fix: DR. Ute (3 chars), SAINT-GERMES composé, SODIUM MACO/BAX pharma

- force_names bypass le seuil 4 chars (prénoms courts après Dr/Mme : Ute, Eva)
- SAINT seul = bloqué, SAINT-xxx composé = accepté comme nom
- Labos pharma ajoutés aux stop-words + companion blacklist :
  MACO, AGUETTANT, RENAUDIN, ARROW, BIOGARAN, MYLAN, TEVA, ZENTIVA
- Score : 99.8/100 (amélioration, "Sie" corrigé)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-31 15:17:37 +02:00
parent c157205751
commit 2f19f7c470
2 changed files with 28 additions and 4 deletions

View File

@@ -2385,12 +2385,21 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam
_NEVER_MASK_AS_NAME = { _NEVER_MASK_AS_NAME = {
"Date", "DATE", "Note", "NOTE", "Heure", "HEURE", "Type", "TYPE", "Date", "DATE", "Note", "NOTE", "Heure", "HEURE", "Type", "TYPE",
"Soin", "SOIN", "Soins", "SOINS", "Surv", "SURV", "Soin", "SOIN", "Soins", "SOINS", "Surv", "SURV",
"Saint", "SAINT", "Sainte", "SAINTE",
"Page", "PAGE", "Presc", "PRESC", "Page", "PAGE", "Presc", "PRESC",
} }
safe_names = {n for n in names if len(n) >= 4 safe_names = set()
and n not in _NEVER_MASK_AS_NAME for n in names:
and (n in _force or n.lower() not in _MEDICAL_STOP_WORDS_SET)} if len(n) < 4 and n not in _force:
# Tokens < 4 chars : accepter SEULEMENT les force_names (ex: "Ute" après Dr)
continue
if n in _NEVER_MASK_AS_NAME:
continue
# "Saint"/"SAINT" seul = bloquer. "Saint-Germes" composé = laisser passer
if n.upper() in ("SAINT", "SAINTE") and "-" not in n:
continue
if n not in _force and n.lower() in _MEDICAL_STOP_WORDS_SET:
continue
safe_names.add(n)
# Ajouter un hit global (page=-1) par nom pour la redaction PDF raster # Ajouter un hit global (page=-1) par nom pour la redaction PDF raster
# (un seul hit suffit — redact_pdf_raster cherche le token sur chaque page) # (un seul hit suffit — redact_pdf_raster cherche le token sur chaque page)
# Les noms forcés (contexte Dr/Mme) utilisent NOM_FORCE pour bypasser # Les noms forcés (contexte Dr/Mme) utilisent NOM_FORCE pour bypasser
@@ -4275,6 +4284,9 @@ def process_pdf(
"SULFAMIDES", "CLAVULANIQUE", "MECILLINAM", "SULFAMIDES", "CLAVULANIQUE", "MECILLINAM",
"TAZOBACTAM", "TEMOCILLINE", "ECOFLAC", "FURANES", "TAZOBACTAM", "TEMOCILLINE", "ECOFLAC", "FURANES",
"CONTENTION", "ISOLEMENT", "ELIMINATION", "CONTENTION", "ISOLEMENT", "ELIMINATION",
# Labos pharmaceutiques (FP dans tableaux prescriptions trackare)
"MACO", "AGUETTANT", "RENAUDIN", "LAVOISIER",
"COOPER", "ARROW", "BIOGARAN", "MYLAN", "TEVA", "ZENTIVA",
"PANCREATITE", "INFECTIEUX", "HEMODYNAMIQUE", "PANCREATITE", "INFECTIEUX", "HEMODYNAMIQUE",
"SENSIBLE", "VARIABLE", "DOSAGE", "CAT", "SENSIBLE", "VARIABLE", "DOSAGE", "CAT",
} }

View File

@@ -1309,3 +1309,15 @@ zymad
étage étage
évaluation évaluation
évolution évolution
# Laboratoires pharmaceutiques (FP prescriptions trackare)
maco
aguettant
renaudin
lavoisier
cooper
arrow
biogaran
mylan
teva
zentiva