refactor: externaliser DPI labels et companion blacklist (modifiables sans recompiler)
Suite de l'externalisation des règles. Trois listes étaient codées en dur dans anonymizer_core_refactored_onnx.py et impossibles à modifier par les établissements sans recompiler : - _NEVER_MASK_AS_NAME (12 entrées) — labels DPI structurels - _DPI_LABELS_BLACKLIST (14 entrées, doublon partiel du précédent) - _COMPANION_BLACKLIST (~75 entrées) — spécialités, labos pharma, mots ambigus Les deux premières fusionnées dans data/dpi_labels_blacklist.txt (11 entrées uniques, comparaison case-insensitive). La troisième dans data/companion_blacklist.txt (75 entrées, comparaison uppercase). Ajout de deux clés YAML pour enrichissement par établissement : - additional_dpi_labels (ex: "Service", "Statut") - additional_companion_blacklist (ex: spécialités locales) Les 3 niveaux cumulatifs habituels s'appliquent : code (vide) → fichiers data/ → YAML config. Chargement au démarrage avec log INFO du nombre d'entrées. Test trackare-18007562-23054899 : 122 hits, 0 régression, 0 DPI label masqué comme NOM. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1133,6 +1133,31 @@ class NameCandidate:
|
|||||||
_WHITELIST_NEVER_MASK_TOKENS: set = set()
|
_WHITELIST_NEVER_MASK_TOKENS: set = set()
|
||||||
_WHITELIST_NEVER_MASK_PHRASES: set = set()
|
_WHITELIST_NEVER_MASK_PHRASES: set = set()
|
||||||
|
|
||||||
|
# Labels DPI structurels à ne JAMAIS masquer comme noms (Date, Note, Heure...)
|
||||||
|
# Stocké en LOWERCASE — la comparaison est case-insensitive.
|
||||||
|
# Chargé depuis data/dpi_labels_blacklist.txt + cfg["additional_dpi_labels"].
|
||||||
|
_DPI_LABELS_SET: set = set()
|
||||||
|
_dpi_file = Path(__file__).parent / "data" / "dpi_labels_blacklist.txt"
|
||||||
|
if _dpi_file.exists():
|
||||||
|
for _line in _dpi_file.read_text(encoding="utf-8").splitlines():
|
||||||
|
_w = _line.strip()
|
||||||
|
if _w and not _w.startswith("#"):
|
||||||
|
_DPI_LABELS_SET.add(_w.lower())
|
||||||
|
log.info("DPI labels blacklist chargés : %d entrées", len(_DPI_LABELS_SET))
|
||||||
|
|
||||||
|
# Companion blacklist : termes EN MAJUSCULES qui ne sont JAMAIS des noms
|
||||||
|
# (spécialités, labos pharma, mots courants ambigus).
|
||||||
|
# Stocké en UPPERCASE — la comparaison est faite contre des candidats déjà uppercase.
|
||||||
|
# Chargé depuis data/companion_blacklist.txt + cfg["additional_companion_blacklist"].
|
||||||
|
_COMPANION_BLACKLIST_SET: set = set()
|
||||||
|
_comp_file = Path(__file__).parent / "data" / "companion_blacklist.txt"
|
||||||
|
if _comp_file.exists():
|
||||||
|
for _line in _comp_file.read_text(encoding="utf-8").splitlines():
|
||||||
|
_w = _line.strip()
|
||||||
|
if _w and not _w.startswith("#"):
|
||||||
|
_COMPANION_BLACKLIST_SET.add(_w.upper())
|
||||||
|
log.info("Companion blacklist chargée : %d entrées", len(_COMPANION_BLACKLIST_SET))
|
||||||
|
|
||||||
|
|
||||||
_WHITELIST_FUNCTION_WORDS = {
|
_WHITELIST_FUNCTION_WORDS = {
|
||||||
"de", "du", "des", "le", "la", "les", "et", "ou", "à", "a",
|
"de", "du", "des", "le", "la", "les", "et", "ou", "à", "a",
|
||||||
@@ -1198,6 +1223,22 @@ def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
|
|||||||
log.info("Whitelist phrases chargées : %d phrases (%d tokens)",
|
log.info("Whitelist phrases chargées : %d phrases (%d tokens)",
|
||||||
len(wl_phrases), n_added)
|
len(wl_phrases), n_added)
|
||||||
|
|
||||||
|
# DPI labels supplémentaires (clé YAML additional_dpi_labels)
|
||||||
|
extra_dpi = cfg.get("additional_dpi_labels", []) or []
|
||||||
|
if extra_dpi:
|
||||||
|
for w in extra_dpi:
|
||||||
|
if w and str(w).strip():
|
||||||
|
_DPI_LABELS_SET.add(str(w).strip().lower())
|
||||||
|
log.info("DPI labels YAML supplémentaires : %d", len(extra_dpi))
|
||||||
|
|
||||||
|
# Companion blacklist supplémentaire (clé YAML additional_companion_blacklist)
|
||||||
|
extra_comp = cfg.get("additional_companion_blacklist", []) or []
|
||||||
|
if extra_comp:
|
||||||
|
for w in extra_comp:
|
||||||
|
if w and str(w).strip():
|
||||||
|
_COMPANION_BLACKLIST_SET.add(str(w).strip().upper())
|
||||||
|
log.info("Companion blacklist YAML supplémentaire : %d", len(extra_comp))
|
||||||
|
|
||||||
return cfg
|
return cfg
|
||||||
|
|
||||||
# ----------------- Extraction -----------------
|
# ----------------- Extraction -----------------
|
||||||
@@ -2428,18 +2469,12 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam
|
|||||||
"""Remplace globalement chaque nom extrait dans le texte."""
|
"""Remplace globalement chaque nom extrait dans le texte."""
|
||||||
placeholder = PLACEHOLDERS["NOM"]
|
placeholder = PLACEHOLDERS["NOM"]
|
||||||
_force = force_names or set()
|
_force = force_names or set()
|
||||||
# Labels DPI structurels à ne jamais masquer comme noms
|
|
||||||
_NEVER_MASK_AS_NAME = {
|
|
||||||
"Date", "DATE", "Note", "NOTE", "Heure", "HEURE", "Type", "TYPE",
|
|
||||||
"Soin", "SOIN", "Soins", "SOINS", "Surv", "SURV",
|
|
||||||
"Page", "PAGE", "Presc", "PRESC",
|
|
||||||
}
|
|
||||||
safe_names = set()
|
safe_names = set()
|
||||||
for n in names:
|
for n in names:
|
||||||
if len(n) < 4 and n not in _force:
|
if len(n) < 4 and n not in _force:
|
||||||
# Tokens < 4 chars : accepter SEULEMENT les force_names (ex: "Ute" après Dr)
|
# Tokens < 4 chars : accepter SEULEMENT les force_names (ex: "Ute" après Dr)
|
||||||
continue
|
continue
|
||||||
if n in _NEVER_MASK_AS_NAME:
|
if n.lower() in _DPI_LABELS_SET:
|
||||||
continue
|
continue
|
||||||
# "Saint"/"SAINT" seul = bloquer. "Saint-Germes" composé = laisser passer
|
# "Saint"/"SAINT" seul = bloquer. "Saint-Germes" composé = laisser passer
|
||||||
if n.upper() in ("SAINT", "SAINTE") and "-" not in n:
|
if n.upper() in ("SAINT", "SAINTE") and "-" not in n:
|
||||||
@@ -4307,13 +4342,6 @@ def process_pdf(
|
|||||||
|
|
||||||
# 4a) Noms : extraire les tokens individuels
|
# 4a) Noms : extraire les tokens individuels
|
||||||
_nom_kinds = {"NOM", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}
|
_nom_kinds = {"NOM", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}
|
||||||
# Labels DPI / mots structurels à ne JAMAIS propager comme noms
|
|
||||||
_DPI_LABELS_BLACKLIST = {
|
|
||||||
"Date", "DATE", "Note", "NOTE", "Heure", "HEURE", "Type", "TYPE",
|
|
||||||
"Soin", "SOIN", "Soins", "SOINS", "Surv", "SURV",
|
|
||||||
"Saint", "SAINT", "Sainte", "SAINTE",
|
|
||||||
"Page", "PAGE", "Presc", "PRESC",
|
|
||||||
}
|
|
||||||
_global_name_tokens: set = set()
|
_global_name_tokens: set = set()
|
||||||
for h in anon.audit:
|
for h in anon.audit:
|
||||||
if h.kind not in _nom_kinds:
|
if h.kind not in _nom_kinds:
|
||||||
@@ -4324,36 +4352,13 @@ def process_pdf(
|
|||||||
continue
|
continue
|
||||||
if word.lower() in _MEDICAL_STOP_WORDS_SET:
|
if word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||||
continue
|
continue
|
||||||
if word in _DPI_LABELS_BLACKLIST:
|
if word.lower() in _DPI_LABELS_SET:
|
||||||
continue
|
continue
|
||||||
if not word[0].isupper():
|
if not word[0].isupper():
|
||||||
continue
|
continue
|
||||||
_global_name_tokens.add(word)
|
_global_name_tokens.add(word)
|
||||||
# 4a-bis) Noms compagnons : si un token connu est suivi/précédé d'un mot majuscule inconnu
|
# 4a-bis) Noms compagnons : si un token connu est suivi/précédé d'un mot majuscule inconnu
|
||||||
# dans le texte brut, c'est aussi un nom (ex: "Diego OLIVER" → OLIVER est un nom)
|
# dans le texte brut, c'est aussi un nom (ex: "Diego OLIVER" → OLIVER est un nom)
|
||||||
_COMPANION_BLACKLIST = {
|
|
||||||
"ZONE", "PARTI", "PLAN", "MAIN", "FORT", "FORTE", "BILAN",
|
|
||||||
"MISE", "NOTE", "AIDE", "BASE", "FACE", "DOSE", "TIGE",
|
|
||||||
"VOIE", "ONDE", "SOIN", "DEMI", "MODE", "CURE", "PAGE",
|
|
||||||
# Spécialités/services
|
|
||||||
"CANCEROLOGIE", "ONCOLOGIE", "REANIMATION", "RADIOLOGIE",
|
|
||||||
"CARDIOLOGIE", "NEUROLOGIE", "PNEUMOLOGIE", "UROLOGIE",
|
|
||||||
"GERIATRIE", "PEDIATRIE", "NEPHROLOGIE", "HEMATOLOGIE",
|
|
||||||
"OPHTALMOLOGIE", "STOMATOLOGIE", "ALLERGOLOGIE",
|
|
||||||
"RHUMATOLOGIE", "DERMATOLOGIE", "IMMUNOLOGIE",
|
|
||||||
# Termes médicaux/courants FP OGC 21
|
|
||||||
"ALIMENTATION", "AUGMENTATION", "AMELIORATION",
|
|
||||||
"BILIAIRES", "BILIAIRE", "VOIES", "BILI",
|
|
||||||
"MEDECINE", "ENTERO", "DOSSIER", "AVIATION",
|
|
||||||
"SULFAMIDES", "CLAVULANIQUE", "MECILLINAM",
|
|
||||||
"TAZOBACTAM", "TEMOCILLINE", "ECOFLAC", "FURANES",
|
|
||||||
"CONTENTION", "ISOLEMENT", "ELIMINATION",
|
|
||||||
# Labos pharmaceutiques (FP dans tableaux prescriptions trackare)
|
|
||||||
"MACO", "AGUETTANT", "RENAUDIN", "LAVOISIER",
|
|
||||||
"COOPER", "ARROW", "BIOGARAN", "MYLAN", "TEVA", "ZENTIVA",
|
|
||||||
"PANCREATITE", "INFECTIEUX", "HEMODYNAMIQUE",
|
|
||||||
"SENSIBLE", "VARIABLE", "DOSAGE", "CAT",
|
|
||||||
}
|
|
||||||
raw_full = "\n\n".join(pages_text)
|
raw_full = "\n\n".join(pages_text)
|
||||||
_companion_tokens: set = set()
|
_companion_tokens: set = set()
|
||||||
for token in _global_name_tokens:
|
for token in _global_name_tokens:
|
||||||
@@ -4362,14 +4367,14 @@ def process_pdf(
|
|||||||
candidate = m.group(1)
|
candidate = m.group(1)
|
||||||
if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET
|
if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET
|
||||||
and candidate not in _global_name_tokens
|
and candidate not in _global_name_tokens
|
||||||
and candidate not in _COMPANION_BLACKLIST):
|
and candidate not in _COMPANION_BLACKLIST_SET):
|
||||||
_companion_tokens.add(candidate)
|
_companion_tokens.add(candidate)
|
||||||
# Mot ALL-CAPS suivi du token connu
|
# Mot ALL-CAPS suivi du token connu
|
||||||
for m in re.finditer(rf"\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{4,}})\s+{re.escape(token)}\b", raw_full):
|
for m in re.finditer(rf"\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{4,}})\s+{re.escape(token)}\b", raw_full):
|
||||||
candidate = m.group(1)
|
candidate = m.group(1)
|
||||||
if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET
|
if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET
|
||||||
and candidate not in _global_name_tokens
|
and candidate not in _global_name_tokens
|
||||||
and candidate not in _COMPANION_BLACKLIST):
|
and candidate not in _COMPANION_BLACKLIST_SET):
|
||||||
_companion_tokens.add(candidate)
|
_companion_tokens.add(candidate)
|
||||||
_global_name_tokens.update(_companion_tokens)
|
_global_name_tokens.update(_companion_tokens)
|
||||||
|
|
||||||
|
|||||||
@@ -72,6 +72,20 @@ additional_villes_blacklist: []
|
|||||||
# Exemple :
|
# Exemple :
|
||||||
# - "VOTRE_VILLE"
|
# - "VOTRE_VILLE"
|
||||||
|
|
||||||
|
# Labels DPI supplémentaires à ne jamais masquer comme noms
|
||||||
|
# (complète data/dpi_labels_blacklist.txt)
|
||||||
|
# Utiliser pour : titres de colonnes, en-têtes de sections, libellés de champs
|
||||||
|
additional_dpi_labels: []
|
||||||
|
# Exemple :
|
||||||
|
# - "Service"
|
||||||
|
# - "Statut"
|
||||||
|
|
||||||
|
# Termes en MAJUSCULES à ne jamais propager comme noms compagnons
|
||||||
|
# (complète data/companion_blacklist.txt — spécialités, labos pharma, mots ambigus)
|
||||||
|
additional_companion_blacklist: []
|
||||||
|
# Exemple :
|
||||||
|
# - "VOTRE_SPECIALITE"
|
||||||
|
|
||||||
flags:
|
flags:
|
||||||
case_insensitive: true
|
case_insensitive: true
|
||||||
unicode_word_boundaries: true
|
unicode_word_boundaries: true
|
||||||
|
|||||||
94
data/companion_blacklist.txt
Normal file
94
data/companion_blacklist.txt
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
# Companion blacklist : termes en MAJUSCULES qui apparaissent à côté d'un nom
|
||||||
|
# connu mais qui NE SONT PAS des noms (spécialités médicales, labos pharma,
|
||||||
|
# mots courants ambigus). Évite la propagation FP : "DUPONT CARDIOLOGIE"
|
||||||
|
# ne propage pas "CARDIOLOGIE" comme nom.
|
||||||
|
#
|
||||||
|
# Format : un terme par ligne, en MAJUSCULES.
|
||||||
|
# Lignes vides et lignes commençant par # ignorées.
|
||||||
|
|
||||||
|
# Mots ambigus courants
|
||||||
|
ZONE
|
||||||
|
PARTI
|
||||||
|
PLAN
|
||||||
|
MAIN
|
||||||
|
FORT
|
||||||
|
FORTE
|
||||||
|
BILAN
|
||||||
|
MISE
|
||||||
|
NOTE
|
||||||
|
AIDE
|
||||||
|
BASE
|
||||||
|
FACE
|
||||||
|
DOSE
|
||||||
|
TIGE
|
||||||
|
VOIE
|
||||||
|
ONDE
|
||||||
|
SOIN
|
||||||
|
DEMI
|
||||||
|
MODE
|
||||||
|
CURE
|
||||||
|
PAGE
|
||||||
|
|
||||||
|
# Spécialités / services médicaux
|
||||||
|
CANCEROLOGIE
|
||||||
|
ONCOLOGIE
|
||||||
|
REANIMATION
|
||||||
|
RADIOLOGIE
|
||||||
|
CARDIOLOGIE
|
||||||
|
NEUROLOGIE
|
||||||
|
PNEUMOLOGIE
|
||||||
|
UROLOGIE
|
||||||
|
GERIATRIE
|
||||||
|
PEDIATRIE
|
||||||
|
NEPHROLOGIE
|
||||||
|
HEMATOLOGIE
|
||||||
|
OPHTALMOLOGIE
|
||||||
|
STOMATOLOGIE
|
||||||
|
ALLERGOLOGIE
|
||||||
|
RHUMATOLOGIE
|
||||||
|
DERMATOLOGIE
|
||||||
|
IMMUNOLOGIE
|
||||||
|
|
||||||
|
# Termes médicaux / courants (FP signalés OGC 21)
|
||||||
|
ALIMENTATION
|
||||||
|
AUGMENTATION
|
||||||
|
AMELIORATION
|
||||||
|
BILIAIRES
|
||||||
|
BILIAIRE
|
||||||
|
VOIES
|
||||||
|
BILI
|
||||||
|
MEDECINE
|
||||||
|
ENTERO
|
||||||
|
DOSSIER
|
||||||
|
AVIATION
|
||||||
|
SULFAMIDES
|
||||||
|
CLAVULANIQUE
|
||||||
|
MECILLINAM
|
||||||
|
TAZOBACTAM
|
||||||
|
TEMOCILLINE
|
||||||
|
ECOFLAC
|
||||||
|
FURANES
|
||||||
|
CONTENTION
|
||||||
|
ISOLEMENT
|
||||||
|
ELIMINATION
|
||||||
|
|
||||||
|
# Labos pharmaceutiques (FP dans tableaux prescriptions trackare)
|
||||||
|
MACO
|
||||||
|
AGUETTANT
|
||||||
|
RENAUDIN
|
||||||
|
LAVOISIER
|
||||||
|
COOPER
|
||||||
|
ARROW
|
||||||
|
BIOGARAN
|
||||||
|
MYLAN
|
||||||
|
TEVA
|
||||||
|
ZENTIVA
|
||||||
|
|
||||||
|
# Termes médicaux additionnels
|
||||||
|
PANCREATITE
|
||||||
|
INFECTIEUX
|
||||||
|
HEMODYNAMIQUE
|
||||||
|
SENSIBLE
|
||||||
|
VARIABLE
|
||||||
|
DOSAGE
|
||||||
|
CAT
|
||||||
16
data/dpi_labels_blacklist.txt
Normal file
16
data/dpi_labels_blacklist.txt
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
# Labels DPI / mots structurels à ne JAMAIS masquer comme noms
|
||||||
|
# (titres de colonnes, en-têtes de sections, libellés de champs DPI)
|
||||||
|
# Comparaison case-insensitive — un mot par ligne.
|
||||||
|
# Lignes vides et lignes commençant par # ignorées.
|
||||||
|
|
||||||
|
Date
|
||||||
|
Note
|
||||||
|
Heure
|
||||||
|
Type
|
||||||
|
Soin
|
||||||
|
Soins
|
||||||
|
Surv
|
||||||
|
Page
|
||||||
|
Presc
|
||||||
|
Saint
|
||||||
|
Sainte
|
||||||
Reference in New Issue
Block a user