fix: FP médicaments dans raster + texte — RE_EXTRACT_STAFF_ROLE + FINESS + stop-words

Bug #1 (critique) : RE_EXTRACT_STAFF_ROLE matchait à l'intérieur des mots (IDE dans METOCLOPRAMIDE, AS dans ATORVASTATINE) → ajout \b word boundaries et suppression du ? optionnel sur ASH (AS matchait partout) Bug #2 : raster multi-mots utilisait page.search_for() (substring matching) → ajout vérification frontières de mots pour les tokens multi-mots dans redact_pdf_raster et redact_pdf_vector FP FINESS Aho-Corasick : - "resistance" (Centre de la Résistance) matchait "résistance aux fluoroquinolones" - "radiotherapie" matchait "tumorectomie, radiothérapie et hormonothérapie" → ajout blacklist : resistance, radiotherapie, chimiotherapie, etc. FP villes : "COU" (commune) matchait dans "prurit (cou, décolleté, dos)" → ajout COU, DOS, SEIN, BRAS à _VILLE_BLACKLIST Stop-words : ajout "totale", "partielle", "prothese", "unicompartimentale" Score évaluation maintenu à 100.0/100 (A+) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-17 07:11:57 +01:00
parent 2731bc1ce7
commit ad7f1ffa8a
1 changed files with 35 additions and 3 deletions
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -200,6 +200,8 @@ _VILLE_BLACKLIST = {
    "FRANCE", "EUROPE",
    # Termes ambigus (aussi communes INSEE) - trackare/DPI
    "COURANT",  # "Médecin courant" ≠ ville
    # Parties du corps homonymes de communes (FP "prurit invalidant (COU, décolleté)")
    "COU", "DOS", "SEIN", "BRAS",
 }
 try:
@@ -753,6 +755,9 @@ _MEDICAL_STOP_WORDS_SET = {
    "chlorure",
    # Dispositifs médicaux (FP "OXYGENE LUNETTES" → [NOM])
    "canule", "canules", "masque", "sonde", "sondes",
    # Termes chirurgicaux FP comme [NOM] (retour relecteur 2026-03-17)
    "totale", "total", "partielle", "partiel",
    "prothese", "prothèse", "unicompartimentale",
 }
 # Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
 _MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
@@ -865,8 +870,8 @@ RE_EXTRACT_DR_DEST = re.compile(
 )
 # Noms du personnel médical après un rôle : "Aide : Marie-Paule BORDABERRY"
 RE_EXTRACT_STAFF_ROLE = re.compile(
-    r"(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH?|Cadre[ \t]+Infirmier"
+    r"\b(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH|Cadre[ \t]+Infirmier"
-    r"|Prescripteur|Prescrit[ \t]+par|Exécut[ée][ \t]+par|Réalisé[ \t]+par)[ \t]*:?[ \t]*"
+    r"|Prescripteur|Prescrit[ \t]+par|Exécut[ée][ \t]+par|Réalisé[ \t]+par)\b[ \t]*:?[ \t]*"
    r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+(?:[ \t]*-[ \t]*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+)?[ \t]+)?"
    r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}[\-]?)(?:[ \t\-]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}){0,2})",
 )
@@ -2559,6 +2564,12 @@ def _build_finess_ac():
        "comprimee", "comprimees", "injectable", "injectables",
        "maintenant", "actuellement", "auparavant", "prochainement",
        "rapidement", "correctement", "directement", "simplement",
        # Termes médicaux homonymes d'établissements FINESS (retour relecteur 2026-03-17)
        "resistance", "radiotherapie", "chimiotherapie", "curietherapie",
        "hormonotherapie", "immunotherapie", "kinesitherapie",
        "ergotherapie", "orthophonie", "psychomotricite",
        "reeducation", "readaptation", "convalescence",
        "dependance", "autonomie", "gerontologie",
    }
    # Expressions multi-mots trop génériques
    _ac_generic_phrases = {
@@ -3334,7 +3345,14 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
                        rects = _search_ocr_words(ocr_word_map[pno], token, page.rect)
                    all_rects.extend(rects)
                else:
                    # Vérification frontières de mots (comme raster)
                    rects = page.search_for(token)
                    if rects:
                        page_text = page.get_text()
                        import re as _re
                        if not _re.search(r"(?<![A-Za-zÀ-ÿ])" + _re.escape(token) + r"(?![A-Za-zÀ-ÿ])",
                                          page_text, _re.IGNORECASE):
                            rects = []
                    if not rects:
                        for word in token.split():
                            word = word.strip(" .-'")
@@ -3469,8 +3487,22 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
                        found_ww = _search_ocr_words(ocr_word_map[pno], token, page.rect)
                    rects.extend(found_ww)
                else:
-                    # Token multi-mots : d'abord chercher la chaîne complète
+                    # Token multi-mots : chercher la chaîne complète puis vérifier
                    # les frontières de mots pour éviter le substring matching
                    # (ex: "TATINE EG" trouvé dans "ATORVASTATINE EG")
                    found_multi = page.search_for(token)
                    if found_multi:
                        # Vérifier que le match est sur des frontières de mots
                        page_text = page.get_text()
                        verified = []
                        for rect in found_multi:
                            # Extraire le texte autour du match pour vérifier les limites
                            # Chercher le token dans le texte brut avec \b
                            import re as _re
                            if _re.search(r"(?<![A-Za-zÀ-ÿ])" + _re.escape(token) + r"(?![A-Za-zÀ-ÿ])",
                                          page_text, _re.IGNORECASE):
                                verified.append(rect)
                        found_multi = verified
                    if not found_multi:
                        # Fallback : chercher chaque mot comme mot entier
                        for word in token.split():