Qualité anonymisation : 10 améliorations (audit 59 OGC, +98% établissements, 0 FP médical)

- RE_ETABLISSEMENT élargi (CH/CHU/CHRU/CHS/HIA/CLCC/GHT/GCS), CH/CHS exigent un nom après - RE_HOPITAL_VILLE enrichi (Centre de Soins, Maison de Santé/Retraite, Résidence, Foyer) - Nouveau RE_SERVICE (service/unité/pôle/département + nom propre) - org_gpe_keep=False : NER masque désormais ORG/LOC - +40 stop words (oncologie, confrères, préparations, spécialités médicales...) - RE_IBAN accepte espaces (groupes de 4, format standard) - RE_TEL tiret échappé + nouveau RE_TEL_COMPACT (numéros collés 0612345678) - RE_ADRESSE +10 types de voies (lotissement, hameau, esplanade, côte...) - RE_AGE élargi (patiente 72 ans, , 88 ans, (85A)) - Blacklist companion tokens (27 mots génériques/spécialités médicales) - Propagation globale VLM_SERVICE et VLM_ETAB Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-27 10:43:30 +01:00
parent 86274b3b2a
commit 0684b77d68
1 changed files with 83 additions and 15 deletions
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -98,7 +98,7 @@ DEFAULTS_CFG = {
    "whitelist": {
        "sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
        "noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
-        "org_gpe_keep": True,
+        "org_gpe_keep": False,
    },
    "blacklist": {
        "force_mask_terms": [],
@@ -147,8 +147,9 @@ CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP", "DATE_NAISSANCE"}

 # Baseline regex
 RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
-RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[ .-]?\d){8}(?!\d)")
-RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b")
+RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?!\d)")
+RE_TEL_COMPACT = re.compile(r"(?<!\d)0[1-9]\d{8}(?!\d)")
+RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}(?:\s?[A-Z0-9]{4}){3,7}(?:\s?[A-Z0-9]{1,4})\b")
 RE_IPP = re.compile(r"\bIPP\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
 RE_FINESS = re.compile(r"\bFINESS\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
 RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE)
@@ -356,6 +357,26 @@ _MEDICAL_STOP_WORDS_SET = {
    "paracetamol", "paracétamol", "unité", "unite",
    # FP résiduels batch 10 OGC (termes médicaux/instructions soins)
    "glyc", "glycosurie", "vider", "forte",
+    # FP audit batch 59 OGC (mots courants/médicaux flagués comme NOM)
+    "oncologie", "confrères", "confrere", "doubles", "chers", "motif",
+    "responsable", "autre", "autres", "autonome", "autonomes",
+    "préparations", "preparations", "prévenir", "prevenir",
+    "acétylsalicylique", "acetylsalicylique", "angio",
+    "desc", "diu", "cambo", "bains", "dogue", "barreau",
+    "haitz", "alde",
+    # Spécialités/services récurrents comme FP NOM
+    "cancérologie", "cancerologie", "réanimation", "reanimation",
+    "urologie", "néphrologie", "nephrologie", "hématologie", "hematologie",
+    "gériatrie", "geriatrie", "pédiatrie", "pediatrie",
+    "ophtalmologie", "stomatologie", "allergologie",
+    "kinésithérapie", "kinesitherapie", "ergothérapie", "ergotherapie",
+    "orthopédie", "orthopedie", "traumatologie",
+    "palliatifs", "palliative", "palliatif",
+    "addictologie", "alcoologie", "tabacologie",
+    # Termes structurels trackare
+    "transmissions", "transmission", "releve", "relevé",
+    "objectif", "objectifs", "evaluation", "évaluation",
+    "planification", "planifié", "planifiee",
 }
 # Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
 _MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
@@ -467,7 +488,8 @@ RE_DATE = re.compile(
 )
 RE_ADRESSE = re.compile(
    r"\b\d{1,4}[\s,]*(?:bis|ter)?\s*,?\s*"
-    r"(?:rue|avenue|av\.?|boulevard|bd\.?|place|chemin|all[ée]e|impasse|route|cours|passage|square|r[ée]sidence)"
+    r"(?:rue|avenue|av\.?|boulevard|bd\.?|place|chemin|all[ée]e|impasse|route|cours|passage|square|r[ée]sidence"
+    r"|lotissement|lot\.?|cit[ée]|hameau|quartier|voie|parvis|esplanade|promenade|côte)"
    r"\s+[A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\s\-']{2,}",
    re.IGNORECASE,
 )
@@ -485,18 +507,37 @@ RE_BP = re.compile(
    re.IGNORECASE,
 )
 RE_AGE = re.compile(
-    r"(?:âg[ée]+\s+de\s+|patient(?:e)?\s+de\s+)(\d{1,3})\s*ans\b",
+    r"(?:âg[ée]+\s+de\s+|patient(?:e)?\s+(?:de\s+)?|(?:,\s*|\(\s*)"
+    r")(\d{1,3})\s*(?:ans|A)\b",
    re.IGNORECASE,
 )
-# Établissements de santé : avec nom (EHPAD Bayonne) ou seuls (EHPAD, SSR, USLD)
+# Établissements de santé : sigles longs peuvent être seuls, sigles courts (CH/CHS) nécessitent un nom
+_ETAB_NAME = (r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
+              r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)")
 RE_ETABLISSEMENT = re.compile(
-    r"\b((?:EHPAD|SSR|USLD|HAD|SSR/USLD|CSAPA|CMP|CMPP|UGA)"
-    r"(?:\s+(?:de\s+|d['']\s*)?[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
+    r"\b("
+    # Sigles longs : acceptés seuls ou avec nom
+    r"(?:EHPAD|SSR/USLD|SSR|USLD|HAD|CSAPA|CMPP|CMP|UGA|CHRU|CHU|HIA|CLCC|GHT|GCS)"
+    + _ETAB_NAME + r"*"
+    r"|"
+    # Sigles courts (CH, CHS) : obligent un nom après pour éviter les faux positifs
+    r"(?:CHS|CH)" + _ETAB_NAME + r"+"
+    r")",
 )
 RE_HOPITAL_VILLE = re.compile(
-    r"\b((?:[Hh]ôpital|[Cc]linique|[Pp]olyclinique|[Cc]entre\s+[Hh]ospitalier)"
-    r"\s+(?:de\s+|d['']\s*)?(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
-    r"(?:\s+(?:de\s+|d['']\s*)?[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
+    r"\b((?:[Hh]ôpital|[Cc]linique|[Pp]olyclinique|[Cc]entre\s+[Hh]ospitalier"
+    r"|[Cc]entre\s+[Dd]e\s+[Ss]oins|[Mm]aison\s+[Dd]e\s+[Ss]anté"
+    r"|[Mm]aison\s+[Dd]e\s+[Rr]etraite|[Rr]ésidence|[Ff]oyer)"
+    r"\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
+    r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
+    r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
+    r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
+)
+RE_SERVICE = re.compile(
+    r"\b((?:[Ss]ervice|[Uu]nité|[Pp]ôle|[Dd]épartement)\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
+    r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
+    r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
+    r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
 )
 RE_NUMERO_DOSSIER = re.compile(
    r"(?:dossier|n°\s*dossier|NDA)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})"
@@ -708,6 +749,7 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
        audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
        return PLACEHOLDERS["TEL"]
    line = RE_TEL.sub(_repl_tel, line)
+    line = RE_TEL_COMPACT.sub(_repl_tel, line)

    # IBAN
    def _repl_iban(m: re.Match) -> str:
@@ -779,6 +821,12 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
    line = RE_ETABLISSEMENT.sub(_repl_etab, line)
    line = RE_HOPITAL_VILLE.sub(_repl_etab, line)

+    # Services hospitaliers (service de Cardiologie, unité de soins palliatifs, etc.)
+    def _repl_service(m: re.Match) -> str:
+        audit.append(PiiHit(page_idx, "ETAB", m.group(0), PLACEHOLDERS["MASK"]))
+        return PLACEHOLDERS["MASK"]
+    line = RE_SERVICE.sub(_repl_service, line)
+
    # Champs structurés : Lieu de naissance, Ville de résidence (masquage direct, sans filtre stop words)
    _re_lieu = re.compile(r"(Lieu\s+de\s+naissance\s*:\s*)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+)")
    def _repl_lieu(m: re.Match) -> str:
@@ -852,6 +900,7 @@ def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str:
        audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
        return PLACEHOLDERS["TEL"]
    key = RE_TEL.sub(_repl_tel, key)
+    key = RE_TEL_COMPACT.sub(_repl_tel, key)
    def _repl_email(m: re.Match) -> str:
        audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"]))
        return PLACEHOLDERS["EMAIL"]
@@ -1193,7 +1242,7 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]

 def _mask_with_hf(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str:
    # remplace via regex sur les 'word' détectés (approche pragmatique)
-    keep_org_gpe = bool((cfg.get("whitelist", {}) or {}).get("org_gpe_keep", True))
+    keep_org_gpe = bool((cfg.get("whitelist", {}) or {}).get("org_gpe_keep", False))
    def repl_once(s: str, old: str, new: str) -> str:
        return re.sub(rf"\b{re.escape(old)}\b", new, s)
    out = text
@@ -1364,6 +1413,7 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
    # PII critiques (comme avant)
    protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
    protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
+    protected = RE_TEL_COMPACT.sub(PLACEHOLDERS["TEL"], protected)
    protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
    # NIR avec validation
    def _rescan_nir(m: re.Match) -> str:
@@ -1382,6 +1432,8 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
    # Établissements
    protected = RE_ETABLISSEMENT.sub(PLACEHOLDERS["ETAB"], protected)
    protected = RE_HOPITAL_VILLE.sub(PLACEHOLDERS["ETAB"], protected)
+    # Services hospitaliers
+    protected = RE_SERVICE.sub(PLACEHOLDERS["MASK"], protected)
    # Personnes contextuelles (avec whitelist)
    wl_sections = set()
    wl_phrases = set()
@@ -1832,18 +1884,33 @@ def process_pdf(
            _global_name_tokens.add(word)
    # 4a-bis) Noms compagnons : si un token connu est suivi/précédé d'un mot majuscule inconnu
    #         dans le texte brut, c'est aussi un nom (ex: "Diego OLIVER" → OLIVER est un nom)
+    _COMPANION_BLACKLIST = {
+        "ZONE", "PARTI", "PLAN", "MAIN", "FORT", "FORTE", "BILAN",
+        "MISE", "NOTE", "AIDE", "BASE", "FACE", "DOSE", "TIGE",
+        "VOIE", "ONDE", "SOIN", "DEMI", "MODE", "CURE", "PAGE",
+        # Spécialités/services
+        "CANCEROLOGIE", "ONCOLOGIE", "REANIMATION", "RADIOLOGIE",
+        "CARDIOLOGIE", "NEUROLOGIE", "PNEUMOLOGIE", "UROLOGIE",
+        "GERIATRIE", "PEDIATRIE", "NEPHROLOGIE", "HEMATOLOGIE",
+        "OPHTALMOLOGIE", "STOMATOLOGIE", "ALLERGOLOGIE",
+        "RHUMATOLOGIE", "DERMATOLOGIE", "IMMUNOLOGIE",
+    }
    raw_full = "\n\n".join(pages_text)
    _companion_tokens: set = set()
    for token in _global_name_tokens:
        # Token connu suivi d'un mot ALL-CAPS
        for m in re.finditer(rf"\b{re.escape(token)}\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{3,}})\b", raw_full):
            candidate = m.group(1)
-            if candidate.lower() not in _MEDICAL_STOP_WORDS_SET and candidate not in _global_name_tokens:
+            if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET
+                    and candidate not in _global_name_tokens
+                    and candidate not in _COMPANION_BLACKLIST):
                _companion_tokens.add(candidate)
        # Mot ALL-CAPS suivi du token connu
        for m in re.finditer(rf"\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{3,}})\s+{re.escape(token)}\b", raw_full):
            candidate = m.group(1)
-            if candidate.lower() not in _MEDICAL_STOP_WORDS_SET and candidate not in _global_name_tokens:
+            if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET
+                    and candidate not in _global_name_tokens
+                    and candidate not in _COMPANION_BLACKLIST):
                _companion_tokens.add(candidate)
    _global_name_tokens.update(_companion_tokens)

@@ -1882,7 +1949,8 @@ def process_pdf(
    # 4b) TEL, EMAIL, ADRESSE, CODE_POSTAL : propager les valeurs uniques sur toutes les pages
    _global_pii: Dict[str, set] = {}
    for h in anon.audit:
-        if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB"}:
+        if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB",
+                      "VLM_SERVICE", "VLM_ETAB"}:
            _global_pii.setdefault(h.kind, set()).add(h.original.strip())
    for kind, values in _global_pii.items():
        placeholder = PLACEHOLDERS.get(kind, PLACEHOLDERS["MASK"])