Qualité anonymisation : 10 améliorations (audit 59 OGC, +98% établissements, 0 FP médical)

- RE_ETABLISSEMENT élargi (CH/CHU/CHRU/CHS/HIA/CLCC/GHT/GCS), CH/CHS exigent un nom après - RE_HOPITAL_VILLE enrichi (Centre de Soins, Maison de Santé/Retraite, Résidence, Foyer) - Nouveau RE_SERVICE (service/unité/pôle/département + nom propre) - org_gpe_keep=False : NER masque désormais ORG/LOC - +40 stop words (oncologie, confrères, préparations, spécialités médicales...) - RE_IBAN accepte espaces (groupes de 4, format standard) - RE_TEL tiret échappé + nouveau RE_TEL_COMPACT (numéros collés 0612345678) - RE_ADRESSE +10 types de voies (lotissement, hameau, esplanade, côte...) - RE_AGE élargi (patiente 72 ans, , 88 ans, (85A)) - Blacklist companion tokens (27 mots génériques/spécialités médicales) - Propagation globale VLM_SERVICE et VLM_ETAB Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-27 10:43:30 +01:00
parent 86274b3b2a
commit 0684b77d68
1 changed files with 83 additions and 15 deletions
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -98,7 +98,7 @@ DEFAULTS_CFG = {
    "whitelist": {
        "sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
        "noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
-        "org_gpe_keep": True,
+        "org_gpe_keep": False,
    },
    "blacklist": {
        "force_mask_terms": [],
@@ -147,8 +147,9 @@ CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP", "DATE_NAISSANCE"}
 # Baseline regex
 RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
-RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[ .-]?\d){8}(?!\d)")
+RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?!\d)")
-RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b")
+RE_TEL_COMPACT = re.compile(r"(?<!\d)0[1-9]\d{8}(?!\d)")
 RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}(?:\s?[A-Z0-9]{4}){3,7}(?:\s?[A-Z0-9]{1,4})\b")
 RE_IPP = re.compile(r"\bIPP\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
 RE_FINESS = re.compile(r"\bFINESS\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
 RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE)
@@ -356,6 +357,26 @@ _MEDICAL_STOP_WORDS_SET = {
    "paracetamol", "paracétamol", "unité", "unite",
    # FP résiduels batch 10 OGC (termes médicaux/instructions soins)
    "glyc", "glycosurie", "vider", "forte",
    # FP audit batch 59 OGC (mots courants/médicaux flagués comme NOM)
    "oncologie", "confrères", "confrere", "doubles", "chers", "motif",
    "responsable", "autre", "autres", "autonome", "autonomes",
    "préparations", "preparations", "prévenir", "prevenir",
    "acétylsalicylique", "acetylsalicylique", "angio",
    "desc", "diu", "cambo", "bains", "dogue", "barreau",
    "haitz", "alde",
    # Spécialités/services récurrents comme FP NOM
    "cancérologie", "cancerologie", "réanimation", "reanimation",
    "urologie", "néphrologie", "nephrologie", "hématologie", "hematologie",
    "gériatrie", "geriatrie", "pédiatrie", "pediatrie",
    "ophtalmologie", "stomatologie", "allergologie",
    "kinésithérapie", "kinesitherapie", "ergothérapie", "ergotherapie",
    "orthopédie", "orthopedie", "traumatologie",
    "palliatifs", "palliative", "palliatif",
    "addictologie", "alcoologie", "tabacologie",
    # Termes structurels trackare
    "transmissions", "transmission", "releve", "relevé",
    "objectif", "objectifs", "evaluation", "évaluation",
    "planification", "planifié", "planifiee",
 }
 # Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
 _MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
@@ -467,7 +488,8 @@ RE_DATE = re.compile(
 )
 RE_ADRESSE = re.compile(
    r"\b\d{1,4}[\s,]*(?:bis|ter)?\s*,?\s*"
-    r"(?:rue|avenue|av\.?|boulevard|bd\.?|place|chemin|all[ée]e|impasse|route|cours|passage|square|r[ée]sidence)"
+    r"(?:rue|avenue|av\.?|boulevard|bd\.?|place|chemin|all[ée]e|impasse|route|cours|passage|square|r[ée]sidence"
    r"|lotissement|lot\.?|cit[ée]|hameau|quartier|voie|parvis|esplanade|promenade|côte)"
    r"\s+[A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\s\-']{2,}",
    re.IGNORECASE,
 )
@@ -485,18 +507,37 @@ RE_BP = re.compile(
    re.IGNORECASE,
 )
 RE_AGE = re.compile(
-    r"(?:âg[ée]+\s+de\s+|patient(?:e)?\s+de\s+)(\d{1,3})\s*ans\b",
+    r"(?:âg[ée]+\s+de\s+|patient(?:e)?\s+(?:de\s+)?|(?:,\s*|\(\s*)"
    r")(\d{1,3})\s*(?:ans|A)\b",
    re.IGNORECASE,
 )
-# Établissements de santé : avec nom (EHPAD Bayonne) ou seuls (EHPAD, SSR, USLD)
+# Établissements de santé : sigles longs peuvent être seuls, sigles courts (CH/CHS) nécessitent un nom
 _ETAB_NAME = (r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
              r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)")
 RE_ETABLISSEMENT = re.compile(
-    r"\b((?:EHPAD|SSR|USLD|HAD|SSR/USLD|CSAPA|CMP|CMPP|UGA)"
+    r"\b("
-    r"(?:\s+(?:de\s+|d['']\s*)?[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
+    # Sigles longs : acceptés seuls ou avec nom
    r"(?:EHPAD|SSR/USLD|SSR|USLD|HAD|CSAPA|CMPP|CMP|UGA|CHRU|CHU|HIA|CLCC|GHT|GCS)"
    + _ETAB_NAME + r"*"
    r"|"
    # Sigles courts (CH, CHS) : obligent un nom après pour éviter les faux positifs
    r"(?:CHS|CH)" + _ETAB_NAME + r"+"
    r")",
 )
 RE_HOPITAL_VILLE = re.compile(
-    r"\b((?:[Hh]ôpital|[Cc]linique|[Pp]olyclinique|[Cc]entre\s+[Hh]ospitalier)"
+    r"\b((?:[Hh]ôpital|[Cc]linique|[Pp]olyclinique|[Cc]entre\s+[Hh]ospitalier"
-    r"\s+(?:de\s+|d['']\s*)?(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
+    r"|[Cc]entre\s+[Dd]e\s+[Ss]oins|[Mm]aison\s+[Dd]e\s+[Ss]anté"
-    r"(?:\s+(?:de\s+|d['']\s*)?[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
+    r"|[Mm]aison\s+[Dd]e\s+[Rr]etraite|[Rr]ésidence|[Ff]oyer)"
    r"\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
    r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
    r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
    r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
 )
 RE_SERVICE = re.compile(
    r"\b((?:[Ss]ervice|[Uu]nité|[Pp]ôle|[Dd]épartement)\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
    r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
    r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
    r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
 )
 RE_NUMERO_DOSSIER = re.compile(
    r"(?:dossier|n°\s*dossier|NDA)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})"
@@ -708,6 +749,7 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
        audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
        return PLACEHOLDERS["TEL"]
    line = RE_TEL.sub(_repl_tel, line)
    line = RE_TEL_COMPACT.sub(_repl_tel, line)
    # IBAN
    def _repl_iban(m: re.Match) -> str:
@@ -779,6 +821,12 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
    line = RE_ETABLISSEMENT.sub(_repl_etab, line)
    line = RE_HOPITAL_VILLE.sub(_repl_etab, line)
    # Services hospitaliers (service de Cardiologie, unité de soins palliatifs, etc.)
    def _repl_service(m: re.Match) -> str:
        audit.append(PiiHit(page_idx, "ETAB", m.group(0), PLACEHOLDERS["MASK"]))
        return PLACEHOLDERS["MASK"]
    line = RE_SERVICE.sub(_repl_service, line)
    # Champs structurés : Lieu de naissance, Ville de résidence (masquage direct, sans filtre stop words)
    _re_lieu = re.compile(r"(Lieu\s+de\s+naissance\s*:\s*)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+)")
    def _repl_lieu(m: re.Match) -> str:
@@ -852,6 +900,7 @@ def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str:
        audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
        return PLACEHOLDERS["TEL"]
    key = RE_TEL.sub(_repl_tel, key)
    key = RE_TEL_COMPACT.sub(_repl_tel, key)
    def _repl_email(m: re.Match) -> str:
        audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"]))
        return PLACEHOLDERS["EMAIL"]
@@ -1193,7 +1242,7 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
 def _mask_with_hf(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str:
    # remplace via regex sur les 'word' détectés (approche pragmatique)
-    keep_org_gpe = bool((cfg.get("whitelist", {}) or {}).get("org_gpe_keep", True))
+    keep_org_gpe = bool((cfg.get("whitelist", {}) or {}).get("org_gpe_keep", False))
    def repl_once(s: str, old: str, new: str) -> str:
        return re.sub(rf"\b{re.escape(old)}\b", new, s)
    out = text
@@ -1364,6 +1413,7 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
    # PII critiques (comme avant)
    protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
    protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
    protected = RE_TEL_COMPACT.sub(PLACEHOLDERS["TEL"], protected)
    protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
    # NIR avec validation
    def _rescan_nir(m: re.Match) -> str:
@@ -1382,6 +1432,8 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
    # Établissements
    protected = RE_ETABLISSEMENT.sub(PLACEHOLDERS["ETAB"], protected)
    protected = RE_HOPITAL_VILLE.sub(PLACEHOLDERS["ETAB"], protected)
    # Services hospitaliers
    protected = RE_SERVICE.sub(PLACEHOLDERS["MASK"], protected)
    # Personnes contextuelles (avec whitelist)
    wl_sections = set()
    wl_phrases = set()
@@ -1832,18 +1884,33 @@ def process_pdf(
            _global_name_tokens.add(word)
    # 4a-bis) Noms compagnons : si un token connu est suivi/précédé d'un mot majuscule inconnu
    #         dans le texte brut, c'est aussi un nom (ex: "Diego OLIVER" → OLIVER est un nom)
    _COMPANION_BLACKLIST = {
        "ZONE", "PARTI", "PLAN", "MAIN", "FORT", "FORTE", "BILAN",
        "MISE", "NOTE", "AIDE", "BASE", "FACE", "DOSE", "TIGE",
        "VOIE", "ONDE", "SOIN", "DEMI", "MODE", "CURE", "PAGE",
        # Spécialités/services
        "CANCEROLOGIE", "ONCOLOGIE", "REANIMATION", "RADIOLOGIE",
        "CARDIOLOGIE", "NEUROLOGIE", "PNEUMOLOGIE", "UROLOGIE",
        "GERIATRIE", "PEDIATRIE", "NEPHROLOGIE", "HEMATOLOGIE",
        "OPHTALMOLOGIE", "STOMATOLOGIE", "ALLERGOLOGIE",
        "RHUMATOLOGIE", "DERMATOLOGIE", "IMMUNOLOGIE",
    }
    raw_full = "\n\n".join(pages_text)
    _companion_tokens: set = set()
    for token in _global_name_tokens:
        # Token connu suivi d'un mot ALL-CAPS
        for m in re.finditer(rf"\b{re.escape(token)}\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{3,}})\b", raw_full):
            candidate = m.group(1)
-            if candidate.lower() not in _MEDICAL_STOP_WORDS_SET and candidate not in _global_name_tokens:
+            if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET
                    and candidate not in _global_name_tokens
                    and candidate not in _COMPANION_BLACKLIST):
                _companion_tokens.add(candidate)
        # Mot ALL-CAPS suivi du token connu
        for m in re.finditer(rf"\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{3,}})\s+{re.escape(token)}\b", raw_full):
            candidate = m.group(1)
-            if candidate.lower() not in _MEDICAL_STOP_WORDS_SET and candidate not in _global_name_tokens:
+            if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET
                    and candidate not in _global_name_tokens
                    and candidate not in _COMPANION_BLACKLIST):
                _companion_tokens.add(candidate)
    _global_name_tokens.update(_companion_tokens)
@@ -1882,7 +1949,8 @@ def process_pdf(
    # 4b) TEL, EMAIL, ADRESSE, CODE_POSTAL : propager les valeurs uniques sur toutes les pages
    _global_pii: Dict[str, set] = {}
    for h in anon.audit:
-        if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB"}:
+        if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB",
                      "VLM_SERVICE", "VLM_ETAB"}:
            _global_pii.setdefault(h.kind, set()).add(h.original.strip())
    for kind, values in _global_pii.items():
        placeholder = PLACEHOLDERS.get(kind, PLACEHOLDERS["MASK"])