fix(whitelist): GUI whitelist_phrases enfin lue et appliquée par le core

Bug majeur depuis l'externalisation : la GUI v5.4 écrivait whitelist_phrases (clé racine), mais le core ne lisait que whitelist.sections_titres / noms_maj_excepts (imbriqué). _apply_whitelist post-masquage était par ailleurs désactivée (c157205) sans remplacement. Correctif : - load_dictionaries() lit whitelist_phrases et alimente deux sets globaux (_WHITELIST_NEVER_MASK_TOKENS, _WHITELIST_NEVER_MASK_PHRASES). Mots-outils (de, du, le...) écartés pour éviter blocages collatéraux. - _apply_extracted_names : check whitelist en pré-masquage, prime sur les force_names (ex: "DUPONT" reste visible même après "Dr DUPONT"). - process_pdf : filtrage final de l'audit avant redact_pdf_vector. Les hits multi-mots dont au moins un sous-token est whitelist sont retirés. - redact_pdf_vector : check whitelist sur les sous-mots cherchés individuellement quand le multi-mots n'est pas trouvé sur la page. Validé sur trackare-18007562-23054899 : - Avec whitelist BELLEAU : 0 hit dans audit, 31 occurrences préservées dans PDF - Sans whitelist : 0 occurrence dans PDF (non-régression OK) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 10:23:09 +02:00
parent b23355ed23
commit b5058b9c4b
1 changed files with 94 additions and 3 deletions
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -1126,6 +1126,45 @@ class NameCandidate:

 # ----------------- Config loader -----------------

+# Whitelist absolue : tokens que l'utilisateur a déclarés "à ne JAMAIS masquer"
+# Alimenté par cfg["whitelist_phrases"] dans load_dictionaries().
+# Filtré à 2 niveaux : pré-masquage (_apply_extracted_names) et filtrage final
+# (avant redact_pdf_vector) pour neutraliser tout NOM/PER/ORG qui matcherait.
+_WHITELIST_NEVER_MASK_TOKENS: set = set()
+_WHITELIST_NEVER_MASK_PHRASES: set = set()
+
+
+_WHITELIST_FUNCTION_WORDS = {
+    "de", "du", "des", "le", "la", "les", "et", "ou", "à", "a",
+    "en", "un", "une", "au", "aux", "of", "the", "and",
+}
+
+
+def _load_whitelist_phrases(phrases) -> int:
+    """Tokenise les phrases whitelist et alimente les sets globaux.
+    Retourne le nombre de tokens effectivement ajoutés.
+    Les mots-outils (de, du, le...) ne sont pas indexés individuellement
+    pour éviter de faux blocages, mais la phrase complète est conservée."""
+    if not phrases:
+        return 0
+    added = 0
+    for phrase in phrases:
+        if not phrase or not str(phrase).strip():
+            continue
+        p = str(phrase).strip()
+        _WHITELIST_NEVER_MASK_PHRASES.add(p.lower())
+        for tok in re.split(r"[\s\-']+", p):
+            tok = tok.strip(" .,;:!?()[]{}\"'«»")
+            tok_lower = tok.lower()
+            if len(tok) < 3:
+                continue
+            if tok_lower in _WHITELIST_FUNCTION_WORDS:
+                continue
+            _WHITELIST_NEVER_MASK_TOKENS.add(tok_lower)
+            added += 1
+    return added
+
+
 def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
    cfg = DEFAULTS_CFG.copy()
    if config_path and config_path.exists() and yaml is not None:
@@ -1151,6 +1190,14 @@ def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
                _VILLE_BLACKLIST.add(str(v).strip().upper())
        log.info("Villes blacklist YAML supplémentaires : %d", len(extra_villes))

+    # Whitelist absolue : termes/phrases que l'utilisateur a déclarés "à ne JAMAIS masquer"
+    # Alimentée par la GUI v5.4 (clé racine whitelist_phrases du YAML).
+    wl_phrases = cfg.get("whitelist_phrases", []) or []
+    if wl_phrases:
+        n_added = _load_whitelist_phrases(wl_phrases)
+        log.info("Whitelist phrases chargées : %d phrases (%d tokens)",
+                 len(wl_phrases), n_added)
+
    return cfg

 # ----------------- Extraction -----------------
@@ -2399,6 +2446,11 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam
            continue
        if n not in _force and n.lower() in _MEDICAL_STOP_WORDS_SET:
            continue
+        # Whitelist absolue (configurée par l'utilisateur via la GUI) : prime sur tout,
+        # y compris sur les force_names (Dr/Mme). Si l'établissement déclare un terme
+        # comme "à ne jamais masquer", on respecte même s'il apparaît après "Dr".
+        if n.lower() in _WHITELIST_NEVER_MASK_TOKENS:
+            continue
        safe_names.add(n)
    # Ajouter un hit global (page=-1) par nom pour la redaction PDF raster
    # (un seul hit suffit — redact_pdf_raster cherche le token sur chaque page)
@@ -3804,6 +3856,9 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
            if h.kind in _VECTOR_WHOLEWORD_KINDS or h.kind == "NOM_FORCE":
                if h.kind != "NOM_FORCE" and token.lower() in _MEDICAL_STOP_WORDS_SET:
                    continue
+                # Whitelist absolue : prime même sur NOM_FORCE
+                if token.lower() in _WHITELIST_NEVER_MASK_TOKENS:
+                    continue
                if " " not in token:
                    rects = _search_whole_word(page, token)
                    if not rects and ocr_word_map and pno in ocr_word_map:
@@ -3823,6 +3878,9 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
                            word = word.strip(" .-'")
                            if len(word) < 4 or word.lower() in _MEDICAL_STOP_WORDS_SET:
                                continue
+                            # Whitelist absolue sur sous-mots
+                            if word.lower() in _WHITELIST_NEVER_MASK_TOKENS:
+                                continue
                            rects.extend(_search_whole_word(page, word))
                            if not rects and ocr_word_map and pno in ocr_word_map:
                                rects.extend(_search_ocr_words(ocr_word_map[pno], word, page.rect))
@@ -4519,9 +4577,42 @@ def process_pdf(
    )
    final_text = _RE_BRACKET_CLEAN.sub(r"\1", final_text)

-    # 6) Whitelist : DÉSACTIVÉ — l'approche post-masquage est défectueuse
-    # (injecte des phrases whitelist au mauvais endroit quand [NOM] masque un vrai nom)
-    # TODO: implémenter en pré-masquage (protéger les spans avant anonymisation)
+    # 6) Whitelist absolue : filtrer les hits qui matchent un terme whitelist
+    # de la GUI (clé YAML whitelist_phrases). Filet de sécurité après tous les
+    # mécanismes de détection — empêche DUPONT (whitelist) d'être masqué dans
+    # le PDF même s'il a été ajouté à l'audit par regex/NER/cross-validation.
+    if _WHITELIST_NEVER_MASK_TOKENS:
+        _NAME_LIKE_KINDS = {
+            "NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NOM_FORCE", "NOM_INITIAL",
+            "EDS_NOM", "EDS_PRENOM", "EDS_HOPITAL", "EDS_VILLE",
+            "ETAB", "ETAB_GLOBAL", "ETAB_FINESS", "ADDR_FINESS",
+            "NER_PER", "NER_ORG", "NER_LOC",
+            "VILLE", "force_term", "force_term_GLOBAL",
+        }
+        before = len(anon.audit)
+        kept = []
+        removed_tokens: set = set()
+        for h in anon.audit:
+            if h.kind not in _NAME_LIKE_KINDS:
+                kept.append(h); continue
+            tok = (h.original or "").strip()
+            if not tok:
+                kept.append(h); continue
+            tok_lower = tok.lower()
+            # Phrase complète whitelist → retirer
+            if tok_lower in _WHITELIST_NEVER_MASK_PHRASES:
+                removed_tokens.add(tok); continue
+            # Au moins un sous-token whitelist → retirer le hit (les sous-tokens
+            # non-whitelist sont déjà couverts par d'autres hits si nécessaire)
+            sub = [s for s in re.split(r"[\s\-']+", tok_lower) if s]
+            if any(s in _WHITELIST_NEVER_MASK_TOKENS for s in sub):
+                removed_tokens.add(tok); continue
+            kept.append(h)
+        anon.audit = kept
+        if before != len(anon.audit):
+            log.info("Whitelist : %d hit(s) filtré(s) (%s)",
+                     before - len(anon.audit),
+                     ", ".join(sorted(removed_tokens)[:10]))

    # Sauvegardes
    base = pdf_path.stem