From b5058b9c4b2e2c8917d7a6c865e261470b3b4641 Mon Sep 17 00:00:00 2001
From: Domi31tls <dbazin52@gmail.com>
Date: Tue, 14 Apr 2026 10:23:09 +0200
Subject: [PATCH] =?UTF-8?q?fix(whitelist):=20GUI=20whitelist=5Fphrases=20e?=
 =?UTF-8?q?nfin=20lue=20et=20appliqu=C3=A9e=20par=20le=20core?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bug majeur depuis l'externalisation : la GUI v5.4 écrivait whitelist_phrases
(clé racine), mais le core ne lisait que whitelist.sections_titres /
noms_maj_excepts (imbriqué). _apply_whitelist post-masquage était par ailleurs
désactivée (c157205) sans remplacement.

Correctif :
- load_dictionaries() lit whitelist_phrases et alimente deux sets globaux
  (_WHITELIST_NEVER_MASK_TOKENS, _WHITELIST_NEVER_MASK_PHRASES). Mots-outils
  (de, du, le...) écartés pour éviter blocages collatéraux.
- _apply_extracted_names : check whitelist en pré-masquage, prime sur les
  force_names (ex: "DUPONT" reste visible même après "Dr DUPONT").
- process_pdf : filtrage final de l'audit avant redact_pdf_vector. Les hits
  multi-mots dont au moins un sous-token est whitelist sont retirés.
- redact_pdf_vector : check whitelist sur les sous-mots cherchés
  individuellement quand le multi-mots n'est pas trouvé sur la page.

Validé sur trackare-18007562-23054899 :
- Avec whitelist BELLEAU : 0 hit dans audit, 31 occurrences préservées dans PDF
- Sans whitelist : 0 occurrence dans PDF (non-régression OK)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 anonymizer_core_refactored_onnx.py | 97 +++++++++++++++++++++++++++++-
 1 file changed, 94 insertions(+), 3 deletions(-)

diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py
index 6c57ea0..d558033 100644
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -1126,6 +1126,45 @@ class NameCandidate:
 
 # ----------------- Config loader -----------------
 
+# Whitelist absolue : tokens que l'utilisateur a déclarés "à ne JAMAIS masquer"
+# Alimenté par cfg["whitelist_phrases"] dans load_dictionaries().
+# Filtré à 2 niveaux : pré-masquage (_apply_extracted_names) et filtrage final
+# (avant redact_pdf_vector) pour neutraliser tout NOM/PER/ORG qui matcherait.
+_WHITELIST_NEVER_MASK_TOKENS: set = set()
+_WHITELIST_NEVER_MASK_PHRASES: set = set()
+
+
+_WHITELIST_FUNCTION_WORDS = {
+    "de", "du", "des", "le", "la", "les", "et", "ou", "à", "a",
+    "en", "un", "une", "au", "aux", "of", "the", "and",
+}
+
+
+def _load_whitelist_phrases(phrases) -> int:
+    """Tokenise les phrases whitelist et alimente les sets globaux.
+    Retourne le nombre de tokens effectivement ajoutés.
+    Les mots-outils (de, du, le...) ne sont pas indexés individuellement
+    pour éviter de faux blocages, mais la phrase complète est conservée."""
+    if not phrases:
+        return 0
+    added = 0
+    for phrase in phrases:
+        if not phrase or not str(phrase).strip():
+            continue
+        p = str(phrase).strip()
+        _WHITELIST_NEVER_MASK_PHRASES.add(p.lower())
+        for tok in re.split(r"[\s\-']+", p):
+            tok = tok.strip(" .,;:!?()[]{}\"'«»")
+            tok_lower = tok.lower()
+            if len(tok) < 3:
+                continue
+            if tok_lower in _WHITELIST_FUNCTION_WORDS:
+                continue
+            _WHITELIST_NEVER_MASK_TOKENS.add(tok_lower)
+            added += 1
+    return added
+
+
 def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
     cfg = DEFAULTS_CFG.copy()
     if config_path and config_path.exists() and yaml is not None:
@@ -1151,6 +1190,14 @@ def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
                 _VILLE_BLACKLIST.add(str(v).strip().upper())
         log.info("Villes blacklist YAML supplémentaires : %d", len(extra_villes))
 
+    # Whitelist absolue : termes/phrases que l'utilisateur a déclarés "à ne JAMAIS masquer"
+    # Alimentée par la GUI v5.4 (clé racine whitelist_phrases du YAML).
+    wl_phrases = cfg.get("whitelist_phrases", []) or []
+    if wl_phrases:
+        n_added = _load_whitelist_phrases(wl_phrases)
+        log.info("Whitelist phrases chargées : %d phrases (%d tokens)",
+                 len(wl_phrases), n_added)
+
     return cfg
 
 # ----------------- Extraction -----------------
@@ -2399,6 +2446,11 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam
             continue
         if n not in _force and n.lower() in _MEDICAL_STOP_WORDS_SET:
             continue
+        # Whitelist absolue (configurée par l'utilisateur via la GUI) : prime sur tout,
+        # y compris sur les force_names (Dr/Mme). Si l'établissement déclare un terme
+        # comme "à ne jamais masquer", on respecte même s'il apparaît après "Dr".
+        if n.lower() in _WHITELIST_NEVER_MASK_TOKENS:
+            continue
         safe_names.add(n)
     # Ajouter un hit global (page=-1) par nom pour la redaction PDF raster
     # (un seul hit suffit — redact_pdf_raster cherche le token sur chaque page)
@@ -3804,6 +3856,9 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
             if h.kind in _VECTOR_WHOLEWORD_KINDS or h.kind == "NOM_FORCE":
                 if h.kind != "NOM_FORCE" and token.lower() in _MEDICAL_STOP_WORDS_SET:
                     continue
+                # Whitelist absolue : prime même sur NOM_FORCE
+                if token.lower() in _WHITELIST_NEVER_MASK_TOKENS:
+                    continue
                 if " " not in token:
                     rects = _search_whole_word(page, token)
                     if not rects and ocr_word_map and pno in ocr_word_map:
@@ -3823,6 +3878,9 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
                             word = word.strip(" .-'")
                             if len(word) < 4 or word.lower() in _MEDICAL_STOP_WORDS_SET:
                                 continue
+                            # Whitelist absolue sur sous-mots
+                            if word.lower() in _WHITELIST_NEVER_MASK_TOKENS:
+                                continue
                             rects.extend(_search_whole_word(page, word))
                             if not rects and ocr_word_map and pno in ocr_word_map:
                                 rects.extend(_search_ocr_words(ocr_word_map[pno], word, page.rect))
@@ -4519,9 +4577,42 @@ def process_pdf(
     )
     final_text = _RE_BRACKET_CLEAN.sub(r"\1", final_text)
 
-    # 6) Whitelist : DÉSACTIVÉ — l'approche post-masquage est défectueuse
-    # (injecte des phrases whitelist au mauvais endroit quand [NOM] masque un vrai nom)
-    # TODO: implémenter en pré-masquage (protéger les spans avant anonymisation)
+    # 6) Whitelist absolue : filtrer les hits qui matchent un terme whitelist
+    # de la GUI (clé YAML whitelist_phrases). Filet de sécurité après tous les
+    # mécanismes de détection — empêche DUPONT (whitelist) d'être masqué dans
+    # le PDF même s'il a été ajouté à l'audit par regex/NER/cross-validation.
+    if _WHITELIST_NEVER_MASK_TOKENS:
+        _NAME_LIKE_KINDS = {
+            "NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NOM_FORCE", "NOM_INITIAL",
+            "EDS_NOM", "EDS_PRENOM", "EDS_HOPITAL", "EDS_VILLE",
+            "ETAB", "ETAB_GLOBAL", "ETAB_FINESS", "ADDR_FINESS",
+            "NER_PER", "NER_ORG", "NER_LOC",
+            "VILLE", "force_term", "force_term_GLOBAL",
+        }
+        before = len(anon.audit)
+        kept = []
+        removed_tokens: set = set()
+        for h in anon.audit:
+            if h.kind not in _NAME_LIKE_KINDS:
+                kept.append(h); continue
+            tok = (h.original or "").strip()
+            if not tok:
+                kept.append(h); continue
+            tok_lower = tok.lower()
+            # Phrase complète whitelist → retirer
+            if tok_lower in _WHITELIST_NEVER_MASK_PHRASES:
+                removed_tokens.add(tok); continue
+            # Au moins un sous-token whitelist → retirer le hit (les sous-tokens
+            # non-whitelist sont déjà couverts par d'autres hits si nécessaire)
+            sub = [s for s in re.split(r"[\s\-']+", tok_lower) if s]
+            if any(s in _WHITELIST_NEVER_MASK_TOKENS for s in sub):
+                removed_tokens.add(tok); continue
+            kept.append(h)
+        anon.audit = kept
+        if before != len(anon.audit):
+            log.info("Whitelist : %d hit(s) filtré(s) (%s)",
+                     before - len(anon.audit),
+                     ", ".join(sorted(removed_tokens)[:10]))
 
     # Sauvegardes
     base = pdf_path.stem