feat(phase2): Multi-signal NER — BDPM gazetteers, confiance EDS, safe patterns, GLiNER

Chantier 1: Intégration BDPM (5737 médicaments officiels) dans medication whitelist Chantier 2: Safe patterns contextuels (dosages mg/mL/cpr, formes pharma, même ligne) Chantier 3: Scores de confiance NER réels (edsnlp 0.20 ner_confidence_score) Chantier 4: GLiNER zero-shot (urchade/gliner_multi_pii-v1) en vote croisé Chantier 5: Scripts export silver annotations + fine-tuning CamemBERT-bio 0 fuite, 0 régression, -18 FP supplémentaires éliminés. Sécurité: GLiNER ne peut rejeter que si confiance NER < 0.70. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 12:01:46 +01:00
parent 782551c1c6
commit 26ac02b0cb
16 changed files with 6431 additions and 41 deletions
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -97,6 +97,23 @@ def _load_edsnlp_drug_names() -> set:
        return set()


+def _load_bdpm_medication_names() -> set:
+    """Charge les noms de médicaments depuis la base BDPM (data/bdpm/medication_names.txt).
+    Retourne un set lowercase. ~5700 noms commerciaux et DCI."""
+    bdpm_path = Path(__file__).parent / "data" / "bdpm" / "medication_names.txt"
+    if not bdpm_path.exists():
+        return set()
+    try:
+        names = set()
+        for line in bdpm_path.read_text(encoding="utf-8").splitlines():
+            w = line.strip()
+            if w and len(w) >= 3:
+                names.add(w.lower())
+        return names
+    except Exception:
+        return set()
+
+
 # ----------------- Whitelists Médicales -----------------
 _MEDICAL_STRUCTURAL_TERMS = set()
 _MEDICATION_WHITELIST = set()
@@ -117,15 +134,16 @@ def load_medical_whitelists():
        except Exception as e:
            log.warning(f"Erreur chargement whitelist médicale: {e}")
    
-    # 2. Charger la whitelist des médicaments
+    # 2. Charger la whitelist des médicaments (edsnlp + BDPM + manuels)
    _MEDICATION_WHITELIST = _load_edsnlp_drug_names()
+    _MEDICATION_WHITELIST.update(_load_bdpm_medication_names())
    # Ajouter médicaments manquants
    additional_meds = {
        "idacio", "salazopyrine", "infliximab", "apranax",
        "ketoprofene", "prevenar", "pneumovax", "bétadine"
    }
    _MEDICATION_WHITELIST.update(additional_meds)
-    log.info(f"Whitelist médicaments chargée: {len(_MEDICATION_WHITELIST)} médicaments")
+    log.info(f"Whitelist médicaments chargée: {len(_MEDICATION_WHITELIST)} médicaments (edsnlp+BDPM)")

 # Charger les whitelists au démarrage du module
 load_medical_whitelists()
@@ -1828,13 +1846,41 @@ def _mask_with_eds_pseudo(text: str, ents: List[Dict[str, Any]], cfg: Dict[str,
            # Vérifier si c'est un médicament connu
            if w.lower() in _MEDICATION_WHITELIST:
                continue
-        # Règles de validation heuristiques par type d'entité
+        # Chantier 3+4 : Confiance NER + vote croisé GLiNER (combinés)
+        # Sécurité d'abord : haute confiance NER → toujours masquer
+        # GLiNER peut rejeter SEULEMENT si confiance NER basse
+        gliner_vote = e.get("gliner_confirmed")  # True=PII, False=médical, None=neutre
        if label in ("NOM", "PRENOM"):
-            # Rejeter si le contexte précédent (15 chars) contient un dosage
+            score = e.get("score", 1.0)
+            if isinstance(score, float) and score < 0.70:
+                # Basse confiance NER : GLiNER peut trancher
+                if gliner_vote is False:
+                    continue  # NER pas sûr + GLiNER dit "médical" → skip
+                if score < 0.30:
+                    continue  # Très basse confiance → skip même sans GLiNER
+        # Chantier 2 : Safe patterns contextuels (Philter-style)
+        # Token suivi/précédé de dosages ou formes pharma → jamais un nom de personne
            pos = text.find(w)
-            if pos > 0:
-                ctx_before = text[max(0, pos - 15):pos]
-                if re.search(r"\d+\s*(?:mg|UI|ml|µg|mcg)\b", ctx_before, re.IGNORECASE):
+            if pos >= 0:
+                # Contexte MÊME LIGNE seulement ([ \t] pas \n)
+                line_start = text.rfind('\n', 0, pos)
+                line_start = 0 if line_start < 0 else line_start + 1
+                line_end = text.find('\n', pos + len(w))
+                line_end = len(text) if line_end < 0 else line_end
+                ctx_before = text[max(line_start, pos - 30):pos]
+                ctx_after = text[pos + len(w):min(line_end, pos + len(w) + 30)]
+                # Safe pattern: précédé ou suivi d'un dosage (mg, mL, UI, comprimé, etc.)
+                _RE_DOSAGE = r"\d+[ \t]*(?:mg|ml|ui|µg|mcg|g|kg|cp|cpr|gel|amp|fl|dos|inh)\b"
+                if re.search(_RE_DOSAGE, ctx_before, re.IGNORECASE):
+                    continue
+                if re.search(_RE_DOSAGE, ctx_after, re.IGNORECASE):
+                    continue
+                # Safe pattern: suivi d'une forme pharmaceutique
+                _RE_PHARMA_FORM = r"^\s*(?:comprim[ée]s?|g[ée]lules?|sachets?|ampoules?|flacons?|solutions?|injectable|suppo(?:sitoire)?s?|sirop|pommade|cr[eè]me|gouttes?|patch|inhal)"
+                if re.search(_RE_PHARMA_FORM, ctx_after, re.IGNORECASE):
+                    continue
+                # Safe pattern: précédé de "taux de", "score de", "dosage de"
+                if re.search(r"(?:taux|score|dosage|indice|index|grade|stade|type)\s+(?:de\s+)?$", ctx_before, re.IGNORECASE):
                    continue
        elif label == "HOPITAL":
            _STRUCTURAL_WORDS = {"SERVICE", "POLE", "PÔLE", "UNITE", "UNITÉ", "SECTEUR"}
@@ -1848,8 +1894,9 @@ def _mask_with_eds_pseudo(text: str, ents: List[Dict[str, Any]], cfg: Dict[str,
    return out


-def apply_eds_pseudo_on_narrative(text_out: str, cfg: Dict[str, Any], manager: "EdsPseudoManager") -> Tuple[str, List[PiiHit]]:
-    """Applique EDS-Pseudo sur le narratif (même structure que apply_hf_ner_on_narrative)."""
+def apply_eds_pseudo_on_narrative(text_out: str, cfg: Dict[str, Any], manager: "EdsPseudoManager",
+                                   gliner_mgr: Any = None) -> Tuple[str, List[PiiHit]]:
+    """Applique EDS-Pseudo sur le narratif avec validation croisée GLiNER optionnelle."""
    if manager is None or not manager.is_loaded():
        return text_out, []
    # isoler [TABLES]
@@ -1871,6 +1918,10 @@ def apply_eds_pseudo_on_narrative(text_out: str, cfg: Dict[str, Any], manager: "
    for pg in pages:
        paras = [p for p in re.split(r"\n\s*\n", pg) if p.strip()]
        ents_per_para = manager.infer_paragraphs(paras)
+        # Chantier 4 : Validation croisée GLiNER (vote majoritaire)
+        if gliner_mgr is not None and hasattr(gliner_mgr, 'validate_entities') and gliner_mgr.is_loaded():
+            for i, (para, ents) in enumerate(zip(paras, ents_per_para)):
+                ents_per_para[i] = gliner_mgr.validate_entities(para, ents, threshold=0.4)
        buf = []
        for para, ents in zip(paras, ents_per_para):
            masked = _mask_with_eds_pseudo(para, ents, cfg, hits)
@@ -2309,6 +2360,7 @@ def process_pdf(
    ner_thresholds=None,
    ogc_label: Optional[str] = None,
    vlm_manager=None,
+    gliner_manager=None,
 ) -> Dict[str, str]:
    out_dir.mkdir(parents=True, exist_ok=True)
    cfg = load_dictionaries(config_path)
@@ -2331,7 +2383,7 @@ def process_pdf(
    if use_hf and ner_manager is not None and ner_manager.is_loaded():
        # Détecter le type de manager et appeler la bonne fonction
        if EdsPseudoManager is not None and isinstance(ner_manager, EdsPseudoManager):
-            final_text, hf_hits = apply_eds_pseudo_on_narrative(final_text, cfg, ner_manager)
+            final_text, hf_hits = apply_eds_pseudo_on_narrative(final_text, cfg, ner_manager, gliner_mgr=gliner_manager)
        else:
            final_text, hf_hits = apply_hf_ner_on_narrative(final_text, cfg, ner_manager, ner_thresholds)
        anon.audit.extend(hf_hits)