feat(phase2): Multi-signal NER — BDPM gazetteers, confiance EDS, safe patterns, GLiNER
Chantier 1: Intégration BDPM (5737 médicaments officiels) dans medication whitelist Chantier 2: Safe patterns contextuels (dosages mg/mL/cpr, formes pharma, même ligne) Chantier 3: Scores de confiance NER réels (edsnlp 0.20 ner_confidence_score) Chantier 4: GLiNER zero-shot (urchade/gliner_multi_pii-v1) en vote croisé Chantier 5: Scripts export silver annotations + fine-tuning CamemBERT-bio 0 fuite, 0 régression, -18 FP supplémentaires éliminés. Sécurité: GLiNER ne peut rejeter que si confiance NER < 0.70. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -97,6 +97,23 @@ def _load_edsnlp_drug_names() -> set:
|
||||
return set()
|
||||
|
||||
|
||||
def _load_bdpm_medication_names() -> set:
|
||||
"""Charge les noms de médicaments depuis la base BDPM (data/bdpm/medication_names.txt).
|
||||
Retourne un set lowercase. ~5700 noms commerciaux et DCI."""
|
||||
bdpm_path = Path(__file__).parent / "data" / "bdpm" / "medication_names.txt"
|
||||
if not bdpm_path.exists():
|
||||
return set()
|
||||
try:
|
||||
names = set()
|
||||
for line in bdpm_path.read_text(encoding="utf-8").splitlines():
|
||||
w = line.strip()
|
||||
if w and len(w) >= 3:
|
||||
names.add(w.lower())
|
||||
return names
|
||||
except Exception:
|
||||
return set()
|
||||
|
||||
|
||||
# ----------------- Whitelists Médicales -----------------
|
||||
_MEDICAL_STRUCTURAL_TERMS = set()
|
||||
_MEDICATION_WHITELIST = set()
|
||||
@@ -117,15 +134,16 @@ def load_medical_whitelists():
|
||||
except Exception as e:
|
||||
log.warning(f"Erreur chargement whitelist médicale: {e}")
|
||||
|
||||
# 2. Charger la whitelist des médicaments
|
||||
# 2. Charger la whitelist des médicaments (edsnlp + BDPM + manuels)
|
||||
_MEDICATION_WHITELIST = _load_edsnlp_drug_names()
|
||||
_MEDICATION_WHITELIST.update(_load_bdpm_medication_names())
|
||||
# Ajouter médicaments manquants
|
||||
additional_meds = {
|
||||
"idacio", "salazopyrine", "infliximab", "apranax",
|
||||
"ketoprofene", "prevenar", "pneumovax", "bétadine"
|
||||
}
|
||||
_MEDICATION_WHITELIST.update(additional_meds)
|
||||
log.info(f"Whitelist médicaments chargée: {len(_MEDICATION_WHITELIST)} médicaments")
|
||||
log.info(f"Whitelist médicaments chargée: {len(_MEDICATION_WHITELIST)} médicaments (edsnlp+BDPM)")
|
||||
|
||||
# Charger les whitelists au démarrage du module
|
||||
load_medical_whitelists()
|
||||
@@ -1828,13 +1846,41 @@ def _mask_with_eds_pseudo(text: str, ents: List[Dict[str, Any]], cfg: Dict[str,
|
||||
# Vérifier si c'est un médicament connu
|
||||
if w.lower() in _MEDICATION_WHITELIST:
|
||||
continue
|
||||
# Règles de validation heuristiques par type d'entité
|
||||
# Chantier 3+4 : Confiance NER + vote croisé GLiNER (combinés)
|
||||
# Sécurité d'abord : haute confiance NER → toujours masquer
|
||||
# GLiNER peut rejeter SEULEMENT si confiance NER basse
|
||||
gliner_vote = e.get("gliner_confirmed") # True=PII, False=médical, None=neutre
|
||||
if label in ("NOM", "PRENOM"):
|
||||
# Rejeter si le contexte précédent (15 chars) contient un dosage
|
||||
score = e.get("score", 1.0)
|
||||
if isinstance(score, float) and score < 0.70:
|
||||
# Basse confiance NER : GLiNER peut trancher
|
||||
if gliner_vote is False:
|
||||
continue # NER pas sûr + GLiNER dit "médical" → skip
|
||||
if score < 0.30:
|
||||
continue # Très basse confiance → skip même sans GLiNER
|
||||
# Chantier 2 : Safe patterns contextuels (Philter-style)
|
||||
# Token suivi/précédé de dosages ou formes pharma → jamais un nom de personne
|
||||
pos = text.find(w)
|
||||
if pos > 0:
|
||||
ctx_before = text[max(0, pos - 15):pos]
|
||||
if re.search(r"\d+\s*(?:mg|UI|ml|µg|mcg)\b", ctx_before, re.IGNORECASE):
|
||||
if pos >= 0:
|
||||
# Contexte MÊME LIGNE seulement ([ \t] pas \n)
|
||||
line_start = text.rfind('\n', 0, pos)
|
||||
line_start = 0 if line_start < 0 else line_start + 1
|
||||
line_end = text.find('\n', pos + len(w))
|
||||
line_end = len(text) if line_end < 0 else line_end
|
||||
ctx_before = text[max(line_start, pos - 30):pos]
|
||||
ctx_after = text[pos + len(w):min(line_end, pos + len(w) + 30)]
|
||||
# Safe pattern: précédé ou suivi d'un dosage (mg, mL, UI, comprimé, etc.)
|
||||
_RE_DOSAGE = r"\d+[ \t]*(?:mg|ml|ui|µg|mcg|g|kg|cp|cpr|gel|amp|fl|dos|inh)\b"
|
||||
if re.search(_RE_DOSAGE, ctx_before, re.IGNORECASE):
|
||||
continue
|
||||
if re.search(_RE_DOSAGE, ctx_after, re.IGNORECASE):
|
||||
continue
|
||||
# Safe pattern: suivi d'une forme pharmaceutique
|
||||
_RE_PHARMA_FORM = r"^\s*(?:comprim[ée]s?|g[ée]lules?|sachets?|ampoules?|flacons?|solutions?|injectable|suppo(?:sitoire)?s?|sirop|pommade|cr[eè]me|gouttes?|patch|inhal)"
|
||||
if re.search(_RE_PHARMA_FORM, ctx_after, re.IGNORECASE):
|
||||
continue
|
||||
# Safe pattern: précédé de "taux de", "score de", "dosage de"
|
||||
if re.search(r"(?:taux|score|dosage|indice|index|grade|stade|type)\s+(?:de\s+)?$", ctx_before, re.IGNORECASE):
|
||||
continue
|
||||
elif label == "HOPITAL":
|
||||
_STRUCTURAL_WORDS = {"SERVICE", "POLE", "PÔLE", "UNITE", "UNITÉ", "SECTEUR"}
|
||||
@@ -1848,8 +1894,9 @@ def _mask_with_eds_pseudo(text: str, ents: List[Dict[str, Any]], cfg: Dict[str,
|
||||
return out
|
||||
|
||||
|
||||
def apply_eds_pseudo_on_narrative(text_out: str, cfg: Dict[str, Any], manager: "EdsPseudoManager") -> Tuple[str, List[PiiHit]]:
|
||||
"""Applique EDS-Pseudo sur le narratif (même structure que apply_hf_ner_on_narrative)."""
|
||||
def apply_eds_pseudo_on_narrative(text_out: str, cfg: Dict[str, Any], manager: "EdsPseudoManager",
|
||||
gliner_mgr: Any = None) -> Tuple[str, List[PiiHit]]:
|
||||
"""Applique EDS-Pseudo sur le narratif avec validation croisée GLiNER optionnelle."""
|
||||
if manager is None or not manager.is_loaded():
|
||||
return text_out, []
|
||||
# isoler [TABLES]
|
||||
@@ -1871,6 +1918,10 @@ def apply_eds_pseudo_on_narrative(text_out: str, cfg: Dict[str, Any], manager: "
|
||||
for pg in pages:
|
||||
paras = [p for p in re.split(r"\n\s*\n", pg) if p.strip()]
|
||||
ents_per_para = manager.infer_paragraphs(paras)
|
||||
# Chantier 4 : Validation croisée GLiNER (vote majoritaire)
|
||||
if gliner_mgr is not None and hasattr(gliner_mgr, 'validate_entities') and gliner_mgr.is_loaded():
|
||||
for i, (para, ents) in enumerate(zip(paras, ents_per_para)):
|
||||
ents_per_para[i] = gliner_mgr.validate_entities(para, ents, threshold=0.4)
|
||||
buf = []
|
||||
for para, ents in zip(paras, ents_per_para):
|
||||
masked = _mask_with_eds_pseudo(para, ents, cfg, hits)
|
||||
@@ -2309,6 +2360,7 @@ def process_pdf(
|
||||
ner_thresholds=None,
|
||||
ogc_label: Optional[str] = None,
|
||||
vlm_manager=None,
|
||||
gliner_manager=None,
|
||||
) -> Dict[str, str]:
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
cfg = load_dictionaries(config_path)
|
||||
@@ -2331,7 +2383,7 @@ def process_pdf(
|
||||
if use_hf and ner_manager is not None and ner_manager.is_loaded():
|
||||
# Détecter le type de manager et appeler la bonne fonction
|
||||
if EdsPseudoManager is not None and isinstance(ner_manager, EdsPseudoManager):
|
||||
final_text, hf_hits = apply_eds_pseudo_on_narrative(final_text, cfg, ner_manager)
|
||||
final_text, hf_hits = apply_eds_pseudo_on_narrative(final_text, cfg, ner_manager, gliner_mgr=gliner_manager)
|
||||
else:
|
||||
final_text, hf_hits = apply_hf_ner_on_narrative(final_text, cfg, ner_manager, ner_thresholds)
|
||||
anon.audit.extend(hf_hits)
|
||||
|
||||
Reference in New Issue
Block a user