Files
anonymisation/eds_pseudo_manager.py
Domi31tls 26ac02b0cb feat(phase2): Multi-signal NER — BDPM gazetteers, confiance EDS, safe patterns, GLiNER
Chantier 1: Intégration BDPM (5737 médicaments officiels) dans medication whitelist
Chantier 2: Safe patterns contextuels (dosages mg/mL/cpr, formes pharma, même ligne)
Chantier 3: Scores de confiance NER réels (edsnlp 0.20 ner_confidence_score)
Chantier 4: GLiNER zero-shot (urchade/gliner_multi_pii-v1) en vote croisé
Chantier 5: Scripts export silver annotations + fine-tuning CamemBERT-bio

0 fuite, 0 régression, -18 FP supplémentaires éliminés.
Sécurité: GLiNER ne peut rejeter que si confiance NER < 0.70.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 12:01:46 +01:00

122 lines
4.1 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
EDS-Pseudo Manager — Interface compatible NerModelManager pour le modèle AP-HP eds-pseudo.
--------------------------------------------------------------------------------------------
Utilise edsnlp pour charger le pipeline eds-pseudo (F1=0.97 sur données cliniques AP-HP).
Mapping des 13 labels EDS-Pseudo vers les clés PLACEHOLDERS du core d'anonymisation.
Dépendance : pip install 'edsnlp[ml]>=0.12.0'
"""
from __future__ import annotations
from pathlib import Path
from typing import Any, Dict, List, Optional
try:
import edsnlp
_EDSNLP_AVAILABLE = True
except ImportError:
edsnlp = None # type: ignore
_EDSNLP_AVAILABLE = False
# Mapping labels EDS-Pseudo → clés PLACEHOLDERS (anonymizer_core)
EDS_LABEL_MAP: Dict[str, str] = {
"NOM": "NOM",
"PRENOM": "NOM",
"MAIL": "EMAIL",
"TEL": "TEL",
"SECU": "NIR",
"ADRESSE": "ADRESSE",
"ZIP": "CODE_POSTAL",
"VILLE": "VILLE",
"HOPITAL": "ETAB",
# "DATE": "DATE", # ✅ DÉSACTIVÉ (Phase 1): ne masquer que les dates de naissance, pas les dates de consultation/examen
"DATE_NAISSANCE": "DATE_NAISSANCE",
"IPP": "IPP",
"NDA": "NDA",
}
# Catalogue affiché dans la GUI
EDS_MODELS_CATALOG: Dict[str, str] = {
"EDS-Pseudo AP-HP (edsnlp)": "AP-HP/eds-pseudo-public",
}
class EdsPseudoManager:
"""Gestionnaire pour le modèle EDS-Pseudo (edsnlp). Même interface que NerModelManager."""
def __init__(self, cache_dir: Optional[Path] = None):
self.cache_dir = Path(cache_dir) if cache_dir else None
self.model_id: Optional[str] = None
self._nlp = None
self._loaded = False
def is_loaded(self) -> bool:
return self._loaded and self._nlp is not None
def load(self, model_id_or_path: str = "AP-HP/eds-pseudo-public") -> None:
if not _EDSNLP_AVAILABLE:
raise RuntimeError("edsnlp non disponible. Installez : pip install 'edsnlp[ml]>=0.12.0'")
self.unload()
self.model_id = model_id_or_path
path = Path(model_id_or_path)
if path.is_dir():
self._nlp = edsnlp.load(path)
else:
self._nlp = edsnlp.load(model_id_or_path)
# Activer les scores de confiance NER (edsnlp >= 0.16)
try:
ner_pipe = self._nlp.get_pipe('ner')
ner_pipe.compute_confidence_score = True
except Exception:
pass # versions plus anciennes sans support confiance
self._loaded = True
def unload(self) -> None:
self._nlp = None
self._loaded = False
self.model_id = None
def models_catalog(self) -> Dict[str, str]:
return dict(EDS_MODELS_CATALOG)
def infer_paragraphs(
self,
paragraphs: List[str],
thresholds: Optional[Any] = None,
max_length: int = 384,
stride: int = 128,
) -> List[List[Dict[str, Any]]]:
"""Pour chaque paragraphe, retourne une liste d'entités détectées.
Chaque entité a les clés : entity_group, word, start, end, score, eds_mapped_key.
"""
if not self.is_loaded():
return [[] for _ in paragraphs]
out: List[List[Dict[str, Any]]] = []
for para in paragraphs:
if not para.strip():
out.append([])
continue
doc = self._nlp(para)
ents: List[Dict[str, Any]] = []
for ent in doc.ents:
label = ent.label_.upper()
mapped = EDS_LABEL_MAP.get(label, None)
if mapped is None:
continue
# Score de confiance réel si disponible (edsnlp >= 0.16)
raw_score = getattr(ent._, 'ner_confidence_score', None)
conf = raw_score if isinstance(raw_score, float) else 1.0
ents.append({
"entity_group": label,
"word": ent.text,
"start": ent.start_char,
"end": ent.end_char,
"score": conf,
"eds_mapped_key": mapped,
})
out.append(ents)
return out