feat(phase2): Gazetteers FINESS 102K établissements + fine-tuning CamemBERT-bio F1=89%
Gazetteers FINESS (data.gouv.fr open data): - 102K numéros FINESS → détection par lookup exact dans _mask_admin_label + selective_rescan - 122K noms d'établissements, 113K téléphones, 76K adresses (disponibles) - Un nombre 9 chiffres matchant un vrai FINESS est masqué même sans label "FINESS" Fine-tuning CamemBERT-bio (almanach/camembert-bio-base): - Export silver annotations réécrit : alignement original↔pseudonymisé (difflib) → 6862 entités B- (vs 3344 avec l'ancien audit-only) sur 222K tokens - Sliding windows (200 tokens, stride 100) pour documents longs - WeightedNERTrainer avec class weights cappés (max 10x) + label smoothing - Résultat: Precision=88.1%, Recall=89.8%, F1=88.9% (20 epochs, lr=1e-5) - Modèle sauvegardé dans models/camembert-bio-deid/best (non commité) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -150,6 +150,55 @@ def _load_insee_gazetteers():
|
||||
_load_insee_gazetteers()
|
||||
|
||||
|
||||
# ----------------- Gazetteer FINESS (établissements de santé) -----------------
|
||||
_FINESS_NUMBERS: set = set() # numéros FINESS 9 chiffres
|
||||
_FINESS_ETAB_NAMES: set = set() # noms d'établissements (lowercase)
|
||||
_FINESS_TELEPHONES: set = set() # téléphones 10 chiffres
|
||||
|
||||
def _load_finess_gazetteers():
|
||||
"""Charge les gazetteers FINESS (établissements, numéros, téléphones)."""
|
||||
global _FINESS_NUMBERS, _FINESS_ETAB_NAMES, _FINESS_TELEPHONES
|
||||
data_dir = Path(__file__).parent / "data" / "finess"
|
||||
|
||||
# Numéros FINESS
|
||||
finess_path = data_dir / "finess_numbers.txt"
|
||||
if finess_path.exists():
|
||||
try:
|
||||
_FINESS_NUMBERS = {
|
||||
line.strip() for line in finess_path.read_text(encoding="utf-8").splitlines()
|
||||
if line.strip()
|
||||
}
|
||||
log.info(f"Gazetteer FINESS numéros: {len(_FINESS_NUMBERS)} entrées")
|
||||
except Exception as e:
|
||||
log.warning(f"Erreur chargement FINESS numéros: {e}")
|
||||
|
||||
# Noms d'établissements (pour détection HOPITAL)
|
||||
noms_path = data_dir / "etablissements_noms.txt"
|
||||
if noms_path.exists():
|
||||
try:
|
||||
_FINESS_ETAB_NAMES = {
|
||||
line.strip().lower() for line in noms_path.read_text(encoding="utf-8").splitlines()
|
||||
if line.strip() and len(line.strip()) >= 6
|
||||
}
|
||||
log.info(f"Gazetteer FINESS noms: {len(_FINESS_ETAB_NAMES)} entrées")
|
||||
except Exception as e:
|
||||
log.warning(f"Erreur chargement FINESS noms: {e}")
|
||||
|
||||
# Téléphones (pour validation)
|
||||
tel_path = data_dir / "telephones.txt"
|
||||
if tel_path.exists():
|
||||
try:
|
||||
_FINESS_TELEPHONES = {
|
||||
line.strip() for line in tel_path.read_text(encoding="utf-8").splitlines()
|
||||
if line.strip()
|
||||
}
|
||||
log.info(f"Gazetteer FINESS téléphones: {len(_FINESS_TELEPHONES)} entrées")
|
||||
except Exception as e:
|
||||
log.warning(f"Erreur chargement FINESS téléphones: {e}")
|
||||
|
||||
_load_finess_gazetteers()
|
||||
|
||||
|
||||
# ----------------- Whitelists Médicales -----------------
|
||||
_MEDICAL_STRUCTURAL_TERMS = set()
|
||||
_MEDICATION_WHITELIST = set()
|
||||
@@ -1030,11 +1079,23 @@ def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[st
|
||||
return line
|
||||
|
||||
|
||||
RE_BARE_9DIGITS = re.compile(r"\b(\d{9})\b")
|
||||
|
||||
def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
|
||||
m = RE_FINESS.search(line)
|
||||
if m:
|
||||
val = m.group(1); audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"]))
|
||||
return RE_FINESS.sub(lambda _: f"FINESS : {PLACEHOLDERS['FINESS']}", line)
|
||||
|
||||
# Détection FINESS par gazetteer : nombre 9 chiffres qui matche un vrai numéro FINESS
|
||||
if _FINESS_NUMBERS:
|
||||
for m9 in RE_BARE_9DIGITS.finditer(line):
|
||||
if m9.group(1) in _FINESS_NUMBERS:
|
||||
val = m9.group(1)
|
||||
audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"]))
|
||||
line = line.replace(val, PLACEHOLDERS["FINESS"], 1)
|
||||
return line
|
||||
|
||||
m = RE_OGC.search(line)
|
||||
if m:
|
||||
val = m.group(1); audit.append(PiiHit(page_idx, "OGC", val, PLACEHOLDERS["OGC"]))
|
||||
@@ -2012,6 +2073,11 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
||||
protected = RE_EPISODE.sub(PLACEHOLDERS["EPISODE"], protected)
|
||||
# N° RPPS
|
||||
protected = RE_RPPS.sub(PLACEHOLDERS["RPPS"], protected)
|
||||
# FINESS par gazetteer (nombres 9 chiffres matchant un vrai numéro FINESS)
|
||||
if _FINESS_NUMBERS:
|
||||
def _rescan_finess(m: re.Match) -> str:
|
||||
return PLACEHOLDERS["FINESS"] if m.group(1) in _FINESS_NUMBERS else m.group(0)
|
||||
protected = RE_BARE_9DIGITS.sub(_rescan_finess, protected)
|
||||
# Établissements
|
||||
protected = RE_ETABLISSEMENT.sub(PLACEHOLDERS["ETAB"], protected)
|
||||
protected = RE_HOPITAL_VILLE.sub(PLACEHOLDERS["ETAB"], protected)
|
||||
|
||||
Reference in New Issue
Block a user