feat(phase2): Détection établissements par Aho-Corasick sur 108K noms FINESS
- Nouveau script build_finess_gazetteers.py : extraction noms distinctifs, villes, numéros depuis CSV open data - Automate Aho-Corasick (pyahocorasick) pour matching multi-pattern en ~1.7ms/page - 108K patterns indexés (noms composés >= 8 chars, mots uniques >= 10 chars) - Blacklist mots génériques (clinique, pharmacie, etc.) et stop words médicaux - Normalisation position-preserving (sans accents, même longueur) - Construction lazy de l'AC (après chargement des stop words) - Intégration dans _mask_line_by_regex et selective_rescan - Nouveau gazetteer villes_finess.txt (11,660 villes) - Résultats : "Girandières" → masqué, "Côte Basque" → masqué, 0 FP sur termes médicaux courants Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -154,10 +154,30 @@ _load_insee_gazetteers()
|
||||
_FINESS_NUMBERS: set = set() # numéros FINESS 9 chiffres
|
||||
_FINESS_ETAB_NAMES: set = set() # noms d'établissements (lowercase)
|
||||
_FINESS_TELEPHONES: set = set() # téléphones 10 chiffres
|
||||
_FINESS_VILLES: set = set() # villes FINESS (uppercase)
|
||||
_FINESS_AC = None # Automate Aho-Corasick pour noms distinctifs
|
||||
|
||||
try:
|
||||
import ahocorasick as _ahocorasick
|
||||
_AHO_AVAILABLE = True
|
||||
except ImportError:
|
||||
_ahocorasick = None
|
||||
_AHO_AVAILABLE = False
|
||||
|
||||
def _normalize_for_matching(s: str) -> str:
|
||||
"""Normalise pour matching gazetteer : lowercase, sans accents, espaces collapsés."""
|
||||
import unicodedata
|
||||
s = s.lower().strip()
|
||||
s = unicodedata.normalize("NFD", s)
|
||||
s = "".join(c for c in s if unicodedata.category(c) != "Mn")
|
||||
s = re.sub(r"[^a-z0-9\s\-]", " ", s)
|
||||
s = re.sub(r"\s+", " ", s).strip()
|
||||
return s
|
||||
|
||||
|
||||
def _load_finess_gazetteers():
|
||||
"""Charge les gazetteers FINESS (établissements, numéros, téléphones)."""
|
||||
global _FINESS_NUMBERS, _FINESS_ETAB_NAMES, _FINESS_TELEPHONES
|
||||
"""Charge les gazetteers FINESS (établissements, numéros, téléphones, villes, Aho-Corasick)."""
|
||||
global _FINESS_NUMBERS, _FINESS_ETAB_NAMES, _FINESS_TELEPHONES, _FINESS_VILLES, _FINESS_AC
|
||||
data_dir = Path(__file__).parent / "data" / "finess"
|
||||
|
||||
# Numéros FINESS
|
||||
@@ -172,7 +192,7 @@ def _load_finess_gazetteers():
|
||||
except Exception as e:
|
||||
log.warning(f"Erreur chargement FINESS numéros: {e}")
|
||||
|
||||
# Noms d'établissements (pour détection HOPITAL)
|
||||
# Noms d'établissements complets (pour debug/référence)
|
||||
noms_path = data_dir / "etablissements_noms.txt"
|
||||
if noms_path.exists():
|
||||
try:
|
||||
@@ -184,6 +204,21 @@ def _load_finess_gazetteers():
|
||||
except Exception as e:
|
||||
log.warning(f"Erreur chargement FINESS noms: {e}")
|
||||
|
||||
# Noms distinctifs : chargement différé (Aho-Corasick construit au premier appel,
|
||||
# car _MEDICAL_STOP_WORDS_SET n'est pas encore défini à ce stade du module)
|
||||
|
||||
# Villes FINESS
|
||||
villes_path = data_dir / "villes_finess.txt"
|
||||
if villes_path.exists():
|
||||
try:
|
||||
_FINESS_VILLES = {
|
||||
line.strip() for line in villes_path.read_text(encoding="utf-8").splitlines()
|
||||
if line.strip() and len(line.strip()) >= 3
|
||||
}
|
||||
log.info(f"Gazetteer FINESS villes: {len(_FINESS_VILLES)} entrées")
|
||||
except Exception as e:
|
||||
log.warning(f"Erreur chargement FINESS villes: {e}")
|
||||
|
||||
# Téléphones (pour validation)
|
||||
tel_path = data_dir / "telephones.txt"
|
||||
if tel_path.exists():
|
||||
@@ -1220,6 +1255,16 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
|
||||
line = RE_ETABLISSEMENT.sub(_repl_etab, line)
|
||||
line = RE_HOPITAL_VILLE.sub(_repl_etab, line)
|
||||
|
||||
# Établissements par gazetteer Aho-Corasick FINESS (116K noms distinctifs)
|
||||
if _FINESS_AC is not None:
|
||||
old_line = line
|
||||
line = _mask_finess_establishments(line)
|
||||
if line != old_line:
|
||||
# Enregistrer les hits dans l'audit
|
||||
# (on ne peut pas facilement savoir quels noms ont matché,
|
||||
# mais on log le fait qu'un match gazetteer a eu lieu)
|
||||
audit.append(PiiHit(page_idx, "ETAB_FINESS", "gazetteer", PLACEHOLDERS["ETAB"]))
|
||||
|
||||
# Services hospitaliers (service de Cardiologie, unité de soins palliatifs, etc.)
|
||||
def _repl_service(m: re.Match) -> str:
|
||||
full_match = m.group(0)
|
||||
@@ -2097,6 +2142,144 @@ def apply_eds_pseudo_on_narrative(text_out: str, cfg: Dict[str, Any], manager: "
|
||||
final = "".join(rebuilt_list)
|
||||
return final, hits
|
||||
|
||||
# ----------------- FINESS Aho-Corasick establishment matching -----------------
|
||||
|
||||
def _build_finess_ac():
|
||||
"""Construit l'automate Aho-Corasick FINESS (appelé en lazy au premier besoin)."""
|
||||
global _FINESS_AC
|
||||
if not _AHO_AVAILABLE:
|
||||
return
|
||||
data_dir = Path(__file__).parent / "data" / "finess"
|
||||
dist_path = data_dir / "etablissements_distinctifs.txt"
|
||||
if not dist_path.exists():
|
||||
return
|
||||
|
||||
# Mots génériques qui ne doivent jamais être matchés seuls
|
||||
_ac_generic_blacklist = {
|
||||
# Types d'établissements
|
||||
"clinique", "pharmacie", "hopital", "centre", "foyer",
|
||||
"residence", "maison", "cabinet", "service", "laboratoire",
|
||||
"institut", "association", "fondation", "mutuelle", "polyclinique",
|
||||
"dispensaire", "hospice", "annexe", "antenne", "site",
|
||||
# Mots français courants qui sont aussi des noms d'établissements
|
||||
"collegiale", "collegial", "cathedral", "cathedrale",
|
||||
"providence", "esperance", "renaissance", "liberation",
|
||||
"republique", "fraternite", "solidarite", "independance",
|
||||
"beauregard", "bellevue", "belvedere",
|
||||
"promenade", "esplanade", "corniche", "prefecture",
|
||||
"croissant", "confluence", "bienvenue",
|
||||
"chartreuse", "commanderie", "chapelle", "basilique",
|
||||
"departement", "departementale", "communautaire",
|
||||
}
|
||||
try:
|
||||
ac = _ahocorasick.Automaton()
|
||||
count = 0
|
||||
for line in dist_path.read_text(encoding="utf-8").splitlines():
|
||||
name = line.strip()
|
||||
if not name:
|
||||
continue
|
||||
# Exclure les mots génériques seuls
|
||||
if name in _ac_generic_blacklist:
|
||||
continue
|
||||
words = name.split()
|
||||
# Exclure les 2-mots dont le 1er est générique ET le 2e < 5 chars
|
||||
if len(words) == 2 and words[0] in _ac_generic_blacklist and len(words[1]) < 5:
|
||||
continue
|
||||
# Filtrer : >= 8 chars et >= 2 mots, OU >= 10 chars pour 1 mot
|
||||
# Les noms courts sont gérés par RE_HOPITAL_VILLE
|
||||
if len(words) >= 2 and len(name) >= 8:
|
||||
ac.add_word(name, name)
|
||||
count += 1
|
||||
elif (len(words) == 1 and len(name) >= 10
|
||||
and name not in _ac_generic_blacklist
|
||||
and name not in _MEDICAL_STOP_WORDS_SET
|
||||
and _normalize_for_matching(name) not in _MEDICAL_STOP_WORDS_SET):
|
||||
ac.add_word(name, name)
|
||||
count += 1
|
||||
ac.make_automaton()
|
||||
_FINESS_AC = ac
|
||||
log.info(f"Gazetteer FINESS Aho-Corasick: {count} patterns chargés")
|
||||
except Exception as e:
|
||||
log.warning(f"Erreur construction FINESS Aho-Corasick: {e}")
|
||||
|
||||
|
||||
def _normalize_positional(text: str) -> str:
|
||||
"""Normalise en préservant la longueur : lowercase + accents → base char.
|
||||
|
||||
Chaque caractère accentué est remplacé par sa version sans accent.
|
||||
Les caractères non-alphanumériques restent tels quels (même position).
|
||||
Longueur de sortie == longueur d'entrée.
|
||||
"""
|
||||
import unicodedata
|
||||
out = []
|
||||
for ch in text:
|
||||
# Lowercase
|
||||
ch = ch.lower()
|
||||
# Décomposer et retirer les accents
|
||||
decomposed = unicodedata.normalize("NFD", ch)
|
||||
base = "".join(c for c in decomposed if unicodedata.category(c) != "Mn")
|
||||
out.append(base if base else ch)
|
||||
return "".join(out)
|
||||
|
||||
|
||||
def _mask_finess_establishments(text: str) -> str:
|
||||
"""Masque les noms d'établissements FINESS détectés par Aho-Corasick.
|
||||
|
||||
Scanne le texte normalisé (position-preserving: même longueur) et remplace
|
||||
les occurrences trouvées dans le texte original par [ETABLISSEMENT].
|
||||
Seuls les matches sur des frontières de mots sont acceptés.
|
||||
"""
|
||||
global _FINESS_AC
|
||||
if _FINESS_AC is None:
|
||||
_build_finess_ac()
|
||||
if _FINESS_AC is None:
|
||||
return text
|
||||
|
||||
normalized = _normalize_positional(text)
|
||||
placeholder = PLACEHOLDERS["ETAB"]
|
||||
|
||||
# Collecter les matches Aho-Corasick (position fin, nom)
|
||||
matches = []
|
||||
for end_idx, name in _FINESS_AC.iter(normalized):
|
||||
start_idx = end_idx - len(name) + 1
|
||||
# Vérifier frontières de mots (pas au milieu d'un mot)
|
||||
if start_idx > 0 and normalized[start_idx - 1].isalnum():
|
||||
continue
|
||||
if end_idx + 1 < len(normalized) and normalized[end_idx + 1].isalnum():
|
||||
continue
|
||||
# Vérifier que ce n'est pas déjà dans un placeholder
|
||||
ctx_before = text[max(0, start_idx - 1):start_idx]
|
||||
ctx_after = text[end_idx + 1:min(len(text), end_idx + 2)]
|
||||
if "[" in ctx_before or "]" in ctx_after:
|
||||
continue
|
||||
matches.append((start_idx, end_idx + 1, name))
|
||||
|
||||
if not matches:
|
||||
return text
|
||||
|
||||
# Trier par position, dédupliquer (garder le plus long en cas de chevauchement)
|
||||
matches.sort(key=lambda x: (x[0], -(x[1] - x[0])))
|
||||
deduped = []
|
||||
last_end = 0
|
||||
for start, end, name in matches:
|
||||
if start >= last_end:
|
||||
deduped.append((start, end, name))
|
||||
last_end = end
|
||||
|
||||
# Reconstruire le texte avec les remplacements (positions 1:1 avec l'original)
|
||||
result = []
|
||||
last_pos = 0
|
||||
for start, end, name in deduped:
|
||||
if start > len(text) or end > len(text):
|
||||
continue
|
||||
result.append(text[last_pos:start])
|
||||
result.append(placeholder)
|
||||
last_pos = end
|
||||
result.append(text[last_pos:])
|
||||
|
||||
return "".join(result)
|
||||
|
||||
|
||||
# ----------------- Selective safety rescan -----------------
|
||||
|
||||
def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
||||
@@ -2140,9 +2323,12 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
||||
def _rescan_finess(m: re.Match) -> str:
|
||||
return PLACEHOLDERS["FINESS"] if m.group(1) in _FINESS_NUMBERS else m.group(0)
|
||||
protected = RE_BARE_9DIGITS.sub(_rescan_finess, protected)
|
||||
# Établissements
|
||||
# Établissements (regex)
|
||||
protected = RE_ETABLISSEMENT.sub(PLACEHOLDERS["ETAB"], protected)
|
||||
protected = RE_HOPITAL_VILLE.sub(PLACEHOLDERS["ETAB"], protected)
|
||||
# Établissements (gazetteer Aho-Corasick FINESS — 116K noms distinctifs)
|
||||
if _FINESS_AC is not None:
|
||||
protected = _mask_finess_establishments(protected)
|
||||
# Services hospitaliers
|
||||
protected = RE_SERVICE.sub(PLACEHOLDERS["MASK"], protected)
|
||||
# Lieu de naissance / Ville de résidence (accepte tout : villes, codes INSEE, minuscules)
|
||||
|
||||
Reference in New Issue
Block a user