feat(phase2): Détection établissements par Aho-Corasick sur 108K noms FINESS
- Nouveau script build_finess_gazetteers.py : extraction noms distinctifs, villes, numéros depuis CSV open data - Automate Aho-Corasick (pyahocorasick) pour matching multi-pattern en ~1.7ms/page - 108K patterns indexés (noms composés >= 8 chars, mots uniques >= 10 chars) - Blacklist mots génériques (clinique, pharmacie, etc.) et stop words médicaux - Normalisation position-preserving (sans accents, même longueur) - Construction lazy de l'AC (après chargement des stop words) - Intégration dans _mask_line_by_regex et selective_rescan - Nouveau gazetteer villes_finess.txt (11,660 villes) - Résultats : "Girandières" → masqué, "Côte Basque" → masqué, 0 FP sur termes médicaux courants Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
206
scripts/build_finess_gazetteers.py
Normal file
206
scripts/build_finess_gazetteers.py
Normal file
@@ -0,0 +1,206 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Construit les gazetteers FINESS depuis le CSV open data.
|
||||
==========================================================
|
||||
Extrait et déduplique :
|
||||
- Noms d'établissements (courts + longs) normalisés
|
||||
- Villes
|
||||
- Numéros FINESS
|
||||
- Téléphones
|
||||
|
||||
Usage:
|
||||
python scripts/build_finess_gazetteers.py [--csv PATH]
|
||||
"""
|
||||
import csv
|
||||
import re
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
|
||||
|
||||
CSV_PATH = Path(__file__).parent.parent / "data" / "finess" / "finess_etablissements.csv"
|
||||
OUT_DIR = Path(__file__).parent.parent / "data" / "finess"
|
||||
|
||||
# Préfixes génériques d'établissements à retirer pour extraire le nom distinctif
|
||||
GENERIC_PREFIXES = re.compile(
|
||||
r"^(?:CENTRE\s+HOSPITALIER\s+(?:UNIVERSITAIRE\s+|REGIONAL\s+|INTERCOMMUNAL\s+|DEPARTEMENTAL\s+)?(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|CH\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|CLINIQUE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|POLYCLINIQUE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|HOPITAL\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|HÔPITAL\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|EHPAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|PHARMACIE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|RESIDENCE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|MAISON\s+DE\s+(?:RETRAITE|SANTÉ|SANTE)\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|FOYER\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|SSR\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|HAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|SAAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|SSIAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|CABINET\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|LABORATOIRE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|LBM\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|SELARL\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|SERVICE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|ASSOCIATION\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|INSTITUT\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r"|GCS\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||
r")",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Mots trop génériques pour être distinctifs seuls
|
||||
GENERIC_WORDS = {
|
||||
"de", "du", "des", "la", "le", "les", "l", "et", "en", "au", "aux",
|
||||
"sur", "sous", "par", "pour", "avec", "dans", "rue", "avenue", "boulevard",
|
||||
"centre", "service", "site", "antenne", "annexe", "pole", "pôle",
|
||||
"nord", "sud", "est", "ouest", "ville", "cedex",
|
||||
}
|
||||
|
||||
|
||||
def normalize(s: str) -> str:
|
||||
"""Normalise : lowercase, supprime accents, collapse espaces."""
|
||||
s = s.lower().strip()
|
||||
# Supprimer accents
|
||||
s = unicodedata.normalize("NFD", s)
|
||||
s = "".join(c for c in s if unicodedata.category(c) != "Mn")
|
||||
# Collapse espaces et caractères spéciaux
|
||||
s = re.sub(r"[^a-z0-9\s\-]", " ", s)
|
||||
s = re.sub(r"\s+", " ", s).strip()
|
||||
return s
|
||||
|
||||
|
||||
def extract_distinctive_name(full_name: str) -> str:
|
||||
"""Extrait la partie distinctive d'un nom d'établissement.
|
||||
|
||||
Ex: 'CENTRE HOSPITALIER DE BAYONNE' → 'bayonne'
|
||||
'PHARMACIE DES GASCONS' → 'gascons'
|
||||
'LES GIRANDIERES' → 'les girandieres'
|
||||
"""
|
||||
name = full_name.strip()
|
||||
# Retirer préfixe générique
|
||||
distinctive = GENERIC_PREFIXES.sub("", name).strip()
|
||||
if not distinctive or len(distinctive) < 4:
|
||||
distinctive = name # Garder le nom complet si rien de distinctif
|
||||
return normalize(distinctive)
|
||||
|
||||
|
||||
def main():
|
||||
csv_path = CSV_PATH
|
||||
if not csv_path.exists():
|
||||
print(f"CSV non trouvé: {csv_path}")
|
||||
return
|
||||
|
||||
print(f"Lecture {csv_path}...")
|
||||
|
||||
# Collecter toutes les données
|
||||
finess_numbers = set()
|
||||
full_names = set() # noms complets (longs + courts)
|
||||
distinctive_names = set() # parties distinctives
|
||||
cities = set()
|
||||
phones = set()
|
||||
|
||||
with open(csv_path, encoding="utf-8") as f:
|
||||
reader = csv.reader(f, delimiter=";")
|
||||
next(reader) # skip header
|
||||
|
||||
for row in reader:
|
||||
if len(row) < 16:
|
||||
continue
|
||||
|
||||
# Numéro FINESS (col 1)
|
||||
finess = row[1].strip()
|
||||
if re.match(r"^\d{9}$", finess):
|
||||
finess_numbers.add(finess)
|
||||
|
||||
# Noms (col 3 = court, col 4 = long)
|
||||
for col_idx in (3, 4):
|
||||
name = row[col_idx].strip() if col_idx < len(row) else ""
|
||||
if name and len(name) >= 5:
|
||||
# Nettoyer les caractères parasites
|
||||
name = re.sub(r'^[%\.\(\)"]+\s*', "", name)
|
||||
name = re.sub(r'\s*["\)]+$', "", name)
|
||||
if len(name) >= 5:
|
||||
full_names.add(name)
|
||||
dist = extract_distinctive_name(name)
|
||||
if dist and len(dist) >= 4:
|
||||
distinctive_names.add(dist)
|
||||
|
||||
# Ville (col 15 = "CODE_POSTAL VILLE")
|
||||
city_field = row[15].strip() if len(row) > 15 else ""
|
||||
if city_field and " " in city_field:
|
||||
parts = city_field.split(None, 1)
|
||||
if len(parts) == 2 and re.match(r"^\d{5}$", parts[0]):
|
||||
ville = parts[1].strip()
|
||||
if ville and len(ville) >= 2:
|
||||
cities.add(ville)
|
||||
|
||||
# Téléphones (col 16, 17)
|
||||
for col_idx in (16, 17):
|
||||
tel = row[col_idx].strip() if col_idx < len(row) else ""
|
||||
if re.match(r"^0\d{9}$", tel):
|
||||
phones.add(tel)
|
||||
|
||||
print(f" Numéros FINESS: {len(finess_numbers)}")
|
||||
print(f" Noms complets: {len(full_names)}")
|
||||
print(f" Noms distinctifs: {len(distinctive_names)}")
|
||||
print(f" Villes: {len(cities)}")
|
||||
print(f" Téléphones: {len(phones)}")
|
||||
|
||||
# Filtrer les noms distinctifs trop courts ou trop génériques
|
||||
filtered_distinctive = set()
|
||||
for name in distinctive_names:
|
||||
words = name.split()
|
||||
# Exiger au moins un mot non-générique de 4+ chars
|
||||
has_distinctive = any(
|
||||
w not in GENERIC_WORDS and len(w) >= 4
|
||||
for w in words
|
||||
)
|
||||
if has_distinctive and len(name) >= 5:
|
||||
filtered_distinctive.add(name)
|
||||
|
||||
print(f" Noms distinctifs filtrés: {len(filtered_distinctive)}")
|
||||
|
||||
# Écrire les fichiers
|
||||
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 1. Numéros FINESS
|
||||
out = OUT_DIR / "finess_numbers.txt"
|
||||
out.write_text("\n".join(sorted(finess_numbers)) + "\n", encoding="utf-8")
|
||||
print(f"\n → {out.name}: {len(finess_numbers)} entrées")
|
||||
|
||||
# 2. Noms complets (pour affichage/debug)
|
||||
out = OUT_DIR / "etablissements_noms.txt"
|
||||
out.write_text("\n".join(sorted(full_names)) + "\n", encoding="utf-8")
|
||||
print(f" → {out.name}: {len(full_names)} entrées")
|
||||
|
||||
# 3. Noms distinctifs normalisés (pour Aho-Corasick matching)
|
||||
out = OUT_DIR / "etablissements_distinctifs.txt"
|
||||
out.write_text("\n".join(sorted(filtered_distinctive)) + "\n", encoding="utf-8")
|
||||
print(f" → {out.name}: {len(filtered_distinctive)} entrées")
|
||||
|
||||
# 4. Villes
|
||||
out = OUT_DIR / "villes_finess.txt"
|
||||
out.write_text("\n".join(sorted(cities)) + "\n", encoding="utf-8")
|
||||
print(f" → {out.name}: {len(cities)} entrées")
|
||||
|
||||
# 5. Téléphones
|
||||
out = OUT_DIR / "telephones.txt"
|
||||
out.write_text("\n".join(sorted(phones)) + "\n", encoding="utf-8")
|
||||
print(f" → {out.name}: {len(phones)} entrées")
|
||||
|
||||
# Stats par longueur
|
||||
print(f"\nDistribution noms distinctifs par longueur (mots):")
|
||||
word_counts = Counter(len(n.split()) for n in filtered_distinctive)
|
||||
for k in sorted(word_counts):
|
||||
print(f" {k} mots: {word_counts[k]:>6d}")
|
||||
|
||||
# Échantillon
|
||||
print(f"\nÉchantillon noms distinctifs (20 premiers):")
|
||||
for n in sorted(filtered_distinctive)[:20]:
|
||||
print(f" {n}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user