Deux corrections exploitant mieux les gazetteers FINESS/INSEE pour réduire la
dépendance au YAML force_mask_terms.
1. scripts/build_finess_gazetteers.py : ne lisait que col 1 (finess_et) du CSV.
Les col 2 (entjur, entité juridique) étaient ignorés. ~48k numéros
juridiques manqués, dont 640780417 (CHCB entjur) forcé en YAML à cause
de cette lacune. Fix : lecture col 1 + col 2 avec déduplication.
Régénération : 101 941 → 150 436 numéros (+48 495).
2. anonymizer_core_refactored_onnx.py :
- _FINESS_ETAB_NAMES (122k noms) chargé mais jamais consulté après le
refactoring NER-first (le matching passe par l'Aho-Corasick sur
etablissements_distinctifs.txt). Suppression → -122k entrées RAM.
- _INSEE_PRENOMS (lowercase) et _INSEE_PRENOMS_SET (uppercase sans accents)
lisaient deux fois le même fichier prenoms_france.txt. Fusion en une
seule passe disque, les deux formes dérivées en mémoire. -36k lectures.
Validation :
- 640780417 présent dans _FINESS_NUMBERS après rebuild
- 122 hits sur trackare-18007562 (non-régression)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
298 lines
13 KiB
Python
298 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Construit les gazetteers FINESS depuis le CSV open data.
|
|
==========================================================
|
|
Extrait et déduplique :
|
|
- Noms d'établissements (courts + longs) normalisés
|
|
- Villes
|
|
- Numéros FINESS
|
|
- Téléphones
|
|
|
|
Usage:
|
|
python scripts/build_finess_gazetteers.py [--csv PATH]
|
|
"""
|
|
import csv
|
|
import re
|
|
import unicodedata
|
|
from pathlib import Path
|
|
from collections import Counter
|
|
|
|
|
|
CSV_PATH = Path(__file__).parent.parent / "data" / "finess" / "finess_etablissements.csv"
|
|
OUT_DIR = Path(__file__).parent.parent / "data" / "finess"
|
|
|
|
# Préfixes génériques d'établissements à retirer pour extraire le nom distinctif
|
|
GENERIC_PREFIXES = re.compile(
|
|
r"^(?:CENTRE\s+HOSPITALIER\s+(?:UNIVERSITAIRE\s+|REGIONAL\s+|INTERCOMMUNAL\s+|DEPARTEMENTAL\s+)?(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|CH\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|CLINIQUE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|POLYCLINIQUE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|HOPITAL\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|HÔPITAL\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|EHPAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|PHARMACIE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|RESIDENCE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|MAISON\s+DE\s+(?:RETRAITE|SANTÉ|SANTE)\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|FOYER\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|SSR\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|HAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|SAAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|SSIAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|CABINET\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|LABORATOIRE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|LBM\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|SELARL\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|SERVICE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|ASSOCIATION\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|INSTITUT\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|GCS\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r")",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# Mots trop génériques pour être distinctifs seuls
|
|
GENERIC_WORDS = {
|
|
"de", "du", "des", "la", "le", "les", "l", "et", "en", "au", "aux",
|
|
"sur", "sous", "par", "pour", "avec", "dans", "rue", "avenue", "boulevard",
|
|
"centre", "service", "site", "antenne", "annexe", "pole", "pôle",
|
|
"nord", "sud", "est", "ouest", "ville", "cedex",
|
|
}
|
|
|
|
|
|
def normalize(s: str) -> str:
|
|
"""Normalise : lowercase, supprime accents, collapse espaces."""
|
|
s = s.lower().strip()
|
|
# Supprimer accents
|
|
s = unicodedata.normalize("NFD", s)
|
|
s = "".join(c for c in s if unicodedata.category(c) != "Mn")
|
|
# Collapse espaces et caractères spéciaux
|
|
s = re.sub(r"[^a-z0-9\s\-]", " ", s)
|
|
s = re.sub(r"\s+", " ", s).strip()
|
|
return s
|
|
|
|
|
|
def extract_distinctive_name(full_name: str) -> str:
|
|
"""Extrait la partie distinctive d'un nom d'établissement.
|
|
|
|
Ex: 'CENTRE HOSPITALIER DE BAYONNE' → 'bayonne'
|
|
'PHARMACIE DES GASCONS' → 'gascons'
|
|
'LES GIRANDIERES' → 'les girandieres'
|
|
"""
|
|
name = full_name.strip()
|
|
# Retirer préfixe générique
|
|
distinctive = GENERIC_PREFIXES.sub("", name).strip()
|
|
if not distinctive or len(distinctive) < 4:
|
|
distinctive = name # Garder le nom complet si rien de distinctif
|
|
return normalize(distinctive)
|
|
|
|
|
|
def main():
|
|
csv_path = CSV_PATH
|
|
if not csv_path.exists():
|
|
print(f"CSV non trouvé: {csv_path}")
|
|
return
|
|
|
|
print(f"Lecture {csv_path}...")
|
|
|
|
# Collecter toutes les données
|
|
finess_numbers = set()
|
|
full_names = set() # noms complets (longs + courts)
|
|
distinctive_names = set() # parties distinctives
|
|
cities = set()
|
|
phones = set()
|
|
|
|
with open(csv_path, encoding="utf-8") as f:
|
|
reader = csv.reader(f, delimiter=";")
|
|
next(reader) # skip header
|
|
|
|
for row in reader:
|
|
if len(row) < 16:
|
|
continue
|
|
|
|
# Numéros FINESS : col 1 = finess_et (structure), col 2 = entjur (entité juridique).
|
|
# Les deux sont des identifiants 9 chiffres réels du référentiel FINESS et doivent
|
|
# être masqués. Avant ce fix, seul finess_et était extrait (~102k), et les ~48k
|
|
# entjur étaient manqués — provoquant des fuites (ex: 640780417 entjur CHCB).
|
|
for col_idx in (1, 2):
|
|
finess = row[col_idx].strip() if col_idx < len(row) else ""
|
|
if re.match(r"^\d{9}$", finess):
|
|
finess_numbers.add(finess)
|
|
|
|
# Noms (col 3 = court, col 4 = long)
|
|
for col_idx in (3, 4):
|
|
name = row[col_idx].strip() if col_idx < len(row) else ""
|
|
if name and len(name) >= 5:
|
|
# Nettoyer les caractères parasites
|
|
name = re.sub(r'^[%\.\(\)"]+\s*', "", name)
|
|
name = re.sub(r'\s*["\)]+$', "", name)
|
|
if len(name) >= 5:
|
|
full_names.add(name)
|
|
dist = extract_distinctive_name(name)
|
|
if dist and len(dist) >= 4:
|
|
distinctive_names.add(dist)
|
|
|
|
# Ville (col 15 = "CODE_POSTAL VILLE")
|
|
city_field = row[15].strip() if len(row) > 15 else ""
|
|
if city_field and " " in city_field:
|
|
parts = city_field.split(None, 1)
|
|
if len(parts) == 2 and re.match(r"^\d{5}$", parts[0]):
|
|
ville = parts[1].strip()
|
|
if ville and len(ville) >= 2:
|
|
cities.add(ville)
|
|
|
|
# Téléphones (col 16, 17)
|
|
for col_idx in (16, 17):
|
|
tel = row[col_idx].strip() if col_idx < len(row) else ""
|
|
if re.match(r"^0\d{9}$", tel):
|
|
phones.add(tel)
|
|
|
|
print(f" Numéros FINESS: {len(finess_numbers)}")
|
|
print(f" Noms complets: {len(full_names)}")
|
|
print(f" Noms distinctifs: {len(distinctive_names)}")
|
|
print(f" Villes: {len(cities)}")
|
|
print(f" Téléphones: {len(phones)}")
|
|
|
|
# Filtrer les noms distinctifs trop courts ou trop génériques
|
|
filtered_distinctive = set()
|
|
for name in distinctive_names:
|
|
words = name.split()
|
|
# Exiger au moins un mot non-générique de 4+ chars
|
|
has_distinctive = any(
|
|
w not in GENERIC_WORDS and len(w) >= 4
|
|
for w in words
|
|
)
|
|
if has_distinctive and len(name) >= 5:
|
|
filtered_distinctive.add(name)
|
|
|
|
print(f" Noms distinctifs filtrés: {len(filtered_distinctive)}")
|
|
|
|
# Écrire les fichiers
|
|
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# 1. Numéros FINESS
|
|
out = OUT_DIR / "finess_numbers.txt"
|
|
out.write_text("\n".join(sorted(finess_numbers)) + "\n", encoding="utf-8")
|
|
print(f"\n → {out.name}: {len(finess_numbers)} entrées")
|
|
|
|
# 2. Noms complets (pour affichage/debug)
|
|
out = OUT_DIR / "etablissements_noms.txt"
|
|
out.write_text("\n".join(sorted(full_names)) + "\n", encoding="utf-8")
|
|
print(f" → {out.name}: {len(full_names)} entrées")
|
|
|
|
# 3. Noms distinctifs normalisés (pour Aho-Corasick matching)
|
|
out = OUT_DIR / "etablissements_distinctifs.txt"
|
|
out.write_text("\n".join(sorted(filtered_distinctive)) + "\n", encoding="utf-8")
|
|
print(f" → {out.name}: {len(filtered_distinctive)} entrées")
|
|
|
|
# 4. Villes
|
|
out = OUT_DIR / "villes_finess.txt"
|
|
out.write_text("\n".join(sorted(cities)) + "\n", encoding="utf-8")
|
|
print(f" → {out.name}: {len(cities)} entrées")
|
|
|
|
# 5. Téléphones
|
|
out = OUT_DIR / "telephones.txt"
|
|
out.write_text("\n".join(sorted(phones)) + "\n", encoding="utf-8")
|
|
print(f" → {out.name}: {len(phones)} entrées")
|
|
|
|
# 6. Adresses FINESS (type_voie + nom_voie) pour Aho-Corasick
|
|
# Mapping des codes type_voie FINESS vers formes étendues
|
|
TYPE_VOIE_MAP = {
|
|
"AV": "avenue", "R": "rue", "BD": "boulevard", "RTE": "route",
|
|
"CHE": "chemin", "PL": "place", "IMP": "impasse", "ALL": "allee",
|
|
"SQ": "square", "PASS": "passage", "QU": "quai", "CRS": "cours",
|
|
"SEN": "sentier", "RPT": "rond-point", "LD": "lieu-dit",
|
|
"HAM": "hameau", "LOT": "lotissement", "TSSE": "traverse",
|
|
"CHEM": "chemin", "RES": "residence", "CTRE": "centre",
|
|
"ESP": "esplanade", "PRO": "promenade", "MTE": "montee",
|
|
"VOI": "voie", "CAR": "carrefour", "FBG": "faubourg",
|
|
}
|
|
# Charger les prénoms INSEE pour générer des variantes abrégées
|
|
prenoms_path = Path(__file__).parent.parent / "data" / "insee" / "prenoms_france.txt"
|
|
prenoms_set = set()
|
|
if prenoms_path.exists():
|
|
for line in prenoms_path.read_text(encoding="utf-8").splitlines():
|
|
p = line.strip().lower()
|
|
if p and len(p) >= 3:
|
|
prenoms_set.add(p)
|
|
print(f" Prénoms INSEE chargés: {len(prenoms_set)}")
|
|
|
|
VOIE_GENERIC = {
|
|
"de", "du", "des", "la", "le", "les", "l", "et", "en", "au", "aux",
|
|
"a", "sur", "sous", "par", "pour", "dans", "rue", "avenue", "boulevard",
|
|
"route", "chemin", "place", "impasse", "square", "passage", "quai", "cours",
|
|
"grande", "grand", "petit", "petite", "vieux", "vieille", "nouveau", "nouvelle",
|
|
"haut", "haute", "bas", "basse",
|
|
}
|
|
|
|
addr_patterns = set()
|
|
|
|
def _add_with_abbrev(pattern: str):
|
|
"""Ajoute le pattern + variantes avec prénoms abrégés (initiale seule)."""
|
|
addr_patterns.add(pattern)
|
|
words = pattern.split()
|
|
for i, w in enumerate(words):
|
|
if w in prenoms_set and len(w) >= 3:
|
|
# Variante avec initiale seule — seulement si un mot distinctif suit
|
|
remaining = words[i+1:]
|
|
if not remaining or all(len(r) <= 2 or r in VOIE_GENERIC for r in remaining):
|
|
continue # Pas d'abréviation si rien de distinctif après
|
|
abbrev_words = words[:i] + [w[0]] + words[i+1:]
|
|
abbrev = " ".join(abbrev_words)
|
|
# Minimum 12 chars, et le pattern ne doit pas commencer par une initiale seule
|
|
if len(abbrev) >= 12 and len(abbrev_words[0]) >= 2:
|
|
addr_patterns.add(abbrev)
|
|
|
|
with open(csv_path, encoding="utf-8") as f:
|
|
reader = csv.reader(f, delimiter=";")
|
|
next(reader)
|
|
for row in reader:
|
|
if len(row) < 10:
|
|
continue
|
|
type_voie_raw = row[8].strip() if len(row) > 8 else ""
|
|
nom_voie = row[9].strip() if len(row) > 9 else ""
|
|
if not nom_voie or len(nom_voie) < 3:
|
|
continue
|
|
nom_norm = normalize(nom_voie)
|
|
words = nom_norm.split()
|
|
|
|
# Pattern complet : type_voie + nom_voie (ex: "avenue de l interne jacques loeb")
|
|
type_voie_expanded = TYPE_VOIE_MAP.get(type_voie_raw.upper(), type_voie_raw.lower())
|
|
if type_voie_expanded and nom_norm:
|
|
full = f"{type_voie_expanded} {nom_norm}"
|
|
full_words = full.split()
|
|
has_distinctive = any(
|
|
w not in VOIE_GENERIC and len(w) >= 4 for w in full_words
|
|
)
|
|
if has_distinctive and len(full) >= 12:
|
|
_add_with_abbrev(full)
|
|
|
|
# Pattern nom_voie seul (seulement si très distinctif)
|
|
has_distinctive = any(w not in VOIE_GENERIC and len(w) >= 4 for w in words)
|
|
if has_distinctive and len(nom_norm) >= 15:
|
|
_add_with_abbrev(nom_norm)
|
|
|
|
out = OUT_DIR / "adresses_finess.txt"
|
|
out.write_text("\n".join(sorted(addr_patterns)) + "\n", encoding="utf-8")
|
|
print(f"\n → {out.name}: {len(addr_patterns)} entrées")
|
|
|
|
# Garder aussi voies_distinctives.txt pour compatibilité
|
|
voie_names = {p for p in addr_patterns if len(p) >= 15}
|
|
out = OUT_DIR / "voies_distinctives.txt"
|
|
out.write_text("\n".join(sorted(voie_names)) + "\n", encoding="utf-8")
|
|
print(f" → {out.name}: {len(voie_names)} entrées")
|
|
|
|
# Stats par longueur
|
|
print(f"\nDistribution noms distinctifs par longueur (mots):")
|
|
word_counts = Counter(len(n.split()) for n in filtered_distinctive)
|
|
for k in sorted(word_counts):
|
|
print(f" {k} mots: {word_counts[k]:>6d}")
|
|
|
|
# Échantillon
|
|
print(f"\nÉchantillon noms distinctifs (20 premiers):")
|
|
for n in sorted(filtered_distinctive)[:20]:
|
|
print(f" {n}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|