#!/usr/bin/env python3 """ Construit les gazetteers FINESS depuis le CSV open data. ========================================================== Extrait et déduplique : - Noms d'établissements (courts + longs) normalisés - Villes - Numéros FINESS - Téléphones Usage: python scripts/build_finess_gazetteers.py [--csv PATH] """ import csv import re import unicodedata from pathlib import Path from collections import Counter CSV_PATH = Path(__file__).parent.parent / "data" / "finess" / "finess_etablissements.csv" OUT_DIR = Path(__file__).parent.parent / "data" / "finess" # Préfixes génériques d'établissements à retirer pour extraire le nom distinctif GENERIC_PREFIXES = re.compile( r"^(?:CENTRE\s+HOSPITALIER\s+(?:UNIVERSITAIRE\s+|REGIONAL\s+|INTERCOMMUNAL\s+|DEPARTEMENTAL\s+)?(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|CH\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|CLINIQUE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|POLYCLINIQUE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|HOPITAL\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|HÔPITAL\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|EHPAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|PHARMACIE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|RESIDENCE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|MAISON\s+DE\s+(?:RETRAITE|SANTÉ|SANTE)\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|FOYER\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|SSR\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|HAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|SAAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|SSIAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|CABINET\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|LABORATOIRE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|LBM\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|SELARL\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|SERVICE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|ASSOCIATION\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|INSTITUT\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r"|GCS\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?" r")", re.IGNORECASE, ) # Mots trop génériques pour être distinctifs seuls GENERIC_WORDS = { "de", "du", "des", "la", "le", "les", "l", "et", "en", "au", "aux", "sur", "sous", "par", "pour", "avec", "dans", "rue", "avenue", "boulevard", "centre", "service", "site", "antenne", "annexe", "pole", "pôle", "nord", "sud", "est", "ouest", "ville", "cedex", } def normalize(s: str) -> str: """Normalise : lowercase, supprime accents, collapse espaces.""" s = s.lower().strip() # Supprimer accents s = unicodedata.normalize("NFD", s) s = "".join(c for c in s if unicodedata.category(c) != "Mn") # Collapse espaces et caractères spéciaux s = re.sub(r"[^a-z0-9\s\-]", " ", s) s = re.sub(r"\s+", " ", s).strip() return s def extract_distinctive_name(full_name: str) -> str: """Extrait la partie distinctive d'un nom d'établissement. Ex: 'CENTRE HOSPITALIER DE BAYONNE' → 'bayonne' 'PHARMACIE DES GASCONS' → 'gascons' 'LES GIRANDIERES' → 'les girandieres' """ name = full_name.strip() # Retirer préfixe générique distinctive = GENERIC_PREFIXES.sub("", name).strip() if not distinctive or len(distinctive) < 4: distinctive = name # Garder le nom complet si rien de distinctif return normalize(distinctive) def main(): csv_path = CSV_PATH if not csv_path.exists(): print(f"CSV non trouvé: {csv_path}") return print(f"Lecture {csv_path}...") # Collecter toutes les données finess_numbers = set() full_names = set() # noms complets (longs + courts) distinctive_names = set() # parties distinctives cities = set() phones = set() with open(csv_path, encoding="utf-8") as f: reader = csv.reader(f, delimiter=";") next(reader) # skip header for row in reader: if len(row) < 16: continue # Numéros FINESS : col 1 = finess_et (structure), col 2 = entjur (entité juridique). # Les deux sont des identifiants 9 chiffres réels du référentiel FINESS et doivent # être masqués. Avant ce fix, seul finess_et était extrait (~102k), et les ~48k # entjur étaient manqués — provoquant des fuites (ex: 640780417 entjur CHCB). for col_idx in (1, 2): finess = row[col_idx].strip() if col_idx < len(row) else "" if re.match(r"^\d{9}$", finess): finess_numbers.add(finess) # Noms (col 3 = court, col 4 = long) for col_idx in (3, 4): name = row[col_idx].strip() if col_idx < len(row) else "" if name and len(name) >= 5: # Nettoyer les caractères parasites name = re.sub(r'^[%\.\(\)"]+\s*', "", name) name = re.sub(r'\s*["\)]+$', "", name) if len(name) >= 5: full_names.add(name) dist = extract_distinctive_name(name) if dist and len(dist) >= 4: distinctive_names.add(dist) # Ville (col 15 = "CODE_POSTAL VILLE") city_field = row[15].strip() if len(row) > 15 else "" if city_field and " " in city_field: parts = city_field.split(None, 1) if len(parts) == 2 and re.match(r"^\d{5}$", parts[0]): ville = parts[1].strip() if ville and len(ville) >= 2: cities.add(ville) # Téléphones (col 16, 17) for col_idx in (16, 17): tel = row[col_idx].strip() if col_idx < len(row) else "" if re.match(r"^0\d{9}$", tel): phones.add(tel) print(f" Numéros FINESS: {len(finess_numbers)}") print(f" Noms complets: {len(full_names)}") print(f" Noms distinctifs: {len(distinctive_names)}") print(f" Villes: {len(cities)}") print(f" Téléphones: {len(phones)}") # Filtrer les noms distinctifs trop courts ou trop génériques filtered_distinctive = set() for name in distinctive_names: words = name.split() # Exiger au moins un mot non-générique de 4+ chars has_distinctive = any( w not in GENERIC_WORDS and len(w) >= 4 for w in words ) if has_distinctive and len(name) >= 5: filtered_distinctive.add(name) print(f" Noms distinctifs filtrés: {len(filtered_distinctive)}") # Écrire les fichiers OUT_DIR.mkdir(parents=True, exist_ok=True) # 1. Numéros FINESS out = OUT_DIR / "finess_numbers.txt" out.write_text("\n".join(sorted(finess_numbers)) + "\n", encoding="utf-8") print(f"\n → {out.name}: {len(finess_numbers)} entrées") # 2. Noms complets (pour affichage/debug) out = OUT_DIR / "etablissements_noms.txt" out.write_text("\n".join(sorted(full_names)) + "\n", encoding="utf-8") print(f" → {out.name}: {len(full_names)} entrées") # 3. Noms distinctifs normalisés (pour Aho-Corasick matching) out = OUT_DIR / "etablissements_distinctifs.txt" out.write_text("\n".join(sorted(filtered_distinctive)) + "\n", encoding="utf-8") print(f" → {out.name}: {len(filtered_distinctive)} entrées") # 4. Villes out = OUT_DIR / "villes_finess.txt" out.write_text("\n".join(sorted(cities)) + "\n", encoding="utf-8") print(f" → {out.name}: {len(cities)} entrées") # 5. Téléphones out = OUT_DIR / "telephones.txt" out.write_text("\n".join(sorted(phones)) + "\n", encoding="utf-8") print(f" → {out.name}: {len(phones)} entrées") # 6. Adresses FINESS (type_voie + nom_voie) pour Aho-Corasick # Mapping des codes type_voie FINESS vers formes étendues TYPE_VOIE_MAP = { "AV": "avenue", "R": "rue", "BD": "boulevard", "RTE": "route", "CHE": "chemin", "PL": "place", "IMP": "impasse", "ALL": "allee", "SQ": "square", "PASS": "passage", "QU": "quai", "CRS": "cours", "SEN": "sentier", "RPT": "rond-point", "LD": "lieu-dit", "HAM": "hameau", "LOT": "lotissement", "TSSE": "traverse", "CHEM": "chemin", "RES": "residence", "CTRE": "centre", "ESP": "esplanade", "PRO": "promenade", "MTE": "montee", "VOI": "voie", "CAR": "carrefour", "FBG": "faubourg", } # Charger les prénoms INSEE pour générer des variantes abrégées prenoms_path = Path(__file__).parent.parent / "data" / "insee" / "prenoms_france.txt" prenoms_set = set() if prenoms_path.exists(): for line in prenoms_path.read_text(encoding="utf-8").splitlines(): p = line.strip().lower() if p and len(p) >= 3: prenoms_set.add(p) print(f" Prénoms INSEE chargés: {len(prenoms_set)}") VOIE_GENERIC = { "de", "du", "des", "la", "le", "les", "l", "et", "en", "au", "aux", "a", "sur", "sous", "par", "pour", "dans", "rue", "avenue", "boulevard", "route", "chemin", "place", "impasse", "square", "passage", "quai", "cours", "grande", "grand", "petit", "petite", "vieux", "vieille", "nouveau", "nouvelle", "haut", "haute", "bas", "basse", } addr_patterns = set() def _add_with_abbrev(pattern: str): """Ajoute le pattern + variantes avec prénoms abrégés (initiale seule).""" addr_patterns.add(pattern) words = pattern.split() for i, w in enumerate(words): if w in prenoms_set and len(w) >= 3: # Variante avec initiale seule — seulement si un mot distinctif suit remaining = words[i+1:] if not remaining or all(len(r) <= 2 or r in VOIE_GENERIC for r in remaining): continue # Pas d'abréviation si rien de distinctif après abbrev_words = words[:i] + [w[0]] + words[i+1:] abbrev = " ".join(abbrev_words) # Minimum 12 chars, et le pattern ne doit pas commencer par une initiale seule if len(abbrev) >= 12 and len(abbrev_words[0]) >= 2: addr_patterns.add(abbrev) with open(csv_path, encoding="utf-8") as f: reader = csv.reader(f, delimiter=";") next(reader) for row in reader: if len(row) < 10: continue type_voie_raw = row[8].strip() if len(row) > 8 else "" nom_voie = row[9].strip() if len(row) > 9 else "" if not nom_voie or len(nom_voie) < 3: continue nom_norm = normalize(nom_voie) words = nom_norm.split() # Pattern complet : type_voie + nom_voie (ex: "avenue de l interne jacques loeb") type_voie_expanded = TYPE_VOIE_MAP.get(type_voie_raw.upper(), type_voie_raw.lower()) if type_voie_expanded and nom_norm: full = f"{type_voie_expanded} {nom_norm}" full_words = full.split() has_distinctive = any( w not in VOIE_GENERIC and len(w) >= 4 for w in full_words ) if has_distinctive and len(full) >= 12: _add_with_abbrev(full) # Pattern nom_voie seul (seulement si très distinctif) has_distinctive = any(w not in VOIE_GENERIC and len(w) >= 4 for w in words) if has_distinctive and len(nom_norm) >= 15: _add_with_abbrev(nom_norm) out = OUT_DIR / "adresses_finess.txt" out.write_text("\n".join(sorted(addr_patterns)) + "\n", encoding="utf-8") print(f"\n → {out.name}: {len(addr_patterns)} entrées") # Garder aussi voies_distinctives.txt pour compatibilité voie_names = {p for p in addr_patterns if len(p) >= 15} out = OUT_DIR / "voies_distinctives.txt" out.write_text("\n".join(sorted(voie_names)) + "\n", encoding="utf-8") print(f" → {out.name}: {len(voie_names)} entrées") # Stats par longueur print(f"\nDistribution noms distinctifs par longueur (mots):") word_counts = Counter(len(n.split()) for n in filtered_distinctive) for k in sorted(word_counts): print(f" {k} mots: {word_counts[k]:>6d}") # Échantillon print(f"\nÉchantillon noms distinctifs (20 premiers):") for n in sorted(filtered_distinctive)[:20]: print(f" {n}") if __name__ == "__main__": main()