Couvre les corrections PII batch A/A-2, le NIR multi-ligne en flux reel, le gazetteer FINESS Corse derive depuis la base locale, et les tests de regression associes. Aucun build ni diffusion.
299 lines
13 KiB
Python
299 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Construit les gazetteers FINESS depuis le CSV open data.
|
|
==========================================================
|
|
Extrait et déduplique :
|
|
- Noms d'établissements (courts + longs) normalisés
|
|
- Villes
|
|
- Numéros FINESS
|
|
- Téléphones
|
|
|
|
Usage:
|
|
python scripts/build_finess_gazetteers.py [--csv PATH]
|
|
"""
|
|
import csv
|
|
import re
|
|
import unicodedata
|
|
from pathlib import Path
|
|
from collections import Counter
|
|
|
|
|
|
CSV_PATH = Path(__file__).parent.parent / "data" / "finess" / "finess_etablissements.csv"
|
|
OUT_DIR = Path(__file__).parent.parent / "data" / "finess"
|
|
RE_FINESS_IDENTIFIER = re.compile(r"^(?:\d{9}|2[AB]\d{7})$", re.IGNORECASE)
|
|
|
|
# Préfixes génériques d'établissements à retirer pour extraire le nom distinctif
|
|
GENERIC_PREFIXES = re.compile(
|
|
r"^(?:CENTRE\s+HOSPITALIER\s+(?:UNIVERSITAIRE\s+|REGIONAL\s+|INTERCOMMUNAL\s+|DEPARTEMENTAL\s+)?(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|CH\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|CLINIQUE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|POLYCLINIQUE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|HOPITAL\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|HÔPITAL\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|EHPAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|PHARMACIE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|RESIDENCE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|MAISON\s+DE\s+(?:RETRAITE|SANTÉ|SANTE)\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|FOYER\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|SSR\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|HAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|SAAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|SSIAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|CABINET\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|LABORATOIRE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|LBM\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|SELARL\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|SERVICE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|ASSOCIATION\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|INSTITUT\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r"|GCS\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
|
r")",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# Mots trop génériques pour être distinctifs seuls
|
|
GENERIC_WORDS = {
|
|
"de", "du", "des", "la", "le", "les", "l", "et", "en", "au", "aux",
|
|
"sur", "sous", "par", "pour", "avec", "dans", "rue", "avenue", "boulevard",
|
|
"centre", "service", "site", "antenne", "annexe", "pole", "pôle",
|
|
"nord", "sud", "est", "ouest", "ville", "cedex",
|
|
}
|
|
|
|
|
|
def normalize(s: str) -> str:
|
|
"""Normalise : lowercase, supprime accents, collapse espaces."""
|
|
s = s.lower().strip()
|
|
# Supprimer accents
|
|
s = unicodedata.normalize("NFD", s)
|
|
s = "".join(c for c in s if unicodedata.category(c) != "Mn")
|
|
# Collapse espaces et caractères spéciaux
|
|
s = re.sub(r"[^a-z0-9\s\-]", " ", s)
|
|
s = re.sub(r"\s+", " ", s).strip()
|
|
return s
|
|
|
|
|
|
def extract_distinctive_name(full_name: str) -> str:
|
|
"""Extrait la partie distinctive d'un nom d'établissement.
|
|
|
|
Ex: 'CENTRE HOSPITALIER DE CHICAGO' → 'chicago'
|
|
'PHARMACIE DES GASCONS' → 'gascons'
|
|
'LES GIRANDIERES' → 'les girandieres'
|
|
"""
|
|
name = full_name.strip()
|
|
# Retirer préfixe générique
|
|
distinctive = GENERIC_PREFIXES.sub("", name).strip()
|
|
if not distinctive or len(distinctive) < 4:
|
|
distinctive = name # Garder le nom complet si rien de distinctif
|
|
return normalize(distinctive)
|
|
|
|
|
|
def main():
|
|
csv_path = CSV_PATH
|
|
if not csv_path.exists():
|
|
print(f"CSV non trouvé: {csv_path}")
|
|
return
|
|
|
|
print(f"Lecture {csv_path}...")
|
|
|
|
# Collecter toutes les données
|
|
finess_numbers = set()
|
|
full_names = set() # noms complets (longs + courts)
|
|
distinctive_names = set() # parties distinctives
|
|
cities = set()
|
|
phones = set()
|
|
|
|
with open(csv_path, encoding="utf-8") as f:
|
|
reader = csv.reader(f, delimiter=";")
|
|
next(reader) # skip header
|
|
|
|
for row in reader:
|
|
if len(row) < 16:
|
|
continue
|
|
|
|
# Numéros FINESS : col 1 = finess_et (structure), col 2 = entjur (entité juridique).
|
|
# Les deux sont des identifiants réels du référentiel FINESS et doivent être masqués.
|
|
# Les départements corses utilisent le préfixe alphanumérique 2A/2B au lieu de deux
|
|
# chiffres, donc on accepte aussi 2A/2B + 7 chiffres.
|
|
for col_idx in (1, 2):
|
|
finess = (row[col_idx].strip() if col_idx < len(row) else "").upper()
|
|
if RE_FINESS_IDENTIFIER.match(finess):
|
|
finess_numbers.add(finess)
|
|
|
|
# Noms (col 3 = court, col 4 = long)
|
|
for col_idx in (3, 4):
|
|
name = row[col_idx].strip() if col_idx < len(row) else ""
|
|
if name and len(name) >= 5:
|
|
# Nettoyer les caractères parasites
|
|
name = re.sub(r'^[%\.\(\)"]+\s*', "", name)
|
|
name = re.sub(r'\s*["\)]+$', "", name)
|
|
if len(name) >= 5:
|
|
full_names.add(name)
|
|
dist = extract_distinctive_name(name)
|
|
if dist and len(dist) >= 4:
|
|
distinctive_names.add(dist)
|
|
|
|
# Ville (col 15 = "CODE_POSTAL VILLE")
|
|
city_field = row[15].strip() if len(row) > 15 else ""
|
|
if city_field and " " in city_field:
|
|
parts = city_field.split(None, 1)
|
|
if len(parts) == 2 and re.match(r"^\d{5}$", parts[0]):
|
|
ville = parts[1].strip()
|
|
if ville and len(ville) >= 2:
|
|
cities.add(ville)
|
|
|
|
# Téléphones (col 16, 17)
|
|
for col_idx in (16, 17):
|
|
tel = row[col_idx].strip() if col_idx < len(row) else ""
|
|
if re.match(r"^0\d{9}$", tel):
|
|
phones.add(tel)
|
|
|
|
print(f" Numéros FINESS: {len(finess_numbers)}")
|
|
print(f" Noms complets: {len(full_names)}")
|
|
print(f" Noms distinctifs: {len(distinctive_names)}")
|
|
print(f" Villes: {len(cities)}")
|
|
print(f" Téléphones: {len(phones)}")
|
|
|
|
# Filtrer les noms distinctifs trop courts ou trop génériques
|
|
filtered_distinctive = set()
|
|
for name in distinctive_names:
|
|
words = name.split()
|
|
# Exiger au moins un mot non-générique de 4+ chars
|
|
has_distinctive = any(
|
|
w not in GENERIC_WORDS and len(w) >= 4
|
|
for w in words
|
|
)
|
|
if has_distinctive and len(name) >= 5:
|
|
filtered_distinctive.add(name)
|
|
|
|
print(f" Noms distinctifs filtrés: {len(filtered_distinctive)}")
|
|
|
|
# Écrire les fichiers
|
|
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# 1. Numéros FINESS
|
|
out = OUT_DIR / "finess_numbers.txt"
|
|
out.write_text("\n".join(sorted(finess_numbers)) + "\n", encoding="utf-8")
|
|
print(f"\n → {out.name}: {len(finess_numbers)} entrées")
|
|
|
|
# 2. Noms complets (pour affichage/debug)
|
|
out = OUT_DIR / "etablissements_noms.txt"
|
|
out.write_text("\n".join(sorted(full_names)) + "\n", encoding="utf-8")
|
|
print(f" → {out.name}: {len(full_names)} entrées")
|
|
|
|
# 3. Noms distinctifs normalisés (pour Aho-Corasick matching)
|
|
out = OUT_DIR / "etablissements_distinctifs.txt"
|
|
out.write_text("\n".join(sorted(filtered_distinctive)) + "\n", encoding="utf-8")
|
|
print(f" → {out.name}: {len(filtered_distinctive)} entrées")
|
|
|
|
# 4. Villes
|
|
out = OUT_DIR / "villes_finess.txt"
|
|
out.write_text("\n".join(sorted(cities)) + "\n", encoding="utf-8")
|
|
print(f" → {out.name}: {len(cities)} entrées")
|
|
|
|
# 5. Téléphones
|
|
out = OUT_DIR / "telephones.txt"
|
|
out.write_text("\n".join(sorted(phones)) + "\n", encoding="utf-8")
|
|
print(f" → {out.name}: {len(phones)} entrées")
|
|
|
|
# 6. Adresses FINESS (type_voie + nom_voie) pour Aho-Corasick
|
|
# Mapping des codes type_voie FINESS vers formes étendues
|
|
TYPE_VOIE_MAP = {
|
|
"AV": "avenue", "R": "rue", "BD": "boulevard", "RTE": "route",
|
|
"CHE": "chemin", "PL": "place", "IMP": "impasse", "ALL": "allee",
|
|
"SQ": "square", "PASS": "passage", "QU": "quai", "CRS": "cours",
|
|
"SEN": "sentier", "RPT": "rond-point", "LD": "lieu-dit",
|
|
"HAM": "hameau", "LOT": "lotissement", "TSSE": "traverse",
|
|
"CHEM": "chemin", "RES": "residence", "CTRE": "centre",
|
|
"ESP": "esplanade", "PRO": "promenade", "MTE": "montee",
|
|
"VOI": "voie", "CAR": "carrefour", "FBG": "faubourg",
|
|
}
|
|
# Charger les prénoms INSEE pour générer des variantes abrégées
|
|
prenoms_path = Path(__file__).parent.parent / "data" / "insee" / "prenoms_france.txt"
|
|
prenoms_set = set()
|
|
if prenoms_path.exists():
|
|
for line in prenoms_path.read_text(encoding="utf-8").splitlines():
|
|
p = line.strip().lower()
|
|
if p and len(p) >= 3:
|
|
prenoms_set.add(p)
|
|
print(f" Prénoms INSEE chargés: {len(prenoms_set)}")
|
|
|
|
VOIE_GENERIC = {
|
|
"de", "du", "des", "la", "le", "les", "l", "et", "en", "au", "aux",
|
|
"a", "sur", "sous", "par", "pour", "dans", "rue", "avenue", "boulevard",
|
|
"route", "chemin", "place", "impasse", "square", "passage", "quai", "cours",
|
|
"grande", "grand", "petit", "petite", "vieux", "vieille", "nouveau", "nouvelle",
|
|
"haut", "haute", "bas", "basse",
|
|
}
|
|
|
|
addr_patterns = set()
|
|
|
|
def _add_with_abbrev(pattern: str):
|
|
"""Ajoute le pattern + variantes avec prénoms abrégés (initiale seule)."""
|
|
addr_patterns.add(pattern)
|
|
words = pattern.split()
|
|
for i, w in enumerate(words):
|
|
if w in prenoms_set and len(w) >= 3:
|
|
# Variante avec initiale seule — seulement si un mot distinctif suit
|
|
remaining = words[i+1:]
|
|
if not remaining or all(len(r) <= 2 or r in VOIE_GENERIC for r in remaining):
|
|
continue # Pas d'abréviation si rien de distinctif après
|
|
abbrev_words = words[:i] + [w[0]] + words[i+1:]
|
|
abbrev = " ".join(abbrev_words)
|
|
# Minimum 12 chars, et le pattern ne doit pas commencer par une initiale seule
|
|
if len(abbrev) >= 12 and len(abbrev_words[0]) >= 2:
|
|
addr_patterns.add(abbrev)
|
|
|
|
with open(csv_path, encoding="utf-8") as f:
|
|
reader = csv.reader(f, delimiter=";")
|
|
next(reader)
|
|
for row in reader:
|
|
if len(row) < 10:
|
|
continue
|
|
type_voie_raw = row[8].strip() if len(row) > 8 else ""
|
|
nom_voie = row[9].strip() if len(row) > 9 else ""
|
|
if not nom_voie or len(nom_voie) < 3:
|
|
continue
|
|
nom_norm = normalize(nom_voie)
|
|
words = nom_norm.split()
|
|
|
|
# Pattern complet : type_voie + nom_voie (ex: "avenue de l interne jacques loeb")
|
|
type_voie_expanded = TYPE_VOIE_MAP.get(type_voie_raw.upper(), type_voie_raw.lower())
|
|
if type_voie_expanded and nom_norm:
|
|
full = f"{type_voie_expanded} {nom_norm}"
|
|
full_words = full.split()
|
|
has_distinctive = any(
|
|
w not in VOIE_GENERIC and len(w) >= 4 for w in full_words
|
|
)
|
|
if has_distinctive and len(full) >= 12:
|
|
_add_with_abbrev(full)
|
|
|
|
# Pattern nom_voie seul (seulement si très distinctif)
|
|
has_distinctive = any(w not in VOIE_GENERIC and len(w) >= 4 for w in words)
|
|
if has_distinctive and len(nom_norm) >= 15:
|
|
_add_with_abbrev(nom_norm)
|
|
|
|
out = OUT_DIR / "adresses_finess.txt"
|
|
out.write_text("\n".join(sorted(addr_patterns)) + "\n", encoding="utf-8")
|
|
print(f"\n → {out.name}: {len(addr_patterns)} entrées")
|
|
|
|
# Garder aussi voies_distinctives.txt pour compatibilité
|
|
voie_names = {p for p in addr_patterns if len(p) >= 15}
|
|
out = OUT_DIR / "voies_distinctives.txt"
|
|
out.write_text("\n".join(sorted(voie_names)) + "\n", encoding="utf-8")
|
|
print(f" → {out.name}: {len(voie_names)} entrées")
|
|
|
|
# Stats par longueur
|
|
print(f"\nDistribution noms distinctifs par longueur (mots):")
|
|
word_counts = Counter(len(n.split()) for n in filtered_distinctive)
|
|
for k in sorted(word_counts):
|
|
print(f" {k} mots: {word_counts[k]:>6d}")
|
|
|
|
# Échantillon
|
|
print(f"\nÉchantillon noms distinctifs (20 premiers):")
|
|
for n in sorted(filtered_distinctive)[:20]:
|
|
print(f" {n}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|