feat(phase2): Détection établissements par Aho-Corasick sur 108K noms FINESS
- Nouveau script build_finess_gazetteers.py : extraction noms distinctifs, villes, numéros depuis CSV open data - Automate Aho-Corasick (pyahocorasick) pour matching multi-pattern en ~1.7ms/page - 108K patterns indexés (noms composés >= 8 chars, mots uniques >= 10 chars) - Blacklist mots génériques (clinique, pharmacie, etc.) et stop words médicaux - Normalisation position-preserving (sans accents, même longueur) - Construction lazy de l'AC (après chargement des stop words) - Intégration dans _mask_line_by_regex et selective_rescan - Nouveau gazetteer villes_finess.txt (11,660 villes) - Résultats : "Girandières" → masqué, "Côte Basque" → masqué, 0 FP sur termes médicaux courants Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -154,10 +154,30 @@ _load_insee_gazetteers()
|
|||||||
_FINESS_NUMBERS: set = set() # numéros FINESS 9 chiffres
|
_FINESS_NUMBERS: set = set() # numéros FINESS 9 chiffres
|
||||||
_FINESS_ETAB_NAMES: set = set() # noms d'établissements (lowercase)
|
_FINESS_ETAB_NAMES: set = set() # noms d'établissements (lowercase)
|
||||||
_FINESS_TELEPHONES: set = set() # téléphones 10 chiffres
|
_FINESS_TELEPHONES: set = set() # téléphones 10 chiffres
|
||||||
|
_FINESS_VILLES: set = set() # villes FINESS (uppercase)
|
||||||
|
_FINESS_AC = None # Automate Aho-Corasick pour noms distinctifs
|
||||||
|
|
||||||
|
try:
|
||||||
|
import ahocorasick as _ahocorasick
|
||||||
|
_AHO_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
_ahocorasick = None
|
||||||
|
_AHO_AVAILABLE = False
|
||||||
|
|
||||||
|
def _normalize_for_matching(s: str) -> str:
|
||||||
|
"""Normalise pour matching gazetteer : lowercase, sans accents, espaces collapsés."""
|
||||||
|
import unicodedata
|
||||||
|
s = s.lower().strip()
|
||||||
|
s = unicodedata.normalize("NFD", s)
|
||||||
|
s = "".join(c for c in s if unicodedata.category(c) != "Mn")
|
||||||
|
s = re.sub(r"[^a-z0-9\s\-]", " ", s)
|
||||||
|
s = re.sub(r"\s+", " ", s).strip()
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
def _load_finess_gazetteers():
|
def _load_finess_gazetteers():
|
||||||
"""Charge les gazetteers FINESS (établissements, numéros, téléphones)."""
|
"""Charge les gazetteers FINESS (établissements, numéros, téléphones, villes, Aho-Corasick)."""
|
||||||
global _FINESS_NUMBERS, _FINESS_ETAB_NAMES, _FINESS_TELEPHONES
|
global _FINESS_NUMBERS, _FINESS_ETAB_NAMES, _FINESS_TELEPHONES, _FINESS_VILLES, _FINESS_AC
|
||||||
data_dir = Path(__file__).parent / "data" / "finess"
|
data_dir = Path(__file__).parent / "data" / "finess"
|
||||||
|
|
||||||
# Numéros FINESS
|
# Numéros FINESS
|
||||||
@@ -172,7 +192,7 @@ def _load_finess_gazetteers():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning(f"Erreur chargement FINESS numéros: {e}")
|
log.warning(f"Erreur chargement FINESS numéros: {e}")
|
||||||
|
|
||||||
# Noms d'établissements (pour détection HOPITAL)
|
# Noms d'établissements complets (pour debug/référence)
|
||||||
noms_path = data_dir / "etablissements_noms.txt"
|
noms_path = data_dir / "etablissements_noms.txt"
|
||||||
if noms_path.exists():
|
if noms_path.exists():
|
||||||
try:
|
try:
|
||||||
@@ -184,6 +204,21 @@ def _load_finess_gazetteers():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning(f"Erreur chargement FINESS noms: {e}")
|
log.warning(f"Erreur chargement FINESS noms: {e}")
|
||||||
|
|
||||||
|
# Noms distinctifs : chargement différé (Aho-Corasick construit au premier appel,
|
||||||
|
# car _MEDICAL_STOP_WORDS_SET n'est pas encore défini à ce stade du module)
|
||||||
|
|
||||||
|
# Villes FINESS
|
||||||
|
villes_path = data_dir / "villes_finess.txt"
|
||||||
|
if villes_path.exists():
|
||||||
|
try:
|
||||||
|
_FINESS_VILLES = {
|
||||||
|
line.strip() for line in villes_path.read_text(encoding="utf-8").splitlines()
|
||||||
|
if line.strip() and len(line.strip()) >= 3
|
||||||
|
}
|
||||||
|
log.info(f"Gazetteer FINESS villes: {len(_FINESS_VILLES)} entrées")
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f"Erreur chargement FINESS villes: {e}")
|
||||||
|
|
||||||
# Téléphones (pour validation)
|
# Téléphones (pour validation)
|
||||||
tel_path = data_dir / "telephones.txt"
|
tel_path = data_dir / "telephones.txt"
|
||||||
if tel_path.exists():
|
if tel_path.exists():
|
||||||
@@ -1220,6 +1255,16 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
|
|||||||
line = RE_ETABLISSEMENT.sub(_repl_etab, line)
|
line = RE_ETABLISSEMENT.sub(_repl_etab, line)
|
||||||
line = RE_HOPITAL_VILLE.sub(_repl_etab, line)
|
line = RE_HOPITAL_VILLE.sub(_repl_etab, line)
|
||||||
|
|
||||||
|
# Établissements par gazetteer Aho-Corasick FINESS (116K noms distinctifs)
|
||||||
|
if _FINESS_AC is not None:
|
||||||
|
old_line = line
|
||||||
|
line = _mask_finess_establishments(line)
|
||||||
|
if line != old_line:
|
||||||
|
# Enregistrer les hits dans l'audit
|
||||||
|
# (on ne peut pas facilement savoir quels noms ont matché,
|
||||||
|
# mais on log le fait qu'un match gazetteer a eu lieu)
|
||||||
|
audit.append(PiiHit(page_idx, "ETAB_FINESS", "gazetteer", PLACEHOLDERS["ETAB"]))
|
||||||
|
|
||||||
# Services hospitaliers (service de Cardiologie, unité de soins palliatifs, etc.)
|
# Services hospitaliers (service de Cardiologie, unité de soins palliatifs, etc.)
|
||||||
def _repl_service(m: re.Match) -> str:
|
def _repl_service(m: re.Match) -> str:
|
||||||
full_match = m.group(0)
|
full_match = m.group(0)
|
||||||
@@ -2097,6 +2142,144 @@ def apply_eds_pseudo_on_narrative(text_out: str, cfg: Dict[str, Any], manager: "
|
|||||||
final = "".join(rebuilt_list)
|
final = "".join(rebuilt_list)
|
||||||
return final, hits
|
return final, hits
|
||||||
|
|
||||||
|
# ----------------- FINESS Aho-Corasick establishment matching -----------------
|
||||||
|
|
||||||
|
def _build_finess_ac():
|
||||||
|
"""Construit l'automate Aho-Corasick FINESS (appelé en lazy au premier besoin)."""
|
||||||
|
global _FINESS_AC
|
||||||
|
if not _AHO_AVAILABLE:
|
||||||
|
return
|
||||||
|
data_dir = Path(__file__).parent / "data" / "finess"
|
||||||
|
dist_path = data_dir / "etablissements_distinctifs.txt"
|
||||||
|
if not dist_path.exists():
|
||||||
|
return
|
||||||
|
|
||||||
|
# Mots génériques qui ne doivent jamais être matchés seuls
|
||||||
|
_ac_generic_blacklist = {
|
||||||
|
# Types d'établissements
|
||||||
|
"clinique", "pharmacie", "hopital", "centre", "foyer",
|
||||||
|
"residence", "maison", "cabinet", "service", "laboratoire",
|
||||||
|
"institut", "association", "fondation", "mutuelle", "polyclinique",
|
||||||
|
"dispensaire", "hospice", "annexe", "antenne", "site",
|
||||||
|
# Mots français courants qui sont aussi des noms d'établissements
|
||||||
|
"collegiale", "collegial", "cathedral", "cathedrale",
|
||||||
|
"providence", "esperance", "renaissance", "liberation",
|
||||||
|
"republique", "fraternite", "solidarite", "independance",
|
||||||
|
"beauregard", "bellevue", "belvedere",
|
||||||
|
"promenade", "esplanade", "corniche", "prefecture",
|
||||||
|
"croissant", "confluence", "bienvenue",
|
||||||
|
"chartreuse", "commanderie", "chapelle", "basilique",
|
||||||
|
"departement", "departementale", "communautaire",
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
ac = _ahocorasick.Automaton()
|
||||||
|
count = 0
|
||||||
|
for line in dist_path.read_text(encoding="utf-8").splitlines():
|
||||||
|
name = line.strip()
|
||||||
|
if not name:
|
||||||
|
continue
|
||||||
|
# Exclure les mots génériques seuls
|
||||||
|
if name in _ac_generic_blacklist:
|
||||||
|
continue
|
||||||
|
words = name.split()
|
||||||
|
# Exclure les 2-mots dont le 1er est générique ET le 2e < 5 chars
|
||||||
|
if len(words) == 2 and words[0] in _ac_generic_blacklist and len(words[1]) < 5:
|
||||||
|
continue
|
||||||
|
# Filtrer : >= 8 chars et >= 2 mots, OU >= 10 chars pour 1 mot
|
||||||
|
# Les noms courts sont gérés par RE_HOPITAL_VILLE
|
||||||
|
if len(words) >= 2 and len(name) >= 8:
|
||||||
|
ac.add_word(name, name)
|
||||||
|
count += 1
|
||||||
|
elif (len(words) == 1 and len(name) >= 10
|
||||||
|
and name not in _ac_generic_blacklist
|
||||||
|
and name not in _MEDICAL_STOP_WORDS_SET
|
||||||
|
and _normalize_for_matching(name) not in _MEDICAL_STOP_WORDS_SET):
|
||||||
|
ac.add_word(name, name)
|
||||||
|
count += 1
|
||||||
|
ac.make_automaton()
|
||||||
|
_FINESS_AC = ac
|
||||||
|
log.info(f"Gazetteer FINESS Aho-Corasick: {count} patterns chargés")
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f"Erreur construction FINESS Aho-Corasick: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_positional(text: str) -> str:
|
||||||
|
"""Normalise en préservant la longueur : lowercase + accents → base char.
|
||||||
|
|
||||||
|
Chaque caractère accentué est remplacé par sa version sans accent.
|
||||||
|
Les caractères non-alphanumériques restent tels quels (même position).
|
||||||
|
Longueur de sortie == longueur d'entrée.
|
||||||
|
"""
|
||||||
|
import unicodedata
|
||||||
|
out = []
|
||||||
|
for ch in text:
|
||||||
|
# Lowercase
|
||||||
|
ch = ch.lower()
|
||||||
|
# Décomposer et retirer les accents
|
||||||
|
decomposed = unicodedata.normalize("NFD", ch)
|
||||||
|
base = "".join(c for c in decomposed if unicodedata.category(c) != "Mn")
|
||||||
|
out.append(base if base else ch)
|
||||||
|
return "".join(out)
|
||||||
|
|
||||||
|
|
||||||
|
def _mask_finess_establishments(text: str) -> str:
|
||||||
|
"""Masque les noms d'établissements FINESS détectés par Aho-Corasick.
|
||||||
|
|
||||||
|
Scanne le texte normalisé (position-preserving: même longueur) et remplace
|
||||||
|
les occurrences trouvées dans le texte original par [ETABLISSEMENT].
|
||||||
|
Seuls les matches sur des frontières de mots sont acceptés.
|
||||||
|
"""
|
||||||
|
global _FINESS_AC
|
||||||
|
if _FINESS_AC is None:
|
||||||
|
_build_finess_ac()
|
||||||
|
if _FINESS_AC is None:
|
||||||
|
return text
|
||||||
|
|
||||||
|
normalized = _normalize_positional(text)
|
||||||
|
placeholder = PLACEHOLDERS["ETAB"]
|
||||||
|
|
||||||
|
# Collecter les matches Aho-Corasick (position fin, nom)
|
||||||
|
matches = []
|
||||||
|
for end_idx, name in _FINESS_AC.iter(normalized):
|
||||||
|
start_idx = end_idx - len(name) + 1
|
||||||
|
# Vérifier frontières de mots (pas au milieu d'un mot)
|
||||||
|
if start_idx > 0 and normalized[start_idx - 1].isalnum():
|
||||||
|
continue
|
||||||
|
if end_idx + 1 < len(normalized) and normalized[end_idx + 1].isalnum():
|
||||||
|
continue
|
||||||
|
# Vérifier que ce n'est pas déjà dans un placeholder
|
||||||
|
ctx_before = text[max(0, start_idx - 1):start_idx]
|
||||||
|
ctx_after = text[end_idx + 1:min(len(text), end_idx + 2)]
|
||||||
|
if "[" in ctx_before or "]" in ctx_after:
|
||||||
|
continue
|
||||||
|
matches.append((start_idx, end_idx + 1, name))
|
||||||
|
|
||||||
|
if not matches:
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Trier par position, dédupliquer (garder le plus long en cas de chevauchement)
|
||||||
|
matches.sort(key=lambda x: (x[0], -(x[1] - x[0])))
|
||||||
|
deduped = []
|
||||||
|
last_end = 0
|
||||||
|
for start, end, name in matches:
|
||||||
|
if start >= last_end:
|
||||||
|
deduped.append((start, end, name))
|
||||||
|
last_end = end
|
||||||
|
|
||||||
|
# Reconstruire le texte avec les remplacements (positions 1:1 avec l'original)
|
||||||
|
result = []
|
||||||
|
last_pos = 0
|
||||||
|
for start, end, name in deduped:
|
||||||
|
if start > len(text) or end > len(text):
|
||||||
|
continue
|
||||||
|
result.append(text[last_pos:start])
|
||||||
|
result.append(placeholder)
|
||||||
|
last_pos = end
|
||||||
|
result.append(text[last_pos:])
|
||||||
|
|
||||||
|
return "".join(result)
|
||||||
|
|
||||||
|
|
||||||
# ----------------- Selective safety rescan -----------------
|
# ----------------- Selective safety rescan -----------------
|
||||||
|
|
||||||
def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
||||||
@@ -2140,9 +2323,12 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
|||||||
def _rescan_finess(m: re.Match) -> str:
|
def _rescan_finess(m: re.Match) -> str:
|
||||||
return PLACEHOLDERS["FINESS"] if m.group(1) in _FINESS_NUMBERS else m.group(0)
|
return PLACEHOLDERS["FINESS"] if m.group(1) in _FINESS_NUMBERS else m.group(0)
|
||||||
protected = RE_BARE_9DIGITS.sub(_rescan_finess, protected)
|
protected = RE_BARE_9DIGITS.sub(_rescan_finess, protected)
|
||||||
# Établissements
|
# Établissements (regex)
|
||||||
protected = RE_ETABLISSEMENT.sub(PLACEHOLDERS["ETAB"], protected)
|
protected = RE_ETABLISSEMENT.sub(PLACEHOLDERS["ETAB"], protected)
|
||||||
protected = RE_HOPITAL_VILLE.sub(PLACEHOLDERS["ETAB"], protected)
|
protected = RE_HOPITAL_VILLE.sub(PLACEHOLDERS["ETAB"], protected)
|
||||||
|
# Établissements (gazetteer Aho-Corasick FINESS — 116K noms distinctifs)
|
||||||
|
if _FINESS_AC is not None:
|
||||||
|
protected = _mask_finess_establishments(protected)
|
||||||
# Services hospitaliers
|
# Services hospitaliers
|
||||||
protected = RE_SERVICE.sub(PLACEHOLDERS["MASK"], protected)
|
protected = RE_SERVICE.sub(PLACEHOLDERS["MASK"], protected)
|
||||||
# Lieu de naissance / Ville de résidence (accepte tout : villes, codes INSEE, minuscules)
|
# Lieu de naissance / Ville de résidence (accepte tout : villes, codes INSEE, minuscules)
|
||||||
|
|||||||
116606
data/finess/etablissements_distinctifs.txt
Normal file
116606
data/finess/etablissements_distinctifs.txt
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -101938,4 +101938,4 @@
|
|||||||
980503346
|
980503346
|
||||||
980503395
|
980503395
|
||||||
980600027
|
980600027
|
||||||
980600035
|
980600035
|
||||||
|
|||||||
@@ -113234,14 +113234,3 @@
|
|||||||
0989324787
|
0989324787
|
||||||
0989450002
|
0989450002
|
||||||
0999991377
|
0999991377
|
||||||
1473143031
|
|
||||||
1545915013
|
|
||||||
1677708604
|
|
||||||
1698408775
|
|
||||||
1749771871
|
|
||||||
2561752635
|
|
||||||
3080428670
|
|
||||||
3088102831
|
|
||||||
3134335540
|
|
||||||
4242745138
|
|
||||||
4326549224
|
|
||||||
11660
data/finess/villes_finess.txt
Normal file
11660
data/finess/villes_finess.txt
Normal file
File diff suppressed because it is too large
Load Diff
206
scripts/build_finess_gazetteers.py
Normal file
206
scripts/build_finess_gazetteers.py
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Construit les gazetteers FINESS depuis le CSV open data.
|
||||||
|
==========================================================
|
||||||
|
Extrait et déduplique :
|
||||||
|
- Noms d'établissements (courts + longs) normalisés
|
||||||
|
- Villes
|
||||||
|
- Numéros FINESS
|
||||||
|
- Téléphones
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/build_finess_gazetteers.py [--csv PATH]
|
||||||
|
"""
|
||||||
|
import csv
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
|
||||||
|
CSV_PATH = Path(__file__).parent.parent / "data" / "finess" / "finess_etablissements.csv"
|
||||||
|
OUT_DIR = Path(__file__).parent.parent / "data" / "finess"
|
||||||
|
|
||||||
|
# Préfixes génériques d'établissements à retirer pour extraire le nom distinctif
|
||||||
|
GENERIC_PREFIXES = re.compile(
|
||||||
|
r"^(?:CENTRE\s+HOSPITALIER\s+(?:UNIVERSITAIRE\s+|REGIONAL\s+|INTERCOMMUNAL\s+|DEPARTEMENTAL\s+)?(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|CH\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|CLINIQUE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|POLYCLINIQUE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|HOPITAL\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|HÔPITAL\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|EHPAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|PHARMACIE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|RESIDENCE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|MAISON\s+DE\s+(?:RETRAITE|SANTÉ|SANTE)\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|FOYER\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|SSR\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|HAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|SAAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|SSIAD\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|CABINET\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|LABORATOIRE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|LBM\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|SELARL\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|SERVICE\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|ASSOCIATION\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|INSTITUT\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r"|GCS\s+(?:DE\s+|D['']\s*|DU\s+|DES\s+)?(?:LA\s+|LE\s+|L['']\s*|LES\s+)?"
|
||||||
|
r")",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mots trop génériques pour être distinctifs seuls
|
||||||
|
GENERIC_WORDS = {
|
||||||
|
"de", "du", "des", "la", "le", "les", "l", "et", "en", "au", "aux",
|
||||||
|
"sur", "sous", "par", "pour", "avec", "dans", "rue", "avenue", "boulevard",
|
||||||
|
"centre", "service", "site", "antenne", "annexe", "pole", "pôle",
|
||||||
|
"nord", "sud", "est", "ouest", "ville", "cedex",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(s: str) -> str:
|
||||||
|
"""Normalise : lowercase, supprime accents, collapse espaces."""
|
||||||
|
s = s.lower().strip()
|
||||||
|
# Supprimer accents
|
||||||
|
s = unicodedata.normalize("NFD", s)
|
||||||
|
s = "".join(c for c in s if unicodedata.category(c) != "Mn")
|
||||||
|
# Collapse espaces et caractères spéciaux
|
||||||
|
s = re.sub(r"[^a-z0-9\s\-]", " ", s)
|
||||||
|
s = re.sub(r"\s+", " ", s).strip()
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def extract_distinctive_name(full_name: str) -> str:
|
||||||
|
"""Extrait la partie distinctive d'un nom d'établissement.
|
||||||
|
|
||||||
|
Ex: 'CENTRE HOSPITALIER DE BAYONNE' → 'bayonne'
|
||||||
|
'PHARMACIE DES GASCONS' → 'gascons'
|
||||||
|
'LES GIRANDIERES' → 'les girandieres'
|
||||||
|
"""
|
||||||
|
name = full_name.strip()
|
||||||
|
# Retirer préfixe générique
|
||||||
|
distinctive = GENERIC_PREFIXES.sub("", name).strip()
|
||||||
|
if not distinctive or len(distinctive) < 4:
|
||||||
|
distinctive = name # Garder le nom complet si rien de distinctif
|
||||||
|
return normalize(distinctive)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
csv_path = CSV_PATH
|
||||||
|
if not csv_path.exists():
|
||||||
|
print(f"CSV non trouvé: {csv_path}")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Lecture {csv_path}...")
|
||||||
|
|
||||||
|
# Collecter toutes les données
|
||||||
|
finess_numbers = set()
|
||||||
|
full_names = set() # noms complets (longs + courts)
|
||||||
|
distinctive_names = set() # parties distinctives
|
||||||
|
cities = set()
|
||||||
|
phones = set()
|
||||||
|
|
||||||
|
with open(csv_path, encoding="utf-8") as f:
|
||||||
|
reader = csv.reader(f, delimiter=";")
|
||||||
|
next(reader) # skip header
|
||||||
|
|
||||||
|
for row in reader:
|
||||||
|
if len(row) < 16:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Numéro FINESS (col 1)
|
||||||
|
finess = row[1].strip()
|
||||||
|
if re.match(r"^\d{9}$", finess):
|
||||||
|
finess_numbers.add(finess)
|
||||||
|
|
||||||
|
# Noms (col 3 = court, col 4 = long)
|
||||||
|
for col_idx in (3, 4):
|
||||||
|
name = row[col_idx].strip() if col_idx < len(row) else ""
|
||||||
|
if name and len(name) >= 5:
|
||||||
|
# Nettoyer les caractères parasites
|
||||||
|
name = re.sub(r'^[%\.\(\)"]+\s*', "", name)
|
||||||
|
name = re.sub(r'\s*["\)]+$', "", name)
|
||||||
|
if len(name) >= 5:
|
||||||
|
full_names.add(name)
|
||||||
|
dist = extract_distinctive_name(name)
|
||||||
|
if dist and len(dist) >= 4:
|
||||||
|
distinctive_names.add(dist)
|
||||||
|
|
||||||
|
# Ville (col 15 = "CODE_POSTAL VILLE")
|
||||||
|
city_field = row[15].strip() if len(row) > 15 else ""
|
||||||
|
if city_field and " " in city_field:
|
||||||
|
parts = city_field.split(None, 1)
|
||||||
|
if len(parts) == 2 and re.match(r"^\d{5}$", parts[0]):
|
||||||
|
ville = parts[1].strip()
|
||||||
|
if ville and len(ville) >= 2:
|
||||||
|
cities.add(ville)
|
||||||
|
|
||||||
|
# Téléphones (col 16, 17)
|
||||||
|
for col_idx in (16, 17):
|
||||||
|
tel = row[col_idx].strip() if col_idx < len(row) else ""
|
||||||
|
if re.match(r"^0\d{9}$", tel):
|
||||||
|
phones.add(tel)
|
||||||
|
|
||||||
|
print(f" Numéros FINESS: {len(finess_numbers)}")
|
||||||
|
print(f" Noms complets: {len(full_names)}")
|
||||||
|
print(f" Noms distinctifs: {len(distinctive_names)}")
|
||||||
|
print(f" Villes: {len(cities)}")
|
||||||
|
print(f" Téléphones: {len(phones)}")
|
||||||
|
|
||||||
|
# Filtrer les noms distinctifs trop courts ou trop génériques
|
||||||
|
filtered_distinctive = set()
|
||||||
|
for name in distinctive_names:
|
||||||
|
words = name.split()
|
||||||
|
# Exiger au moins un mot non-générique de 4+ chars
|
||||||
|
has_distinctive = any(
|
||||||
|
w not in GENERIC_WORDS and len(w) >= 4
|
||||||
|
for w in words
|
||||||
|
)
|
||||||
|
if has_distinctive and len(name) >= 5:
|
||||||
|
filtered_distinctive.add(name)
|
||||||
|
|
||||||
|
print(f" Noms distinctifs filtrés: {len(filtered_distinctive)}")
|
||||||
|
|
||||||
|
# Écrire les fichiers
|
||||||
|
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# 1. Numéros FINESS
|
||||||
|
out = OUT_DIR / "finess_numbers.txt"
|
||||||
|
out.write_text("\n".join(sorted(finess_numbers)) + "\n", encoding="utf-8")
|
||||||
|
print(f"\n → {out.name}: {len(finess_numbers)} entrées")
|
||||||
|
|
||||||
|
# 2. Noms complets (pour affichage/debug)
|
||||||
|
out = OUT_DIR / "etablissements_noms.txt"
|
||||||
|
out.write_text("\n".join(sorted(full_names)) + "\n", encoding="utf-8")
|
||||||
|
print(f" → {out.name}: {len(full_names)} entrées")
|
||||||
|
|
||||||
|
# 3. Noms distinctifs normalisés (pour Aho-Corasick matching)
|
||||||
|
out = OUT_DIR / "etablissements_distinctifs.txt"
|
||||||
|
out.write_text("\n".join(sorted(filtered_distinctive)) + "\n", encoding="utf-8")
|
||||||
|
print(f" → {out.name}: {len(filtered_distinctive)} entrées")
|
||||||
|
|
||||||
|
# 4. Villes
|
||||||
|
out = OUT_DIR / "villes_finess.txt"
|
||||||
|
out.write_text("\n".join(sorted(cities)) + "\n", encoding="utf-8")
|
||||||
|
print(f" → {out.name}: {len(cities)} entrées")
|
||||||
|
|
||||||
|
# 5. Téléphones
|
||||||
|
out = OUT_DIR / "telephones.txt"
|
||||||
|
out.write_text("\n".join(sorted(phones)) + "\n", encoding="utf-8")
|
||||||
|
print(f" → {out.name}: {len(phones)} entrées")
|
||||||
|
|
||||||
|
# Stats par longueur
|
||||||
|
print(f"\nDistribution noms distinctifs par longueur (mots):")
|
||||||
|
word_counts = Counter(len(n.split()) for n in filtered_distinctive)
|
||||||
|
for k in sorted(word_counts):
|
||||||
|
print(f" {k} mots: {word_counts[k]:>6d}")
|
||||||
|
|
||||||
|
# Échantillon
|
||||||
|
print(f"\nÉchantillon noms distinctifs (20 premiers):")
|
||||||
|
for n in sorted(filtered_distinctive)[:20]:
|
||||||
|
print(f" {n}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user