feat: réduction FP + gazetteers adresses FINESS + batch parallèle + corrections multi-axes
- Token min length relevé de 2-3 → 4 chars (élimine FP EPO, IRC, SIB...) - Stop-words enrichis : acronymes médicaux 3 lettres, termes pharma, soins infirmiers - BDPM stop-words : ~7300 noms commerciaux + DCI/substances actives - Gazetteers adresses FINESS : 63K patterns Aho-Corasick (position-preserving normalization) - Filtre contextuel anatomique pour FINESS établissements - Nouvelles regex : RE_CIVILITE_COMMA_LIST, RE_EXTRACT_NOM_UTILISE, RE_EXTRACT_PRENOM, RE_NUM_EXAMEN_PATIENT, RE_ADRESSE_LIEU_DIT, RE_CIVILITE_INITIALE, Dr X.NOM - URLs complètes (RE_URL) + détection multiline - N° venue inversé (layout-aware) + EPISODE/NDA dans _CRITICAL_PII_TYPES - HospitalFilter désactivé pour ADRESSE/TEL/VILLE/EPISODE (identifient le patient) - Batch silver export parallélisé (multiprocessing spawn, N workers) - Seuil sur-masquage relevé à 8%, server.py enrichi (source regex/ner) - Blacklist villes : COURANT, PARIS ; contexte villes étendu (UHCD, spécialités) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -190,6 +190,93 @@ def main():
|
||||
out.write_text("\n".join(sorted(phones)) + "\n", encoding="utf-8")
|
||||
print(f" → {out.name}: {len(phones)} entrées")
|
||||
|
||||
# 6. Adresses FINESS (type_voie + nom_voie) pour Aho-Corasick
|
||||
# Mapping des codes type_voie FINESS vers formes étendues
|
||||
TYPE_VOIE_MAP = {
|
||||
"AV": "avenue", "R": "rue", "BD": "boulevard", "RTE": "route",
|
||||
"CHE": "chemin", "PL": "place", "IMP": "impasse", "ALL": "allee",
|
||||
"SQ": "square", "PASS": "passage", "QU": "quai", "CRS": "cours",
|
||||
"SEN": "sentier", "RPT": "rond-point", "LD": "lieu-dit",
|
||||
"HAM": "hameau", "LOT": "lotissement", "TSSE": "traverse",
|
||||
"CHEM": "chemin", "RES": "residence", "CTRE": "centre",
|
||||
"ESP": "esplanade", "PRO": "promenade", "MTE": "montee",
|
||||
"VOI": "voie", "CAR": "carrefour", "FBG": "faubourg",
|
||||
}
|
||||
# Charger les prénoms INSEE pour générer des variantes abrégées
|
||||
prenoms_path = Path(__file__).parent.parent / "data" / "insee" / "prenoms_france.txt"
|
||||
prenoms_set = set()
|
||||
if prenoms_path.exists():
|
||||
for line in prenoms_path.read_text(encoding="utf-8").splitlines():
|
||||
p = line.strip().lower()
|
||||
if p and len(p) >= 3:
|
||||
prenoms_set.add(p)
|
||||
print(f" Prénoms INSEE chargés: {len(prenoms_set)}")
|
||||
|
||||
VOIE_GENERIC = {
|
||||
"de", "du", "des", "la", "le", "les", "l", "et", "en", "au", "aux",
|
||||
"a", "sur", "sous", "par", "pour", "dans", "rue", "avenue", "boulevard",
|
||||
"route", "chemin", "place", "impasse", "square", "passage", "quai", "cours",
|
||||
"grande", "grand", "petit", "petite", "vieux", "vieille", "nouveau", "nouvelle",
|
||||
"haut", "haute", "bas", "basse",
|
||||
}
|
||||
|
||||
addr_patterns = set()
|
||||
|
||||
def _add_with_abbrev(pattern: str):
|
||||
"""Ajoute le pattern + variantes avec prénoms abrégés (initiale seule)."""
|
||||
addr_patterns.add(pattern)
|
||||
words = pattern.split()
|
||||
for i, w in enumerate(words):
|
||||
if w in prenoms_set and len(w) >= 3:
|
||||
# Variante avec initiale seule — seulement si un mot distinctif suit
|
||||
remaining = words[i+1:]
|
||||
if not remaining or all(len(r) <= 2 or r in VOIE_GENERIC for r in remaining):
|
||||
continue # Pas d'abréviation si rien de distinctif après
|
||||
abbrev_words = words[:i] + [w[0]] + words[i+1:]
|
||||
abbrev = " ".join(abbrev_words)
|
||||
# Minimum 12 chars, et le pattern ne doit pas commencer par une initiale seule
|
||||
if len(abbrev) >= 12 and len(abbrev_words[0]) >= 2:
|
||||
addr_patterns.add(abbrev)
|
||||
|
||||
with open(csv_path, encoding="utf-8") as f:
|
||||
reader = csv.reader(f, delimiter=";")
|
||||
next(reader)
|
||||
for row in reader:
|
||||
if len(row) < 10:
|
||||
continue
|
||||
type_voie_raw = row[8].strip() if len(row) > 8 else ""
|
||||
nom_voie = row[9].strip() if len(row) > 9 else ""
|
||||
if not nom_voie or len(nom_voie) < 3:
|
||||
continue
|
||||
nom_norm = normalize(nom_voie)
|
||||
words = nom_norm.split()
|
||||
|
||||
# Pattern complet : type_voie + nom_voie (ex: "avenue de l interne jacques loeb")
|
||||
type_voie_expanded = TYPE_VOIE_MAP.get(type_voie_raw.upper(), type_voie_raw.lower())
|
||||
if type_voie_expanded and nom_norm:
|
||||
full = f"{type_voie_expanded} {nom_norm}"
|
||||
full_words = full.split()
|
||||
has_distinctive = any(
|
||||
w not in VOIE_GENERIC and len(w) >= 4 for w in full_words
|
||||
)
|
||||
if has_distinctive and len(full) >= 12:
|
||||
_add_with_abbrev(full)
|
||||
|
||||
# Pattern nom_voie seul (seulement si très distinctif)
|
||||
has_distinctive = any(w not in VOIE_GENERIC and len(w) >= 4 for w in words)
|
||||
if has_distinctive and len(nom_norm) >= 15:
|
||||
_add_with_abbrev(nom_norm)
|
||||
|
||||
out = OUT_DIR / "adresses_finess.txt"
|
||||
out.write_text("\n".join(sorted(addr_patterns)) + "\n", encoding="utf-8")
|
||||
print(f"\n → {out.name}: {len(addr_patterns)} entrées")
|
||||
|
||||
# Garder aussi voies_distinctives.txt pour compatibilité
|
||||
voie_names = {p for p in addr_patterns if len(p) >= 15}
|
||||
out = OUT_DIR / "voies_distinctives.txt"
|
||||
out.write_text("\n".join(sorted(voie_names)) + "\n", encoding="utf-8")
|
||||
print(f" → {out.name}: {len(voie_names)} entrées")
|
||||
|
||||
# Stats par longueur
|
||||
print(f"\nDistribution noms distinctifs par longueur (mots):")
|
||||
word_counts = Counter(len(n.split()) for n in filtered_distinctive)
|
||||
|
||||
@@ -300,7 +300,7 @@ def check_fp_density(text: str) -> dict:
|
||||
"density_pct": round(density, 2),
|
||||
"nom_count": nom_count,
|
||||
"nom_pct": round(nom_pct, 2),
|
||||
"alert": nom_pct > 5.0,
|
||||
"alert": nom_pct > 8.0, # seuil relevé : CRO/CRH courts listent 8-10 soignants = légitime
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user