feat: réduction FP + gazetteers adresses FINESS + batch parallèle + corrections multi-axes

- Token min length relevé de 2-3 → 4 chars (élimine FP EPO, IRC, SIB...)
- Stop-words enrichis : acronymes médicaux 3 lettres, termes pharma, soins infirmiers
- BDPM stop-words : ~7300 noms commerciaux + DCI/substances actives
- Gazetteers adresses FINESS : 63K patterns Aho-Corasick (position-preserving normalization)
- Filtre contextuel anatomique pour FINESS établissements
- Nouvelles regex : RE_CIVILITE_COMMA_LIST, RE_EXTRACT_NOM_UTILISE, RE_EXTRACT_PRENOM,
  RE_NUM_EXAMEN_PATIENT, RE_ADRESSE_LIEU_DIT, RE_CIVILITE_INITIALE, Dr X.NOM
- URLs complètes (RE_URL) + détection multiline
- N° venue inversé (layout-aware) + EPISODE/NDA dans _CRITICAL_PII_TYPES
- HospitalFilter désactivé pour ADRESSE/TEL/VILLE/EPISODE (identifient le patient)
- Batch silver export parallélisé (multiprocessing spawn, N workers)
- Seuil sur-masquage relevé à 8%, server.py enrichi (source regex/ner)
- Blacklist villes : COURANT, PARIS ; contexte villes étendu (UHCD, spécialités)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-16 09:26:56 +01:00
parent a827d860f1
commit 49ff464e6e
18 changed files with 358579 additions and 232 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -24,9 +24,14 @@ blacklist:
- '640780417' - '640780417'
- 'Dates du séjour :' - 'Dates du séjour :'
- CONCERTATION - CONCERTATION
- BAYONNE CEDEX
- BAYONNE
- '64109'
- LABORATOIRE de BIOLOGIE MEDICALE
force_mask_regex: force_mask_regex:
- 'Centre\s+Hospitalier\s+(?:de\s+(?:la\s+)?)?C[oôÔ]te\s+Basque' - 'Centre\s+Hospitalier\s+(?:de\s+(?:la\s+)?)?C[oôÔ]te\s+Basque'
- 'Polyclinique\s+C[oôÔ]te\s+Basque\s+Sud' - 'Polyclinique\s+C[oôÔ]te\s+Basque\s+Sud'
- '13\s*,?\s*Avenue\s+de\s+l.Interne\s+J\.?\s*LOEB\s+BP\s*\d+'
kv_labels_preserve: kv_labels_preserve:
- FINESS - FINESS
- IPP - IPP

15816
data/bdpm/CIS_bdpm.txt Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

218984
data/insee/noms2008nat_txt.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1184,8 +1184,8 @@ déglobulisation. O
Bladder O Bladder O
négatif. O négatif. O
Sur O Sur O
le O le B-VILLE
plan B-VILLE plan I-VILLE
antalgique O antalgique O
: O : O
Faux B-VILLE Faux B-VILLE
@@ -1515,8 +1515,8 @@ cette O
patiente O patiente O
altérée O altérée O
sur O sur O
le O le B-VILLE
plan B-VILLE plan I-VILLE
général, O général, O
OMS2/3. O OMS2/3. O
> O > O
@@ -1529,8 +1529,8 @@ du O
traitement O traitement O
antalgique. O antalgique. O
Sur O Sur O
le O le B-VILLE
plan B-VILLE plan I-VILLE
infectieux O infectieux O
: O : O
Pic O Pic O
@@ -2817,8 +2817,8 @@ apyrexie O
au O au O
décours. O décours. O
Sur O Sur O
le O le B-VILLE
plan B-VILLE plan I-VILLE
urologique O urologique O
: O : O
Un O Un O
@@ -2919,8 +2919,8 @@ oncologique O
Nette O Nette O
amélioration O amélioration O
sur O sur O
le O le B-VILLE
plan B-VILLE plan I-VILLE
général O général O
avec O avec O
la O la O

View File

@@ -2572,8 +2572,8 @@ de O
traitement O traitement O
antibiotique O antibiotique O
Sur O Sur O
le O le B-VILLE
plan B-VILLE plan I-VILLE
hématologique O hématologique O
Anémie O Anémie O
autour O autour O

View File

@@ -1812,8 +1812,8 @@ de O
cette O cette O
décision. O décision. O
Sur O Sur O
le O le B-VILLE
plan B-VILLE plan I-VILLE
hématologique: O hématologique: O
Elle O Elle O
présente O présente O

View File

@@ -1420,8 +1420,8 @@ en O
charge O charge O
antalgique. O antalgique. O
Sur O Sur O
le O le B-VILLE
plan B-VILLE plan I-VILLE
de O de O
la O la O
gravité: O gravité: O

View File

@@ -1102,8 +1102,8 @@ de O
l'épisode O l'épisode O
aigüe. O aigüe. O
Sur O Sur O
le O le B-VILLE
plan B-VILLE plan I-VILLE
infectieux, O infectieux, O
présence O présence O
de O de O

View File

@@ -166,23 +166,12 @@ class HospitalFilter:
Returns: Returns:
True si la détection doit être filtrée (faux positif) True si la détection doit être filtrée (faux positif)
""" """
# Filtrer par type # ADRESSE, CODE_POSTAL, VILLE, TEL : NE PAS filtrer.
if pii_type == "ADRESSE": # Les coordonnées hospitalières identifient indirectement le patient
return self.is_hospital_address(text) # et doivent être masquées (validé par contrôle humain 2026-03-12).
elif pii_type == "CODE_POSTAL": # EPISODE : NE PAS filtrer.
return self.is_hospital_postal_code(text) # Les numéros d'épisode identifient le patient (validé 2026-03-14).
elif pii_type == "VILLE":
return self.is_hospital_city(text)
elif pii_type == "TEL":
return self.is_hospital_phone(text)
elif pii_type == "EPISODE":
# Filtrer les épisodes qui proviennent du nom de fichier
# (répétés dans les en-têtes/pieds de page des documents trackare)
return self.is_episode_in_filename(text, filename)
return False return False
@@ -222,15 +211,17 @@ if __name__ == "__main__":
# Tests # Tests
test_cases = [ test_cases = [
("ADRESSE", "13, Avenue de l'Interne J", "", -1, True), # ADRESSE, CODE_POSTAL, VILLE, TEL : ne sont plus filtrés (identifient le patient)
("ADRESSE", "13, Avenue de l'Interne J", "", -1, False),
("ADRESSE", "22 LOT MENDI ALDE", "", -1, False), ("ADRESSE", "22 LOT MENDI ALDE", "", -1, False),
("CODE_POSTAL", "64109 BAYONNE CEDEX", "", -1, True), ("CODE_POSTAL", "64109 BAYONNE CEDEX", "", -1, False),
("CODE_POSTAL", "64130", "", -1, False), ("CODE_POSTAL", "64130", "", -1, False),
("VILLE", "BAYONNE CEDEX", "", -1, True), ("VILLE", "BAYONNE CEDEX", "", -1, False),
("VILLE", "CHERAUTE", "", -1, False), ("VILLE", "CHERAUTE", "", -1, False),
("VILLE", "DROIT", "", -1, True), # Terme anatomique ("VILLE", "DROIT", "", -1, False),
("TEL", "05 59 44 35 35", "", -1, True), ("TEL", "05 59 44 35 35", "", -1, False),
("TEL", "0676085336", "", -1, False), ("TEL", "0676085336", "", -1, False),
# EPISODE : filtré uniquement si provient du nom de fichier trackare
("EPISODE", "23202435", "trackare-14004105-23202435", -1, True), ("EPISODE", "23202435", "trackare-14004105-23202435", -1, True),
("EPISODE", "23102610", "CRH_23102610", 0, False), ("EPISODE", "23102610", "CRH_23102610", 0, False),
] ]

View File

@@ -1,18 +1,18 @@
{ {
"date": "2026-03-12T10:24:59.261417", "date": "2026-03-12T17:16:25.993851",
"scores": { "scores": {
"global_score": 97.0, "global_score": 97.0,
"leak_score": 100.0, "leak_score": 100.0,
"fp_score": 90, "fp_score": 90,
"totals": { "totals": {
"documents": 29, "documents": 29,
"audit_hits": 2797, "audit_hits": 3186,
"name_tokens_known": 461, "name_tokens_known": 457,
"leak_audit": 0, "leak_audit": 0,
"leak_occurrences": 0, "leak_occurrences": 0,
"leak_regex": 0, "leak_regex": 0,
"leak_insee_high": 0, "leak_insee_high": 0,
"leak_insee_medium": 569, "leak_insee_medium": 570,
"fp_medical": 0, "fp_medical": 0,
"fp_overmasking": 2 "fp_overmasking": 2
} }
@@ -110,7 +110,7 @@
"leak_audit": 0, "leak_audit": 0,
"leak_regex": 0, "leak_regex": 0,
"leak_insee_high": 0, "leak_insee_high": 0,
"leak_insee_medium": 23, "leak_insee_medium": 24,
"fp_medical": 0, "fp_medical": 0,
"fp_overmasking": 0 "fp_overmasking": 0
}, },
@@ -206,7 +206,7 @@
"leak_audit": 0, "leak_audit": 0,
"leak_regex": 0, "leak_regex": 0,
"leak_insee_high": 0, "leak_insee_high": 0,
"leak_insee_medium": 32, "leak_insee_medium": 33,
"fp_medical": 0, "fp_medical": 0,
"fp_overmasking": 0 "fp_overmasking": 0
}, },
@@ -222,7 +222,7 @@
"leak_audit": 0, "leak_audit": 0,
"leak_regex": 0, "leak_regex": 0,
"leak_insee_high": 0, "leak_insee_high": 0,
"leak_insee_medium": 34, "leak_insee_medium": 32,
"fp_medical": 0, "fp_medical": 0,
"fp_overmasking": 0 "fp_overmasking": 0
}, },
@@ -246,7 +246,7 @@
"leak_audit": 0, "leak_audit": 0,
"leak_regex": 0, "leak_regex": 0,
"leak_insee_high": 0, "leak_insee_high": 0,
"leak_insee_medium": 26, "leak_insee_medium": 27,
"fp_medical": 0, "fp_medical": 0,
"fp_overmasking": 0 "fp_overmasking": 0
} }

View File

@@ -1,27 +1,24 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""Batch anonymisation de PDFs pour enrichir le dataset silver. """Batch anonymisation parallèle de PDFs pour enrichir le dataset silver.
Traite TOUS les PDFs disponibles (excluant ceux déjà dans audit_30) en mode CPU Traite TOUS les PDFs disponibles en mode CPU (sans VLM), avec N workers
uniquement (sans VLM) pour générer des .pseudonymise.txt utilisables par parallèles. Chaque worker charge ses propres modèles NER.
export_silver_annotations.py.
Timeout par fichier pour éviter les blocages sur les gros documents.
Reprend automatiquement là où il s'est arrêté (skip les déjà traités). Reprend automatiquement là où il s'est arrêté (skip les déjà traités).
Usage:
python run_batch_silver_export.py # 6 workers (défaut)
python run_batch_silver_export.py --workers 4 # 4 workers
""" """
import sys import sys
import os
import time import time
import signal import argparse
import random import multiprocessing as mp
from pathlib import Path from pathlib import Path
from collections import Counter
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))
import anonymizer_core_refactored_onnx as core
from eds_pseudo_manager import EdsPseudoManager
from gliner_manager import GlinerManager
from camembert_ner_manager import CamembertNerManager
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
OUTDIR = SRC / "anonymise_silver_extra" OUTDIR = SRC / "anonymise_silver_extra"
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml") CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
@@ -62,16 +59,102 @@ ALREADY_DONE_AUDIT30 = {
TIMEOUT_PER_FILE = 120 # secondes max par PDF TIMEOUT_PER_FILE = 120 # secondes max par PDF
# Variables globales par worker (initialisées une seule fois)
class TimeoutError(Exception): _worker_ner = None
pass _worker_gliner = None
_worker_camembert = None
_worker_id = None
def timeout_handler(signum, frame): def init_worker(worker_id):
raise TimeoutError("Timeout") """Initialise les modèles NER dans chaque worker (appelé une seule fois)."""
global _worker_ner, _worker_gliner, _worker_camembert, _worker_id
_worker_id = worker_id
# Limiter les threads ONNX/OpenMP par worker pour éviter la contention
n_threads = max(2, 32 // (mp.cpu_count() // 2)) # répartir équitablement
os.environ["OMP_NUM_THREADS"] = str(n_threads)
os.environ["MKL_NUM_THREADS"] = str(n_threads)
import anonymizer_core_refactored_onnx as core # noqa: F401
from eds_pseudo_manager import EdsPseudoManager
from gliner_manager import GlinerManager
from camembert_ner_manager import CamembertNerManager
_worker_ner = EdsPseudoManager()
_worker_ner.load()
print(f" [W{worker_id}] EDS-Pseudo chargé", flush=True)
_worker_gliner = GlinerManager()
try:
_worker_gliner.load()
print(f" [W{worker_id}] GLiNER chargé", flush=True)
except Exception as e:
print(f" [W{worker_id}] GLiNER indisponible ({e})", flush=True)
_worker_gliner = None
_worker_camembert = CamembertNerManager()
try:
_worker_camembert.load()
print(f" [W{worker_id}] CamemBERT-bio chargé", flush=True)
except Exception as e:
print(f" [W{worker_id}] CamemBERT-bio indisponible ({e})", flush=True)
_worker_camembert = None
print(f" [W{worker_id}] Prêt (threads={n_threads})", flush=True)
def process_one_pdf(args):
"""Traite un seul PDF. Appelé par le pool de workers."""
pdf_path, idx, total = args
import signal
import anonymizer_core_refactored_onnx as core
ogc = pdf_path.parent.name.split("_")[0]
# Timeout via alarm
def _timeout_handler(signum, frame):
raise TimeoutError("Timeout")
signal.signal(signal.SIGALRM, _timeout_handler)
signal.alarm(TIMEOUT_PER_FILE)
try:
core.process_pdf(
pdf_path=pdf_path,
out_dir=OUTDIR,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=CONFIG,
use_hf=True,
ner_manager=_worker_ner,
ner_thresholds=None,
ogc_label=ogc,
vlm_manager=None,
gliner_manager=_worker_gliner,
camembert_manager=_worker_camembert,
)
signal.alarm(0)
return ("OK", pdf_path.name, idx, total)
except TimeoutError:
signal.alarm(0)
return ("TIMEOUT", pdf_path.name, idx, total)
except Exception as e:
signal.alarm(0)
err = str(e)
if "encrypted" in err.lower() or "password" in err.lower():
return ("SKIP", pdf_path.name, idx, total)
return ("ERROR", pdf_path.name, idx, total, str(e)[:100])
def main(): def main():
parser = argparse.ArgumentParser(description="Batch silver export parallèle")
parser.add_argument("--workers", type=int, default=6,
help="Nombre de workers parallèles (défaut: 6)")
args = parser.parse_args()
n_workers = args.workers
# Collecter tous les PDFs disponibles (excluant audit_30) # Collecter tous les PDFs disponibles (excluant audit_30)
all_pdfs = [] all_pdfs = []
for ogc_dir in sorted(SRC.iterdir()): for ogc_dir in sorted(SRC.iterdir()):
@@ -81,7 +164,6 @@ def main():
if pdf.name not in ALREADY_DONE_AUDIT30: if pdf.name not in ALREADY_DONE_AUDIT30:
all_pdfs.append(pdf) all_pdfs.append(pdf)
# Trier par OGC pour reproductibilité
all_pdfs.sort(key=lambda p: (p.parent.name, p.name)) all_pdfs.sort(key=lambda p: (p.parent.name, p.name))
# Détecter les fichiers déjà traités (reprise) # Détecter les fichiers déjà traités (reprise)
@@ -95,96 +177,73 @@ def main():
print(f"PDFs disponibles: {len(all_pdfs)} (excl. audit_30)") print(f"PDFs disponibles: {len(all_pdfs)} (excl. audit_30)")
print(f"Déjà traités: {len(already_done)}") print(f"Déjà traités: {len(already_done)}")
print(f"Restant: {len(pdfs_to_do)}") print(f"Restant: {len(pdfs_to_do)}")
print(f"Workers: {n_workers}")
print(f"RAM par worker: ~4 Go (NER models)")
print(f"RAM totale estimée: ~{n_workers * 4} Go\n")
if not pdfs_to_do: if not pdfs_to_do:
print("Rien à faire.") print("Rien à faire.")
return return
# Chargement des modèles NER (CPU uniquement, pas de VLM) # Préparer les arguments : (pdf_path, index, total)
print("\nChargement EDS-Pseudo...", flush=True) tasks = [(pdf, i, len(pdfs_to_do)) for i, pdf in enumerate(pdfs_to_do, 1)]
ner = EdsPseudoManager()
ner.load()
assert ner.is_loaded(), "EDS-Pseudo non chargé"
print("EDS-Pseudo chargé.", flush=True)
print("Chargement GLiNER...", flush=True) print(f"Chargement des modèles dans {n_workers} workers...", flush=True)
gliner = GlinerManager()
try:
gliner.load()
print("GLiNER chargé.", flush=True)
except Exception as e:
print(f"GLiNER indisponible ({e}), on continue sans.", flush=True)
gliner = None
print("Chargement CamemBERT-bio ONNX...", flush=True) # Créer le pool avec initialisation des modèles par worker
camembert = CamembertNerManager() # On utilise mp.Pool avec initializer pour charger les modèles une seule fois
try: # Note: fork + ONNX peut poser problème, on utilise 'spawn'
camembert.load() ctx = mp.get_context("spawn")
print("CamemBERT-bio ONNX chargé.", flush=True)
except Exception as e:
print(f"CamemBERT-bio indisponible ({e}), on continue sans.", flush=True)
camembert = None
print(f"\nPas de VLM (CPU only pour silver export).\n", flush=True)
ok = ko = skip_encrypted = skip_timeout = 0 ok = ko = skip_encrypted = skip_timeout = 0
t0 = time.time() t0 = time.time()
total = len(pdfs_to_do)
for i, pdf in enumerate(pdfs_to_do, 1): # Lancer les workers séquentiellement pour l'init (éviter pic mémoire)
ogc = pdf.parent.name.split("_")[0] # puis traiter en parallèle
print(f"[{i}/{total}] {pdf.name} (OGC {ogc})...", end=" ", flush=True) with ctx.Pool(
processes=n_workers,
initializer=init_worker,
initargs=(0,), # worker_id simplifié
) as pool:
for result in pool.imap_unordered(process_one_pdf, tasks, chunksize=1):
status = result[0]
name = result[1]
idx = result[2]
total = result[3]
# Timeout par fichier
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(TIMEOUT_PER_FILE)
try:
core.process_pdf(
pdf_path=pdf,
out_dir=OUTDIR,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=CONFIG,
use_hf=True,
ner_manager=ner,
ner_thresholds=None,
ogc_label=ogc,
vlm_manager=None,
gliner_manager=gliner,
camembert_manager=camembert,
)
signal.alarm(0)
elapsed_file = time.time() - t0
rate = ok / elapsed_file * 3600 if elapsed_file > 0 and ok > 0 else 0
print(f"OK ({rate:.0f}/h)", flush=True)
ok += 1
except TimeoutError:
signal.alarm(0)
print(f"TIMEOUT ({TIMEOUT_PER_FILE}s)", flush=True)
skip_timeout += 1
except Exception as e:
signal.alarm(0)
err = str(e)
if "encrypted" in err.lower() or "password" in err.lower():
print("SKIP (chiffré)", flush=True)
skip_encrypted += 1
else:
print(f"ERREUR: {e}", flush=True)
ko += 1
# Rapport intermédiaire toutes les 50 fichiers
if i % 50 == 0:
elapsed = time.time() - t0 elapsed = time.time() - t0
remaining = (elapsed / i) * (total - i) done = ok + ko + skip_encrypted + skip_timeout + 1
print(f"\n --- Progression: {i}/{total} | OK: {ok} | "
f"Erreurs: {ko} | Timeout: {skip_timeout} | " if status == "OK":
f"Temps restant estimé: {remaining/60:.0f}min ---\n", flush=True) ok += 1
rate = ok / elapsed * 3600 if elapsed > 0 else 0
print(f"[{done}/{total}] {name} OK ({rate:.0f}/h)", flush=True)
elif status == "TIMEOUT":
skip_timeout += 1
print(f"[{done}/{total}] {name} TIMEOUT", flush=True)
elif status == "SKIP":
skip_encrypted += 1
print(f"[{done}/{total}] {name} SKIP (chiffré)", flush=True)
else:
ko += 1
err_msg = result[4] if len(result) > 4 else "?"
print(f"[{done}/{total}] {name} ERREUR: {err_msg}", flush=True)
# Rapport intermédiaire toutes les 50 fichiers
if done % 50 == 0:
remaining = (elapsed / done) * (total - done)
print(f"\n --- Progression: {done}/{total} | OK: {ok} | "
f"Erreurs: {ko} | Timeout: {skip_timeout} | "
f"Débit: {ok/elapsed*3600:.0f}/h | "
f"Restant: {remaining/60:.0f}min ---\n", flush=True)
elapsed = time.time() - t0 elapsed = time.time() - t0
total_pseudo = len(list(OUTDIR.glob("*.pseudonymise.txt")))
print(f"\n{'='*60}") print(f"\n{'='*60}")
print(f"Terminé en {elapsed:.0f}s ({elapsed/60:.1f}min)") print(f"Terminé en {elapsed:.0f}s ({elapsed/60:.1f}min)")
print(f"OK: {ok}, Chiffrés: {skip_encrypted}, Timeout: {skip_timeout}, Erreurs: {ko}") print(f"OK: {ok}, Chiffrés: {skip_encrypted}, Timeout: {skip_timeout}, Erreurs: {ko}")
print(f"Total .pseudonymise.txt: {len(list(OUTDIR.glob('*.pseudonymise.txt')))}") print(f"Total .pseudonymise.txt: {total_pseudo}")
print(f"Débit moyen: {ok/elapsed*3600:.0f} fichiers/h")
print(f"Sortie: {OUTDIR}") print(f"Sortie: {OUTDIR}")

View File

@@ -190,6 +190,93 @@ def main():
out.write_text("\n".join(sorted(phones)) + "\n", encoding="utf-8") out.write_text("\n".join(sorted(phones)) + "\n", encoding="utf-8")
print(f"{out.name}: {len(phones)} entrées") print(f"{out.name}: {len(phones)} entrées")
# 6. Adresses FINESS (type_voie + nom_voie) pour Aho-Corasick
# Mapping des codes type_voie FINESS vers formes étendues
TYPE_VOIE_MAP = {
"AV": "avenue", "R": "rue", "BD": "boulevard", "RTE": "route",
"CHE": "chemin", "PL": "place", "IMP": "impasse", "ALL": "allee",
"SQ": "square", "PASS": "passage", "QU": "quai", "CRS": "cours",
"SEN": "sentier", "RPT": "rond-point", "LD": "lieu-dit",
"HAM": "hameau", "LOT": "lotissement", "TSSE": "traverse",
"CHEM": "chemin", "RES": "residence", "CTRE": "centre",
"ESP": "esplanade", "PRO": "promenade", "MTE": "montee",
"VOI": "voie", "CAR": "carrefour", "FBG": "faubourg",
}
# Charger les prénoms INSEE pour générer des variantes abrégées
prenoms_path = Path(__file__).parent.parent / "data" / "insee" / "prenoms_france.txt"
prenoms_set = set()
if prenoms_path.exists():
for line in prenoms_path.read_text(encoding="utf-8").splitlines():
p = line.strip().lower()
if p and len(p) >= 3:
prenoms_set.add(p)
print(f" Prénoms INSEE chargés: {len(prenoms_set)}")
VOIE_GENERIC = {
"de", "du", "des", "la", "le", "les", "l", "et", "en", "au", "aux",
"a", "sur", "sous", "par", "pour", "dans", "rue", "avenue", "boulevard",
"route", "chemin", "place", "impasse", "square", "passage", "quai", "cours",
"grande", "grand", "petit", "petite", "vieux", "vieille", "nouveau", "nouvelle",
"haut", "haute", "bas", "basse",
}
addr_patterns = set()
def _add_with_abbrev(pattern: str):
"""Ajoute le pattern + variantes avec prénoms abrégés (initiale seule)."""
addr_patterns.add(pattern)
words = pattern.split()
for i, w in enumerate(words):
if w in prenoms_set and len(w) >= 3:
# Variante avec initiale seule — seulement si un mot distinctif suit
remaining = words[i+1:]
if not remaining or all(len(r) <= 2 or r in VOIE_GENERIC for r in remaining):
continue # Pas d'abréviation si rien de distinctif après
abbrev_words = words[:i] + [w[0]] + words[i+1:]
abbrev = " ".join(abbrev_words)
# Minimum 12 chars, et le pattern ne doit pas commencer par une initiale seule
if len(abbrev) >= 12 and len(abbrev_words[0]) >= 2:
addr_patterns.add(abbrev)
with open(csv_path, encoding="utf-8") as f:
reader = csv.reader(f, delimiter=";")
next(reader)
for row in reader:
if len(row) < 10:
continue
type_voie_raw = row[8].strip() if len(row) > 8 else ""
nom_voie = row[9].strip() if len(row) > 9 else ""
if not nom_voie or len(nom_voie) < 3:
continue
nom_norm = normalize(nom_voie)
words = nom_norm.split()
# Pattern complet : type_voie + nom_voie (ex: "avenue de l interne jacques loeb")
type_voie_expanded = TYPE_VOIE_MAP.get(type_voie_raw.upper(), type_voie_raw.lower())
if type_voie_expanded and nom_norm:
full = f"{type_voie_expanded} {nom_norm}"
full_words = full.split()
has_distinctive = any(
w not in VOIE_GENERIC and len(w) >= 4 for w in full_words
)
if has_distinctive and len(full) >= 12:
_add_with_abbrev(full)
# Pattern nom_voie seul (seulement si très distinctif)
has_distinctive = any(w not in VOIE_GENERIC and len(w) >= 4 for w in words)
if has_distinctive and len(nom_norm) >= 15:
_add_with_abbrev(nom_norm)
out = OUT_DIR / "adresses_finess.txt"
out.write_text("\n".join(sorted(addr_patterns)) + "\n", encoding="utf-8")
print(f"\n{out.name}: {len(addr_patterns)} entrées")
# Garder aussi voies_distinctives.txt pour compatibilité
voie_names = {p for p in addr_patterns if len(p) >= 15}
out = OUT_DIR / "voies_distinctives.txt"
out.write_text("\n".join(sorted(voie_names)) + "\n", encoding="utf-8")
print(f"{out.name}: {len(voie_names)} entrées")
# Stats par longueur # Stats par longueur
print(f"\nDistribution noms distinctifs par longueur (mots):") print(f"\nDistribution noms distinctifs par longueur (mots):")
word_counts = Counter(len(n.split()) for n in filtered_distinctive) word_counts = Counter(len(n.split()) for n in filtered_distinctive)

View File

@@ -300,7 +300,7 @@ def check_fp_density(text: str) -> dict:
"density_pct": round(density, 2), "density_pct": round(density, 2),
"nom_count": nom_count, "nom_count": nom_count,
"nom_pct": round(nom_pct, 2), "nom_pct": round(nom_pct, 2),
"alert": nom_pct > 5.0, "alert": nom_pct > 8.0, # seuil relevé : CRO/CRH courts listent 8-10 soignants = légitime
} }

View File

@@ -210,17 +210,34 @@ async def anonymize_text(
final_text = selective_rescan(final_text, cfg=cfg) final_text = selective_rescan(final_text, cfg=cfg)
elapsed = time.time() - t0 elapsed = time.time() - t0
audit_list = [
{"kind": h.kind, "original": h.original, "placeholder": h.placeholder, "page": h.page} # Inclure tous les hits (regex page≥0 + NER page=-1) avec source
for h in anon.audit ner_prefixes = ("NER_", "EDS_")
if h.page != -1 # exclure les propagations globales audit_list = []
] ner_count = 0
regex_count = 0
for h in anon.audit:
is_ner = h.kind.startswith(ner_prefixes) or h.page == -1
entry = {
"kind": h.kind,
"original": h.original,
"placeholder": h.placeholder,
"page": h.page,
"source": "ner" if is_ner else "regex",
}
audit_list.append(entry)
if is_ner:
ner_count += 1
else:
regex_count += 1
return { return {
"text_anonymized": final_text, "text_anonymized": final_text,
"audit": audit_list, "audit": audit_list,
"stats": { "stats": {
"pii_detected": len(audit_list), "pii_detected": len(audit_list),
"regex_count": regex_count,
"ner_count": ner_count,
"elapsed_seconds": round(elapsed, 3), "elapsed_seconds": round(elapsed, 3),
"ner_active": use_ner and _eds_manager is not None, "ner_active": use_ner and _eds_manager is not None,
}, },