feat: réduction FP + gazetteers adresses FINESS + batch parallèle + corrections multi-axes

- Token min length relevé de 2-3 → 4 chars (élimine FP EPO, IRC, SIB...) - Stop-words enrichis : acronymes médicaux 3 lettres, termes pharma, soins infirmiers - BDPM stop-words : ~7300 noms commerciaux + DCI/substances actives - Gazetteers adresses FINESS : 63K patterns Aho-Corasick (position-preserving normalization) - Filtre contextuel anatomique pour FINESS établissements - Nouvelles regex : RE_CIVILITE_COMMA_LIST, RE_EXTRACT_NOM_UTILISE, RE_EXTRACT_PRENOM, RE_NUM_EXAMEN_PATIENT, RE_ADRESSE_LIEU_DIT, RE_CIVILITE_INITIALE, Dr X.NOM - URLs complètes (RE_URL) + détection multiline - N° venue inversé (layout-aware) + EPISODE/NDA dans _CRITICAL_PII_TYPES - HospitalFilter désactivé pour ADRESSE/TEL/VILLE/EPISODE (identifient le patient) - Batch silver export parallélisé (multiprocessing spawn, N workers) - Seuil sur-masquage relevé à 8%, server.py enrichi (source regex/ner) - Blacklist villes : COURANT, PARIS ; contexte villes étendu (UHCD, spécialités) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-16 09:26:56 +01:00
parent a827d860f1
commit 49ff464e6e
18 changed files with 358579 additions and 232 deletions
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
--- a/config/dictionnaires.yml
+++ b/config/dictionnaires.yml
@@ -24,9 +24,14 @@ blacklist:
  - '640780417'
  - 'Dates du séjour :'
  - CONCERTATION
  - BAYONNE CEDEX
  - BAYONNE
  - '64109'
  - LABORATOIRE de BIOLOGIE MEDICALE
  force_mask_regex:
  - 'Centre\s+Hospitalier\s+(?:de\s+(?:la\s+)?)?C[oôÔ]te\s+Basque'
  - 'Polyclinique\s+C[oôÔ]te\s+Basque\s+Sud'
  - '13\s*,?\s*Avenue\s+de\s+l.Interne\s+J\.?\s*LOEB\s+BP\s*\d+'
 kv_labels_preserve:
 - FINESS
 - IPP
--- a/data/bdpm/CIS_bdpm.txt
+++ b/data/bdpm/CIS_bdpm.txt
--- a/data/bdpm/medicaments_stopwords.txt
+++ b/data/bdpm/medicaments_stopwords.txt
--- a/data/finess/adresses_finess.txt
+++ b/data/finess/adresses_finess.txt
--- a/data/finess/voies_distinctives.txt
+++ b/data/finess/voies_distinctives.txt
--- a/data/insee/noms2008nat_txt.txt
+++ b/data/insee/noms2008nat_txt.txt
--- a/data/silver_annotations/trackare-00260974-23070213_00260974_23070213.bio
+++ b/data/silver_annotations/trackare-00260974-23070213_00260974_23070213.bio
@@ -1184,8 +1184,8 @@ déglobulisation.	O
 Bladder	O
 négatif.	O
 Sur	O
-le	O
+le	B-VILLE
-plan	B-VILLE
+plan	I-VILLE
 antalgique	O
 :	O
 Faux	B-VILLE
@@ -1515,8 +1515,8 @@ cette	O
 patiente	O
 altérée	O
 sur	O
-le	O
+le	B-VILLE
-plan	B-VILLE
+plan	I-VILLE
 général,	O
 OMS2/3.	O
 >	O
@@ -1529,8 +1529,8 @@ du	O
 traitement	O
 antalgique.	O
 Sur	O
-le	O
+le	B-VILLE
-plan	B-VILLE
+plan	I-VILLE
 infectieux	O
 :	O
 Pic	O
@@ -2817,8 +2817,8 @@ apyrexie	O
 au	O
 décours.	O
 Sur	O
-le	O
+le	B-VILLE
-plan	B-VILLE
+plan	I-VILLE
 urologique	O
 :	O
 Un	O
@@ -2919,8 +2919,8 @@ oncologique	O
 Nette	O
 amélioration	O
 sur	O
-le	O
+le	B-VILLE
-plan	B-VILLE
+plan	I-VILLE
 général	O
 avec	O
 la	O
--- a/data/silver_annotations/trackare-13013848-23165708_13013848_23165708.bio
+++ b/data/silver_annotations/trackare-13013848-23165708_13013848_23165708.bio
@@ -2572,8 +2572,8 @@ de	O
 traitement	O
 antibiotique	O
 Sur	O
-le	O
+le	B-VILLE
-plan	B-VILLE
+plan	I-VILLE
 hématologique	O
 Anémie	O
 autour	O
--- a/data/silver_annotations/trackare-14025311-23034958_14025311_23034958.bio
+++ b/data/silver_annotations/trackare-14025311-23034958_14025311_23034958.bio
@@ -1812,8 +1812,8 @@ de	O
 cette	O
 décision.	O
 Sur	O
-le	O
+le	B-VILLE
-plan	B-VILLE
+plan	I-VILLE
 hématologique:	O
 Elle	O
 présente	O
--- a/data/silver_annotations/trackare-BA042686-23090597_BA042686_23090597.bio
+++ b/data/silver_annotations/trackare-BA042686-23090597_BA042686_23090597.bio
@@ -1420,8 +1420,8 @@ en	O
 charge	O
 antalgique.	O
 Sur	O
-le	O
+le	B-VILLE
-plan	B-VILLE
+plan	I-VILLE
 de	O
 la	O
 gravité:	O
--- a/data/silver_annotations/trackare-BA067657-23076655_BA067657_23076655.bio
+++ b/data/silver_annotations/trackare-BA067657-23076655_BA067657_23076655.bio
@@ -1102,8 +1102,8 @@ de	O
 l'épisode	O
 aigüe.	O
 Sur	O
-le	O
+le	B-VILLE
-plan	B-VILLE
+plan	I-VILLE
 infectieux,	O
 présence	O
 de	O
--- a/detectors/hospital_filter.py
+++ b/detectors/hospital_filter.py
@@ -166,23 +166,12 @@ class HospitalFilter:
        Returns:
            True si la détection doit être filtrée (faux positif)
        """
-        # Filtrer par type
+        # ADRESSE, CODE_POSTAL, VILLE, TEL : NE PAS filtrer.
-        if pii_type == "ADRESSE":
+        # Les coordonnées hospitalières identifient indirectement le patient
-            return self.is_hospital_address(text)
+        # et doivent être masquées (validé par contrôle humain 2026-03-12).
-        elif pii_type == "CODE_POSTAL":
+        # EPISODE : NE PAS filtrer.
-            return self.is_hospital_postal_code(text)
+        # Les numéros d'épisode identifient le patient (validé 2026-03-14).
        elif pii_type == "VILLE":
            return self.is_hospital_city(text)
        elif pii_type == "TEL":
            return self.is_hospital_phone(text)
        elif pii_type == "EPISODE":
            # Filtrer les épisodes qui proviennent du nom de fichier
            # (répétés dans les en-têtes/pieds de page des documents trackare)
            return self.is_episode_in_filename(text, filename)
        return False
@@ -222,15 +211,17 @@ if __name__ == "__main__":
    # Tests
    test_cases = [
-        ("ADRESSE", "13, Avenue de l'Interne J", "", -1, True),
+        # ADRESSE, CODE_POSTAL, VILLE, TEL : ne sont plus filtrés (identifient le patient)
        ("ADRESSE", "13, Avenue de l'Interne J", "", -1, False),
        ("ADRESSE", "22 LOT MENDI ALDE", "", -1, False),
-        ("CODE_POSTAL", "64109 BAYONNE CEDEX", "", -1, True),
+        ("CODE_POSTAL", "64109 BAYONNE CEDEX", "", -1, False),
        ("CODE_POSTAL", "64130", "", -1, False),
-        ("VILLE", "BAYONNE CEDEX", "", -1, True),
+        ("VILLE", "BAYONNE CEDEX", "", -1, False),
        ("VILLE", "CHERAUTE", "", -1, False),
-        ("VILLE", "DROIT", "", -1, True),  # Terme anatomique
+        ("VILLE", "DROIT", "", -1, False),
-        ("TEL", "05 59 44 35 35", "", -1, True),
+        ("TEL", "05 59 44 35 35", "", -1, False),
        ("TEL", "0676085336", "", -1, False),
        # EPISODE : filtré uniquement si provient du nom de fichier trackare
        ("EPISODE", "23202435", "trackare-14004105-23202435", -1, True),
        ("EPISODE", "23102610", "CRH_23102610", 0, False),
    ]
--- a/evaluation/baseline_scores.json
+++ b/evaluation/baseline_scores.json
@@ -1,18 +1,18 @@
 {
-  "date": "2026-03-12T10:24:59.261417",
+  "date": "2026-03-12T17:16:25.993851",
  "scores": {
    "global_score": 97.0,
    "leak_score": 100.0,
    "fp_score": 90,
    "totals": {
      "documents": 29,
-      "audit_hits": 2797,
+      "audit_hits": 3186,
-      "name_tokens_known": 461,
+      "name_tokens_known": 457,
      "leak_audit": 0,
      "leak_occurrences": 0,
      "leak_regex": 0,
      "leak_insee_high": 0,
-      "leak_insee_medium": 569,
+      "leak_insee_medium": 570,
      "fp_medical": 0,
      "fp_overmasking": 2
    }
@@ -110,7 +110,7 @@
      "leak_audit": 0,
      "leak_regex": 0,
      "leak_insee_high": 0,
-      "leak_insee_medium": 23,
+      "leak_insee_medium": 24,
      "fp_medical": 0,
      "fp_overmasking": 0
    },
@@ -206,7 +206,7 @@
      "leak_audit": 0,
      "leak_regex": 0,
      "leak_insee_high": 0,
-      "leak_insee_medium": 32,
+      "leak_insee_medium": 33,
      "fp_medical": 0,
      "fp_overmasking": 0
    },
@@ -222,7 +222,7 @@
      "leak_audit": 0,
      "leak_regex": 0,
      "leak_insee_high": 0,
-      "leak_insee_medium": 34,
+      "leak_insee_medium": 32,
      "fp_medical": 0,
      "fp_overmasking": 0
    },
@@ -246,7 +246,7 @@
      "leak_audit": 0,
      "leak_regex": 0,
      "leak_insee_high": 0,
-      "leak_insee_medium": 26,
+      "leak_insee_medium": 27,
      "fp_medical": 0,
      "fp_overmasking": 0
    }
--- a/run_batch_silver_export.py
+++ b/run_batch_silver_export.py
@@ -1,27 +1,24 @@
 #!/usr/bin/env python3
-"""Batch anonymisation de PDFs pour enrichir le dataset silver.
+"""Batch anonymisation parallèle de PDFs pour enrichir le dataset silver.
-Traite TOUS les PDFs disponibles (excluant ceux déjà dans audit_30) en mode CPU
+Traite TOUS les PDFs disponibles en mode CPU (sans VLM), avec N workers
-uniquement (sans VLM) pour générer des .pseudonymise.txt utilisables par
+parallèles. Chaque worker charge ses propres modèles NER.
 export_silver_annotations.py.
 Timeout par fichier pour éviter les blocages sur les gros documents.
 Reprend automatiquement là où il s'est arrêté (skip les déjà traités).
 Usage:
    python run_batch_silver_export.py              # 6 workers (défaut)
    python run_batch_silver_export.py --workers 4  # 4 workers
 """
 import sys
 import os
 import time
-import signal
+import argparse
-import random
+import multiprocessing as mp
 from pathlib import Path
 from collections import Counter
 sys.path.insert(0, str(Path(__file__).parent))
 import anonymizer_core_refactored_onnx as core
 from eds_pseudo_manager import EdsPseudoManager
 from gliner_manager import GlinerManager
 from camembert_ner_manager import CamembertNerManager
 SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
 OUTDIR = SRC / "anonymise_silver_extra"
 CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
@@ -62,16 +59,102 @@ ALREADY_DONE_AUDIT30 = {
 TIMEOUT_PER_FILE = 120  # secondes max par PDF
-
+# Variables globales par worker (initialisées une seule fois)
-class TimeoutError(Exception):
+_worker_ner = None
-    pass
+_worker_gliner = None
 _worker_camembert = None
 _worker_id = None
-def timeout_handler(signum, frame):
+def init_worker(worker_id):
-    raise TimeoutError("Timeout")
+    """Initialise les modèles NER dans chaque worker (appelé une seule fois)."""
    global _worker_ner, _worker_gliner, _worker_camembert, _worker_id
    _worker_id = worker_id
    # Limiter les threads ONNX/OpenMP par worker pour éviter la contention
    n_threads = max(2, 32 // (mp.cpu_count() // 2))  # répartir équitablement
    os.environ["OMP_NUM_THREADS"] = str(n_threads)
    os.environ["MKL_NUM_THREADS"] = str(n_threads)
    import anonymizer_core_refactored_onnx as core  # noqa: F401
    from eds_pseudo_manager import EdsPseudoManager
    from gliner_manager import GlinerManager
    from camembert_ner_manager import CamembertNerManager
    _worker_ner = EdsPseudoManager()
    _worker_ner.load()
    print(f"  [W{worker_id}] EDS-Pseudo chargé", flush=True)
    _worker_gliner = GlinerManager()
    try:
        _worker_gliner.load()
        print(f"  [W{worker_id}] GLiNER chargé", flush=True)
    except Exception as e:
        print(f"  [W{worker_id}] GLiNER indisponible ({e})", flush=True)
        _worker_gliner = None
    _worker_camembert = CamembertNerManager()
    try:
        _worker_camembert.load()
        print(f"  [W{worker_id}] CamemBERT-bio chargé", flush=True)
    except Exception as e:
        print(f"  [W{worker_id}] CamemBERT-bio indisponible ({e})", flush=True)
        _worker_camembert = None
    print(f"  [W{worker_id}] Prêt (threads={n_threads})", flush=True)
 def process_one_pdf(args):
    """Traite un seul PDF. Appelé par le pool de workers."""
    pdf_path, idx, total = args
    import signal
    import anonymizer_core_refactored_onnx as core
    ogc = pdf_path.parent.name.split("_")[0]
    # Timeout via alarm
    def _timeout_handler(signum, frame):
        raise TimeoutError("Timeout")
    signal.signal(signal.SIGALRM, _timeout_handler)
    signal.alarm(TIMEOUT_PER_FILE)
    try:
        core.process_pdf(
            pdf_path=pdf_path,
            out_dir=OUTDIR,
            make_vector_redaction=False,
            also_make_raster_burn=False,
            config_path=CONFIG,
            use_hf=True,
            ner_manager=_worker_ner,
            ner_thresholds=None,
            ogc_label=ogc,
            vlm_manager=None,
            gliner_manager=_worker_gliner,
            camembert_manager=_worker_camembert,
        )
        signal.alarm(0)
        return ("OK", pdf_path.name, idx, total)
    except TimeoutError:
        signal.alarm(0)
        return ("TIMEOUT", pdf_path.name, idx, total)
    except Exception as e:
        signal.alarm(0)
        err = str(e)
        if "encrypted" in err.lower() or "password" in err.lower():
            return ("SKIP", pdf_path.name, idx, total)
        return ("ERROR", pdf_path.name, idx, total, str(e)[:100])
 def main():
    parser = argparse.ArgumentParser(description="Batch silver export parallèle")
    parser.add_argument("--workers", type=int, default=6,
                        help="Nombre de workers parallèles (défaut: 6)")
    args = parser.parse_args()
    n_workers = args.workers
    # Collecter tous les PDFs disponibles (excluant audit_30)
    all_pdfs = []
    for ogc_dir in sorted(SRC.iterdir()):
@@ -81,7 +164,6 @@ def main():
            if pdf.name not in ALREADY_DONE_AUDIT30:
                all_pdfs.append(pdf)
    # Trier par OGC pour reproductibilité
    all_pdfs.sort(key=lambda p: (p.parent.name, p.name))
    # Détecter les fichiers déjà traités (reprise)
@@ -95,96 +177,73 @@ def main():
    print(f"PDFs disponibles: {len(all_pdfs)} (excl. audit_30)")
    print(f"Déjà traités:     {len(already_done)}")
    print(f"Restant:          {len(pdfs_to_do)}")
    print(f"Workers:          {n_workers}")
    print(f"RAM par worker:   ~4 Go (NER models)")
    print(f"RAM totale estimée: ~{n_workers * 4} Go\n")
    if not pdfs_to_do:
        print("Rien à faire.")
        return
-    # Chargement des modèles NER (CPU uniquement, pas de VLM)
+    # Préparer les arguments : (pdf_path, index, total)
-    print("\nChargement EDS-Pseudo...", flush=True)
+    tasks = [(pdf, i, len(pdfs_to_do)) for i, pdf in enumerate(pdfs_to_do, 1)]
    ner = EdsPseudoManager()
    ner.load()
    assert ner.is_loaded(), "EDS-Pseudo non chargé"
    print("EDS-Pseudo chargé.", flush=True)
-    print("Chargement GLiNER...", flush=True)
+    print(f"Chargement des modèles dans {n_workers} workers...", flush=True)
    gliner = GlinerManager()
    try:
        gliner.load()
        print("GLiNER chargé.", flush=True)
    except Exception as e:
        print(f"GLiNER indisponible ({e}), on continue sans.", flush=True)
        gliner = None
-    print("Chargement CamemBERT-bio ONNX...", flush=True)
+    # Créer le pool avec initialisation des modèles par worker
-    camembert = CamembertNerManager()
+    # On utilise mp.Pool avec initializer pour charger les modèles une seule fois
-    try:
+    # Note: fork + ONNX peut poser problème, on utilise 'spawn'
-        camembert.load()
+    ctx = mp.get_context("spawn")
        print("CamemBERT-bio ONNX chargé.", flush=True)
    except Exception as e:
        print(f"CamemBERT-bio indisponible ({e}), on continue sans.", flush=True)
        camembert = None
    print(f"\nPas de VLM (CPU only pour silver export).\n", flush=True)
    ok = ko = skip_encrypted = skip_timeout = 0
    t0 = time.time()
    total = len(pdfs_to_do)
-    for i, pdf in enumerate(pdfs_to_do, 1):
+    # Lancer les workers séquentiellement pour l'init (éviter pic mémoire)
-        ogc = pdf.parent.name.split("_")[0]
+    # puis traiter en parallèle
-        print(f"[{i}/{total}] {pdf.name} (OGC {ogc})...", end=" ", flush=True)
+    with ctx.Pool(
        processes=n_workers,
        initializer=init_worker,
        initargs=(0,),  # worker_id simplifié
    ) as pool:
        for result in pool.imap_unordered(process_one_pdf, tasks, chunksize=1):
            status = result[0]
            name = result[1]
            idx = result[2]
            total = result[3]
        # Timeout par fichier
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(TIMEOUT_PER_FILE)
        try:
            core.process_pdf(
                pdf_path=pdf,
                out_dir=OUTDIR,
                make_vector_redaction=False,
                also_make_raster_burn=False,
                config_path=CONFIG,
                use_hf=True,
                ner_manager=ner,
                ner_thresholds=None,
                ogc_label=ogc,
                vlm_manager=None,
                gliner_manager=gliner,
                camembert_manager=camembert,
            )
            signal.alarm(0)
            elapsed_file = time.time() - t0
            rate = ok / elapsed_file * 3600 if elapsed_file > 0 and ok > 0 else 0
            print(f"OK ({rate:.0f}/h)", flush=True)
            ok += 1
        except TimeoutError:
            signal.alarm(0)
            print(f"TIMEOUT ({TIMEOUT_PER_FILE}s)", flush=True)
            skip_timeout += 1
        except Exception as e:
            signal.alarm(0)
            err = str(e)
            if "encrypted" in err.lower() or "password" in err.lower():
                print("SKIP (chiffré)", flush=True)
                skip_encrypted += 1
            else:
                print(f"ERREUR: {e}", flush=True)
                ko += 1
        # Rapport intermédiaire toutes les 50 fichiers
        if i % 50 == 0:
            elapsed = time.time() - t0
-            remaining = (elapsed / i) * (total - i)
+            done = ok + ko + skip_encrypted + skip_timeout + 1
-            print(f"\n  --- Progression: {i}/{total} | OK: {ok} | "
+
-                  f"Erreurs: {ko} | Timeout: {skip_timeout} | "
+            if status == "OK":
-                  f"Temps restant estimé: {remaining/60:.0f}min ---\n", flush=True)
+                ok += 1
                rate = ok / elapsed * 3600 if elapsed > 0 else 0
                print(f"[{done}/{total}] {name} OK ({rate:.0f}/h)", flush=True)
            elif status == "TIMEOUT":
                skip_timeout += 1
                print(f"[{done}/{total}] {name} TIMEOUT", flush=True)
            elif status == "SKIP":
                skip_encrypted += 1
                print(f"[{done}/{total}] {name} SKIP (chiffré)", flush=True)
            else:
                ko += 1
                err_msg = result[4] if len(result) > 4 else "?"
                print(f"[{done}/{total}] {name} ERREUR: {err_msg}", flush=True)
            # Rapport intermédiaire toutes les 50 fichiers
            if done % 50 == 0:
                remaining = (elapsed / done) * (total - done)
                print(f"\n  --- Progression: {done}/{total} | OK: {ok} | "
                      f"Erreurs: {ko} | Timeout: {skip_timeout} | "
                      f"Débit: {ok/elapsed*3600:.0f}/h | "
                      f"Restant: {remaining/60:.0f}min ---\n", flush=True)
    elapsed = time.time() - t0
    total_pseudo = len(list(OUTDIR.glob("*.pseudonymise.txt")))
    print(f"\n{'='*60}")
    print(f"Terminé en {elapsed:.0f}s ({elapsed/60:.1f}min)")
    print(f"OK: {ok}, Chiffrés: {skip_encrypted}, Timeout: {skip_timeout}, Erreurs: {ko}")
-    print(f"Total .pseudonymise.txt: {len(list(OUTDIR.glob('*.pseudonymise.txt')))}")
+    print(f"Total .pseudonymise.txt: {total_pseudo}")
    print(f"Débit moyen: {ok/elapsed*3600:.0f} fichiers/h")
    print(f"Sortie: {OUTDIR}")
--- a/scripts/build_finess_gazetteers.py
+++ b/scripts/build_finess_gazetteers.py
@@ -190,6 +190,93 @@ def main():
    out.write_text("\n".join(sorted(phones)) + "\n", encoding="utf-8")
    print(f"  → {out.name}: {len(phones)} entrées")
    # 6. Adresses FINESS (type_voie + nom_voie) pour Aho-Corasick
    # Mapping des codes type_voie FINESS vers formes étendues
    TYPE_VOIE_MAP = {
        "AV": "avenue", "R": "rue", "BD": "boulevard", "RTE": "route",
        "CHE": "chemin", "PL": "place", "IMP": "impasse", "ALL": "allee",
        "SQ": "square", "PASS": "passage", "QU": "quai", "CRS": "cours",
        "SEN": "sentier", "RPT": "rond-point", "LD": "lieu-dit",
        "HAM": "hameau", "LOT": "lotissement", "TSSE": "traverse",
        "CHEM": "chemin", "RES": "residence", "CTRE": "centre",
        "ESP": "esplanade", "PRO": "promenade", "MTE": "montee",
        "VOI": "voie", "CAR": "carrefour", "FBG": "faubourg",
    }
    # Charger les prénoms INSEE pour générer des variantes abrégées
    prenoms_path = Path(__file__).parent.parent / "data" / "insee" / "prenoms_france.txt"
    prenoms_set = set()
    if prenoms_path.exists():
        for line in prenoms_path.read_text(encoding="utf-8").splitlines():
            p = line.strip().lower()
            if p and len(p) >= 3:
                prenoms_set.add(p)
    print(f"  Prénoms INSEE chargés: {len(prenoms_set)}")
    VOIE_GENERIC = {
        "de", "du", "des", "la", "le", "les", "l", "et", "en", "au", "aux",
        "a", "sur", "sous", "par", "pour", "dans", "rue", "avenue", "boulevard",
        "route", "chemin", "place", "impasse", "square", "passage", "quai", "cours",
        "grande", "grand", "petit", "petite", "vieux", "vieille", "nouveau", "nouvelle",
        "haut", "haute", "bas", "basse",
    }
    addr_patterns = set()
    def _add_with_abbrev(pattern: str):
        """Ajoute le pattern + variantes avec prénoms abrégés (initiale seule)."""
        addr_patterns.add(pattern)
        words = pattern.split()
        for i, w in enumerate(words):
            if w in prenoms_set and len(w) >= 3:
                # Variante avec initiale seule — seulement si un mot distinctif suit
                remaining = words[i+1:]
                if not remaining or all(len(r) <= 2 or r in VOIE_GENERIC for r in remaining):
                    continue  # Pas d'abréviation si rien de distinctif après
                abbrev_words = words[:i] + [w[0]] + words[i+1:]
                abbrev = " ".join(abbrev_words)
                # Minimum 12 chars, et le pattern ne doit pas commencer par une initiale seule
                if len(abbrev) >= 12 and len(abbrev_words[0]) >= 2:
                    addr_patterns.add(abbrev)
    with open(csv_path, encoding="utf-8") as f:
        reader = csv.reader(f, delimiter=";")
        next(reader)
        for row in reader:
            if len(row) < 10:
                continue
            type_voie_raw = row[8].strip() if len(row) > 8 else ""
            nom_voie = row[9].strip() if len(row) > 9 else ""
            if not nom_voie or len(nom_voie) < 3:
                continue
            nom_norm = normalize(nom_voie)
            words = nom_norm.split()
            # Pattern complet : type_voie + nom_voie (ex: "avenue de l interne jacques loeb")
            type_voie_expanded = TYPE_VOIE_MAP.get(type_voie_raw.upper(), type_voie_raw.lower())
            if type_voie_expanded and nom_norm:
                full = f"{type_voie_expanded} {nom_norm}"
                full_words = full.split()
                has_distinctive = any(
                    w not in VOIE_GENERIC and len(w) >= 4 for w in full_words
                )
                if has_distinctive and len(full) >= 12:
                    _add_with_abbrev(full)
            # Pattern nom_voie seul (seulement si très distinctif)
            has_distinctive = any(w not in VOIE_GENERIC and len(w) >= 4 for w in words)
            if has_distinctive and len(nom_norm) >= 15:
                _add_with_abbrev(nom_norm)
    out = OUT_DIR / "adresses_finess.txt"
    out.write_text("\n".join(sorted(addr_patterns)) + "\n", encoding="utf-8")
    print(f"\n  → {out.name}: {len(addr_patterns)} entrées")
    # Garder aussi voies_distinctives.txt pour compatibilité
    voie_names = {p for p in addr_patterns if len(p) >= 15}
    out = OUT_DIR / "voies_distinctives.txt"
    out.write_text("\n".join(sorted(voie_names)) + "\n", encoding="utf-8")
    print(f"  → {out.name}: {len(voie_names)} entrées")
    # Stats par longueur
    print(f"\nDistribution noms distinctifs par longueur (mots):")
    word_counts = Counter(len(n.split()) for n in filtered_distinctive)
--- a/scripts/evaluate_quality.py
+++ b/scripts/evaluate_quality.py
@@ -300,7 +300,7 @@ def check_fp_density(text: str) -> dict:
        "density_pct": round(density, 2),
        "nom_count": nom_count,
        "nom_pct": round(nom_pct, 2),
-        "alert": nom_pct > 5.0,
+        "alert": nom_pct > 8.0,  # seuil relevé : CRO/CRH courts listent 8-10 soignants = légitime
    }
--- a/server.py
+++ b/server.py
@@ -210,17 +210,34 @@ async def anonymize_text(
    final_text = selective_rescan(final_text, cfg=cfg)
    elapsed = time.time() - t0
-    audit_list = [
+
-        {"kind": h.kind, "original": h.original, "placeholder": h.placeholder, "page": h.page}
+    # Inclure tous les hits (regex page≥0 + NER page=-1) avec source
-        for h in anon.audit
+    ner_prefixes = ("NER_", "EDS_")
-        if h.page != -1  # exclure les propagations globales
+    audit_list = []
-    ]
+    ner_count = 0
    regex_count = 0
    for h in anon.audit:
        is_ner = h.kind.startswith(ner_prefixes) or h.page == -1
        entry = {
            "kind": h.kind,
            "original": h.original,
            "placeholder": h.placeholder,
            "page": h.page,
            "source": "ner" if is_ner else "regex",
        }
        audit_list.append(entry)
        if is_ner:
            ner_count += 1
        else:
            regex_count += 1
    return {
        "text_anonymized": final_text,
        "audit": audit_list,
        "stats": {
            "pii_detected": len(audit_list),
            "regex_count": regex_count,
            "ner_count": ner_count,
            "elapsed_seconds": round(elapsed, 3),
            "ner_active": use_ner and _eds_manager is not None,
        },