feat: réduction FP + gazetteers adresses FINESS + batch parallèle + corrections multi-axes

- Token min length relevé de 2-3 → 4 chars (élimine FP EPO, IRC, SIB...)
- Stop-words enrichis : acronymes médicaux 3 lettres, termes pharma, soins infirmiers
- BDPM stop-words : ~7300 noms commerciaux + DCI/substances actives
- Gazetteers adresses FINESS : 63K patterns Aho-Corasick (position-preserving normalization)
- Filtre contextuel anatomique pour FINESS établissements
- Nouvelles regex : RE_CIVILITE_COMMA_LIST, RE_EXTRACT_NOM_UTILISE, RE_EXTRACT_PRENOM,
  RE_NUM_EXAMEN_PATIENT, RE_ADRESSE_LIEU_DIT, RE_CIVILITE_INITIALE, Dr X.NOM
- URLs complètes (RE_URL) + détection multiline
- N° venue inversé (layout-aware) + EPISODE/NDA dans _CRITICAL_PII_TYPES
- HospitalFilter désactivé pour ADRESSE/TEL/VILLE/EPISODE (identifient le patient)
- Batch silver export parallélisé (multiprocessing spawn, N workers)
- Seuil sur-masquage relevé à 8%, server.py enrichi (source regex/ner)
- Blacklist villes : COURANT, PARIS ; contexte villes étendu (UHCD, spécialités)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-16 09:26:56 +01:00
parent a827d860f1
commit 49ff464e6e
18 changed files with 358579 additions and 232 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -24,9 +24,14 @@ blacklist:
- '640780417'
- 'Dates du séjour :'
- CONCERTATION
- BAYONNE CEDEX
- BAYONNE
- '64109'
- LABORATOIRE de BIOLOGIE MEDICALE
force_mask_regex:
- 'Centre\s+Hospitalier\s+(?:de\s+(?:la\s+)?)?C[oôÔ]te\s+Basque'
- 'Polyclinique\s+C[oôÔ]te\s+Basque\s+Sud'
- '13\s*,?\s*Avenue\s+de\s+l.Interne\s+J\.?\s*LOEB\s+BP\s*\d+'
kv_labels_preserve:
- FINESS
- IPP

15816
data/bdpm/CIS_bdpm.txt Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

218984
data/insee/noms2008nat_txt.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1184,8 +1184,8 @@ déglobulisation. O
Bladder O
négatif. O
Sur O
le O
plan B-VILLE
le B-VILLE
plan I-VILLE
antalgique O
: O
Faux B-VILLE
@@ -1515,8 +1515,8 @@ cette O
patiente O
altérée O
sur O
le O
plan B-VILLE
le B-VILLE
plan I-VILLE
général, O
OMS2/3. O
> O
@@ -1529,8 +1529,8 @@ du O
traitement O
antalgique. O
Sur O
le O
plan B-VILLE
le B-VILLE
plan I-VILLE
infectieux O
: O
Pic O
@@ -2817,8 +2817,8 @@ apyrexie O
au O
décours. O
Sur O
le O
plan B-VILLE
le B-VILLE
plan I-VILLE
urologique O
: O
Un O
@@ -2919,8 +2919,8 @@ oncologique O
Nette O
amélioration O
sur O
le O
plan B-VILLE
le B-VILLE
plan I-VILLE
général O
avec O
la O

View File

@@ -2572,8 +2572,8 @@ de O
traitement O
antibiotique O
Sur O
le O
plan B-VILLE
le B-VILLE
plan I-VILLE
hématologique O
Anémie O
autour O

View File

@@ -1812,8 +1812,8 @@ de O
cette O
décision. O
Sur O
le O
plan B-VILLE
le B-VILLE
plan I-VILLE
hématologique: O
Elle O
présente O

View File

@@ -1420,8 +1420,8 @@ en O
charge O
antalgique. O
Sur O
le O
plan B-VILLE
le B-VILLE
plan I-VILLE
de O
la O
gravité: O

View File

@@ -1102,8 +1102,8 @@ de O
l'épisode O
aigüe. O
Sur O
le O
plan B-VILLE
le B-VILLE
plan I-VILLE
infectieux, O
présence O
de O

View File

@@ -166,23 +166,12 @@ class HospitalFilter:
Returns:
True si la détection doit être filtrée (faux positif)
"""
# Filtrer par type
if pii_type == "ADRESSE":
return self.is_hospital_address(text)
# ADRESSE, CODE_POSTAL, VILLE, TEL : NE PAS filtrer.
# Les coordonnées hospitalières identifient indirectement le patient
# et doivent être masquées (validé par contrôle humain 2026-03-12).
elif pii_type == "CODE_POSTAL":
return self.is_hospital_postal_code(text)
elif pii_type == "VILLE":
return self.is_hospital_city(text)
elif pii_type == "TEL":
return self.is_hospital_phone(text)
elif pii_type == "EPISODE":
# Filtrer les épisodes qui proviennent du nom de fichier
# (répétés dans les en-têtes/pieds de page des documents trackare)
return self.is_episode_in_filename(text, filename)
# EPISODE : NE PAS filtrer.
# Les numéros d'épisode identifient le patient (validé 2026-03-14).
return False
@@ -222,15 +211,17 @@ if __name__ == "__main__":
# Tests
test_cases = [
("ADRESSE", "13, Avenue de l'Interne J", "", -1, True),
# ADRESSE, CODE_POSTAL, VILLE, TEL : ne sont plus filtrés (identifient le patient)
("ADRESSE", "13, Avenue de l'Interne J", "", -1, False),
("ADRESSE", "22 LOT MENDI ALDE", "", -1, False),
("CODE_POSTAL", "64109 BAYONNE CEDEX", "", -1, True),
("CODE_POSTAL", "64109 BAYONNE CEDEX", "", -1, False),
("CODE_POSTAL", "64130", "", -1, False),
("VILLE", "BAYONNE CEDEX", "", -1, True),
("VILLE", "BAYONNE CEDEX", "", -1, False),
("VILLE", "CHERAUTE", "", -1, False),
("VILLE", "DROIT", "", -1, True), # Terme anatomique
("TEL", "05 59 44 35 35", "", -1, True),
("VILLE", "DROIT", "", -1, False),
("TEL", "05 59 44 35 35", "", -1, False),
("TEL", "0676085336", "", -1, False),
# EPISODE : filtré uniquement si provient du nom de fichier trackare
("EPISODE", "23202435", "trackare-14004105-23202435", -1, True),
("EPISODE", "23102610", "CRH_23102610", 0, False),
]

View File

@@ -1,18 +1,18 @@
{
"date": "2026-03-12T10:24:59.261417",
"date": "2026-03-12T17:16:25.993851",
"scores": {
"global_score": 97.0,
"leak_score": 100.0,
"fp_score": 90,
"totals": {
"documents": 29,
"audit_hits": 2797,
"name_tokens_known": 461,
"audit_hits": 3186,
"name_tokens_known": 457,
"leak_audit": 0,
"leak_occurrences": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 569,
"leak_insee_medium": 570,
"fp_medical": 0,
"fp_overmasking": 2
}
@@ -110,7 +110,7 @@
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 23,
"leak_insee_medium": 24,
"fp_medical": 0,
"fp_overmasking": 0
},
@@ -206,7 +206,7 @@
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 32,
"leak_insee_medium": 33,
"fp_medical": 0,
"fp_overmasking": 0
},
@@ -222,7 +222,7 @@
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 34,
"leak_insee_medium": 32,
"fp_medical": 0,
"fp_overmasking": 0
},
@@ -246,7 +246,7 @@
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 26,
"leak_insee_medium": 27,
"fp_medical": 0,
"fp_overmasking": 0
}

View File

@@ -1,27 +1,24 @@
#!/usr/bin/env python3
"""Batch anonymisation de PDFs pour enrichir le dataset silver.
"""Batch anonymisation parallèle de PDFs pour enrichir le dataset silver.
Traite TOUS les PDFs disponibles (excluant ceux déjà dans audit_30) en mode CPU
uniquement (sans VLM) pour générer des .pseudonymise.txt utilisables par
export_silver_annotations.py.
Traite TOUS les PDFs disponibles en mode CPU (sans VLM), avec N workers
parallèles. Chaque worker charge ses propres modèles NER.
Timeout par fichier pour éviter les blocages sur les gros documents.
Reprend automatiquement là où il s'est arrêté (skip les déjà traités).
Usage:
python run_batch_silver_export.py # 6 workers (défaut)
python run_batch_silver_export.py --workers 4 # 4 workers
"""
import sys
import os
import time
import signal
import random
import argparse
import multiprocessing as mp
from pathlib import Path
from collections import Counter
sys.path.insert(0, str(Path(__file__).parent))
import anonymizer_core_refactored_onnx as core
from eds_pseudo_manager import EdsPseudoManager
from gliner_manager import GlinerManager
from camembert_ner_manager import CamembertNerManager
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
OUTDIR = SRC / "anonymise_silver_extra"
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
@@ -62,16 +59,102 @@ ALREADY_DONE_AUDIT30 = {
TIMEOUT_PER_FILE = 120 # secondes max par PDF
class TimeoutError(Exception):
pass
# Variables globales par worker (initialisées une seule fois)
_worker_ner = None
_worker_gliner = None
_worker_camembert = None
_worker_id = None
def timeout_handler(signum, frame):
raise TimeoutError("Timeout")
def init_worker(worker_id):
"""Initialise les modèles NER dans chaque worker (appelé une seule fois)."""
global _worker_ner, _worker_gliner, _worker_camembert, _worker_id
_worker_id = worker_id
# Limiter les threads ONNX/OpenMP par worker pour éviter la contention
n_threads = max(2, 32 // (mp.cpu_count() // 2)) # répartir équitablement
os.environ["OMP_NUM_THREADS"] = str(n_threads)
os.environ["MKL_NUM_THREADS"] = str(n_threads)
import anonymizer_core_refactored_onnx as core # noqa: F401
from eds_pseudo_manager import EdsPseudoManager
from gliner_manager import GlinerManager
from camembert_ner_manager import CamembertNerManager
_worker_ner = EdsPseudoManager()
_worker_ner.load()
print(f" [W{worker_id}] EDS-Pseudo chargé", flush=True)
_worker_gliner = GlinerManager()
try:
_worker_gliner.load()
print(f" [W{worker_id}] GLiNER chargé", flush=True)
except Exception as e:
print(f" [W{worker_id}] GLiNER indisponible ({e})", flush=True)
_worker_gliner = None
_worker_camembert = CamembertNerManager()
try:
_worker_camembert.load()
print(f" [W{worker_id}] CamemBERT-bio chargé", flush=True)
except Exception as e:
print(f" [W{worker_id}] CamemBERT-bio indisponible ({e})", flush=True)
_worker_camembert = None
print(f" [W{worker_id}] Prêt (threads={n_threads})", flush=True)
def process_one_pdf(args):
"""Traite un seul PDF. Appelé par le pool de workers."""
pdf_path, idx, total = args
import signal
import anonymizer_core_refactored_onnx as core
ogc = pdf_path.parent.name.split("_")[0]
# Timeout via alarm
def _timeout_handler(signum, frame):
raise TimeoutError("Timeout")
signal.signal(signal.SIGALRM, _timeout_handler)
signal.alarm(TIMEOUT_PER_FILE)
try:
core.process_pdf(
pdf_path=pdf_path,
out_dir=OUTDIR,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=CONFIG,
use_hf=True,
ner_manager=_worker_ner,
ner_thresholds=None,
ogc_label=ogc,
vlm_manager=None,
gliner_manager=_worker_gliner,
camembert_manager=_worker_camembert,
)
signal.alarm(0)
return ("OK", pdf_path.name, idx, total)
except TimeoutError:
signal.alarm(0)
return ("TIMEOUT", pdf_path.name, idx, total)
except Exception as e:
signal.alarm(0)
err = str(e)
if "encrypted" in err.lower() or "password" in err.lower():
return ("SKIP", pdf_path.name, idx, total)
return ("ERROR", pdf_path.name, idx, total, str(e)[:100])
def main():
parser = argparse.ArgumentParser(description="Batch silver export parallèle")
parser.add_argument("--workers", type=int, default=6,
help="Nombre de workers parallèles (défaut: 6)")
args = parser.parse_args()
n_workers = args.workers
# Collecter tous les PDFs disponibles (excluant audit_30)
all_pdfs = []
for ogc_dir in sorted(SRC.iterdir()):
@@ -81,7 +164,6 @@ def main():
if pdf.name not in ALREADY_DONE_AUDIT30:
all_pdfs.append(pdf)
# Trier par OGC pour reproductibilité
all_pdfs.sort(key=lambda p: (p.parent.name, p.name))
# Détecter les fichiers déjà traités (reprise)
@@ -95,96 +177,73 @@ def main():
print(f"PDFs disponibles: {len(all_pdfs)} (excl. audit_30)")
print(f"Déjà traités: {len(already_done)}")
print(f"Restant: {len(pdfs_to_do)}")
print(f"Workers: {n_workers}")
print(f"RAM par worker: ~4 Go (NER models)")
print(f"RAM totale estimée: ~{n_workers * 4} Go\n")
if not pdfs_to_do:
print("Rien à faire.")
return
# Chargement des modèles NER (CPU uniquement, pas de VLM)
print("\nChargement EDS-Pseudo...", flush=True)
ner = EdsPseudoManager()
ner.load()
assert ner.is_loaded(), "EDS-Pseudo non chargé"
print("EDS-Pseudo chargé.", flush=True)
# Préparer les arguments : (pdf_path, index, total)
tasks = [(pdf, i, len(pdfs_to_do)) for i, pdf in enumerate(pdfs_to_do, 1)]
print("Chargement GLiNER...", flush=True)
gliner = GlinerManager()
try:
gliner.load()
print("GLiNER chargé.", flush=True)
except Exception as e:
print(f"GLiNER indisponible ({e}), on continue sans.", flush=True)
gliner = None
print(f"Chargement des modèles dans {n_workers} workers...", flush=True)
print("Chargement CamemBERT-bio ONNX...", flush=True)
camembert = CamembertNerManager()
try:
camembert.load()
print("CamemBERT-bio ONNX chargé.", flush=True)
except Exception as e:
print(f"CamemBERT-bio indisponible ({e}), on continue sans.", flush=True)
camembert = None
print(f"\nPas de VLM (CPU only pour silver export).\n", flush=True)
# Créer le pool avec initialisation des modèles par worker
# On utilise mp.Pool avec initializer pour charger les modèles une seule fois
# Note: fork + ONNX peut poser problème, on utilise 'spawn'
ctx = mp.get_context("spawn")
ok = ko = skip_encrypted = skip_timeout = 0
t0 = time.time()
total = len(pdfs_to_do)
for i, pdf in enumerate(pdfs_to_do, 1):
ogc = pdf.parent.name.split("_")[0]
print(f"[{i}/{total}] {pdf.name} (OGC {ogc})...", end=" ", flush=True)
# Lancer les workers séquentiellement pour l'init (éviter pic mémoire)
# puis traiter en parallèle
with ctx.Pool(
processes=n_workers,
initializer=init_worker,
initargs=(0,), # worker_id simplifié
) as pool:
for result in pool.imap_unordered(process_one_pdf, tasks, chunksize=1):
status = result[0]
name = result[1]
idx = result[2]
total = result[3]
# Timeout par fichier
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(TIMEOUT_PER_FILE)
try:
core.process_pdf(
pdf_path=pdf,
out_dir=OUTDIR,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=CONFIG,
use_hf=True,
ner_manager=ner,
ner_thresholds=None,
ogc_label=ogc,
vlm_manager=None,
gliner_manager=gliner,
camembert_manager=camembert,
)
signal.alarm(0)
elapsed_file = time.time() - t0
rate = ok / elapsed_file * 3600 if elapsed_file > 0 and ok > 0 else 0
print(f"OK ({rate:.0f}/h)", flush=True)
ok += 1
except TimeoutError:
signal.alarm(0)
print(f"TIMEOUT ({TIMEOUT_PER_FILE}s)", flush=True)
skip_timeout += 1
except Exception as e:
signal.alarm(0)
err = str(e)
if "encrypted" in err.lower() or "password" in err.lower():
print("SKIP (chiffré)", flush=True)
skip_encrypted += 1
else:
print(f"ERREUR: {e}", flush=True)
ko += 1
# Rapport intermédiaire toutes les 50 fichiers
if i % 50 == 0:
elapsed = time.time() - t0
remaining = (elapsed / i) * (total - i)
print(f"\n --- Progression: {i}/{total} | OK: {ok} | "
f"Erreurs: {ko} | Timeout: {skip_timeout} | "
f"Temps restant estimé: {remaining/60:.0f}min ---\n", flush=True)
done = ok + ko + skip_encrypted + skip_timeout + 1
if status == "OK":
ok += 1
rate = ok / elapsed * 3600 if elapsed > 0 else 0
print(f"[{done}/{total}] {name} OK ({rate:.0f}/h)", flush=True)
elif status == "TIMEOUT":
skip_timeout += 1
print(f"[{done}/{total}] {name} TIMEOUT", flush=True)
elif status == "SKIP":
skip_encrypted += 1
print(f"[{done}/{total}] {name} SKIP (chiffré)", flush=True)
else:
ko += 1
err_msg = result[4] if len(result) > 4 else "?"
print(f"[{done}/{total}] {name} ERREUR: {err_msg}", flush=True)
# Rapport intermédiaire toutes les 50 fichiers
if done % 50 == 0:
remaining = (elapsed / done) * (total - done)
print(f"\n --- Progression: {done}/{total} | OK: {ok} | "
f"Erreurs: {ko} | Timeout: {skip_timeout} | "
f"Débit: {ok/elapsed*3600:.0f}/h | "
f"Restant: {remaining/60:.0f}min ---\n", flush=True)
elapsed = time.time() - t0
total_pseudo = len(list(OUTDIR.glob("*.pseudonymise.txt")))
print(f"\n{'='*60}")
print(f"Terminé en {elapsed:.0f}s ({elapsed/60:.1f}min)")
print(f"OK: {ok}, Chiffrés: {skip_encrypted}, Timeout: {skip_timeout}, Erreurs: {ko}")
print(f"Total .pseudonymise.txt: {len(list(OUTDIR.glob('*.pseudonymise.txt')))}")
print(f"Total .pseudonymise.txt: {total_pseudo}")
print(f"Débit moyen: {ok/elapsed*3600:.0f} fichiers/h")
print(f"Sortie: {OUTDIR}")

View File

@@ -190,6 +190,93 @@ def main():
out.write_text("\n".join(sorted(phones)) + "\n", encoding="utf-8")
print(f"{out.name}: {len(phones)} entrées")
# 6. Adresses FINESS (type_voie + nom_voie) pour Aho-Corasick
# Mapping des codes type_voie FINESS vers formes étendues
TYPE_VOIE_MAP = {
"AV": "avenue", "R": "rue", "BD": "boulevard", "RTE": "route",
"CHE": "chemin", "PL": "place", "IMP": "impasse", "ALL": "allee",
"SQ": "square", "PASS": "passage", "QU": "quai", "CRS": "cours",
"SEN": "sentier", "RPT": "rond-point", "LD": "lieu-dit",
"HAM": "hameau", "LOT": "lotissement", "TSSE": "traverse",
"CHEM": "chemin", "RES": "residence", "CTRE": "centre",
"ESP": "esplanade", "PRO": "promenade", "MTE": "montee",
"VOI": "voie", "CAR": "carrefour", "FBG": "faubourg",
}
# Charger les prénoms INSEE pour générer des variantes abrégées
prenoms_path = Path(__file__).parent.parent / "data" / "insee" / "prenoms_france.txt"
prenoms_set = set()
if prenoms_path.exists():
for line in prenoms_path.read_text(encoding="utf-8").splitlines():
p = line.strip().lower()
if p and len(p) >= 3:
prenoms_set.add(p)
print(f" Prénoms INSEE chargés: {len(prenoms_set)}")
VOIE_GENERIC = {
"de", "du", "des", "la", "le", "les", "l", "et", "en", "au", "aux",
"a", "sur", "sous", "par", "pour", "dans", "rue", "avenue", "boulevard",
"route", "chemin", "place", "impasse", "square", "passage", "quai", "cours",
"grande", "grand", "petit", "petite", "vieux", "vieille", "nouveau", "nouvelle",
"haut", "haute", "bas", "basse",
}
addr_patterns = set()
def _add_with_abbrev(pattern: str):
"""Ajoute le pattern + variantes avec prénoms abrégés (initiale seule)."""
addr_patterns.add(pattern)
words = pattern.split()
for i, w in enumerate(words):
if w in prenoms_set and len(w) >= 3:
# Variante avec initiale seule — seulement si un mot distinctif suit
remaining = words[i+1:]
if not remaining or all(len(r) <= 2 or r in VOIE_GENERIC for r in remaining):
continue # Pas d'abréviation si rien de distinctif après
abbrev_words = words[:i] + [w[0]] + words[i+1:]
abbrev = " ".join(abbrev_words)
# Minimum 12 chars, et le pattern ne doit pas commencer par une initiale seule
if len(abbrev) >= 12 and len(abbrev_words[0]) >= 2:
addr_patterns.add(abbrev)
with open(csv_path, encoding="utf-8") as f:
reader = csv.reader(f, delimiter=";")
next(reader)
for row in reader:
if len(row) < 10:
continue
type_voie_raw = row[8].strip() if len(row) > 8 else ""
nom_voie = row[9].strip() if len(row) > 9 else ""
if not nom_voie or len(nom_voie) < 3:
continue
nom_norm = normalize(nom_voie)
words = nom_norm.split()
# Pattern complet : type_voie + nom_voie (ex: "avenue de l interne jacques loeb")
type_voie_expanded = TYPE_VOIE_MAP.get(type_voie_raw.upper(), type_voie_raw.lower())
if type_voie_expanded and nom_norm:
full = f"{type_voie_expanded} {nom_norm}"
full_words = full.split()
has_distinctive = any(
w not in VOIE_GENERIC and len(w) >= 4 for w in full_words
)
if has_distinctive and len(full) >= 12:
_add_with_abbrev(full)
# Pattern nom_voie seul (seulement si très distinctif)
has_distinctive = any(w not in VOIE_GENERIC and len(w) >= 4 for w in words)
if has_distinctive and len(nom_norm) >= 15:
_add_with_abbrev(nom_norm)
out = OUT_DIR / "adresses_finess.txt"
out.write_text("\n".join(sorted(addr_patterns)) + "\n", encoding="utf-8")
print(f"\n{out.name}: {len(addr_patterns)} entrées")
# Garder aussi voies_distinctives.txt pour compatibilité
voie_names = {p for p in addr_patterns if len(p) >= 15}
out = OUT_DIR / "voies_distinctives.txt"
out.write_text("\n".join(sorted(voie_names)) + "\n", encoding="utf-8")
print(f"{out.name}: {len(voie_names)} entrées")
# Stats par longueur
print(f"\nDistribution noms distinctifs par longueur (mots):")
word_counts = Counter(len(n.split()) for n in filtered_distinctive)

View File

@@ -300,7 +300,7 @@ def check_fp_density(text: str) -> dict:
"density_pct": round(density, 2),
"nom_count": nom_count,
"nom_pct": round(nom_pct, 2),
"alert": nom_pct > 5.0,
"alert": nom_pct > 8.0, # seuil relevé : CRO/CRH courts listent 8-10 soignants = légitime
}

View File

@@ -210,17 +210,34 @@ async def anonymize_text(
final_text = selective_rescan(final_text, cfg=cfg)
elapsed = time.time() - t0
audit_list = [
{"kind": h.kind, "original": h.original, "placeholder": h.placeholder, "page": h.page}
for h in anon.audit
if h.page != -1 # exclure les propagations globales
]
# Inclure tous les hits (regex page≥0 + NER page=-1) avec source
ner_prefixes = ("NER_", "EDS_")
audit_list = []
ner_count = 0
regex_count = 0
for h in anon.audit:
is_ner = h.kind.startswith(ner_prefixes) or h.page == -1
entry = {
"kind": h.kind,
"original": h.original,
"placeholder": h.placeholder,
"page": h.page,
"source": "ner" if is_ner else "regex",
}
audit_list.append(entry)
if is_ner:
ner_count += 1
else:
regex_count += 1
return {
"text_anonymized": final_text,
"audit": audit_list,
"stats": {
"pii_detected": len(audit_list),
"regex_count": regex_count,
"ner_count": ner_count,
"elapsed_seconds": round(elapsed, 3),
"ner_active": use_ner and _eds_manager is not None,
},