feat: réduction FP + gazetteers adresses FINESS + batch parallèle + corrections multi-axes

- Token min length relevé de 2-3 → 4 chars (élimine FP EPO, IRC, SIB...)
- Stop-words enrichis : acronymes médicaux 3 lettres, termes pharma, soins infirmiers
- BDPM stop-words : ~7300 noms commerciaux + DCI/substances actives
- Gazetteers adresses FINESS : 63K patterns Aho-Corasick (position-preserving normalization)
- Filtre contextuel anatomique pour FINESS établissements
- Nouvelles regex : RE_CIVILITE_COMMA_LIST, RE_EXTRACT_NOM_UTILISE, RE_EXTRACT_PRENOM,
  RE_NUM_EXAMEN_PATIENT, RE_ADRESSE_LIEU_DIT, RE_CIVILITE_INITIALE, Dr X.NOM
- URLs complètes (RE_URL) + détection multiline
- N° venue inversé (layout-aware) + EPISODE/NDA dans _CRITICAL_PII_TYPES
- HospitalFilter désactivé pour ADRESSE/TEL/VILLE/EPISODE (identifient le patient)
- Batch silver export parallélisé (multiprocessing spawn, N workers)
- Seuil sur-masquage relevé à 8%, server.py enrichi (source regex/ner)
- Blacklist villes : COURANT, PARIS ; contexte villes étendu (UHCD, spécialités)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-16 09:26:56 +01:00
parent a827d860f1
commit 49ff464e6e
18 changed files with 358579 additions and 232 deletions

View File

@@ -1,27 +1,24 @@
#!/usr/bin/env python3
"""Batch anonymisation de PDFs pour enrichir le dataset silver.
"""Batch anonymisation parallèle de PDFs pour enrichir le dataset silver.
Traite TOUS les PDFs disponibles (excluant ceux déjà dans audit_30) en mode CPU
uniquement (sans VLM) pour générer des .pseudonymise.txt utilisables par
export_silver_annotations.py.
Traite TOUS les PDFs disponibles en mode CPU (sans VLM), avec N workers
parallèles. Chaque worker charge ses propres modèles NER.
Timeout par fichier pour éviter les blocages sur les gros documents.
Reprend automatiquement là où il s'est arrêté (skip les déjà traités).
Usage:
python run_batch_silver_export.py # 6 workers (défaut)
python run_batch_silver_export.py --workers 4 # 4 workers
"""
import sys
import os
import time
import signal
import random
import argparse
import multiprocessing as mp
from pathlib import Path
from collections import Counter
sys.path.insert(0, str(Path(__file__).parent))
import anonymizer_core_refactored_onnx as core
from eds_pseudo_manager import EdsPseudoManager
from gliner_manager import GlinerManager
from camembert_ner_manager import CamembertNerManager
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
OUTDIR = SRC / "anonymise_silver_extra"
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
@@ -62,16 +59,102 @@ ALREADY_DONE_AUDIT30 = {
TIMEOUT_PER_FILE = 120 # secondes max par PDF
class TimeoutError(Exception):
pass
# Variables globales par worker (initialisées une seule fois)
_worker_ner = None
_worker_gliner = None
_worker_camembert = None
_worker_id = None
def timeout_handler(signum, frame):
raise TimeoutError("Timeout")
def init_worker(worker_id):
"""Initialise les modèles NER dans chaque worker (appelé une seule fois)."""
global _worker_ner, _worker_gliner, _worker_camembert, _worker_id
_worker_id = worker_id
# Limiter les threads ONNX/OpenMP par worker pour éviter la contention
n_threads = max(2, 32 // (mp.cpu_count() // 2)) # répartir équitablement
os.environ["OMP_NUM_THREADS"] = str(n_threads)
os.environ["MKL_NUM_THREADS"] = str(n_threads)
import anonymizer_core_refactored_onnx as core # noqa: F401
from eds_pseudo_manager import EdsPseudoManager
from gliner_manager import GlinerManager
from camembert_ner_manager import CamembertNerManager
_worker_ner = EdsPseudoManager()
_worker_ner.load()
print(f" [W{worker_id}] EDS-Pseudo chargé", flush=True)
_worker_gliner = GlinerManager()
try:
_worker_gliner.load()
print(f" [W{worker_id}] GLiNER chargé", flush=True)
except Exception as e:
print(f" [W{worker_id}] GLiNER indisponible ({e})", flush=True)
_worker_gliner = None
_worker_camembert = CamembertNerManager()
try:
_worker_camembert.load()
print(f" [W{worker_id}] CamemBERT-bio chargé", flush=True)
except Exception as e:
print(f" [W{worker_id}] CamemBERT-bio indisponible ({e})", flush=True)
_worker_camembert = None
print(f" [W{worker_id}] Prêt (threads={n_threads})", flush=True)
def process_one_pdf(args):
"""Traite un seul PDF. Appelé par le pool de workers."""
pdf_path, idx, total = args
import signal
import anonymizer_core_refactored_onnx as core
ogc = pdf_path.parent.name.split("_")[0]
# Timeout via alarm
def _timeout_handler(signum, frame):
raise TimeoutError("Timeout")
signal.signal(signal.SIGALRM, _timeout_handler)
signal.alarm(TIMEOUT_PER_FILE)
try:
core.process_pdf(
pdf_path=pdf_path,
out_dir=OUTDIR,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=CONFIG,
use_hf=True,
ner_manager=_worker_ner,
ner_thresholds=None,
ogc_label=ogc,
vlm_manager=None,
gliner_manager=_worker_gliner,
camembert_manager=_worker_camembert,
)
signal.alarm(0)
return ("OK", pdf_path.name, idx, total)
except TimeoutError:
signal.alarm(0)
return ("TIMEOUT", pdf_path.name, idx, total)
except Exception as e:
signal.alarm(0)
err = str(e)
if "encrypted" in err.lower() or "password" in err.lower():
return ("SKIP", pdf_path.name, idx, total)
return ("ERROR", pdf_path.name, idx, total, str(e)[:100])
def main():
parser = argparse.ArgumentParser(description="Batch silver export parallèle")
parser.add_argument("--workers", type=int, default=6,
help="Nombre de workers parallèles (défaut: 6)")
args = parser.parse_args()
n_workers = args.workers
# Collecter tous les PDFs disponibles (excluant audit_30)
all_pdfs = []
for ogc_dir in sorted(SRC.iterdir()):
@@ -81,7 +164,6 @@ def main():
if pdf.name not in ALREADY_DONE_AUDIT30:
all_pdfs.append(pdf)
# Trier par OGC pour reproductibilité
all_pdfs.sort(key=lambda p: (p.parent.name, p.name))
# Détecter les fichiers déjà traités (reprise)
@@ -95,96 +177,73 @@ def main():
print(f"PDFs disponibles: {len(all_pdfs)} (excl. audit_30)")
print(f"Déjà traités: {len(already_done)}")
print(f"Restant: {len(pdfs_to_do)}")
print(f"Workers: {n_workers}")
print(f"RAM par worker: ~4 Go (NER models)")
print(f"RAM totale estimée: ~{n_workers * 4} Go\n")
if not pdfs_to_do:
print("Rien à faire.")
return
# Chargement des modèles NER (CPU uniquement, pas de VLM)
print("\nChargement EDS-Pseudo...", flush=True)
ner = EdsPseudoManager()
ner.load()
assert ner.is_loaded(), "EDS-Pseudo non chargé"
print("EDS-Pseudo chargé.", flush=True)
# Préparer les arguments : (pdf_path, index, total)
tasks = [(pdf, i, len(pdfs_to_do)) for i, pdf in enumerate(pdfs_to_do, 1)]
print("Chargement GLiNER...", flush=True)
gliner = GlinerManager()
try:
gliner.load()
print("GLiNER chargé.", flush=True)
except Exception as e:
print(f"GLiNER indisponible ({e}), on continue sans.", flush=True)
gliner = None
print(f"Chargement des modèles dans {n_workers} workers...", flush=True)
print("Chargement CamemBERT-bio ONNX...", flush=True)
camembert = CamembertNerManager()
try:
camembert.load()
print("CamemBERT-bio ONNX chargé.", flush=True)
except Exception as e:
print(f"CamemBERT-bio indisponible ({e}), on continue sans.", flush=True)
camembert = None
print(f"\nPas de VLM (CPU only pour silver export).\n", flush=True)
# Créer le pool avec initialisation des modèles par worker
# On utilise mp.Pool avec initializer pour charger les modèles une seule fois
# Note: fork + ONNX peut poser problème, on utilise 'spawn'
ctx = mp.get_context("spawn")
ok = ko = skip_encrypted = skip_timeout = 0
t0 = time.time()
total = len(pdfs_to_do)
for i, pdf in enumerate(pdfs_to_do, 1):
ogc = pdf.parent.name.split("_")[0]
print(f"[{i}/{total}] {pdf.name} (OGC {ogc})...", end=" ", flush=True)
# Lancer les workers séquentiellement pour l'init (éviter pic mémoire)
# puis traiter en parallèle
with ctx.Pool(
processes=n_workers,
initializer=init_worker,
initargs=(0,), # worker_id simplifié
) as pool:
for result in pool.imap_unordered(process_one_pdf, tasks, chunksize=1):
status = result[0]
name = result[1]
idx = result[2]
total = result[3]
# Timeout par fichier
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(TIMEOUT_PER_FILE)
try:
core.process_pdf(
pdf_path=pdf,
out_dir=OUTDIR,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=CONFIG,
use_hf=True,
ner_manager=ner,
ner_thresholds=None,
ogc_label=ogc,
vlm_manager=None,
gliner_manager=gliner,
camembert_manager=camembert,
)
signal.alarm(0)
elapsed_file = time.time() - t0
rate = ok / elapsed_file * 3600 if elapsed_file > 0 and ok > 0 else 0
print(f"OK ({rate:.0f}/h)", flush=True)
ok += 1
except TimeoutError:
signal.alarm(0)
print(f"TIMEOUT ({TIMEOUT_PER_FILE}s)", flush=True)
skip_timeout += 1
except Exception as e:
signal.alarm(0)
err = str(e)
if "encrypted" in err.lower() or "password" in err.lower():
print("SKIP (chiffré)", flush=True)
skip_encrypted += 1
else:
print(f"ERREUR: {e}", flush=True)
ko += 1
# Rapport intermédiaire toutes les 50 fichiers
if i % 50 == 0:
elapsed = time.time() - t0
remaining = (elapsed / i) * (total - i)
print(f"\n --- Progression: {i}/{total} | OK: {ok} | "
f"Erreurs: {ko} | Timeout: {skip_timeout} | "
f"Temps restant estimé: {remaining/60:.0f}min ---\n", flush=True)
done = ok + ko + skip_encrypted + skip_timeout + 1
if status == "OK":
ok += 1
rate = ok / elapsed * 3600 if elapsed > 0 else 0
print(f"[{done}/{total}] {name} OK ({rate:.0f}/h)", flush=True)
elif status == "TIMEOUT":
skip_timeout += 1
print(f"[{done}/{total}] {name} TIMEOUT", flush=True)
elif status == "SKIP":
skip_encrypted += 1
print(f"[{done}/{total}] {name} SKIP (chiffré)", flush=True)
else:
ko += 1
err_msg = result[4] if len(result) > 4 else "?"
print(f"[{done}/{total}] {name} ERREUR: {err_msg}", flush=True)
# Rapport intermédiaire toutes les 50 fichiers
if done % 50 == 0:
remaining = (elapsed / done) * (total - done)
print(f"\n --- Progression: {done}/{total} | OK: {ok} | "
f"Erreurs: {ko} | Timeout: {skip_timeout} | "
f"Débit: {ok/elapsed*3600:.0f}/h | "
f"Restant: {remaining/60:.0f}min ---\n", flush=True)
elapsed = time.time() - t0
total_pseudo = len(list(OUTDIR.glob("*.pseudonymise.txt")))
print(f"\n{'='*60}")
print(f"Terminé en {elapsed:.0f}s ({elapsed/60:.1f}min)")
print(f"OK: {ok}, Chiffrés: {skip_encrypted}, Timeout: {skip_timeout}, Erreurs: {ko}")
print(f"Total .pseudonymise.txt: {len(list(OUTDIR.glob('*.pseudonymise.txt')))}")
print(f"Total .pseudonymise.txt: {total_pseudo}")
print(f"Débit moyen: {ok/elapsed*3600:.0f} fichiers/h")
print(f"Sortie: {OUTDIR}")