feat: réduction FP + gazetteers adresses FINESS + batch parallèle + corrections multi-axes
- Token min length relevé de 2-3 → 4 chars (élimine FP EPO, IRC, SIB...) - Stop-words enrichis : acronymes médicaux 3 lettres, termes pharma, soins infirmiers - BDPM stop-words : ~7300 noms commerciaux + DCI/substances actives - Gazetteers adresses FINESS : 63K patterns Aho-Corasick (position-preserving normalization) - Filtre contextuel anatomique pour FINESS établissements - Nouvelles regex : RE_CIVILITE_COMMA_LIST, RE_EXTRACT_NOM_UTILISE, RE_EXTRACT_PRENOM, RE_NUM_EXAMEN_PATIENT, RE_ADRESSE_LIEU_DIT, RE_CIVILITE_INITIALE, Dr X.NOM - URLs complètes (RE_URL) + détection multiline - N° venue inversé (layout-aware) + EPISODE/NDA dans _CRITICAL_PII_TYPES - HospitalFilter désactivé pour ADRESSE/TEL/VILLE/EPISODE (identifient le patient) - Batch silver export parallélisé (multiprocessing spawn, N workers) - Seuil sur-masquage relevé à 8%, server.py enrichi (source regex/ner) - Blacklist villes : COURANT, PARIS ; contexte villes étendu (UHCD, spécialités) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,27 +1,24 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Batch anonymisation de PDFs pour enrichir le dataset silver.
|
||||
"""Batch anonymisation parallèle de PDFs pour enrichir le dataset silver.
|
||||
|
||||
Traite TOUS les PDFs disponibles (excluant ceux déjà dans audit_30) en mode CPU
|
||||
uniquement (sans VLM) pour générer des .pseudonymise.txt utilisables par
|
||||
export_silver_annotations.py.
|
||||
Traite TOUS les PDFs disponibles en mode CPU (sans VLM), avec N workers
|
||||
parallèles. Chaque worker charge ses propres modèles NER.
|
||||
|
||||
Timeout par fichier pour éviter les blocages sur les gros documents.
|
||||
Reprend automatiquement là où il s'est arrêté (skip les déjà traités).
|
||||
|
||||
Usage:
|
||||
python run_batch_silver_export.py # 6 workers (défaut)
|
||||
python run_batch_silver_export.py --workers 4 # 4 workers
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import signal
|
||||
import random
|
||||
import argparse
|
||||
import multiprocessing as mp
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
import anonymizer_core_refactored_onnx as core
|
||||
from eds_pseudo_manager import EdsPseudoManager
|
||||
from gliner_manager import GlinerManager
|
||||
from camembert_ner_manager import CamembertNerManager
|
||||
|
||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
OUTDIR = SRC / "anonymise_silver_extra"
|
||||
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
|
||||
@@ -62,16 +59,102 @@ ALREADY_DONE_AUDIT30 = {
|
||||
|
||||
TIMEOUT_PER_FILE = 120 # secondes max par PDF
|
||||
|
||||
|
||||
class TimeoutError(Exception):
|
||||
pass
|
||||
# Variables globales par worker (initialisées une seule fois)
|
||||
_worker_ner = None
|
||||
_worker_gliner = None
|
||||
_worker_camembert = None
|
||||
_worker_id = None
|
||||
|
||||
|
||||
def timeout_handler(signum, frame):
|
||||
raise TimeoutError("Timeout")
|
||||
def init_worker(worker_id):
|
||||
"""Initialise les modèles NER dans chaque worker (appelé une seule fois)."""
|
||||
global _worker_ner, _worker_gliner, _worker_camembert, _worker_id
|
||||
_worker_id = worker_id
|
||||
|
||||
# Limiter les threads ONNX/OpenMP par worker pour éviter la contention
|
||||
n_threads = max(2, 32 // (mp.cpu_count() // 2)) # répartir équitablement
|
||||
os.environ["OMP_NUM_THREADS"] = str(n_threads)
|
||||
os.environ["MKL_NUM_THREADS"] = str(n_threads)
|
||||
|
||||
import anonymizer_core_refactored_onnx as core # noqa: F401
|
||||
from eds_pseudo_manager import EdsPseudoManager
|
||||
from gliner_manager import GlinerManager
|
||||
from camembert_ner_manager import CamembertNerManager
|
||||
|
||||
_worker_ner = EdsPseudoManager()
|
||||
_worker_ner.load()
|
||||
print(f" [W{worker_id}] EDS-Pseudo chargé", flush=True)
|
||||
|
||||
_worker_gliner = GlinerManager()
|
||||
try:
|
||||
_worker_gliner.load()
|
||||
print(f" [W{worker_id}] GLiNER chargé", flush=True)
|
||||
except Exception as e:
|
||||
print(f" [W{worker_id}] GLiNER indisponible ({e})", flush=True)
|
||||
_worker_gliner = None
|
||||
|
||||
_worker_camembert = CamembertNerManager()
|
||||
try:
|
||||
_worker_camembert.load()
|
||||
print(f" [W{worker_id}] CamemBERT-bio chargé", flush=True)
|
||||
except Exception as e:
|
||||
print(f" [W{worker_id}] CamemBERT-bio indisponible ({e})", flush=True)
|
||||
_worker_camembert = None
|
||||
|
||||
print(f" [W{worker_id}] Prêt (threads={n_threads})", flush=True)
|
||||
|
||||
|
||||
def process_one_pdf(args):
|
||||
"""Traite un seul PDF. Appelé par le pool de workers."""
|
||||
pdf_path, idx, total = args
|
||||
import signal
|
||||
import anonymizer_core_refactored_onnx as core
|
||||
|
||||
ogc = pdf_path.parent.name.split("_")[0]
|
||||
|
||||
# Timeout via alarm
|
||||
def _timeout_handler(signum, frame):
|
||||
raise TimeoutError("Timeout")
|
||||
|
||||
signal.signal(signal.SIGALRM, _timeout_handler)
|
||||
signal.alarm(TIMEOUT_PER_FILE)
|
||||
|
||||
try:
|
||||
core.process_pdf(
|
||||
pdf_path=pdf_path,
|
||||
out_dir=OUTDIR,
|
||||
make_vector_redaction=False,
|
||||
also_make_raster_burn=False,
|
||||
config_path=CONFIG,
|
||||
use_hf=True,
|
||||
ner_manager=_worker_ner,
|
||||
ner_thresholds=None,
|
||||
ogc_label=ogc,
|
||||
vlm_manager=None,
|
||||
gliner_manager=_worker_gliner,
|
||||
camembert_manager=_worker_camembert,
|
||||
)
|
||||
signal.alarm(0)
|
||||
return ("OK", pdf_path.name, idx, total)
|
||||
except TimeoutError:
|
||||
signal.alarm(0)
|
||||
return ("TIMEOUT", pdf_path.name, idx, total)
|
||||
except Exception as e:
|
||||
signal.alarm(0)
|
||||
err = str(e)
|
||||
if "encrypted" in err.lower() or "password" in err.lower():
|
||||
return ("SKIP", pdf_path.name, idx, total)
|
||||
return ("ERROR", pdf_path.name, idx, total, str(e)[:100])
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Batch silver export parallèle")
|
||||
parser.add_argument("--workers", type=int, default=6,
|
||||
help="Nombre de workers parallèles (défaut: 6)")
|
||||
args = parser.parse_args()
|
||||
|
||||
n_workers = args.workers
|
||||
|
||||
# Collecter tous les PDFs disponibles (excluant audit_30)
|
||||
all_pdfs = []
|
||||
for ogc_dir in sorted(SRC.iterdir()):
|
||||
@@ -81,7 +164,6 @@ def main():
|
||||
if pdf.name not in ALREADY_DONE_AUDIT30:
|
||||
all_pdfs.append(pdf)
|
||||
|
||||
# Trier par OGC pour reproductibilité
|
||||
all_pdfs.sort(key=lambda p: (p.parent.name, p.name))
|
||||
|
||||
# Détecter les fichiers déjà traités (reprise)
|
||||
@@ -95,96 +177,73 @@ def main():
|
||||
print(f"PDFs disponibles: {len(all_pdfs)} (excl. audit_30)")
|
||||
print(f"Déjà traités: {len(already_done)}")
|
||||
print(f"Restant: {len(pdfs_to_do)}")
|
||||
print(f"Workers: {n_workers}")
|
||||
print(f"RAM par worker: ~4 Go (NER models)")
|
||||
print(f"RAM totale estimée: ~{n_workers * 4} Go\n")
|
||||
|
||||
if not pdfs_to_do:
|
||||
print("Rien à faire.")
|
||||
return
|
||||
|
||||
# Chargement des modèles NER (CPU uniquement, pas de VLM)
|
||||
print("\nChargement EDS-Pseudo...", flush=True)
|
||||
ner = EdsPseudoManager()
|
||||
ner.load()
|
||||
assert ner.is_loaded(), "EDS-Pseudo non chargé"
|
||||
print("EDS-Pseudo chargé.", flush=True)
|
||||
# Préparer les arguments : (pdf_path, index, total)
|
||||
tasks = [(pdf, i, len(pdfs_to_do)) for i, pdf in enumerate(pdfs_to_do, 1)]
|
||||
|
||||
print("Chargement GLiNER...", flush=True)
|
||||
gliner = GlinerManager()
|
||||
try:
|
||||
gliner.load()
|
||||
print("GLiNER chargé.", flush=True)
|
||||
except Exception as e:
|
||||
print(f"GLiNER indisponible ({e}), on continue sans.", flush=True)
|
||||
gliner = None
|
||||
print(f"Chargement des modèles dans {n_workers} workers...", flush=True)
|
||||
|
||||
print("Chargement CamemBERT-bio ONNX...", flush=True)
|
||||
camembert = CamembertNerManager()
|
||||
try:
|
||||
camembert.load()
|
||||
print("CamemBERT-bio ONNX chargé.", flush=True)
|
||||
except Exception as e:
|
||||
print(f"CamemBERT-bio indisponible ({e}), on continue sans.", flush=True)
|
||||
camembert = None
|
||||
|
||||
print(f"\nPas de VLM (CPU only pour silver export).\n", flush=True)
|
||||
# Créer le pool avec initialisation des modèles par worker
|
||||
# On utilise mp.Pool avec initializer pour charger les modèles une seule fois
|
||||
# Note: fork + ONNX peut poser problème, on utilise 'spawn'
|
||||
ctx = mp.get_context("spawn")
|
||||
|
||||
ok = ko = skip_encrypted = skip_timeout = 0
|
||||
t0 = time.time()
|
||||
total = len(pdfs_to_do)
|
||||
|
||||
for i, pdf in enumerate(pdfs_to_do, 1):
|
||||
ogc = pdf.parent.name.split("_")[0]
|
||||
print(f"[{i}/{total}] {pdf.name} (OGC {ogc})...", end=" ", flush=True)
|
||||
# Lancer les workers séquentiellement pour l'init (éviter pic mémoire)
|
||||
# puis traiter en parallèle
|
||||
with ctx.Pool(
|
||||
processes=n_workers,
|
||||
initializer=init_worker,
|
||||
initargs=(0,), # worker_id simplifié
|
||||
) as pool:
|
||||
for result in pool.imap_unordered(process_one_pdf, tasks, chunksize=1):
|
||||
status = result[0]
|
||||
name = result[1]
|
||||
idx = result[2]
|
||||
total = result[3]
|
||||
|
||||
# Timeout par fichier
|
||||
signal.signal(signal.SIGALRM, timeout_handler)
|
||||
signal.alarm(TIMEOUT_PER_FILE)
|
||||
try:
|
||||
core.process_pdf(
|
||||
pdf_path=pdf,
|
||||
out_dir=OUTDIR,
|
||||
make_vector_redaction=False,
|
||||
also_make_raster_burn=False,
|
||||
config_path=CONFIG,
|
||||
use_hf=True,
|
||||
ner_manager=ner,
|
||||
ner_thresholds=None,
|
||||
ogc_label=ogc,
|
||||
vlm_manager=None,
|
||||
gliner_manager=gliner,
|
||||
camembert_manager=camembert,
|
||||
)
|
||||
signal.alarm(0)
|
||||
elapsed_file = time.time() - t0
|
||||
rate = ok / elapsed_file * 3600 if elapsed_file > 0 and ok > 0 else 0
|
||||
print(f"OK ({rate:.0f}/h)", flush=True)
|
||||
ok += 1
|
||||
except TimeoutError:
|
||||
signal.alarm(0)
|
||||
print(f"TIMEOUT ({TIMEOUT_PER_FILE}s)", flush=True)
|
||||
skip_timeout += 1
|
||||
except Exception as e:
|
||||
signal.alarm(0)
|
||||
err = str(e)
|
||||
if "encrypted" in err.lower() or "password" in err.lower():
|
||||
print("SKIP (chiffré)", flush=True)
|
||||
skip_encrypted += 1
|
||||
else:
|
||||
print(f"ERREUR: {e}", flush=True)
|
||||
ko += 1
|
||||
|
||||
# Rapport intermédiaire toutes les 50 fichiers
|
||||
if i % 50 == 0:
|
||||
elapsed = time.time() - t0
|
||||
remaining = (elapsed / i) * (total - i)
|
||||
print(f"\n --- Progression: {i}/{total} | OK: {ok} | "
|
||||
f"Erreurs: {ko} | Timeout: {skip_timeout} | "
|
||||
f"Temps restant estimé: {remaining/60:.0f}min ---\n", flush=True)
|
||||
done = ok + ko + skip_encrypted + skip_timeout + 1
|
||||
|
||||
if status == "OK":
|
||||
ok += 1
|
||||
rate = ok / elapsed * 3600 if elapsed > 0 else 0
|
||||
print(f"[{done}/{total}] {name} OK ({rate:.0f}/h)", flush=True)
|
||||
elif status == "TIMEOUT":
|
||||
skip_timeout += 1
|
||||
print(f"[{done}/{total}] {name} TIMEOUT", flush=True)
|
||||
elif status == "SKIP":
|
||||
skip_encrypted += 1
|
||||
print(f"[{done}/{total}] {name} SKIP (chiffré)", flush=True)
|
||||
else:
|
||||
ko += 1
|
||||
err_msg = result[4] if len(result) > 4 else "?"
|
||||
print(f"[{done}/{total}] {name} ERREUR: {err_msg}", flush=True)
|
||||
|
||||
# Rapport intermédiaire toutes les 50 fichiers
|
||||
if done % 50 == 0:
|
||||
remaining = (elapsed / done) * (total - done)
|
||||
print(f"\n --- Progression: {done}/{total} | OK: {ok} | "
|
||||
f"Erreurs: {ko} | Timeout: {skip_timeout} | "
|
||||
f"Débit: {ok/elapsed*3600:.0f}/h | "
|
||||
f"Restant: {remaining/60:.0f}min ---\n", flush=True)
|
||||
|
||||
elapsed = time.time() - t0
|
||||
total_pseudo = len(list(OUTDIR.glob("*.pseudonymise.txt")))
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Terminé en {elapsed:.0f}s ({elapsed/60:.1f}min)")
|
||||
print(f"OK: {ok}, Chiffrés: {skip_encrypted}, Timeout: {skip_timeout}, Erreurs: {ko}")
|
||||
print(f"Total .pseudonymise.txt: {len(list(OUTDIR.glob('*.pseudonymise.txt')))}")
|
||||
print(f"Total .pseudonymise.txt: {total_pseudo}")
|
||||
print(f"Débit moyen: {ok/elapsed*3600:.0f} fichiers/h")
|
||||
print(f"Sortie: {OUTDIR}")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user