Files
anonymisation/run_batch_silver_export.py
Domi31tls c9572c383a feat(phase2): Fine-tuning CamemBERT-bio v2 (F1=0.90) + enrichissement données
- Fine-tuning camembert-bio-base : F1=0.903, Recall=0.930 (vs 0.89/0.85)
- Data augmentation : substitution noms INSEE (219K patronymes, x3 copies)
- Hard negatives BDPM (5.7K médicaments) + QUAERO (1319 termes médicaux)
- Annotations silver enrichies par gazetteers (+612 VILLE, +5 HOPITAL)
- Export silver avec support multi-répertoires (--extra-dir)
- Gazetteers QUAERO : CHEM, DISO, PROC, ANAT depuis DrBenchmark/QUAERO
- Gazetteers INSEE : noms de famille fréquents (96K) et complets (219K)
- Batch silver 1194 PDFs (run_batch_silver_export.py) pour dataset v3

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 02:06:08 +01:00

193 lines
6.8 KiB
Python

#!/usr/bin/env python3
"""Batch anonymisation de PDFs pour enrichir le dataset silver.
Traite TOUS les PDFs disponibles (excluant ceux déjà dans audit_30) en mode CPU
uniquement (sans VLM) pour générer des .pseudonymise.txt utilisables par
export_silver_annotations.py.
Timeout par fichier pour éviter les blocages sur les gros documents.
Reprend automatiquement là où il s'est arrêté (skip les déjà traités).
"""
import sys
import time
import signal
import random
from pathlib import Path
from collections import Counter
sys.path.insert(0, str(Path(__file__).parent))
import anonymizer_core_refactored_onnx as core
from eds_pseudo_manager import EdsPseudoManager
from gliner_manager import GlinerManager
from camembert_ner_manager import CamembertNerManager
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
OUTDIR = SRC / "anonymise_silver_extra"
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
# PDFs déjà traités dans l'audit 30 (à exclure)
ALREADY_DONE_AUDIT30 = {
"CONSULTATION ANESTHESISTE 23060661.pdf",
"trackare-05000272-23074376_05000272_23074376.pdf",
"CONSULTATION ANESTHESISTE 23056022.pdf",
"trackare-BA042686-23090597_BA042686_23090597.pdf",
"trackare-23000862-23018396_23000862_23018396.pdf",
"LETTRE DE SORTIE 23087212.pdf",
"CRO 23159905.pdf",
"trackare-99246761-23159905_99246761_23159905.pdf",
"CONSULTATION ANESTHESISTE 23139653.pdf",
"CRO 23160703.pdf",
"trackare-BA192486-23127395_BA192486_23127395.pdf",
"BACTERIO 23232115.pdf",
"CR consultation anesth-290-23025988.pdf",
"trackare-05012965-23060770_05012965_23060770.pdf",
"trackare-BA065989-23102874_BA065989_23102874.pdf",
"trackare-BA127127-23135726_BA127127_23135726.pdf",
"trackare-99252128-23177582_99252128_23177582.pdf",
"trackare-BA171849-23214501_BA171849_23214501.pdf",
"trackare-17015185-23043950_17015185_23043950.pdf",
"CRH 60_23106634.pdf",
"trackare-00260974-23070213_00260974_23070213.pdf",
"trackare-BA067657-23076655_BA067657_23076655.pdf",
"trackare-05012679-23098722_05012679_23098722.pdf",
"trackare-11004431-23124019_11004431_23124019.pdf",
"trackare-07003136-23135847_07003136_23135847.pdf",
"trackare-13013848-23165708_13013848_23165708.pdf",
"trackare-03020576-23175616_03020576_23175616.pdf",
"trackare-BA093659-23074520_BA093659_23074520.pdf",
"trackare-14025311-23034958_14025311_23034958.pdf",
"trackare-BA121804-23016863_BA121804_23016863.pdf",
}
TIMEOUT_PER_FILE = 120 # secondes max par PDF
class TimeoutError(Exception):
pass
def timeout_handler(signum, frame):
raise TimeoutError("Timeout")
def main():
# Collecter tous les PDFs disponibles (excluant audit_30)
all_pdfs = []
for ogc_dir in sorted(SRC.iterdir()):
if not ogc_dir.is_dir() or ogc_dir.name.startswith("anonymise"):
continue
for pdf in ogc_dir.glob("*.pdf"):
if pdf.name not in ALREADY_DONE_AUDIT30:
all_pdfs.append(pdf)
# Trier par OGC pour reproductibilité
all_pdfs.sort(key=lambda p: (p.parent.name, p.name))
# Détecter les fichiers déjà traités (reprise)
OUTDIR.mkdir(exist_ok=True)
already_done = {
p.name.replace(".pseudonymise.txt", ".pdf")
for p in OUTDIR.glob("*.pseudonymise.txt")
}
pdfs_to_do = [p for p in all_pdfs if p.name not in already_done]
print(f"PDFs disponibles: {len(all_pdfs)} (excl. audit_30)")
print(f"Déjà traités: {len(already_done)}")
print(f"Restant: {len(pdfs_to_do)}")
if not pdfs_to_do:
print("Rien à faire.")
return
# Chargement des modèles NER (CPU uniquement, pas de VLM)
print("\nChargement EDS-Pseudo...", flush=True)
ner = EdsPseudoManager()
ner.load()
assert ner.is_loaded(), "EDS-Pseudo non chargé"
print("EDS-Pseudo chargé.", flush=True)
print("Chargement GLiNER...", flush=True)
gliner = GlinerManager()
try:
gliner.load()
print("GLiNER chargé.", flush=True)
except Exception as e:
print(f"GLiNER indisponible ({e}), on continue sans.", flush=True)
gliner = None
print("Chargement CamemBERT-bio ONNX...", flush=True)
camembert = CamembertNerManager()
try:
camembert.load()
print("CamemBERT-bio ONNX chargé.", flush=True)
except Exception as e:
print(f"CamemBERT-bio indisponible ({e}), on continue sans.", flush=True)
camembert = None
print(f"\nPas de VLM (CPU only pour silver export).\n", flush=True)
ok = ko = skip_encrypted = skip_timeout = 0
t0 = time.time()
total = len(pdfs_to_do)
for i, pdf in enumerate(pdfs_to_do, 1):
ogc = pdf.parent.name.split("_")[0]
print(f"[{i}/{total}] {pdf.name} (OGC {ogc})...", end=" ", flush=True)
# Timeout par fichier
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(TIMEOUT_PER_FILE)
try:
core.process_pdf(
pdf_path=pdf,
out_dir=OUTDIR,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=CONFIG,
use_hf=True,
ner_manager=ner,
ner_thresholds=None,
ogc_label=ogc,
vlm_manager=None,
gliner_manager=gliner,
camembert_manager=camembert,
)
signal.alarm(0)
elapsed_file = time.time() - t0
rate = ok / elapsed_file * 3600 if elapsed_file > 0 and ok > 0 else 0
print(f"OK ({rate:.0f}/h)", flush=True)
ok += 1
except TimeoutError:
signal.alarm(0)
print(f"TIMEOUT ({TIMEOUT_PER_FILE}s)", flush=True)
skip_timeout += 1
except Exception as e:
signal.alarm(0)
err = str(e)
if "encrypted" in err.lower() or "password" in err.lower():
print("SKIP (chiffré)", flush=True)
skip_encrypted += 1
else:
print(f"ERREUR: {e}", flush=True)
ko += 1
# Rapport intermédiaire toutes les 50 fichiers
if i % 50 == 0:
elapsed = time.time() - t0
remaining = (elapsed / i) * (total - i)
print(f"\n --- Progression: {i}/{total} | OK: {ok} | "
f"Erreurs: {ko} | Timeout: {skip_timeout} | "
f"Temps restant estimé: {remaining/60:.0f}min ---\n", flush=True)
elapsed = time.time() - t0
print(f"\n{'='*60}")
print(f"Terminé en {elapsed:.0f}s ({elapsed/60:.1f}min)")
print(f"OK: {ok}, Chiffrés: {skip_encrypted}, Timeout: {skip_timeout}, Erreurs: {ko}")
print(f"Total .pseudonymise.txt: {len(list(OUTDIR.glob('*.pseudonymise.txt')))}")
print(f"Sortie: {OUTDIR}")
if __name__ == "__main__":
main()