#!/usr/bin/env python3 """Batch anonymisation de PDFs pour enrichir le dataset silver. Traite TOUS les PDFs disponibles (excluant ceux déjà dans audit_30) en mode CPU uniquement (sans VLM) pour générer des .pseudonymise.txt utilisables par export_silver_annotations.py. Timeout par fichier pour éviter les blocages sur les gros documents. Reprend automatiquement là où il s'est arrêté (skip les déjà traités). """ import sys import time import signal import random from pathlib import Path from collections import Counter sys.path.insert(0, str(Path(__file__).parent)) import anonymizer_core_refactored_onnx as core from eds_pseudo_manager import EdsPseudoManager from gliner_manager import GlinerManager from camembert_ner_manager import CamembertNerManager SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") OUTDIR = SRC / "anonymise_silver_extra" CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml") # PDFs déjà traités dans l'audit 30 (à exclure) ALREADY_DONE_AUDIT30 = { "CONSULTATION ANESTHESISTE 23060661.pdf", "trackare-05000272-23074376_05000272_23074376.pdf", "CONSULTATION ANESTHESISTE 23056022.pdf", "trackare-BA042686-23090597_BA042686_23090597.pdf", "trackare-23000862-23018396_23000862_23018396.pdf", "LETTRE DE SORTIE 23087212.pdf", "CRO 23159905.pdf", "trackare-99246761-23159905_99246761_23159905.pdf", "CONSULTATION ANESTHESISTE 23139653.pdf", "CRO 23160703.pdf", "trackare-BA192486-23127395_BA192486_23127395.pdf", "BACTERIO 23232115.pdf", "CR consultation anesth-290-23025988.pdf", "trackare-05012965-23060770_05012965_23060770.pdf", "trackare-BA065989-23102874_BA065989_23102874.pdf", "trackare-BA127127-23135726_BA127127_23135726.pdf", "trackare-99252128-23177582_99252128_23177582.pdf", "trackare-BA171849-23214501_BA171849_23214501.pdf", "trackare-17015185-23043950_17015185_23043950.pdf", "CRH 60_23106634.pdf", "trackare-00260974-23070213_00260974_23070213.pdf", "trackare-BA067657-23076655_BA067657_23076655.pdf", "trackare-05012679-23098722_05012679_23098722.pdf", "trackare-11004431-23124019_11004431_23124019.pdf", "trackare-07003136-23135847_07003136_23135847.pdf", "trackare-13013848-23165708_13013848_23165708.pdf", "trackare-03020576-23175616_03020576_23175616.pdf", "trackare-BA093659-23074520_BA093659_23074520.pdf", "trackare-14025311-23034958_14025311_23034958.pdf", "trackare-BA121804-23016863_BA121804_23016863.pdf", } TIMEOUT_PER_FILE = 120 # secondes max par PDF class TimeoutError(Exception): pass def timeout_handler(signum, frame): raise TimeoutError("Timeout") def main(): # Collecter tous les PDFs disponibles (excluant audit_30) all_pdfs = [] for ogc_dir in sorted(SRC.iterdir()): if not ogc_dir.is_dir() or ogc_dir.name.startswith("anonymise"): continue for pdf in ogc_dir.glob("*.pdf"): if pdf.name not in ALREADY_DONE_AUDIT30: all_pdfs.append(pdf) # Trier par OGC pour reproductibilité all_pdfs.sort(key=lambda p: (p.parent.name, p.name)) # Détecter les fichiers déjà traités (reprise) OUTDIR.mkdir(exist_ok=True) already_done = { p.name.replace(".pseudonymise.txt", ".pdf") for p in OUTDIR.glob("*.pseudonymise.txt") } pdfs_to_do = [p for p in all_pdfs if p.name not in already_done] print(f"PDFs disponibles: {len(all_pdfs)} (excl. audit_30)") print(f"Déjà traités: {len(already_done)}") print(f"Restant: {len(pdfs_to_do)}") if not pdfs_to_do: print("Rien à faire.") return # Chargement des modèles NER (CPU uniquement, pas de VLM) print("\nChargement EDS-Pseudo...", flush=True) ner = EdsPseudoManager() ner.load() assert ner.is_loaded(), "EDS-Pseudo non chargé" print("EDS-Pseudo chargé.", flush=True) print("Chargement GLiNER...", flush=True) gliner = GlinerManager() try: gliner.load() print("GLiNER chargé.", flush=True) except Exception as e: print(f"GLiNER indisponible ({e}), on continue sans.", flush=True) gliner = None print("Chargement CamemBERT-bio ONNX...", flush=True) camembert = CamembertNerManager() try: camembert.load() print("CamemBERT-bio ONNX chargé.", flush=True) except Exception as e: print(f"CamemBERT-bio indisponible ({e}), on continue sans.", flush=True) camembert = None print(f"\nPas de VLM (CPU only pour silver export).\n", flush=True) ok = ko = skip_encrypted = skip_timeout = 0 t0 = time.time() total = len(pdfs_to_do) for i, pdf in enumerate(pdfs_to_do, 1): ogc = pdf.parent.name.split("_")[0] print(f"[{i}/{total}] {pdf.name} (OGC {ogc})...", end=" ", flush=True) # Timeout par fichier signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(TIMEOUT_PER_FILE) try: core.process_pdf( pdf_path=pdf, out_dir=OUTDIR, make_vector_redaction=False, also_make_raster_burn=False, config_path=CONFIG, use_hf=True, ner_manager=ner, ner_thresholds=None, ogc_label=ogc, vlm_manager=None, gliner_manager=gliner, camembert_manager=camembert, ) signal.alarm(0) elapsed_file = time.time() - t0 rate = ok / elapsed_file * 3600 if elapsed_file > 0 and ok > 0 else 0 print(f"OK ({rate:.0f}/h)", flush=True) ok += 1 except TimeoutError: signal.alarm(0) print(f"TIMEOUT ({TIMEOUT_PER_FILE}s)", flush=True) skip_timeout += 1 except Exception as e: signal.alarm(0) err = str(e) if "encrypted" in err.lower() or "password" in err.lower(): print("SKIP (chiffré)", flush=True) skip_encrypted += 1 else: print(f"ERREUR: {e}", flush=True) ko += 1 # Rapport intermédiaire toutes les 50 fichiers if i % 50 == 0: elapsed = time.time() - t0 remaining = (elapsed / i) * (total - i) print(f"\n --- Progression: {i}/{total} | OK: {ok} | " f"Erreurs: {ko} | Timeout: {skip_timeout} | " f"Temps restant estimé: {remaining/60:.0f}min ---\n", flush=True) elapsed = time.time() - t0 print(f"\n{'='*60}") print(f"Terminé en {elapsed:.0f}s ({elapsed/60:.1f}min)") print(f"OK: {ok}, Chiffrés: {skip_encrypted}, Timeout: {skip_timeout}, Erreurs: {ko}") print(f"Total .pseudonymise.txt: {len(list(OUTDIR.glob('*.pseudonymise.txt')))}") print(f"Sortie: {OUTDIR}") if __name__ == "__main__": main()