#!/usr/bin/env python3 """Batch 30 fichiers aléatoires pour contrôle humain.""" import sys import time import json from pathlib import Path from collections import Counter sys.path.insert(0, str(Path(__file__).parent)) import anonymizer_core_refactored_onnx as core from eds_pseudo_manager import EdsPseudoManager SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") OUTDIR = SRC / "anonymise_audit_30" CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml") PDFS = [ SRC / "110_23061319/trackare-07026002-23061319_07026002_23061319.pdf", SRC / "115_23066188/CRH 23066188.pdf", SRC / "161_23098838/CRO 23098838.pdf", SRC / "179_23126805/trackare-23005591-23126805_23005591_23126805.pdf", SRC / "181_23127286/CRH 23127286.pdf", SRC / "192_23132490/CRH 23132490.pdf", SRC / "208_23151988/trackare-23020064-23151988_23020064_23151988.pdf", SRC / "215_23158603/trackare-22028007-23158603_22028007_23158603.pdf", SRC / "227_23173599/CRH 23173599.pdf", SRC / "236_23116794/trackare-BA054633-23116794_BA054633_23116794.pdf", SRC / "248_23194278/CRH 23194278.pdf", SRC / "263_23203642/CRO 23203642.pdf", SRC / "28_23135549/trackare-15021750-23135549_15021750_23135549.pdf", SRC / "321_23043929/CRH 321_23066387.pdf", SRC / "379_23098754/trackare-18009635-23098754_18009635_23098754.pdf", SRC / "39_23167029/trackare-23022121-23167029_23022121_23167029.pdf", SRC / "444_23141032/trackare-BA102259-23141032_BA102259_23141032.pdf", SRC / "478_23161697/cro 478_23161697.pdf", SRC / "50_23219173/trackare-07019278-23219173_07019278_23219173.pdf", SRC / "520_23177582/trackare-99252128-23177582_99252128_23177582.pdf", SRC / "556_23220878/trackare-21041742-23220878_21041742_23220878.pdf", SRC / "602_23070052/trackare-20028293-23070052_20028293_23070052.pdf", SRC / "604_23070704/trackare-23008170-23070704_23008170_23070704.pdf", SRC / "655_23163458/trackare-01296746-23163458_01296746_23163458.pdf", SRC / "684_23207941/CRH 684_23207941.pdf", SRC / "79_23187785/79_23187785 Dossier.pdf", SRC / "12_23084754/CRO 23084754.pdf" if (SRC / "12_23084754/CRO 23084754.pdf").exists() else SRC / "122_23070126/LETTRE DE SORTIE 23070126.pdf", SRC / "122_23070126/LETTRE DE SORTIE 23070126.pdf", SRC / "131_23079402/CRH 23079402.pdf", SRC / "290_23025988/cr anesth 290_23025988.pdf", ] def main(): print("Chargement EDS-Pseudo...", flush=True) ner = EdsPseudoManager() ner.load() assert ner.is_loaded(), "EDS-Pseudo non chargé" print("EDS-Pseudo chargé.\n", flush=True) # Vérifier existence des fichiers existing = [p for p in PDFS if p.exists()] missing = [p for p in PDFS if not p.exists()] if missing: print(f"ATTENTION: {len(missing)} fichiers manquants:") for p in missing: print(f" - {p.name}") print() print(f"Fichiers à traiter: {len(existing)}/30\n") OUTDIR.mkdir(exist_ok=True) ok = ko = skip_encrypted = 0 global_counts = Counter() t0 = time.time() for i, pdf in enumerate(existing, 1): ogc = pdf.parent.name.split("_")[0] print(f"[{i}/{len(existing)}] {pdf.name} (OGC {ogc})...", end=" ", flush=True) try: outputs = core.process_pdf( pdf_path=pdf, out_dir=OUTDIR, make_vector_redaction=False, also_make_raster_burn=True, config_path=CONFIG, use_hf=True, ner_manager=ner, ner_thresholds=None, ogc_label=ogc, ) audit_path = Path(outputs.get("audit", "")) if audit_path.exists(): for line in audit_path.read_text().splitlines(): try: h = json.loads(line) global_counts[h["kind"]] += 1 except Exception: pass print("OK", flush=True) ok += 1 except Exception as e: err = str(e) if "encrypted" in err.lower() or "password" in err.lower(): print(f"SKIP (chiffré)", flush=True) skip_encrypted += 1 else: print(f"ERREUR: {e}", flush=True) ko += 1 elapsed = time.time() - t0 print(f"\n{'='*60}") print(f"Terminé en {elapsed:.0f}s — OK: {ok}, Chiffrés: {skip_encrypted}, Erreurs: {ko}") print(f"Total PII détectés: {sum(global_counts.values())}") print(f"\nDétail par type:") for k, v in sorted(global_counts.items(), key=lambda x: -x[1]): print(f" {k:30s} {v:6d}") print(f"\nSortie: {OUTDIR}") if __name__ == "__main__": main()