#!/usr/bin/env python3 """Batch processing des 59 premiers OGC — script CLI pour test post-modifications.""" import sys import time import json from pathlib import Path from collections import Counter sys.path.insert(0, str(Path(__file__).parent)) import anonymizer_core_refactored_onnx as core from eds_pseudo_manager import EdsPseudoManager SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") OUTDIR = SRC / "anonymise" CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml") def main(): # Charger EDS-Pseudo print("Chargement EDS-Pseudo...", flush=True) ner = EdsPseudoManager() ner.load() assert ner.is_loaded(), "EDS-Pseudo non chargé" print("EDS-Pseudo chargé.", flush=True) # Lister les 59 premiers dossiers OGC ogc_dirs = sorted( [d for d in SRC.iterdir() if d.is_dir() and "_" in d.name and d.name[0].isdigit()], key=lambda d: int(d.name.split("_")[0]), )[:59] print(f"Dossiers OGC: {len(ogc_dirs)}") # Collecter tous les PDFs pdfs = [] for d in ogc_dirs: for pdf in sorted(d.glob("*.pdf")): pdfs.append(pdf) print(f"PDFs à traiter: {len(pdfs)}") OUTDIR.mkdir(exist_ok=True) ok = ko = 0 global_counts = Counter() t0 = time.time() for i, pdf in enumerate(pdfs, 1): ogc = pdf.parent.name.split("_")[0] print(f"[{i}/{len(pdfs)}] {pdf.name} (OGC {ogc})...", end=" ", flush=True) try: outputs = core.process_pdf( pdf_path=pdf, out_dir=OUTDIR, make_vector_redaction=False, also_make_raster_burn=True, config_path=CONFIG, use_hf=True, ner_manager=ner, ner_thresholds=None, ogc_label=ogc, ) # Compter les hits audit audit_path = Path(outputs.get("audit", "")) if audit_path.exists(): for line in audit_path.read_text().splitlines(): try: h = json.loads(line) global_counts[h["kind"]] += 1 except Exception: pass print("OK", flush=True) ok += 1 except Exception as e: print(f"ERREUR: {e}", flush=True) ko += 1 elapsed = time.time() - t0 print(f"\n{'='*60}") print(f"Terminé en {elapsed:.0f}s — OK: {ok}, Erreurs: {ko}") print(f"Total PII détectés: {sum(global_counts.values())}") print(f"\nDétail par type:") for k, v in sorted(global_counts.items(), key=lambda x: -x[1]): print(f" {k:30s} {v:6d}") if __name__ == "__main__": main()