anonymisation/run_batch_59ogc.py

#!/usr/bin/env python3
"""Batch processing des 59 premiers OGC — script CLI pour test post-modifications."""
import sys
import time
import json
from pathlib import Path
from collections import Counter

sys.path.insert(0, str(Path(__file__).parent))

import anonymizer_core_refactored_onnx as core
from eds_pseudo_manager import EdsPseudoManager

SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
OUTDIR = SRC / "anonymise"
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")

def main():
    # Charger EDS-Pseudo
    print("Chargement EDS-Pseudo...", flush=True)
    ner = EdsPseudoManager()
    ner.load()
    assert ner.is_loaded(), "EDS-Pseudo non chargé"
    print("EDS-Pseudo chargé.", flush=True)

    # Lister les 59 premiers dossiers OGC
    ogc_dirs = sorted(
        [d for d in SRC.iterdir() if d.is_dir() and "_" in d.name and d.name[0].isdigit()],
        key=lambda d: int(d.name.split("_")[0]),
    )[:59]
    print(f"Dossiers OGC: {len(ogc_dirs)}")

    # Collecter tous les PDFs
    pdfs = []
    for d in ogc_dirs:
        for pdf in sorted(d.glob("*.pdf")):
            pdfs.append(pdf)
    print(f"PDFs à traiter: {len(pdfs)}")

    OUTDIR.mkdir(exist_ok=True)
    ok = ko = 0
    global_counts = Counter()
    t0 = time.time()

    for i, pdf in enumerate(pdfs, 1):
        ogc = pdf.parent.name.split("_")[0]
        print(f"[{i}/{len(pdfs)}] {pdf.name} (OGC {ogc})...", end=" ", flush=True)
        try:
            outputs = core.process_pdf(
                pdf_path=pdf,
                out_dir=OUTDIR,
                make_vector_redaction=False,
                also_make_raster_burn=True,
                config_path=CONFIG,
                use_hf=True,
                ner_manager=ner,
                ner_thresholds=None,
                ogc_label=ogc,
            )
            # Compter les hits audit
            audit_path = Path(outputs.get("audit", ""))
            if audit_path.exists():
                for line in audit_path.read_text().splitlines():
                    try:
                        h = json.loads(line)
                        global_counts[h["kind"]] += 1
                    except Exception:
                        pass
            print("OK", flush=True)
            ok += 1
        except Exception as e:
            print(f"ERREUR: {e}", flush=True)
            ko += 1

    elapsed = time.time() - t0
    print(f"\n{'='*60}")
    print(f"Terminé en {elapsed:.0f}s — OK: {ok}, Erreurs: {ko}")
    print(f"Total PII détectés: {sum(global_counts.values())}")
    print(f"\nDétail par type:")
    for k, v in sorted(global_counts.items(), key=lambda x: -x[1]):
        print(f"  {k:30s} {v:6d}")

if __name__ == "__main__":
    main()