Files
anonymisation/run_batch_59ogc.py

86 lines
2.7 KiB
Python

#!/usr/bin/env python3
"""Batch processing des 59 premiers OGC — script CLI pour test post-modifications."""
import sys
import time
import json
from pathlib import Path
from collections import Counter
sys.path.insert(0, str(Path(__file__).parent))
import anonymizer_core_refactored_onnx as core
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from eds_pseudo_manager import EdsPseudoManager
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
OUTDIR = SRC / "anonymise"
CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH
def main():
# Charger EDS-Pseudo
print("Chargement EDS-Pseudo...", flush=True)
ner = EdsPseudoManager()
ner.load()
assert ner.is_loaded(), "EDS-Pseudo non chargé"
print("EDS-Pseudo chargé.", flush=True)
# Lister les 59 premiers dossiers OGC
ogc_dirs = sorted(
[d for d in SRC.iterdir() if d.is_dir() and "_" in d.name and d.name[0].isdigit()],
key=lambda d: int(d.name.split("_")[0]),
)[:59]
print(f"Dossiers OGC: {len(ogc_dirs)}")
# Collecter tous les PDFs
pdfs = []
for d in ogc_dirs:
for pdf in sorted(d.glob("*.pdf")):
pdfs.append(pdf)
print(f"PDFs à traiter: {len(pdfs)}")
OUTDIR.mkdir(exist_ok=True)
ok = ko = 0
global_counts = Counter()
t0 = time.time()
for i, pdf in enumerate(pdfs, 1):
ogc = pdf.parent.name.split("_")[0]
print(f"[{i}/{len(pdfs)}] {pdf.name} (OGC {ogc})...", end=" ", flush=True)
try:
outputs = core.process_pdf(
pdf_path=pdf,
out_dir=OUTDIR,
make_vector_redaction=False,
also_make_raster_burn=True,
config_path=CONFIG,
use_hf=True,
ner_manager=ner,
ner_thresholds=None,
ogc_label=ogc,
)
# Compter les hits audit
audit_path = Path(outputs.get("audit", ""))
if audit_path.exists():
for line in audit_path.read_text().splitlines():
try:
h = json.loads(line)
global_counts[h["kind"]] += 1
except Exception:
pass
print("OK", flush=True)
ok += 1
except Exception as e:
print(f"ERREUR: {e}", flush=True)
ko += 1
elapsed = time.time() - t0
print(f"\n{'='*60}")
print(f"Terminé en {elapsed:.0f}s — OK: {ok}, Erreurs: {ko}")
print(f"Total PII détectés: {sum(global_counts.values())}")
print(f"\nDétail par type:")
for k, v in sorted(global_counts.items(), key=lambda x: -x[1]):
print(f" {k:30s} {v:6d}")
if __name__ == "__main__":
main()