#!/usr/bin/env python3 """Reprocess corpus audit_30 avec le code actuel. Lit la liste des documents depuis evaluation/baseline_scores.json, retrouve chaque PDF source dans le dossier des justificatifs CHCB, et appelle process_pdf() pour chacun. Sortie : un dossier horodaté sous /tmp/reprocess_audit30// avec les .pseudonymise.txt + .audit.jsonl + .redacted_raster.pdf pour pouvoir lancer ensuite `evaluate_quality.py --compare --dir `. Usage: python scripts/reprocess_audit30.py [--out /tmp/.../foo] Note : NER ONNX activé par défaut (gazetteers, INSEE, FINESS, BDPM). """ from __future__ import annotations import argparse import json import sys import time import traceback from datetime import datetime from pathlib import Path PROJECT_DIR = Path(__file__).parent.parent sys.path.insert(0, str(PROJECT_DIR)) from anonymizer_core_refactored_onnx import process_pdf, NerModelManager, NerThresholds # noqa: E402 BASELINE_PATH = PROJECT_DIR / "evaluation" / "baseline_scores.json" SOURCE_ROOT = Path( "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)" ) def list_baseline_docs() -> list[str]: data = json.loads(BASELINE_PATH.read_text(encoding="utf-8")) return list(data["per_file"].keys()) def find_source_pdf(doc_name: str) -> Path | None: """Cherche un PDF dont le stem matche doc_name dans SOURCE_ROOT.""" # Cas Trackare : nom complexe, on garde le pattern principal # Sinon : recherche directe sur le stem matches = list(SOURCE_ROOT.glob(f"**/{doc_name}.pdf")) if matches: return matches[0] # Tentative avec variantes (espaces vs underscores) alt = doc_name.replace("_", " ") matches = list(SOURCE_ROOT.glob(f"**/{alt}.pdf")) if matches: return matches[0] return None def main(): ap = argparse.ArgumentParser() ap.add_argument("--out", type=str, default=None) ap.add_argument("--no-ner", action="store_true", help="Désactiver NER ONNX (test gazetteers + regex seulement)") args = ap.parse_args() timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") out_dir = Path(args.out) if args.out else Path(f"/tmp/reprocess_audit30/{timestamp}") out_dir.mkdir(parents=True, exist_ok=True) docs = list_baseline_docs() print(f"Reprocess {len(docs)} docs from baseline → {out_dir}") print(f"NER ONNX : {'OFF' if args.no_ner else 'ON (gazetteers + regex + NER)'}") print("=" * 70) # NER ONNX ner = None if not args.no_ner and NerModelManager is not None: try: ner = NerModelManager(cache_dir=PROJECT_DIR / "models") ner.load("cmarkea/distilcamembert-base-ner") print("NER ONNX loaded.") except Exception as e: print(f"NER ONNX failed to load: {e} → fallback regex+gazetteers only") ner = None ner_thresholds = NerThresholds() if NerThresholds and ner else None results = [] missing = [] failed = [] t_start = time.time() for i, doc_name in enumerate(docs, 1): pdf = find_source_pdf(doc_name) if pdf is None: missing.append(doc_name) print(f"[{i:3}/{len(docs)}] MISSING : {doc_name}") continue t0 = time.time() try: out = process_pdf( pdf, out_dir, make_vector_redaction=True, also_make_raster_burn=True, use_hf=bool(ner), ner_manager=ner, ner_thresholds=ner_thresholds, ) dt = time.time() - t0 status = out.get("status", "ok") if isinstance(out, dict) else "ok" print(f"[{i:3}/{len(docs)}] {status:11} ({dt:5.1f}s) : {doc_name}") results.append((doc_name, status, dt)) except Exception as e: dt = time.time() - t0 failed.append((doc_name, str(e))) print(f"[{i:3}/{len(docs)}] FAILED ({dt:5.1f}s) : {doc_name} — {e}") traceback.print_exc() t_total = time.time() - t_start print("=" * 70) print(f"Total time : {t_total:.1f}s ({t_total/60:.1f} min)") print(f"OK : {len(results)}") print(f"Missing : {len(missing)}") print(f"Failed : {len(failed)}") print() print(f"Output dir : {out_dir}") print() print("Pour évaluer la qualité :") print(f" python scripts/evaluate_quality.py --dir {out_dir} --compare") if __name__ == "__main__": main()