From 2f96f56432e6f85feb7bc614fc9ea53d64f2cffa Mon Sep 17 00:00:00 2001 From: Domi31tls Date: Tue, 2 Jun 2026 14:26:02 +0200 Subject: [PATCH] chore(scripts): add reprocess_audit30.py for quality regression testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Petit utilitaire pour re-traiter le corpus audit_30 avec le code courant et générer un dossier de sortie horodaté. Usage: python scripts/reprocess_audit30.py [--out /tmp/.../foo] [--no-ner] Lit la liste des 29 docs depuis evaluation/baseline_scores.json, retrouve chaque PDF source dans /home/dom/Téléchargements/.../CHCB_DocJustificatifs, appelle process_pdf() pour chacun, sortie dans /tmp/reprocess_audit30/ (ou --out). Permet ensuite de mesurer la qualité avec : python scripts/evaluate_quality.py --dir --compare Validé sur audit_30 — 29 docs en ~4 min avec NER ONNX. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/reprocess_audit30.py | 132 +++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 scripts/reprocess_audit30.py diff --git a/scripts/reprocess_audit30.py b/scripts/reprocess_audit30.py new file mode 100644 index 0000000..7e455b6 --- /dev/null +++ b/scripts/reprocess_audit30.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +"""Reprocess corpus audit_30 avec le code actuel. + +Lit la liste des documents depuis evaluation/baseline_scores.json, retrouve +chaque PDF source dans le dossier des justificatifs CHCB, et appelle +process_pdf() pour chacun. + +Sortie : un dossier horodaté sous /tmp/reprocess_audit30// avec +les .pseudonymise.txt + .audit.jsonl + .redacted_raster.pdf pour pouvoir +lancer ensuite `evaluate_quality.py --compare --dir `. + +Usage: + python scripts/reprocess_audit30.py [--out /tmp/.../foo] + +Note : NER ONNX activé par défaut (gazetteers, INSEE, FINESS, BDPM). +""" +from __future__ import annotations + +import argparse +import json +import sys +import time +import traceback +from datetime import datetime +from pathlib import Path + +PROJECT_DIR = Path(__file__).parent.parent +sys.path.insert(0, str(PROJECT_DIR)) + +from anonymizer_core_refactored_onnx import process_pdf, NerModelManager, NerThresholds # noqa: E402 + +BASELINE_PATH = PROJECT_DIR / "evaluation" / "baseline_scores.json" +SOURCE_ROOT = Path( + "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)" +) + + +def list_baseline_docs() -> list[str]: + data = json.loads(BASELINE_PATH.read_text(encoding="utf-8")) + return list(data["per_file"].keys()) + + +def find_source_pdf(doc_name: str) -> Path | None: + """Cherche un PDF dont le stem matche doc_name dans SOURCE_ROOT.""" + # Cas Trackare : nom complexe, on garde le pattern principal + # Sinon : recherche directe sur le stem + matches = list(SOURCE_ROOT.glob(f"**/{doc_name}.pdf")) + if matches: + return matches[0] + # Tentative avec variantes (espaces vs underscores) + alt = doc_name.replace("_", " ") + matches = list(SOURCE_ROOT.glob(f"**/{alt}.pdf")) + if matches: + return matches[0] + return None + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--out", type=str, default=None) + ap.add_argument("--no-ner", action="store_true", + help="Désactiver NER ONNX (test gazetteers + regex seulement)") + args = ap.parse_args() + + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + out_dir = Path(args.out) if args.out else Path(f"/tmp/reprocess_audit30/{timestamp}") + out_dir.mkdir(parents=True, exist_ok=True) + + docs = list_baseline_docs() + print(f"Reprocess {len(docs)} docs from baseline → {out_dir}") + print(f"NER ONNX : {'OFF' if args.no_ner else 'ON (gazetteers + regex + NER)'}") + print("=" * 70) + + # NER ONNX + ner = None + if not args.no_ner and NerModelManager is not None: + try: + ner = NerModelManager(cache_dir=PROJECT_DIR / "models") + ner.load("cmarkea/distilcamembert-base-ner") + print("NER ONNX loaded.") + except Exception as e: + print(f"NER ONNX failed to load: {e} → fallback regex+gazetteers only") + ner = None + + ner_thresholds = NerThresholds() if NerThresholds and ner else None + + results = [] + missing = [] + failed = [] + t_start = time.time() + + for i, doc_name in enumerate(docs, 1): + pdf = find_source_pdf(doc_name) + if pdf is None: + missing.append(doc_name) + print(f"[{i:3}/{len(docs)}] MISSING : {doc_name}") + continue + t0 = time.time() + try: + out = process_pdf( + pdf, out_dir, + make_vector_redaction=True, + also_make_raster_burn=True, + use_hf=bool(ner), + ner_manager=ner, + ner_thresholds=ner_thresholds, + ) + dt = time.time() - t0 + status = out.get("status", "ok") if isinstance(out, dict) else "ok" + print(f"[{i:3}/{len(docs)}] {status:11} ({dt:5.1f}s) : {doc_name}") + results.append((doc_name, status, dt)) + except Exception as e: + dt = time.time() - t0 + failed.append((doc_name, str(e))) + print(f"[{i:3}/{len(docs)}] FAILED ({dt:5.1f}s) : {doc_name} — {e}") + traceback.print_exc() + + t_total = time.time() - t_start + print("=" * 70) + print(f"Total time : {t_total:.1f}s ({t_total/60:.1f} min)") + print(f"OK : {len(results)}") + print(f"Missing : {len(missing)}") + print(f"Failed : {len(failed)}") + print() + print(f"Output dir : {out_dir}") + print() + print("Pour évaluer la qualité :") + print(f" python scripts/evaluate_quality.py --dir {out_dir} --compare") + + +if __name__ == "__main__": + main()