chore(scripts): add reprocess_audit30.py for quality regression testing
Petit utilitaire pour re-traiter le corpus audit_30 avec le code courant
et générer un dossier de sortie horodaté.
Usage:
python scripts/reprocess_audit30.py [--out /tmp/.../foo] [--no-ner]
Lit la liste des 29 docs depuis evaluation/baseline_scores.json, retrouve
chaque PDF source dans /home/dom/Téléchargements/.../CHCB_DocJustificatifs,
appelle process_pdf() pour chacun, sortie dans /tmp/reprocess_audit30/
(ou --out).
Permet ensuite de mesurer la qualité avec :
python scripts/evaluate_quality.py --dir <output> --compare
Validé sur audit_30 — 29 docs en ~4 min avec NER ONNX.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
132
scripts/reprocess_audit30.py
Normal file
132
scripts/reprocess_audit30.py
Normal file
@@ -0,0 +1,132 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Reprocess corpus audit_30 avec le code actuel.
|
||||
|
||||
Lit la liste des documents depuis evaluation/baseline_scores.json, retrouve
|
||||
chaque PDF source dans le dossier des justificatifs CHCB, et appelle
|
||||
process_pdf() pour chacun.
|
||||
|
||||
Sortie : un dossier horodaté sous /tmp/reprocess_audit30/<timestamp>/ avec
|
||||
les .pseudonymise.txt + .audit.jsonl + .redacted_raster.pdf pour pouvoir
|
||||
lancer ensuite `evaluate_quality.py --compare --dir <ce dossier>`.
|
||||
|
||||
Usage:
|
||||
python scripts/reprocess_audit30.py [--out /tmp/.../foo]
|
||||
|
||||
Note : NER ONNX activé par défaut (gazetteers, INSEE, FINESS, BDPM).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
PROJECT_DIR = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(PROJECT_DIR))
|
||||
|
||||
from anonymizer_core_refactored_onnx import process_pdf, NerModelManager, NerThresholds # noqa: E402
|
||||
|
||||
BASELINE_PATH = PROJECT_DIR / "evaluation" / "baseline_scores.json"
|
||||
SOURCE_ROOT = Path(
|
||||
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)"
|
||||
)
|
||||
|
||||
|
||||
def list_baseline_docs() -> list[str]:
|
||||
data = json.loads(BASELINE_PATH.read_text(encoding="utf-8"))
|
||||
return list(data["per_file"].keys())
|
||||
|
||||
|
||||
def find_source_pdf(doc_name: str) -> Path | None:
|
||||
"""Cherche un PDF dont le stem matche doc_name dans SOURCE_ROOT."""
|
||||
# Cas Trackare : nom complexe, on garde le pattern principal
|
||||
# Sinon : recherche directe sur le stem
|
||||
matches = list(SOURCE_ROOT.glob(f"**/{doc_name}.pdf"))
|
||||
if matches:
|
||||
return matches[0]
|
||||
# Tentative avec variantes (espaces vs underscores)
|
||||
alt = doc_name.replace("_", " ")
|
||||
matches = list(SOURCE_ROOT.glob(f"**/{alt}.pdf"))
|
||||
if matches:
|
||||
return matches[0]
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--out", type=str, default=None)
|
||||
ap.add_argument("--no-ner", action="store_true",
|
||||
help="Désactiver NER ONNX (test gazetteers + regex seulement)")
|
||||
args = ap.parse_args()
|
||||
|
||||
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
out_dir = Path(args.out) if args.out else Path(f"/tmp/reprocess_audit30/{timestamp}")
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
docs = list_baseline_docs()
|
||||
print(f"Reprocess {len(docs)} docs from baseline → {out_dir}")
|
||||
print(f"NER ONNX : {'OFF' if args.no_ner else 'ON (gazetteers + regex + NER)'}")
|
||||
print("=" * 70)
|
||||
|
||||
# NER ONNX
|
||||
ner = None
|
||||
if not args.no_ner and NerModelManager is not None:
|
||||
try:
|
||||
ner = NerModelManager(cache_dir=PROJECT_DIR / "models")
|
||||
ner.load("cmarkea/distilcamembert-base-ner")
|
||||
print("NER ONNX loaded.")
|
||||
except Exception as e:
|
||||
print(f"NER ONNX failed to load: {e} → fallback regex+gazetteers only")
|
||||
ner = None
|
||||
|
||||
ner_thresholds = NerThresholds() if NerThresholds and ner else None
|
||||
|
||||
results = []
|
||||
missing = []
|
||||
failed = []
|
||||
t_start = time.time()
|
||||
|
||||
for i, doc_name in enumerate(docs, 1):
|
||||
pdf = find_source_pdf(doc_name)
|
||||
if pdf is None:
|
||||
missing.append(doc_name)
|
||||
print(f"[{i:3}/{len(docs)}] MISSING : {doc_name}")
|
||||
continue
|
||||
t0 = time.time()
|
||||
try:
|
||||
out = process_pdf(
|
||||
pdf, out_dir,
|
||||
make_vector_redaction=True,
|
||||
also_make_raster_burn=True,
|
||||
use_hf=bool(ner),
|
||||
ner_manager=ner,
|
||||
ner_thresholds=ner_thresholds,
|
||||
)
|
||||
dt = time.time() - t0
|
||||
status = out.get("status", "ok") if isinstance(out, dict) else "ok"
|
||||
print(f"[{i:3}/{len(docs)}] {status:11} ({dt:5.1f}s) : {doc_name}")
|
||||
results.append((doc_name, status, dt))
|
||||
except Exception as e:
|
||||
dt = time.time() - t0
|
||||
failed.append((doc_name, str(e)))
|
||||
print(f"[{i:3}/{len(docs)}] FAILED ({dt:5.1f}s) : {doc_name} — {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
t_total = time.time() - t_start
|
||||
print("=" * 70)
|
||||
print(f"Total time : {t_total:.1f}s ({t_total/60:.1f} min)")
|
||||
print(f"OK : {len(results)}")
|
||||
print(f"Missing : {len(missing)}")
|
||||
print(f"Failed : {len(failed)}")
|
||||
print()
|
||||
print(f"Output dir : {out_dir}")
|
||||
print()
|
||||
print("Pour évaluer la qualité :")
|
||||
print(f" python scripts/evaluate_quality.py --dir {out_dir} --compare")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user