chore(scripts): add reprocess_audit30.py for quality regression testing
Petit utilitaire pour re-traiter le corpus audit_30 avec le code courant
et générer un dossier de sortie horodaté.
Usage:
python scripts/reprocess_audit30.py [--out /tmp/.../foo] [--no-ner]
Lit la liste des 29 docs depuis evaluation/baseline_scores.json, retrouve
chaque PDF source dans /home/dom/Téléchargements/.../CHCB_DocJustificatifs,
appelle process_pdf() pour chacun, sortie dans /tmp/reprocess_audit30/
(ou --out).
Permet ensuite de mesurer la qualité avec :
python scripts/evaluate_quality.py --dir <output> --compare
Validé sur audit_30 — 29 docs en ~4 min avec NER ONNX.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
132
scripts/reprocess_audit30.py
Normal file
132
scripts/reprocess_audit30.py
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Reprocess corpus audit_30 avec le code actuel.
|
||||||
|
|
||||||
|
Lit la liste des documents depuis evaluation/baseline_scores.json, retrouve
|
||||||
|
chaque PDF source dans le dossier des justificatifs CHCB, et appelle
|
||||||
|
process_pdf() pour chacun.
|
||||||
|
|
||||||
|
Sortie : un dossier horodaté sous /tmp/reprocess_audit30/<timestamp>/ avec
|
||||||
|
les .pseudonymise.txt + .audit.jsonl + .redacted_raster.pdf pour pouvoir
|
||||||
|
lancer ensuite `evaluate_quality.py --compare --dir <ce dossier>`.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/reprocess_audit30.py [--out /tmp/.../foo]
|
||||||
|
|
||||||
|
Note : NER ONNX activé par défaut (gazetteers, INSEE, FINESS, BDPM).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
PROJECT_DIR = Path(__file__).parent.parent
|
||||||
|
sys.path.insert(0, str(PROJECT_DIR))
|
||||||
|
|
||||||
|
from anonymizer_core_refactored_onnx import process_pdf, NerModelManager, NerThresholds # noqa: E402
|
||||||
|
|
||||||
|
BASELINE_PATH = PROJECT_DIR / "evaluation" / "baseline_scores.json"
|
||||||
|
SOURCE_ROOT = Path(
|
||||||
|
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def list_baseline_docs() -> list[str]:
|
||||||
|
data = json.loads(BASELINE_PATH.read_text(encoding="utf-8"))
|
||||||
|
return list(data["per_file"].keys())
|
||||||
|
|
||||||
|
|
||||||
|
def find_source_pdf(doc_name: str) -> Path | None:
|
||||||
|
"""Cherche un PDF dont le stem matche doc_name dans SOURCE_ROOT."""
|
||||||
|
# Cas Trackare : nom complexe, on garde le pattern principal
|
||||||
|
# Sinon : recherche directe sur le stem
|
||||||
|
matches = list(SOURCE_ROOT.glob(f"**/{doc_name}.pdf"))
|
||||||
|
if matches:
|
||||||
|
return matches[0]
|
||||||
|
# Tentative avec variantes (espaces vs underscores)
|
||||||
|
alt = doc_name.replace("_", " ")
|
||||||
|
matches = list(SOURCE_ROOT.glob(f"**/{alt}.pdf"))
|
||||||
|
if matches:
|
||||||
|
return matches[0]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--out", type=str, default=None)
|
||||||
|
ap.add_argument("--no-ner", action="store_true",
|
||||||
|
help="Désactiver NER ONNX (test gazetteers + regex seulement)")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||||
|
out_dir = Path(args.out) if args.out else Path(f"/tmp/reprocess_audit30/{timestamp}")
|
||||||
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
docs = list_baseline_docs()
|
||||||
|
print(f"Reprocess {len(docs)} docs from baseline → {out_dir}")
|
||||||
|
print(f"NER ONNX : {'OFF' if args.no_ner else 'ON (gazetteers + regex + NER)'}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# NER ONNX
|
||||||
|
ner = None
|
||||||
|
if not args.no_ner and NerModelManager is not None:
|
||||||
|
try:
|
||||||
|
ner = NerModelManager(cache_dir=PROJECT_DIR / "models")
|
||||||
|
ner.load("cmarkea/distilcamembert-base-ner")
|
||||||
|
print("NER ONNX loaded.")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"NER ONNX failed to load: {e} → fallback regex+gazetteers only")
|
||||||
|
ner = None
|
||||||
|
|
||||||
|
ner_thresholds = NerThresholds() if NerThresholds and ner else None
|
||||||
|
|
||||||
|
results = []
|
||||||
|
missing = []
|
||||||
|
failed = []
|
||||||
|
t_start = time.time()
|
||||||
|
|
||||||
|
for i, doc_name in enumerate(docs, 1):
|
||||||
|
pdf = find_source_pdf(doc_name)
|
||||||
|
if pdf is None:
|
||||||
|
missing.append(doc_name)
|
||||||
|
print(f"[{i:3}/{len(docs)}] MISSING : {doc_name}")
|
||||||
|
continue
|
||||||
|
t0 = time.time()
|
||||||
|
try:
|
||||||
|
out = process_pdf(
|
||||||
|
pdf, out_dir,
|
||||||
|
make_vector_redaction=True,
|
||||||
|
also_make_raster_burn=True,
|
||||||
|
use_hf=bool(ner),
|
||||||
|
ner_manager=ner,
|
||||||
|
ner_thresholds=ner_thresholds,
|
||||||
|
)
|
||||||
|
dt = time.time() - t0
|
||||||
|
status = out.get("status", "ok") if isinstance(out, dict) else "ok"
|
||||||
|
print(f"[{i:3}/{len(docs)}] {status:11} ({dt:5.1f}s) : {doc_name}")
|
||||||
|
results.append((doc_name, status, dt))
|
||||||
|
except Exception as e:
|
||||||
|
dt = time.time() - t0
|
||||||
|
failed.append((doc_name, str(e)))
|
||||||
|
print(f"[{i:3}/{len(docs)}] FAILED ({dt:5.1f}s) : {doc_name} — {e}")
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
t_total = time.time() - t_start
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"Total time : {t_total:.1f}s ({t_total/60:.1f} min)")
|
||||||
|
print(f"OK : {len(results)}")
|
||||||
|
print(f"Missing : {len(missing)}")
|
||||||
|
print(f"Failed : {len(failed)}")
|
||||||
|
print()
|
||||||
|
print(f"Output dir : {out_dir}")
|
||||||
|
print()
|
||||||
|
print("Pour évaluer la qualité :")
|
||||||
|
print(f" python scripts/evaluate_quality.py --dir {out_dir} --compare")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user