anonymisation/scripts/reprocess_audit30.py

#!/usr/bin/env python3
"""Reprocess corpus audit_30 avec le code actuel.

Lit la liste des documents depuis evaluation/baseline_scores.json, retrouve
chaque PDF source dans le dossier des justificatifs CHCB, et appelle
process_pdf() pour chacun.

Sortie : un dossier horodaté sous /tmp/reprocess_audit30/<timestamp>/ avec
les .pseudonymise.txt + .audit.jsonl + .redacted_raster.pdf pour pouvoir
lancer ensuite `evaluate_quality.py --compare --dir <ce dossier>`.

Usage:
    python scripts/reprocess_audit30.py [--out /tmp/.../foo]

Note : NER ONNX activé par défaut (gazetteers, INSEE, FINESS, BDPM).
"""
from __future__ import annotations

import argparse
import json
import sys
import time
import traceback
from datetime import datetime
from pathlib import Path

PROJECT_DIR = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_DIR))

from anonymizer_core_refactored_onnx import process_pdf, NerModelManager, NerThresholds  # noqa: E402

BASELINE_PATH = PROJECT_DIR / "evaluation" / "baseline_scores.json"
SOURCE_ROOT = Path(
    "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)"
)


def list_baseline_docs() -> list[str]:
    data = json.loads(BASELINE_PATH.read_text(encoding="utf-8"))
    return list(data["per_file"].keys())


def find_source_pdf(doc_name: str) -> Path | None:
    """Cherche un PDF dont le stem matche doc_name dans SOURCE_ROOT."""
    # Cas Trackare : nom complexe, on garde le pattern principal
    # Sinon : recherche directe sur le stem
    matches = list(SOURCE_ROOT.glob(f"**/{doc_name}.pdf"))
    if matches:
        return matches[0]
    # Tentative avec variantes (espaces vs underscores)
    alt = doc_name.replace("_", " ")
    matches = list(SOURCE_ROOT.glob(f"**/{alt}.pdf"))
    if matches:
        return matches[0]
    return None


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--out", type=str, default=None)
    ap.add_argument("--no-ner", action="store_true",
                    help="Désactiver NER ONNX (test gazetteers + regex seulement)")
    args = ap.parse_args()

    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    out_dir = Path(args.out) if args.out else Path(f"/tmp/reprocess_audit30/{timestamp}")
    out_dir.mkdir(parents=True, exist_ok=True)

    docs = list_baseline_docs()
    print(f"Reprocess {len(docs)} docs from baseline → {out_dir}")
    print(f"NER ONNX : {'OFF' if args.no_ner else 'ON (gazetteers + regex + NER)'}")
    print("=" * 70)

    # NER ONNX
    ner = None
    if not args.no_ner and NerModelManager is not None:
        try:
            ner = NerModelManager(cache_dir=PROJECT_DIR / "models")
            ner.load("cmarkea/distilcamembert-base-ner")
            print("NER ONNX loaded.")
        except Exception as e:
            print(f"NER ONNX failed to load: {e} → fallback regex+gazetteers only")
            ner = None

    ner_thresholds = NerThresholds() if NerThresholds and ner else None

    results = []
    missing = []
    failed = []
    t_start = time.time()

    for i, doc_name in enumerate(docs, 1):
        pdf = find_source_pdf(doc_name)
        if pdf is None:
            missing.append(doc_name)
            print(f"[{i:3}/{len(docs)}] MISSING : {doc_name}")
            continue
        t0 = time.time()
        try:
            out = process_pdf(
                pdf, out_dir,
                make_vector_redaction=True,
                also_make_raster_burn=True,
                use_hf=bool(ner),
                ner_manager=ner,
                ner_thresholds=ner_thresholds,
            )
            dt = time.time() - t0
            status = out.get("status", "ok") if isinstance(out, dict) else "ok"
            print(f"[{i:3}/{len(docs)}] {status:11} ({dt:5.1f}s) : {doc_name}")
            results.append((doc_name, status, dt))
        except Exception as e:
            dt = time.time() - t0
            failed.append((doc_name, str(e)))
            print(f"[{i:3}/{len(docs)}] FAILED      ({dt:5.1f}s) : {doc_name} — {e}")
            traceback.print_exc()

    t_total = time.time() - t_start
    print("=" * 70)
    print(f"Total time : {t_total:.1f}s ({t_total/60:.1f} min)")
    print(f"OK       : {len(results)}")
    print(f"Missing  : {len(missing)}")
    print(f"Failed   : {len(failed)}")
    print()
    print(f"Output dir : {out_dir}")
    print()
    print("Pour évaluer la qualité :")
    print(f"    python scripts/evaluate_quality.py --dir {out_dir} --compare")


if __name__ == "__main__":
    main()