anonymisation/tools/quick_test_date_correction.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Test rapide de la correction DATE"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))

from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from anonymizer_core_refactored_onnx import process_pdf

# Test sur 3 documents du test dataset
test_docs = [
    "tests/ground_truth/pdfs/001_simple_compte_rendu_460_23153652_CR_COLOSCOPIE.pdf",
    "tests/ground_truth/pdfs/008_moyen_compte_rendu_195_23144210_ANAPATH.pdf",
    "tests/ground_truth/pdfs/013_moyen_compte_rendu_363_23085243_CRO.pdf",
]

print("Test correction DATE (Phase 1)")
print("=" * 80)

out_dir = Path("tests/phase1_test_output")
out_dir.mkdir(exist_ok=True)

for doc in test_docs:
    pdf_path = Path(doc)
    if not pdf_path.exists():
        print(f"⚠️  {pdf_path.name}: non trouvé")
        continue

    try:
        result = process_pdf(
            pdf_path=pdf_path,
            out_dir=out_dir,
            make_vector_redaction=False,
            also_make_raster_burn=False,
            config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
            use_hf=False,
            ner_manager=None,
            vlm_manager=None,
        )

        # Lire le fichier texte anonymisé
        text_file = out_dir / f"{pdf_path.stem}.pseudonymise.txt"
        if text_file.exists():
            text = text_file.read_text(encoding='utf-8')
            date_count = text.count("[DATE]")
            date_naissance_count = text.count("[DATE_NAISSANCE]")

            status = "✅" if date_count == 0 else "❌"
            print(f"{status} {pdf_path.name}")
            print(f"   [DATE]: {date_count} (attendu: 0)")
            print(f"   [DATE_NAISSANCE]: {date_naissance_count}")
        else:
            print(f"⚠️  {pdf_path.name}: fichier texte non trouvé")

    except Exception as e:
        print(f"❌ {pdf_path.name}: Erreur - {e}")

print("\n✅ Test terminé")