anonymisation/tools/validate_phase1_on_production.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Validation Phase 1 sur corpus production
-----------------------------------------
Teste les 3 corrections sur 5 documents du corpus production.
"""
import sys
from pathlib import Path
import json
sys.path.insert(0, str(Path(__file__).parent.parent))

from anonymizer_core_refactored_onnx import process_pdf

# 5 documents du corpus production (OGC 008)
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs")
test_docs = [
    corpus_dir / "008_23001234" / "CRH 23001234.pdf",
    corpus_dir / "008_23001234" / "CRO 23001234.pdf",
]

# Fallback: si les documents OGC 008 n'existent pas, utiliser d'autres
if not test_docs[0].exists():
    # Chercher les premiers documents disponibles
    test_docs = []
    for ogc_dir in sorted(corpus_dir.glob("*_*"))[:3]:
        for pdf in ogc_dir.glob("*.pdf"):
            if not pdf.name.endswith(".redacted_raster.pdf"):
                test_docs.append(pdf)
                break
        if len(test_docs) >= 5:
            break

print("=" * 80)
print("VALIDATION PHASE 1 - CORPUS PRODUCTION")
print("=" * 80)
print()

out_dir = Path("tests/phase1_production_test")
out_dir.mkdir(exist_ok=True)

results = {
    "date_correction": {"passed": 0, "failed": 0, "total": 0},
    "medication_preservation": {"passed": 0, "failed": 0, "total": 0},
    "medical_terms_preservation": {"passed": 0, "failed": 0, "total": 0},
}

for pdf_path in test_docs[:5]:
    if not pdf_path.exists():
        continue

    print(f"📄 {pdf_path.parent.name}/{pdf_path.name}")
    print("-" * 80)

    try:
        result = process_pdf(
            pdf_path=pdf_path,
            out_dir=out_dir,
            make_vector_redaction=False,
            also_make_raster_burn=False,
            config_path=Path("config/dictionnaires.yml"),
            use_hf=False,
            ner_manager=None,
            vlm_manager=None,
        )

        # Lire le texte anonymisé
        text_file = out_dir / f"{pdf_path.stem}.pseudonymise.txt"
        if not text_file.exists():
            print("⚠️  Fichier texte non trouvé")
            continue

        text = text_file.read_text(encoding='utf-8')

        # Test 1: [DATE] = 0
        date_count = text.count("[DATE]")
        date_naissance_count = text.count("[DATE_NAISSANCE]")
        results["date_correction"]["total"] += 1

        if date_count == 0:
            print(f"✅ Correction 1: [DATE] = {date_count}, [DATE_NAISSANCE] = {date_naissance_count}")
            results["date_correction"]["passed"] += 1
        else:
            print(f"❌ Correction 1: [DATE] = {date_count} (attendu: 0)")
            results["date_correction"]["failed"] += 1

        # Test 2: Médicaments préservés
        medications = ["idacio", "salazopyrine", "infliximab", "methotrexate",
                      "cortancyl", "bisoprolol", "entresto"]
        meds_found = [m for m in medications if m in text.lower()]

        if meds_found:
            results["medication_preservation"]["total"] += 1
            # Vérifier qu'ils ne sont pas masqués
            meds_masked = [m for m in meds_found if f"[NOM]" in text[max(0, text.lower().find(m)-10):text.lower().find(m)+len(m)+10]]
            if not meds_masked:
                print(f"✅ Correction 2: Médicaments préservés: {', '.join(meds_found[:3])}")
                results["medication_preservation"]["passed"] += 1
            else:
                print(f"❌ Correction 2: Médicaments masqués: {', '.join(meds_masked)}")
                results["medication_preservation"]["failed"] += 1

        # Test 3: Termes médicaux structurels préservés
        medical_terms = ["chef de service", "chef de clinique", "praticien hospitalier",
                        "service de", "unité de"]
        terms_found = [t for t in medical_terms if t in text.lower()]

        if terms_found:
            results["medical_terms_preservation"]["total"] += 1
            # Vérifier qu'ils ne sont pas masqués
            terms_masked = [t for t in terms_found if "[MASK]" in text[max(0, text.lower().find(t)-5):text.lower().find(t)+len(t)+15]]
            if not terms_masked:
                print(f"✅ Correction 3: Termes médicaux préservés: {', '.join(terms_found[:2])}")
                results["medical_terms_preservation"]["passed"] += 1
            else:
                print(f"❌ Correction 3: Termes masqués: {', '.join(terms_masked)}")
                results["medical_terms_preservation"]["failed"] += 1

        print()

    except Exception as e:
        print(f"❌ Erreur: {e}")
        print()
        continue

# Résumé
print("=" * 80)
print("RÉSUMÉ")
print("=" * 80)

for test_name, test_results in results.items():
    total = test_results["total"]
    passed = test_results["passed"]
    failed = test_results["failed"]

    if total > 0:
        success_rate = (passed / total) * 100
        status = "✅" if failed == 0 else "❌"
        print(f"{status} {test_name}: {passed}/{total} ({success_rate:.1f}%)")
    else:
        print(f"⚪ {test_name}: Aucun test applicable")

print()

# Verdict
all_passed = all(r["failed"] == 0 for r in results.values() if r["total"] > 0)
if all_passed:
    print("✅ PHASE 1 VALIDÉE - Toutes les corrections fonctionnent")
else:
    print("⚠️  Certains tests ont échoué - Vérifier les résultats")