✅ Correction 1: Désactivation mapping DATE dans EDS-Pseudo - Seules les dates de naissance sont masquées - [DATE] = 0, [DATE_NAISSANCE] préservé - Contexte temporel médical préservé ✅ Correction 2: Activation whitelist médicaments - Médicaments préservés (IDACIO, SALAZOPYRINE, etc.) - Filtrage dans _mask_with_eds_pseudo - Information thérapeutique préservée ✅ Correction 3: Whitelist termes médicaux structurels - Termes préservés (Chef de service, Praticien hospitalier, etc.) - Filtrage dans _repl_service - Contexte médical préservé Tests: 100% succès sur corpus production (3 documents testés)
151 lines
5.4 KiB
Python
151 lines
5.4 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Validation Phase 1 sur corpus production
|
|
-----------------------------------------
|
|
Teste les 3 corrections sur 5 documents du corpus production.
|
|
"""
|
|
import sys
|
|
from pathlib import Path
|
|
import json
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from anonymizer_core_refactored_onnx import process_pdf
|
|
|
|
# 5 documents du corpus production (OGC 008)
|
|
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs")
|
|
test_docs = [
|
|
corpus_dir / "008_23001234" / "CRH 23001234.pdf",
|
|
corpus_dir / "008_23001234" / "CRO 23001234.pdf",
|
|
]
|
|
|
|
# Fallback: si les documents OGC 008 n'existent pas, utiliser d'autres
|
|
if not test_docs[0].exists():
|
|
# Chercher les premiers documents disponibles
|
|
test_docs = []
|
|
for ogc_dir in sorted(corpus_dir.glob("*_*"))[:3]:
|
|
for pdf in ogc_dir.glob("*.pdf"):
|
|
if not pdf.name.endswith(".redacted_raster.pdf"):
|
|
test_docs.append(pdf)
|
|
break
|
|
if len(test_docs) >= 5:
|
|
break
|
|
|
|
print("=" * 80)
|
|
print("VALIDATION PHASE 1 - CORPUS PRODUCTION")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
out_dir = Path("tests/phase1_production_test")
|
|
out_dir.mkdir(exist_ok=True)
|
|
|
|
results = {
|
|
"date_correction": {"passed": 0, "failed": 0, "total": 0},
|
|
"medication_preservation": {"passed": 0, "failed": 0, "total": 0},
|
|
"medical_terms_preservation": {"passed": 0, "failed": 0, "total": 0},
|
|
}
|
|
|
|
for pdf_path in test_docs[:5]:
|
|
if not pdf_path.exists():
|
|
continue
|
|
|
|
print(f"📄 {pdf_path.parent.name}/{pdf_path.name}")
|
|
print("-" * 80)
|
|
|
|
try:
|
|
result = process_pdf(
|
|
pdf_path=pdf_path,
|
|
out_dir=out_dir,
|
|
make_vector_redaction=False,
|
|
also_make_raster_burn=False,
|
|
config_path=Path("config/dictionnaires.yml"),
|
|
use_hf=False,
|
|
ner_manager=None,
|
|
vlm_manager=None,
|
|
)
|
|
|
|
# Lire le texte anonymisé
|
|
text_file = out_dir / f"{pdf_path.stem}.pseudonymise.txt"
|
|
if not text_file.exists():
|
|
print("⚠️ Fichier texte non trouvé")
|
|
continue
|
|
|
|
text = text_file.read_text(encoding='utf-8')
|
|
|
|
# Test 1: [DATE] = 0
|
|
date_count = text.count("[DATE]")
|
|
date_naissance_count = text.count("[DATE_NAISSANCE]")
|
|
results["date_correction"]["total"] += 1
|
|
|
|
if date_count == 0:
|
|
print(f"✅ Correction 1: [DATE] = {date_count}, [DATE_NAISSANCE] = {date_naissance_count}")
|
|
results["date_correction"]["passed"] += 1
|
|
else:
|
|
print(f"❌ Correction 1: [DATE] = {date_count} (attendu: 0)")
|
|
results["date_correction"]["failed"] += 1
|
|
|
|
# Test 2: Médicaments préservés
|
|
medications = ["idacio", "salazopyrine", "infliximab", "methotrexate",
|
|
"cortancyl", "bisoprolol", "entresto"]
|
|
meds_found = [m for m in medications if m in text.lower()]
|
|
|
|
if meds_found:
|
|
results["medication_preservation"]["total"] += 1
|
|
# Vérifier qu'ils ne sont pas masqués
|
|
meds_masked = [m for m in meds_found if f"[NOM]" in text[max(0, text.lower().find(m)-10):text.lower().find(m)+len(m)+10]]
|
|
if not meds_masked:
|
|
print(f"✅ Correction 2: Médicaments préservés: {', '.join(meds_found[:3])}")
|
|
results["medication_preservation"]["passed"] += 1
|
|
else:
|
|
print(f"❌ Correction 2: Médicaments masqués: {', '.join(meds_masked)}")
|
|
results["medication_preservation"]["failed"] += 1
|
|
|
|
# Test 3: Termes médicaux structurels préservés
|
|
medical_terms = ["chef de service", "chef de clinique", "praticien hospitalier",
|
|
"service de", "unité de"]
|
|
terms_found = [t for t in medical_terms if t in text.lower()]
|
|
|
|
if terms_found:
|
|
results["medical_terms_preservation"]["total"] += 1
|
|
# Vérifier qu'ils ne sont pas masqués
|
|
terms_masked = [t for t in terms_found if "[MASK]" in text[max(0, text.lower().find(t)-5):text.lower().find(t)+len(t)+15]]
|
|
if not terms_masked:
|
|
print(f"✅ Correction 3: Termes médicaux préservés: {', '.join(terms_found[:2])}")
|
|
results["medical_terms_preservation"]["passed"] += 1
|
|
else:
|
|
print(f"❌ Correction 3: Termes masqués: {', '.join(terms_masked)}")
|
|
results["medical_terms_preservation"]["failed"] += 1
|
|
|
|
print()
|
|
|
|
except Exception as e:
|
|
print(f"❌ Erreur: {e}")
|
|
print()
|
|
continue
|
|
|
|
# Résumé
|
|
print("=" * 80)
|
|
print("RÉSUMÉ")
|
|
print("=" * 80)
|
|
|
|
for test_name, test_results in results.items():
|
|
total = test_results["total"]
|
|
passed = test_results["passed"]
|
|
failed = test_results["failed"]
|
|
|
|
if total > 0:
|
|
success_rate = (passed / total) * 100
|
|
status = "✅" if failed == 0 else "❌"
|
|
print(f"{status} {test_name}: {passed}/{total} ({success_rate:.1f}%)")
|
|
else:
|
|
print(f"⚪ {test_name}: Aucun test applicable")
|
|
|
|
print()
|
|
|
|
# Verdict
|
|
all_passed = all(r["failed"] == 0 for r in results.values() if r["total"] > 0)
|
|
if all_passed:
|
|
print("✅ PHASE 1 VALIDÉE - Toutes les corrections fonctionnent")
|
|
else:
|
|
print("⚠️ Certains tests ont échoué - Vérifier les résultats")
|