145 lines
5.1 KiB
Python
Executable File
145 lines
5.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Test des corrections Phase 1 sur un échantillon de documents.
|
|
Vérifie que:
|
|
1. Les termes médicaux structurels ne sont PAS masqués
|
|
2. Les médicaments ne sont PAS masqués
|
|
3. Les dates de consultation ne sont PAS masquées (seules les dates de naissance)
|
|
"""
|
|
|
|
import sys
|
|
sys.path.insert(0, '.')
|
|
|
|
from pathlib import Path
|
|
import re
|
|
from anonymizer_core_refactored_onnx import process_pdf
|
|
|
|
def test_phase1_corrections():
|
|
"""Test les 3 corrections Phase 1 sur un échantillon de documents."""
|
|
|
|
# Chercher des documents de test
|
|
test_dir = Path("tests/ground_truth/pdfs")
|
|
|
|
# Sélectionner 5 documents pour le test (éviter les .annotations.json)
|
|
pdf_files = [f for f in test_dir.glob("*.pdf") if not f.name.endswith('.annotations.json')][:5]
|
|
|
|
if not pdf_files:
|
|
print("❌ Aucun document de test trouvé")
|
|
return
|
|
|
|
print(f"Test des corrections Phase 1 sur {len(pdf_files)} documents...")
|
|
print("=" * 80)
|
|
|
|
output_dir = Path("tests/ground_truth/pdfs/phase1_test")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
results = {
|
|
'medical_terms_preserved': 0,
|
|
'medications_preserved': 0,
|
|
'dates_reduced': 0,
|
|
'total_docs': 0
|
|
}
|
|
|
|
for i, pdf_path in enumerate(pdf_files, 1):
|
|
print(f"\n[{i}/{len(pdf_files)}] {pdf_path.name}")
|
|
|
|
try:
|
|
# Anonymiser
|
|
result = process_pdf(
|
|
pdf_path,
|
|
output_dir,
|
|
make_vector_redaction=False,
|
|
also_make_raster_burn=False,
|
|
config_path=Path("config/dictionnaires.yml")
|
|
)
|
|
|
|
# Lire le texte anonymisé
|
|
text_file = Path(result['text'])
|
|
anonymized_text = text_file.read_text(encoding='utf-8')
|
|
|
|
# Test 1: Vérifier que les termes médicaux structurels sont préservés
|
|
medical_terms = [
|
|
"Chef de service",
|
|
"Chef de clinique",
|
|
"Praticien hospitalier",
|
|
"service de",
|
|
"unité de"
|
|
]
|
|
|
|
medical_preserved = 0
|
|
for term in medical_terms:
|
|
if term.lower() in anonymized_text.lower():
|
|
medical_preserved += 1
|
|
|
|
# Test 2: Vérifier que les médicaments sont préservés
|
|
medications = [
|
|
"IDACIO",
|
|
"Salazopyrine",
|
|
"Infliximab",
|
|
"Apranax"
|
|
]
|
|
|
|
medications_preserved = 0
|
|
for med in medications:
|
|
if med.lower() in anonymized_text.lower():
|
|
medications_preserved += 1
|
|
|
|
# Test 3: Compter les masques [DATE] vs [DATE_NAISSANCE]
|
|
date_masks = len(re.findall(r'\[DATE\]', anonymized_text))
|
|
date_naissance_masks = len(re.findall(r'\[DATE_NAISSANCE\]', anonymized_text))
|
|
|
|
print(f" ✓ Termes médicaux préservés: {medical_preserved}/{len(medical_terms)}")
|
|
print(f" ✓ Médicaments préservés: {medications_preserved}/{len(medications)}")
|
|
print(f" ✓ [DATE]: {date_masks}, [DATE_NAISSANCE]: {date_naissance_masks}")
|
|
|
|
# Vérifier que [DATE] = 0 (correction réussie)
|
|
if date_masks == 0:
|
|
results['dates_reduced'] += 1
|
|
print(f" ✅ Correction dates: OK (0 [DATE])")
|
|
else:
|
|
print(f" ⚠️ Correction dates: {date_masks} [DATE] restants")
|
|
|
|
if medical_preserved > 0:
|
|
results['medical_terms_preserved'] += 1
|
|
|
|
if medications_preserved > 0:
|
|
results['medications_preserved'] += 1
|
|
|
|
results['total_docs'] += 1
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Erreur: {e}")
|
|
|
|
# Résumé
|
|
print("\n" + "=" * 80)
|
|
print("RÉSUMÉ DES CORRECTIONS PHASE 1")
|
|
print("=" * 80)
|
|
|
|
print(f"\nDocuments testés: {results['total_docs']}")
|
|
print(f"\n✅ Correction 1.1 (Termes médicaux):")
|
|
print(f" Documents avec termes préservés: {results['medical_terms_preserved']}/{results['total_docs']}")
|
|
|
|
print(f"\n✅ Correction 1.2 (Médicaments):")
|
|
print(f" Documents avec médicaments préservés: {results['medications_preserved']}/{results['total_docs']}")
|
|
|
|
print(f"\n✅ Correction 1.3 (Dates):")
|
|
print(f" Documents avec [DATE]=0: {results['dates_reduced']}/{results['total_docs']}")
|
|
|
|
success_rate = (
|
|
results['medical_terms_preserved'] +
|
|
results['medications_preserved'] +
|
|
results['dates_reduced']
|
|
) / (results['total_docs'] * 3) * 100
|
|
|
|
print(f"\n📊 Taux de succès global: {success_rate:.1f}%")
|
|
|
|
if success_rate >= 80:
|
|
print("\n✅ PHASE 1 CORRECTIONS VALIDÉES")
|
|
else:
|
|
print("\n⚠️ PHASE 1 CORRECTIONS PARTIELLES - Vérification manuelle requise")
|
|
|
|
print(f"\n📁 Résultats dans: {output_dir}")
|
|
|
|
if __name__ == "__main__":
|
|
test_phase1_corrections()
|