#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Validation Phase 1 sur corpus production ----------------------------------------- Teste les 3 corrections sur 5 documents du corpus production. """ import sys from pathlib import Path import json sys.path.insert(0, str(Path(__file__).parent.parent)) from anonymizer_core_refactored_onnx import process_pdf # 5 documents du corpus production (OGC 008) corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs") test_docs = [ corpus_dir / "008_23001234" / "CRH 23001234.pdf", corpus_dir / "008_23001234" / "CRO 23001234.pdf", ] # Fallback: si les documents OGC 008 n'existent pas, utiliser d'autres if not test_docs[0].exists(): # Chercher les premiers documents disponibles test_docs = [] for ogc_dir in sorted(corpus_dir.glob("*_*"))[:3]: for pdf in ogc_dir.glob("*.pdf"): if not pdf.name.endswith(".redacted_raster.pdf"): test_docs.append(pdf) break if len(test_docs) >= 5: break print("=" * 80) print("VALIDATION PHASE 1 - CORPUS PRODUCTION") print("=" * 80) print() out_dir = Path("tests/phase1_production_test") out_dir.mkdir(exist_ok=True) results = { "date_correction": {"passed": 0, "failed": 0, "total": 0}, "medication_preservation": {"passed": 0, "failed": 0, "total": 0}, "medical_terms_preservation": {"passed": 0, "failed": 0, "total": 0}, } for pdf_path in test_docs[:5]: if not pdf_path.exists(): continue print(f"📄 {pdf_path.parent.name}/{pdf_path.name}") print("-" * 80) try: result = process_pdf( pdf_path=pdf_path, out_dir=out_dir, make_vector_redaction=False, also_make_raster_burn=False, config_path=Path("config/dictionnaires.yml"), use_hf=False, ner_manager=None, vlm_manager=None, ) # Lire le texte anonymisé text_file = out_dir / f"{pdf_path.stem}.pseudonymise.txt" if not text_file.exists(): print("⚠️ Fichier texte non trouvé") continue text = text_file.read_text(encoding='utf-8') # Test 1: [DATE] = 0 date_count = text.count("[DATE]") date_naissance_count = text.count("[DATE_NAISSANCE]") results["date_correction"]["total"] += 1 if date_count == 0: print(f"✅ Correction 1: [DATE] = {date_count}, [DATE_NAISSANCE] = {date_naissance_count}") results["date_correction"]["passed"] += 1 else: print(f"❌ Correction 1: [DATE] = {date_count} (attendu: 0)") results["date_correction"]["failed"] += 1 # Test 2: Médicaments préservés medications = ["idacio", "salazopyrine", "infliximab", "methotrexate", "cortancyl", "bisoprolol", "entresto"] meds_found = [m for m in medications if m in text.lower()] if meds_found: results["medication_preservation"]["total"] += 1 # Vérifier qu'ils ne sont pas masqués meds_masked = [m for m in meds_found if f"[NOM]" in text[max(0, text.lower().find(m)-10):text.lower().find(m)+len(m)+10]] if not meds_masked: print(f"✅ Correction 2: Médicaments préservés: {', '.join(meds_found[:3])}") results["medication_preservation"]["passed"] += 1 else: print(f"❌ Correction 2: Médicaments masqués: {', '.join(meds_masked)}") results["medication_preservation"]["failed"] += 1 # Test 3: Termes médicaux structurels préservés medical_terms = ["chef de service", "chef de clinique", "praticien hospitalier", "service de", "unité de"] terms_found = [t for t in medical_terms if t in text.lower()] if terms_found: results["medical_terms_preservation"]["total"] += 1 # Vérifier qu'ils ne sont pas masqués terms_masked = [t for t in terms_found if "[MASK]" in text[max(0, text.lower().find(t)-5):text.lower().find(t)+len(t)+15]] if not terms_masked: print(f"✅ Correction 3: Termes médicaux préservés: {', '.join(terms_found[:2])}") results["medical_terms_preservation"]["passed"] += 1 else: print(f"❌ Correction 3: Termes masqués: {', '.join(terms_masked)}") results["medical_terms_preservation"]["failed"] += 1 print() except Exception as e: print(f"❌ Erreur: {e}") print() continue # Résumé print("=" * 80) print("RÉSUMÉ") print("=" * 80) for test_name, test_results in results.items(): total = test_results["total"] passed = test_results["passed"] failed = test_results["failed"] if total > 0: success_rate = (passed / total) * 100 status = "✅" if failed == 0 else "❌" print(f"{status} {test_name}: {passed}/{total} ({success_rate:.1f}%)") else: print(f"⚪ {test_name}: Aucun test applicable") print() # Verdict all_passed = all(r["failed"] == 0 for r in results.values() if r["total"] > 0) if all_passed: print("✅ PHASE 1 VALIDÉE - Toutes les corrections fonctionnent") else: print("⚠️ Certains tests ont échoué - Vérifier les résultats")