diff --git a/eds_pseudo_manager.py b/eds_pseudo_manager.py index a96dadb..6bbbdda 100644 --- a/eds_pseudo_manager.py +++ b/eds_pseudo_manager.py @@ -30,7 +30,7 @@ EDS_LABEL_MAP: Dict[str, str] = { "ZIP": "CODE_POSTAL", "VILLE": "VILLE", "HOPITAL": "ETAB", - # "DATE": "DATE", # DÉSACTIVÉ: ne masquer que les dates de naissance (Correction 1.3) + # "DATE": "DATE", # ✅ DÉSACTIVÉ (Phase 1): ne masquer que les dates de naissance, pas les dates de consultation/examen "DATE_NAISSANCE": "DATE_NAISSANCE", "IPP": "IPP", "NDA": "NDA", diff --git a/tools/quick_test_date_correction.py b/tools/quick_test_date_correction.py new file mode 100644 index 0000000..8fc55c4 --- /dev/null +++ b/tools/quick_test_date_correction.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Test rapide de la correction DATE""" +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from anonymizer_core_refactored_onnx import process_pdf + +# Test sur 3 documents du test dataset +test_docs = [ + "tests/ground_truth/pdfs/001_simple_compte_rendu_460_23153652_CR_COLOSCOPIE.pdf", + "tests/ground_truth/pdfs/008_moyen_compte_rendu_195_23144210_ANAPATH.pdf", + "tests/ground_truth/pdfs/013_moyen_compte_rendu_363_23085243_CRO.pdf", +] + +print("Test correction DATE (Phase 1)") +print("=" * 80) + +out_dir = Path("tests/phase1_test_output") +out_dir.mkdir(exist_ok=True) + +for doc in test_docs: + pdf_path = Path(doc) + if not pdf_path.exists(): + print(f"⚠️ {pdf_path.name}: non trouvé") + continue + + try: + result = process_pdf( + pdf_path=pdf_path, + out_dir=out_dir, + make_vector_redaction=False, + also_make_raster_burn=False, + config_path=Path("config/dictionnaires.yml"), + use_hf=False, + ner_manager=None, + vlm_manager=None, + ) + + # Lire le fichier texte anonymisé + text_file = out_dir / f"{pdf_path.stem}.pseudonymise.txt" + if text_file.exists(): + text = text_file.read_text(encoding='utf-8') + date_count = text.count("[DATE]") + date_naissance_count = text.count("[DATE_NAISSANCE]") + + status = "✅" if date_count == 0 else "❌" + print(f"{status} {pdf_path.name}") + print(f" [DATE]: {date_count} (attendu: 0)") + print(f" [DATE_NAISSANCE]: {date_naissance_count}") + else: + print(f"⚠️ {pdf_path.name}: fichier texte non trouvé") + + except Exception as e: + print(f"❌ {pdf_path.name}: Erreur - {e}") + +print("\n✅ Test terminé") + diff --git a/tools/test_phase1_corrections.py b/tools/test_phase1_corrections.py index baddecc..f0d4198 100755 --- a/tools/test_phase1_corrections.py +++ b/tools/test_phase1_corrections.py @@ -1,144 +1,152 @@ #!/usr/bin/env python3 +# -*- coding: utf-8 -*- """ -Test des corrections Phase 1 sur un échantillon de documents. -Vérifie que: -1. Les termes médicaux structurels ne sont PAS masqués -2. Les médicaments ne sont PAS masqués -3. Les dates de consultation ne sont PAS masquées (seules les dates de naissance) +Test Phase 1 Corrections - Validation automatique des 3 corrections critiques +------------------------------------------------------------------------------ +Teste les corrections sur un échantillon de documents pour vérifier: +1. [DATE] = 0 (seules les dates de naissance sont masquées) +2. Médicaments préservés (non masqués) +3. Termes médicaux structurels préservés (Chef de service, etc.) """ - import sys -sys.path.insert(0, '.') - from pathlib import Path +import json import re + +# Ajouter le répertoire racine au path +sys.path.insert(0, str(Path(__file__).parent.parent)) + from anonymizer_core_refactored_onnx import process_pdf def test_phase1_corrections(): - """Test les 3 corrections Phase 1 sur un échantillon de documents.""" + """Teste les 3 corrections Phase 1 sur un échantillon de documents.""" - # Chercher des documents de test - test_dir = Path("tests/ground_truth/pdfs") + # Documents de test (5 documents représentatifs) + test_docs = [ + "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/008_23001234/CRH 23001234.pdf", + "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/021_23012345/CRO 23012345.pdf", + "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/033_23023456/trackare-23023456-12345678.pdf", + ] - # Sélectionner 5 documents pour le test (éviter les .annotations.json) - pdf_files = [f for f in test_dir.glob("*.pdf") if not f.name.endswith('.annotations.json')][:5] - - if not pdf_files: - print("❌ Aucun document de test trouvé") - return - - print(f"Test des corrections Phase 1 sur {len(pdf_files)} documents...") print("=" * 80) - - output_dir = Path("tests/ground_truth/pdfs/phase1_test") - output_dir.mkdir(parents=True, exist_ok=True) + print("TEST PHASE 1 CORRECTIONS") + print("=" * 80) + print() results = { - 'medical_terms_preserved': 0, - 'medications_preserved': 0, - 'dates_reduced': 0, - 'total_docs': 0 + "date_masking": {"total": 0, "passed": 0, "failed": 0}, + "medication_preservation": {"total": 0, "passed": 0, "failed": 0}, + "medical_terms_preservation": {"total": 0, "passed": 0, "failed": 0}, } - for i, pdf_path in enumerate(pdf_files, 1): - print(f"\n[{i}/{len(pdf_files)}] {pdf_path.name}") + for doc_path in test_docs: + pdf_path = Path(doc_path) + if not pdf_path.exists(): + print(f"⚠️ Document non trouvé: {pdf_path.name}") + continue + + print(f"\n📄 Test: {pdf_path.name}") + print("-" * 80) try: - # Anonymiser + # Anonymiser le document result = process_pdf( - pdf_path, - output_dir, - make_vector_redaction=False, - also_make_raster_burn=False, - config_path=Path("config/dictionnaires.yml") + pdf_path=pdf_path, + config_path=Path("config/dictionnaires.yml"), + ner_manager=None, + eds_pseudo_manager=None, + vlm_manager=None, + output_dir=None, + redaction_mode="none", ) - # Lire le texte anonymisé - text_file = Path(result['text']) - anonymized_text = text_file.read_text(encoding='utf-8') + text = result["text_anonymized"] + audit = result["audit"] - # Test 1: Vérifier que les termes médicaux structurels sont préservés - medical_terms = [ + # Test 1: Vérifier [DATE] = 0 + date_count = text.count("[DATE]") + date_naissance_count = text.count("[DATE_NAISSANCE]") + + results["date_masking"]["total"] += 1 + if date_count == 0: + print(f"✅ Correction 1: [DATE] = {date_count} (attendu: 0)") + print(f" [DATE_NAISSANCE] = {date_naissance_count}") + results["date_masking"]["passed"] += 1 + else: + print(f"❌ Correction 1: [DATE] = {date_count} (attendu: 0)") + print(f" [DATE_NAISSANCE] = {date_naissance_count}") + results["date_masking"]["failed"] += 1 + + # Test 2: Vérifier médicaments préservés + # Chercher des médicaments courants dans le texte original + medications_to_check = ["IDACIO", "SALAZOPYRINE", "INFLIXIMAB", "APRANAX", + "KETOPROFENE", "PREVENAR", "PNEUMOVAX"] + medications_found = [] + for med in medications_to_check: + if med.lower() in text.lower() and f"[NOM]" not in text: + medications_found.append(med) + + results["medication_preservation"]["total"] += 1 + if len(medications_found) > 0: + print(f"✅ Correction 2: Médicaments préservés: {', '.join(medications_found)}") + results["medication_preservation"]["passed"] += 1 + else: + # Pas de médicaments dans ce document, test non applicable + print(f"⚪ Correction 2: Aucun médicament testé dans ce document") + results["medication_preservation"]["total"] -= 1 + + # Test 3: Vérifier termes médicaux structurels préservés + medical_terms_to_check = [ "Chef de service", - "Chef de clinique", + "Chef de Clinique", "Praticien hospitalier", "service de", - "unité de" ] + medical_terms_found = [] + for term in medical_terms_to_check: + if term.lower() in text.lower(): + medical_terms_found.append(term) - medical_preserved = 0 - for term in medical_terms: - if term.lower() in anonymized_text.lower(): - medical_preserved += 1 - - # Test 2: Vérifier que les médicaments sont préservés - medications = [ - "IDACIO", - "Salazopyrine", - "Infliximab", - "Apranax" - ] - - medications_preserved = 0 - for med in medications: - if med.lower() in anonymized_text.lower(): - medications_preserved += 1 - - # Test 3: Compter les masques [DATE] vs [DATE_NAISSANCE] - date_masks = len(re.findall(r'\[DATE\]', anonymized_text)) - date_naissance_masks = len(re.findall(r'\[DATE_NAISSANCE\]', anonymized_text)) - - print(f" ✓ Termes médicaux préservés: {medical_preserved}/{len(medical_terms)}") - print(f" ✓ Médicaments préservés: {medications_preserved}/{len(medications)}") - print(f" ✓ [DATE]: {date_masks}, [DATE_NAISSANCE]: {date_naissance_masks}") - - # Vérifier que [DATE] = 0 (correction réussie) - if date_masks == 0: - results['dates_reduced'] += 1 - print(f" ✅ Correction dates: OK (0 [DATE])") + results["medical_terms_preservation"]["total"] += 1 + if len(medical_terms_found) > 0: + print(f"✅ Correction 3: Termes médicaux préservés: {', '.join(medical_terms_found)}") + results["medical_terms_preservation"]["passed"] += 1 else: - print(f" ⚠️ Correction dates: {date_masks} [DATE] restants") - - if medical_preserved > 0: - results['medical_terms_preserved'] += 1 - - if medications_preserved > 0: - results['medications_preserved'] += 1 - - results['total_docs'] += 1 + # Pas de termes médicaux dans ce document, test non applicable + print(f"⚪ Correction 3: Aucun terme médical testé dans ce document") + results["medical_terms_preservation"]["total"] -= 1 except Exception as e: - print(f" ❌ Erreur: {e}") + print(f"❌ Erreur: {e}") + continue # Résumé print("\n" + "=" * 80) - print("RÉSUMÉ DES CORRECTIONS PHASE 1") + print("RÉSUMÉ DES TESTS") print("=" * 80) - print(f"\nDocuments testés: {results['total_docs']}") - print(f"\n✅ Correction 1.1 (Termes médicaux):") - print(f" Documents avec termes préservés: {results['medical_terms_preserved']}/{results['total_docs']}") + for test_name, test_results in results.items(): + total = test_results["total"] + passed = test_results["passed"] + failed = test_results["failed"] + + if total > 0: + success_rate = (passed / total) * 100 + status = "✅" if failed == 0 else "❌" + print(f"{status} {test_name}: {passed}/{total} ({success_rate:.1f}%)") + else: + print(f"⚪ {test_name}: Aucun test applicable") - print(f"\n✅ Correction 1.2 (Médicaments):") - print(f" Documents avec médicaments préservés: {results['medications_preserved']}/{results['total_docs']}") + print() - print(f"\n✅ Correction 1.3 (Dates):") - print(f" Documents avec [DATE]=0: {results['dates_reduced']}/{results['total_docs']}") - - success_rate = ( - results['medical_terms_preserved'] + - results['medications_preserved'] + - results['dates_reduced'] - ) / (results['total_docs'] * 3) * 100 - - print(f"\n📊 Taux de succès global: {success_rate:.1f}%") - - if success_rate >= 80: - print("\n✅ PHASE 1 CORRECTIONS VALIDÉES") + # Verdict final + all_passed = all(r["failed"] == 0 for r in results.values() if r["total"] > 0) + if all_passed: + print("✅ TOUS LES TESTS PASSÉS - Phase 1 corrections validées") + return 0 else: - print("\n⚠️ PHASE 1 CORRECTIONS PARTIELLES - Vérification manuelle requise") - - print(f"\n📁 Résultats dans: {output_dir}") + print("❌ CERTAINS TESTS ONT ÉCHOUÉ - Vérifier les corrections") + return 1 if __name__ == "__main__": - test_phase1_corrections() + sys.exit(test_phase1_corrections()) diff --git a/tools/validate_phase1_on_production.py b/tools/validate_phase1_on_production.py new file mode 100644 index 0000000..473eaad --- /dev/null +++ b/tools/validate_phase1_on_production.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Validation Phase 1 sur corpus production +----------------------------------------- +Teste les 3 corrections sur 5 documents du corpus production. +""" +import sys +from pathlib import Path +import json +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from anonymizer_core_refactored_onnx import process_pdf + +# 5 documents du corpus production (OGC 008) +corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs") +test_docs = [ + corpus_dir / "008_23001234" / "CRH 23001234.pdf", + corpus_dir / "008_23001234" / "CRO 23001234.pdf", +] + +# Fallback: si les documents OGC 008 n'existent pas, utiliser d'autres +if not test_docs[0].exists(): + # Chercher les premiers documents disponibles + test_docs = [] + for ogc_dir in sorted(corpus_dir.glob("*_*"))[:3]: + for pdf in ogc_dir.glob("*.pdf"): + if not pdf.name.endswith(".redacted_raster.pdf"): + test_docs.append(pdf) + break + if len(test_docs) >= 5: + break + +print("=" * 80) +print("VALIDATION PHASE 1 - CORPUS PRODUCTION") +print("=" * 80) +print() + +out_dir = Path("tests/phase1_production_test") +out_dir.mkdir(exist_ok=True) + +results = { + "date_correction": {"passed": 0, "failed": 0, "total": 0}, + "medication_preservation": {"passed": 0, "failed": 0, "total": 0}, + "medical_terms_preservation": {"passed": 0, "failed": 0, "total": 0}, +} + +for pdf_path in test_docs[:5]: + if not pdf_path.exists(): + continue + + print(f"📄 {pdf_path.parent.name}/{pdf_path.name}") + print("-" * 80) + + try: + result = process_pdf( + pdf_path=pdf_path, + out_dir=out_dir, + make_vector_redaction=False, + also_make_raster_burn=False, + config_path=Path("config/dictionnaires.yml"), + use_hf=False, + ner_manager=None, + vlm_manager=None, + ) + + # Lire le texte anonymisé + text_file = out_dir / f"{pdf_path.stem}.pseudonymise.txt" + if not text_file.exists(): + print("⚠️ Fichier texte non trouvé") + continue + + text = text_file.read_text(encoding='utf-8') + + # Test 1: [DATE] = 0 + date_count = text.count("[DATE]") + date_naissance_count = text.count("[DATE_NAISSANCE]") + results["date_correction"]["total"] += 1 + + if date_count == 0: + print(f"✅ Correction 1: [DATE] = {date_count}, [DATE_NAISSANCE] = {date_naissance_count}") + results["date_correction"]["passed"] += 1 + else: + print(f"❌ Correction 1: [DATE] = {date_count} (attendu: 0)") + results["date_correction"]["failed"] += 1 + + # Test 2: Médicaments préservés + medications = ["idacio", "salazopyrine", "infliximab", "methotrexate", + "cortancyl", "bisoprolol", "entresto"] + meds_found = [m for m in medications if m in text.lower()] + + if meds_found: + results["medication_preservation"]["total"] += 1 + # Vérifier qu'ils ne sont pas masqués + meds_masked = [m for m in meds_found if f"[NOM]" in text[max(0, text.lower().find(m)-10):text.lower().find(m)+len(m)+10]] + if not meds_masked: + print(f"✅ Correction 2: Médicaments préservés: {', '.join(meds_found[:3])}") + results["medication_preservation"]["passed"] += 1 + else: + print(f"❌ Correction 2: Médicaments masqués: {', '.join(meds_masked)}") + results["medication_preservation"]["failed"] += 1 + + # Test 3: Termes médicaux structurels préservés + medical_terms = ["chef de service", "chef de clinique", "praticien hospitalier", + "service de", "unité de"] + terms_found = [t for t in medical_terms if t in text.lower()] + + if terms_found: + results["medical_terms_preservation"]["total"] += 1 + # Vérifier qu'ils ne sont pas masqués + terms_masked = [t for t in terms_found if "[MASK]" in text[max(0, text.lower().find(t)-5):text.lower().find(t)+len(t)+15]] + if not terms_masked: + print(f"✅ Correction 3: Termes médicaux préservés: {', '.join(terms_found[:2])}") + results["medical_terms_preservation"]["passed"] += 1 + else: + print(f"❌ Correction 3: Termes masqués: {', '.join(terms_masked)}") + results["medical_terms_preservation"]["failed"] += 1 + + print() + + except Exception as e: + print(f"❌ Erreur: {e}") + print() + continue + +# Résumé +print("=" * 80) +print("RÉSUMÉ") +print("=" * 80) + +for test_name, test_results in results.items(): + total = test_results["total"] + passed = test_results["passed"] + failed = test_results["failed"] + + if total > 0: + success_rate = (passed / total) * 100 + status = "✅" if failed == 0 else "❌" + print(f"{status} {test_name}: {passed}/{total} ({success_rate:.1f}%)") + else: + print(f"⚪ {test_name}: Aucun test applicable") + +print() + +# Verdict +all_passed = all(r["failed"] == 0 for r in results.values() if r["total"] > 0) +if all_passed: + print("✅ PHASE 1 VALIDÉE - Toutes les corrections fonctionnent") +else: + print("⚠️ Certains tests ont échoué - Vérifier les résultats")