#!/usr/bin/env python3 """ Test de la propagation globale sélective sur les CRO avec fuites de dates. Teste également la validation post-anonymisation. """ import sys sys.path.insert(0, '.') from pathlib import Path import re from anonymizer_core_refactored_onnx import process_pdf def test_date_propagation(): """Test la propagation des dates de naissance sur un CRO.""" # Chercher un CRO dans les 59 OGC ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") # Trouver un CRO (compte rendu opératoire) cro_files = [] for pdf in ogc_dir.rglob("*CRO*.pdf"): if pdf.is_file(): cro_files.append(pdf) if len(cro_files) >= 5: # Tester sur 5 CRO (augmenté de 3 à 5) break if not cro_files: print("❌ Aucun CRO trouvé") return print(f"Test de propagation sur {len(cro_files)} CRO...") print("=" * 80) output_dir = Path("tests/ground_truth/pdfs/test_propagation") output_dir.mkdir(parents=True, exist_ok=True) results = [] for i, pdf_path in enumerate(cro_files, 1): print(f"\n[{i}/{len(cro_files)}] {pdf_path.name}") try: # Anonymiser avec le dictionnaire de configuration result = process_pdf( pdf_path, output_dir, make_vector_redaction=False, also_make_raster_burn=False, config_path=Path("config/dictionnaires.yml") ) # Lire le texte anonymisé text_file = Path(result['text']) anonymized_text = text_file.read_text(encoding='utf-8') # Scanner les fuites de dates avec contexte "Né(e) le" date_context_pattern = re.compile(r'Né(?:e)?\s+le\s+(\d{1,2}[\s/.\-]+\d{1,2}[\s/.\-]+\d{2,4})', re.IGNORECASE) context_leaks = date_context_pattern.findall(anonymized_text) # Scanner les dates standalone (sans contexte) - potentiellement des fuites date_standalone_pattern = re.compile(r'\b(\d{1,2}[/.\-]\d{1,2}[/.\-]\d{4})\b') standalone_dates = date_standalone_pattern.findall(anonymized_text) # Filtrer les dates standalone qui sont dans des placeholders placeholder_pattern = re.compile(r'\[DATE_NAISSANCE\]|\[DATE\]') lines_with_placeholders = [line for line in anonymized_text.split('\n') if placeholder_pattern.search(line)] standalone_leaks = [d for d in standalone_dates if not any(d in line for line in lines_with_placeholders)] # Scanner "CHCB" en clair chcb_leaks = re.findall(r'\bCHCB\b', anonymized_text) # Compter les fuites totales total_leaks = len(context_leaks) + len(chcb_leaks) status = "✅" if total_leaks == 0 else "❌" print(f" {status} Fuites 'Né(e) le': {len(context_leaks)}, Fuites CHCB: {len(chcb_leaks)}") if context_leaks: print(f" Exemples dates: {context_leaks[:3]}") if chcb_leaks: print(f" Exemples CHCB: {chcb_leaks[:3]}") # Info : dates standalone (pas nécessairement des fuites) if standalone_leaks: print(f" ℹ️ Dates standalone (à vérifier): {len(standalone_leaks)}") results.append({ 'file': pdf_path.name, 'context_leaks': len(context_leaks), 'chcb_leaks': len(chcb_leaks), 'standalone_dates': len(standalone_leaks), 'success': total_leaks == 0 }) except Exception as e: print(f" ❌ Erreur: {e}") results.append({ 'file': pdf_path.name, 'error': str(e), 'success': False }) # Résumé print("\n" + "=" * 80) print("RÉSUMÉ") print("=" * 80) success_count = sum(1 for r in results if r.get('success', False)) total_context_leaks = sum(r.get('context_leaks', 0) for r in results) total_chcb_leaks = sum(r.get('chcb_leaks', 0) for r in results) total_standalone = sum(r.get('standalone_dates', 0) for r in results) print(f"Documents testés: {len(results)}") print(f"Succès: {success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)") print(f"Fuites 'Né(e) le' totales: {total_context_leaks}") print(f"Fuites CHCB totales: {total_chcb_leaks}") print(f"Dates standalone (info): {total_standalone}") if success_count == len(results): print("\n✅ TOUS LES TESTS PASSENT - Propagation globale sélective fonctionne!") else: print(f"\n⚠️ {len(results) - success_count} documents ont encore des fuites") print(f"\n📁 Résultats dans: {output_dir}") print("\n💡 Pour validation complète, exécutez:") print(f" python3 tools/validate_anonymization.py {output_dir}/*.txt") if __name__ == "__main__": test_date_propagation()