#!/usr/bin/env python3 """ Test de la propagation globale sélective sur les CRO avec fuites de dates. """ import sys sys.path.insert(0, '.') from pathlib import Path import re from anonymizer_core_refactored_onnx import process_pdf def test_date_propagation(): """Test la propagation des dates de naissance sur un CRO.""" # Chercher un CRO dans les 59 OGC ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") # Trouver un CRO (compte rendu opératoire) cro_files = [] for pdf in ogc_dir.rglob("*CRO*.pdf"): if pdf.is_file(): cro_files.append(pdf) if len(cro_files) >= 3: # Tester sur 3 CRO break if not cro_files: print("❌ Aucun CRO trouvé") return print(f"Test de propagation sur {len(cro_files)} CRO...") print("=" * 80) output_dir = Path("tests/ground_truth/pdfs/test_propagation") output_dir.mkdir(parents=True, exist_ok=True) results = [] for i, pdf_path in enumerate(cro_files, 1): print(f"\n[{i}/{len(cro_files)}] {pdf_path.name}") try: # Anonymiser result = process_pdf( pdf_path, output_dir, make_vector_redaction=False, also_make_raster_burn=False ) # Lire le texte anonymisé text_file = Path(result['text']) anonymized_text = text_file.read_text(encoding='utf-8') # Scanner les fuites de dates date_pattern = re.compile(r'Né(?:e)?\s+le\s+\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}', re.IGNORECASE) leaks = date_pattern.findall(anonymized_text) # Scanner "CHCB" en clair chcb_leaks = re.findall(r'\bCHCB\b', anonymized_text) status = "✅" if not leaks and not chcb_leaks else "❌" print(f" {status} Fuites dates: {len(leaks)}, Fuites CHCB: {len(chcb_leaks)}") if leaks: print(f" Exemples: {leaks[:3]}") results.append({ 'file': pdf_path.name, 'date_leaks': len(leaks), 'chcb_leaks': len(chcb_leaks), 'success': len(leaks) == 0 and len(chcb_leaks) == 0 }) except Exception as e: print(f" ❌ Erreur: {e}") results.append({ 'file': pdf_path.name, 'error': str(e), 'success': False }) # Résumé print("\n" + "=" * 80) print("RÉSUMÉ") print("=" * 80) success_count = sum(1 for r in results if r.get('success', False)) total_date_leaks = sum(r.get('date_leaks', 0) for r in results) total_chcb_leaks = sum(r.get('chcb_leaks', 0) for r in results) print(f"Documents testés: {len(results)}") print(f"Succès: {success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)") print(f"Fuites dates totales: {total_date_leaks}") print(f"Fuites CHCB totales: {total_chcb_leaks}") if success_count == len(results): print("\n✅ TOUS LES TESTS PASSENT - Propagation globale sélective fonctionne!") else: print(f"\n⚠️ {len(results) - success_count} documents ont encore des fuites") print(f"\n📁 Résultats dans: {output_dir}") if __name__ == "__main__": test_date_propagation()