#!/usr/bin/env python3 """ Test de la propagation globale sélective sur TOUS les CRO du corpus 59 OGC. """ import sys sys.path.insert(0, '.') from pathlib import Path import re from anonymizer_core_refactored_onnx import process_pdf import time def test_all_cro(): """Test la propagation des dates de naissance sur tous les CRO.""" # Chercher tous les CRO dans les 59 OGC ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") # Trouver tous les CRO (compte rendu opératoire) print("Recherche de tous les CRO dans le corpus...") cro_files = [] for pdf in ogc_dir.rglob("*CRO*.pdf"): if pdf.is_file(): cro_files.append(pdf) if not cro_files: print("❌ Aucun CRO trouvé") return print(f"Trouvé {len(cro_files)} CRO dans le corpus") print("=" * 80) output_dir = Path("tests/ground_truth/pdfs/test_all_cro") output_dir.mkdir(parents=True, exist_ok=True) results = [] start_time = time.time() for i, pdf_path in enumerate(cro_files, 1): print(f"\n[{i}/{len(cro_files)}] {pdf_path.name}") try: # Anonymiser avec le dictionnaire de configuration result = process_pdf( pdf_path, output_dir, make_vector_redaction=False, also_make_raster_burn=False, config_path=Path("config/dictionnaires.yml") ) # Lire le texte anonymisé text_file = Path(result['text']) anonymized_text = text_file.read_text(encoding='utf-8') # Scanner les fuites de dates avec contexte "Né(e) le" date_context_pattern = re.compile(r'Né(?:e)?\s+le\s+(\d{1,2}[\s/.\-]+\d{1,2}[\s/.\-]+\d{2,4})', re.IGNORECASE) context_leaks = date_context_pattern.findall(anonymized_text) # Scanner "CHCB" en clair chcb_leaks = re.findall(r'\bCHCB\b', anonymized_text) # Compter les fuites totales total_leaks = len(context_leaks) + len(chcb_leaks) status = "✅" if total_leaks == 0 else "❌" print(f" {status} Fuites 'Né(e) le': {len(context_leaks)}, Fuites CHCB: {len(chcb_leaks)}") if context_leaks: print(f" Exemples dates: {context_leaks[:3]}") if chcb_leaks: print(f" Exemples CHCB: {chcb_leaks[:3]}") results.append({ 'file': pdf_path.name, 'path': str(pdf_path), 'context_leaks': len(context_leaks), 'chcb_leaks': len(chcb_leaks), 'success': total_leaks == 0 }) except Exception as e: print(f" ❌ Erreur: {e}") results.append({ 'file': pdf_path.name, 'path': str(pdf_path), 'error': str(e), 'success': False }) elapsed_time = time.time() - start_time # Résumé print("\n" + "=" * 80) print("RÉSUMÉ GLOBAL") print("=" * 80) success_count = sum(1 for r in results if r.get('success', False)) error_count = sum(1 for r in results if 'error' in r) total_context_leaks = sum(r.get('context_leaks', 0) for r in results) total_chcb_leaks = sum(r.get('chcb_leaks', 0) for r in results) print(f"Documents testés: {len(results)}") print(f"Succès: {success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)") print(f"Erreurs: {error_count}") print(f"Fuites 'Né(e) le' totales: {total_context_leaks}") print(f"Fuites CHCB totales: {total_chcb_leaks}") print(f"Temps total: {elapsed_time:.1f}s ({elapsed_time/len(results):.1f}s/doc)") # Liste des documents avec fuites failed_docs = [r for r in results if not r.get('success', False) and 'error' not in r] if failed_docs: print("\n" + "=" * 80) print(f"DOCUMENTS AVEC FUITES ({len(failed_docs)})") print("=" * 80) for doc in failed_docs: print(f"\n{doc['file']}") print(f" Path: {doc['path']}") print(f" Fuites dates: {doc.get('context_leaks', 0)}") print(f" Fuites CHCB: {doc.get('chcb_leaks', 0)}") # Liste des erreurs error_docs = [r for r in results if 'error' in r] if error_docs: print("\n" + "=" * 80) print(f"DOCUMENTS EN ERREUR ({len(error_docs)})") print("=" * 80) for doc in error_docs: print(f"\n{doc['file']}") print(f" Erreur: {doc['error']}") if success_count == len(results): print("\n✅ TOUS LES TESTS PASSENT - Propagation globale sélective fonctionne sur TOUS les CRO!") else: print(f"\n⚠️ {len(results) - success_count} documents ont encore des fuites ou erreurs") print(f"\n📁 Résultats dans: {output_dir}") # Sauvegarder le rapport report_file = output_dir / "test_report.txt" with open(report_file, 'w', encoding='utf-8') as f: f.write("=" * 80 + "\n") f.write("RAPPORT DE TEST - TOUS LES CRO\n") f.write("=" * 80 + "\n\n") f.write(f"Documents testés: {len(results)}\n") f.write(f"Succès: {success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)\n") f.write(f"Erreurs: {error_count}\n") f.write(f"Fuites 'Né(e) le' totales: {total_context_leaks}\n") f.write(f"Fuites CHCB totales: {total_chcb_leaks}\n") f.write(f"Temps total: {elapsed_time:.1f}s ({elapsed_time/len(results):.1f}s/doc)\n\n") if failed_docs: f.write("=" * 80 + "\n") f.write(f"DOCUMENTS AVEC FUITES ({len(failed_docs)})\n") f.write("=" * 80 + "\n\n") for doc in failed_docs: f.write(f"{doc['file']}\n") f.write(f" Path: {doc['path']}\n") f.write(f" Fuites dates: {doc.get('context_leaks', 0)}\n") f.write(f" Fuites CHCB: {doc.get('chcb_leaks', 0)}\n\n") if error_docs: f.write("=" * 80 + "\n") f.write(f"DOCUMENTS EN ERREUR ({len(error_docs)})\n") f.write("=" * 80 + "\n\n") for doc in error_docs: f.write(f"{doc['file']}\n") f.write(f" Erreur: {doc['error']}\n\n") print(f"📄 Rapport sauvegardé: {report_file}") if __name__ == "__main__": test_all_cro()