Files
anonymisation/tools/test_all_cro.py
Domi31tls 1c44a26eb3 chore(rgpd): replace CHCB/Bayonne/Saint-Denis/Réunion refs in source + configs (D-12)
Anonymise toutes les références à des entités réelles (CHCB, Bayonne, Saint-Denis,
Réunion, etc.) dans le code source, les configurations YAML, les scripts/outils,
et les tests unitaires. Conserve les tests synthétiques (cases) intentionnels.

- profile key chcb_strict → chuxx_strict
- CHCB → CHUXX, Bayonne → Chicago, Saint-Denis → Springfield,
  Réunion → Province Bêta, 64100/97400 → 12345, FINESS → 999999999,
  préfixe tél 05.59.44 → 0X.XX.XX
- renomme tools/test_chcb_leak.py → tools/test_force_term_leak.py

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-02 14:39:21 +02:00

176 lines
6.6 KiB
Python

#!/usr/bin/env python3
"""
Test de la propagation globale sélective sur TOUS les CRO du corpus 59 OGC.
"""
import sys
sys.path.insert(0, '.')
from pathlib import Path
import re
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from anonymizer_core_refactored_onnx import process_pdf
import time
def test_all_cro():
"""Test la propagation des dates de naissance sur tous les CRO."""
# Chercher tous les CRO dans les 59 OGC
ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
# Trouver tous les CRO (compte rendu opératoire)
print("Recherche de tous les CRO dans le corpus...")
cro_files = []
for pdf in ogc_dir.rglob("*CRO*.pdf"):
if pdf.is_file():
cro_files.append(pdf)
if not cro_files:
print("❌ Aucun CRO trouvé")
return
print(f"Trouvé {len(cro_files)} CRO dans le corpus")
print("=" * 80)
output_dir = Path("tests/ground_truth/pdfs/test_all_cro")
output_dir.mkdir(parents=True, exist_ok=True)
results = []
start_time = time.time()
for i, pdf_path in enumerate(cro_files, 1):
print(f"\n[{i}/{len(cro_files)}] {pdf_path.name}")
try:
# Anonymiser avec le dictionnaire de configuration
result = process_pdf(
pdf_path,
output_dir,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
)
# Lire le texte anonymisé
text_file = Path(result['text'])
anonymized_text = text_file.read_text(encoding='utf-8')
# Scanner les fuites de dates avec contexte "Né(e) le"
date_context_pattern = re.compile(r'Né(?:e)?\s+le\s+(\d{1,2}[\s/.\-]+\d{1,2}[\s/.\-]+\d{2,4})', re.IGNORECASE)
context_leaks = date_context_pattern.findall(anonymized_text)
# Scanner "CHUXX" en clair
chuxx_leaks = re.findall(r'\bCHUXX\b', anonymized_text)
# Compter les fuites totales
total_leaks = len(context_leaks) + len(chuxx_leaks)
status = "" if total_leaks == 0 else ""
print(f" {status} Fuites 'Né(e) le': {len(context_leaks)}, Fuites CHUXX: {len(chuxx_leaks)}")
if context_leaks:
print(f" Exemples dates: {context_leaks[:3]}")
if chuxx_leaks:
print(f" Exemples CHUXX: {chuxx_leaks[:3]}")
results.append({
'file': pdf_path.name,
'path': str(pdf_path),
'context_leaks': len(context_leaks),
'chuxx_leaks': len(chuxx_leaks),
'success': total_leaks == 0
})
except Exception as e:
print(f" ❌ Erreur: {e}")
results.append({
'file': pdf_path.name,
'path': str(pdf_path),
'error': str(e),
'success': False
})
elapsed_time = time.time() - start_time
# Résumé
print("\n" + "=" * 80)
print("RÉSUMÉ GLOBAL")
print("=" * 80)
success_count = sum(1 for r in results if r.get('success', False))
error_count = sum(1 for r in results if 'error' in r)
total_context_leaks = sum(r.get('context_leaks', 0) for r in results)
total_chuxx_leaks = sum(r.get('chuxx_leaks', 0) for r in results)
print(f"Documents testés: {len(results)}")
print(f"Succès: {success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)")
print(f"Erreurs: {error_count}")
print(f"Fuites 'Né(e) le' totales: {total_context_leaks}")
print(f"Fuites CHUXX totales: {total_chuxx_leaks}")
print(f"Temps total: {elapsed_time:.1f}s ({elapsed_time/len(results):.1f}s/doc)")
# Liste des documents avec fuites
failed_docs = [r for r in results if not r.get('success', False) and 'error' not in r]
if failed_docs:
print("\n" + "=" * 80)
print(f"DOCUMENTS AVEC FUITES ({len(failed_docs)})")
print("=" * 80)
for doc in failed_docs:
print(f"\n{doc['file']}")
print(f" Path: {doc['path']}")
print(f" Fuites dates: {doc.get('context_leaks', 0)}")
print(f" Fuites CHUXX: {doc.get('chuxx_leaks', 0)}")
# Liste des erreurs
error_docs = [r for r in results if 'error' in r]
if error_docs:
print("\n" + "=" * 80)
print(f"DOCUMENTS EN ERREUR ({len(error_docs)})")
print("=" * 80)
for doc in error_docs:
print(f"\n{doc['file']}")
print(f" Erreur: {doc['error']}")
if success_count == len(results):
print("\n✅ TOUS LES TESTS PASSENT - Propagation globale sélective fonctionne sur TOUS les CRO!")
else:
print(f"\n⚠️ {len(results) - success_count} documents ont encore des fuites ou erreurs")
print(f"\n📁 Résultats dans: {output_dir}")
# Sauvegarder le rapport
report_file = output_dir / "test_report.txt"
with open(report_file, 'w', encoding='utf-8') as f:
f.write("=" * 80 + "\n")
f.write("RAPPORT DE TEST - TOUS LES CRO\n")
f.write("=" * 80 + "\n\n")
f.write(f"Documents testés: {len(results)}\n")
f.write(f"Succès: {success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)\n")
f.write(f"Erreurs: {error_count}\n")
f.write(f"Fuites 'Né(e) le' totales: {total_context_leaks}\n")
f.write(f"Fuites CHUXX totales: {total_chuxx_leaks}\n")
f.write(f"Temps total: {elapsed_time:.1f}s ({elapsed_time/len(results):.1f}s/doc)\n\n")
if failed_docs:
f.write("=" * 80 + "\n")
f.write(f"DOCUMENTS AVEC FUITES ({len(failed_docs)})\n")
f.write("=" * 80 + "\n\n")
for doc in failed_docs:
f.write(f"{doc['file']}\n")
f.write(f" Path: {doc['path']}\n")
f.write(f" Fuites dates: {doc.get('context_leaks', 0)}\n")
f.write(f" Fuites CHUXX: {doc.get('chuxx_leaks', 0)}\n\n")
if error_docs:
f.write("=" * 80 + "\n")
f.write(f"DOCUMENTS EN ERREUR ({len(error_docs)})\n")
f.write("=" * 80 + "\n\n")
for doc in error_docs:
f.write(f"{doc['file']}\n")
f.write(f" Erreur: {doc['error']}\n\n")
print(f"📄 Rapport sauvegardé: {report_file}")
if __name__ == "__main__":
test_all_cro()