- Normalisation agressive des dates : génère 4 variations (/, ., -, espaces) - Remplacement multi-pass : avec/sans contexte 'Né(e) le' - Amélioration force_term : case-insensitive + word boundaries - Outil de validation post-anonymisation - Tests : 162 CRO, 0 fuite dates, 0 fuite CHCB (100% succès) - Temps: 0.1s/doc Résout les 36 CRO avec fuites identifiées dans l'audit initial.
131 lines
5.1 KiB
Python
131 lines
5.1 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Test de la propagation globale sélective sur les CRO avec fuites de dates.
|
||
Teste également la validation post-anonymisation.
|
||
"""
|
||
|
||
import sys
|
||
sys.path.insert(0, '.')
|
||
|
||
from pathlib import Path
|
||
import re
|
||
from anonymizer_core_refactored_onnx import process_pdf
|
||
|
||
def test_date_propagation():
|
||
"""Test la propagation des dates de naissance sur un CRO."""
|
||
|
||
# Chercher un CRO dans les 59 OGC
|
||
ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||
|
||
# Trouver un CRO (compte rendu opératoire)
|
||
cro_files = []
|
||
for pdf in ogc_dir.rglob("*CRO*.pdf"):
|
||
if pdf.is_file():
|
||
cro_files.append(pdf)
|
||
if len(cro_files) >= 5: # Tester sur 5 CRO (augmenté de 3 à 5)
|
||
break
|
||
|
||
if not cro_files:
|
||
print("❌ Aucun CRO trouvé")
|
||
return
|
||
|
||
print(f"Test de propagation sur {len(cro_files)} CRO...")
|
||
print("=" * 80)
|
||
|
||
output_dir = Path("tests/ground_truth/pdfs/test_propagation")
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
results = []
|
||
|
||
for i, pdf_path in enumerate(cro_files, 1):
|
||
print(f"\n[{i}/{len(cro_files)}] {pdf_path.name}")
|
||
|
||
try:
|
||
# Anonymiser avec le dictionnaire de configuration
|
||
result = process_pdf(
|
||
pdf_path,
|
||
output_dir,
|
||
make_vector_redaction=False,
|
||
also_make_raster_burn=False,
|
||
config_path=Path("config/dictionnaires.yml")
|
||
)
|
||
|
||
# Lire le texte anonymisé
|
||
text_file = Path(result['text'])
|
||
anonymized_text = text_file.read_text(encoding='utf-8')
|
||
|
||
# Scanner les fuites de dates avec contexte "Né(e) le"
|
||
date_context_pattern = re.compile(r'Né(?:e)?\s+le\s+(\d{1,2}[\s/.\-]+\d{1,2}[\s/.\-]+\d{2,4})', re.IGNORECASE)
|
||
context_leaks = date_context_pattern.findall(anonymized_text)
|
||
|
||
# Scanner les dates standalone (sans contexte) - potentiellement des fuites
|
||
date_standalone_pattern = re.compile(r'\b(\d{1,2}[/.\-]\d{1,2}[/.\-]\d{4})\b')
|
||
standalone_dates = date_standalone_pattern.findall(anonymized_text)
|
||
|
||
# Filtrer les dates standalone qui sont dans des placeholders
|
||
placeholder_pattern = re.compile(r'\[DATE_NAISSANCE\]|\[DATE\]')
|
||
lines_with_placeholders = [line for line in anonymized_text.split('\n') if placeholder_pattern.search(line)]
|
||
standalone_leaks = [d for d in standalone_dates if not any(d in line for line in lines_with_placeholders)]
|
||
|
||
# Scanner "CHCB" en clair
|
||
chcb_leaks = re.findall(r'\bCHCB\b', anonymized_text)
|
||
|
||
# Compter les fuites totales
|
||
total_leaks = len(context_leaks) + len(chcb_leaks)
|
||
|
||
status = "✅" if total_leaks == 0 else "❌"
|
||
print(f" {status} Fuites 'Né(e) le': {len(context_leaks)}, Fuites CHCB: {len(chcb_leaks)}")
|
||
|
||
if context_leaks:
|
||
print(f" Exemples dates: {context_leaks[:3]}")
|
||
if chcb_leaks:
|
||
print(f" Exemples CHCB: {chcb_leaks[:3]}")
|
||
|
||
# Info : dates standalone (pas nécessairement des fuites)
|
||
if standalone_leaks:
|
||
print(f" ℹ️ Dates standalone (à vérifier): {len(standalone_leaks)}")
|
||
|
||
results.append({
|
||
'file': pdf_path.name,
|
||
'context_leaks': len(context_leaks),
|
||
'chcb_leaks': len(chcb_leaks),
|
||
'standalone_dates': len(standalone_leaks),
|
||
'success': total_leaks == 0
|
||
})
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Erreur: {e}")
|
||
results.append({
|
||
'file': pdf_path.name,
|
||
'error': str(e),
|
||
'success': False
|
||
})
|
||
|
||
# Résumé
|
||
print("\n" + "=" * 80)
|
||
print("RÉSUMÉ")
|
||
print("=" * 80)
|
||
|
||
success_count = sum(1 for r in results if r.get('success', False))
|
||
total_context_leaks = sum(r.get('context_leaks', 0) for r in results)
|
||
total_chcb_leaks = sum(r.get('chcb_leaks', 0) for r in results)
|
||
total_standalone = sum(r.get('standalone_dates', 0) for r in results)
|
||
|
||
print(f"Documents testés: {len(results)}")
|
||
print(f"Succès: {success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)")
|
||
print(f"Fuites 'Né(e) le' totales: {total_context_leaks}")
|
||
print(f"Fuites CHCB totales: {total_chcb_leaks}")
|
||
print(f"Dates standalone (info): {total_standalone}")
|
||
|
||
if success_count == len(results):
|
||
print("\n✅ TOUS LES TESTS PASSENT - Propagation globale sélective fonctionne!")
|
||
else:
|
||
print(f"\n⚠️ {len(results) - success_count} documents ont encore des fuites")
|
||
|
||
print(f"\n📁 Résultats dans: {output_dir}")
|
||
print("\n💡 Pour validation complète, exécutez:")
|
||
print(f" python3 tools/validate_anonymization.py {output_dir}/*.txt")
|
||
|
||
if __name__ == "__main__":
|
||
test_date_propagation()
|