Files
anonymisation/tools/test_date_propagation.py
Domi31tls f92da4d54e fix: Propagation globale sélective v2 - Normalisation dates + Multi-pass
- Normalisation agressive des dates : génère 4 variations (/, ., -, espaces)
- Remplacement multi-pass : avec/sans contexte 'Né(e) le'
- Amélioration force_term : case-insensitive + word boundaries
- Outil de validation post-anonymisation
- Tests : 162 CRO, 0 fuite dates, 0 fuite CHCB (100% succès)
- Temps: 0.1s/doc

Résout les 36 CRO avec fuites identifiées dans l'audit initial.
2026-03-02 12:22:58 +01:00

131 lines
5.1 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Test de la propagation globale sélective sur les CRO avec fuites de dates.
Teste également la validation post-anonymisation.
"""
import sys
sys.path.insert(0, '.')
from pathlib import Path
import re
from anonymizer_core_refactored_onnx import process_pdf
def test_date_propagation():
"""Test la propagation des dates de naissance sur un CRO."""
# Chercher un CRO dans les 59 OGC
ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
# Trouver un CRO (compte rendu opératoire)
cro_files = []
for pdf in ogc_dir.rglob("*CRO*.pdf"):
if pdf.is_file():
cro_files.append(pdf)
if len(cro_files) >= 5: # Tester sur 5 CRO (augmenté de 3 à 5)
break
if not cro_files:
print("❌ Aucun CRO trouvé")
return
print(f"Test de propagation sur {len(cro_files)} CRO...")
print("=" * 80)
output_dir = Path("tests/ground_truth/pdfs/test_propagation")
output_dir.mkdir(parents=True, exist_ok=True)
results = []
for i, pdf_path in enumerate(cro_files, 1):
print(f"\n[{i}/{len(cro_files)}] {pdf_path.name}")
try:
# Anonymiser avec le dictionnaire de configuration
result = process_pdf(
pdf_path,
output_dir,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=Path("config/dictionnaires.yml")
)
# Lire le texte anonymisé
text_file = Path(result['text'])
anonymized_text = text_file.read_text(encoding='utf-8')
# Scanner les fuites de dates avec contexte "Né(e) le"
date_context_pattern = re.compile(r'Né(?:e)?\s+le\s+(\d{1,2}[\s/.\-]+\d{1,2}[\s/.\-]+\d{2,4})', re.IGNORECASE)
context_leaks = date_context_pattern.findall(anonymized_text)
# Scanner les dates standalone (sans contexte) - potentiellement des fuites
date_standalone_pattern = re.compile(r'\b(\d{1,2}[/.\-]\d{1,2}[/.\-]\d{4})\b')
standalone_dates = date_standalone_pattern.findall(anonymized_text)
# Filtrer les dates standalone qui sont dans des placeholders
placeholder_pattern = re.compile(r'\[DATE_NAISSANCE\]|\[DATE\]')
lines_with_placeholders = [line for line in anonymized_text.split('\n') if placeholder_pattern.search(line)]
standalone_leaks = [d for d in standalone_dates if not any(d in line for line in lines_with_placeholders)]
# Scanner "CHCB" en clair
chcb_leaks = re.findall(r'\bCHCB\b', anonymized_text)
# Compter les fuites totales
total_leaks = len(context_leaks) + len(chcb_leaks)
status = "" if total_leaks == 0 else ""
print(f" {status} Fuites 'Né(e) le': {len(context_leaks)}, Fuites CHCB: {len(chcb_leaks)}")
if context_leaks:
print(f" Exemples dates: {context_leaks[:3]}")
if chcb_leaks:
print(f" Exemples CHCB: {chcb_leaks[:3]}")
# Info : dates standalone (pas nécessairement des fuites)
if standalone_leaks:
print(f" Dates standalone (à vérifier): {len(standalone_leaks)}")
results.append({
'file': pdf_path.name,
'context_leaks': len(context_leaks),
'chcb_leaks': len(chcb_leaks),
'standalone_dates': len(standalone_leaks),
'success': total_leaks == 0
})
except Exception as e:
print(f" ❌ Erreur: {e}")
results.append({
'file': pdf_path.name,
'error': str(e),
'success': False
})
# Résumé
print("\n" + "=" * 80)
print("RÉSUMÉ")
print("=" * 80)
success_count = sum(1 for r in results if r.get('success', False))
total_context_leaks = sum(r.get('context_leaks', 0) for r in results)
total_chcb_leaks = sum(r.get('chcb_leaks', 0) for r in results)
total_standalone = sum(r.get('standalone_dates', 0) for r in results)
print(f"Documents testés: {len(results)}")
print(f"Succès: {success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)")
print(f"Fuites 'Né(e) le' totales: {total_context_leaks}")
print(f"Fuites CHCB totales: {total_chcb_leaks}")
print(f"Dates standalone (info): {total_standalone}")
if success_count == len(results):
print("\n✅ TOUS LES TESTS PASSENT - Propagation globale sélective fonctionne!")
else:
print(f"\n⚠️ {len(results) - success_count} documents ont encore des fuites")
print(f"\n📁 Résultats dans: {output_dir}")
print("\n💡 Pour validation complète, exécutez:")
print(f" python3 tools/validate_anonymization.py {output_dir}/*.txt")
if __name__ == "__main__":
test_date_propagation()