#!/usr/bin/env python3 """ Analyse des résultats d'anonymisation. """ import json from pathlib import Path from collections import Counter from evaluation import LeakScanner def main(): # Fichiers générés base_name = "003_simple_compte_rendu_CRO_23155084" output_dir = Path("tests/ground_truth/pdfs/anonymized_test") audit_path = output_dir / f"{base_name}.audit.jsonl" redacted_pdf = output_dir / f"{base_name}.redacted_raster.pdf" text_path = output_dir / f"{base_name}.pseudonymise.txt" print("="*80) print("ANALYSE DES RÉSULTATS D'ANONYMISATION") print("="*80) print(f"\n📄 Document: {base_name}.pdf") print(f" Type: Compte-rendu opératoire (CRO)") # Analyser l'audit if audit_path.exists(): print(f"\n📊 ANALYSE DE L'AUDIT") print(f" Fichier: {audit_path.name}") pii_list = [] with open(audit_path, 'r', encoding='utf-8') as f: for line in f: if line.strip(): pii_list.append(json.loads(line)) print(f"\n Total PII détectés: {len(pii_list)}") # Compter par type type_counts = Counter(pii['kind'] for pii in pii_list) print(f"\n Répartition par type:") for pii_type, count in sorted(type_counts.items(), key=lambda x: -x[1]): print(f" {pii_type:20s} : {count:3d}") # Afficher les PII uniques (page 0 uniquement) page0_pii = [p for p in pii_list if p.get('page') == 0] if page0_pii: print(f"\n PII détectés sur la page principale:") for pii in page0_pii: original = pii.get('original', '')[:60] print(f" • {pii['kind']:20s} : {original}") # Afficher les noms extraits (propagation globale) extracted_names = [p for p in pii_list if p.get('kind') == 'NOM_EXTRACTED'] if extracted_names: unique_names = set(p['original'] for p in extracted_names) print(f"\n Noms propagés globalement ({len(unique_names)} uniques):") for name in sorted(unique_names): count = sum(1 for p in extracted_names if p['original'] == name) print(f" • {name:20s} : {count} occurrences") # Afficher le texte anonymisé if text_path.exists(): print(f"\n📝 TEXTE ANONYMISÉ") print(f" Fichier: {text_path.name}") with open(text_path, 'r', encoding='utf-8') as f: text = f.read() print(f"\n Extrait (200 premiers caractères):") print(" " + "-"*76) lines = text[:200].split('\n') for line in lines[:5]: print(f" {line}") print(" " + "-"*76) # Scanner les fuites if redacted_pdf.exists() and audit_path.exists(): print(f"\n🔒 SCAN DE FUITE") print(f" PDF anonymisé: {redacted_pdf.name}") scanner = LeakScanner() leak_report = scanner.scan(redacted_pdf, audit_path) if leak_report.is_safe: print(f"\n ✓ DOCUMENT SÛR") print(f" Aucune fuite détectée") else: print(f"\n ✗ ATTENTION - {leak_report.leak_count} fuite(s)") # Par sévérité print(f"\n Fuites par sévérité:") for severity, count in sorted(leak_report.severity_counts.items()): print(f" {severity:10s} : {count}") # Détails print(f"\n Détails des fuites:") for i, leak in enumerate(leak_report.leaks[:10], 1): print(f" {i}. [{leak['severity']}] {leak['type']}") print(f" {leak['message']}") if leak_report.leak_count > 10: print(f" ... et {leak_report.leak_count - 10} autres") print("\n" + "="*80) print("✨ Analyse terminée") print("="*80) print(f"\n💡 Fichiers disponibles:") print(f" - PDF anonymisé (raster): {redacted_pdf.name}") print(f" - PDF anonymisé (vector): {base_name}.redacted_vector.pdf") print(f" - Texte anonymisé: {text_path.name}") print(f" - Audit complet: {audit_path.name}") print(f"\n📂 Répertoire: {output_dir}") print(f"\n🔍 Pour voir le PDF:") print(f" xdg-open {redacted_pdf}") if __name__ == "__main__": main()