- Test sur 003_simple_compte_rendu_CRO_23155084.pdf - 25 PII détectés (4 sur page principale + propagation globale) - Types: NOM, ADRESSE, CODE_POSTAL, DATE_NAISSANCE - Validation: AUCUNE FUITE détectée ✓ - Scripts d'analyse: analyze_anonymization_result.py, demo_complete_anonymization.py - Résultats dans tests/ground_truth/pdfs/anonymized_test/
123 lines
4.4 KiB
Python
123 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Analyse des résultats d'anonymisation.
|
|
"""
|
|
import json
|
|
from pathlib import Path
|
|
from collections import Counter
|
|
from evaluation import LeakScanner
|
|
|
|
def main():
|
|
# Fichiers générés
|
|
base_name = "003_simple_compte_rendu_CRO_23155084"
|
|
output_dir = Path("tests/ground_truth/pdfs/anonymized_test")
|
|
|
|
audit_path = output_dir / f"{base_name}.audit.jsonl"
|
|
redacted_pdf = output_dir / f"{base_name}.redacted_raster.pdf"
|
|
text_path = output_dir / f"{base_name}.pseudonymise.txt"
|
|
|
|
print("="*80)
|
|
print("ANALYSE DES RÉSULTATS D'ANONYMISATION")
|
|
print("="*80)
|
|
print(f"\n📄 Document: {base_name}.pdf")
|
|
print(f" Type: Compte-rendu opératoire (CRO)")
|
|
|
|
# Analyser l'audit
|
|
if audit_path.exists():
|
|
print(f"\n📊 ANALYSE DE L'AUDIT")
|
|
print(f" Fichier: {audit_path.name}")
|
|
|
|
pii_list = []
|
|
with open(audit_path, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
if line.strip():
|
|
pii_list.append(json.loads(line))
|
|
|
|
print(f"\n Total PII détectés: {len(pii_list)}")
|
|
|
|
# Compter par type
|
|
type_counts = Counter(pii['kind'] for pii in pii_list)
|
|
|
|
print(f"\n Répartition par type:")
|
|
for pii_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {pii_type:20s} : {count:3d}")
|
|
|
|
# Afficher les PII uniques (page 0 uniquement)
|
|
page0_pii = [p for p in pii_list if p.get('page') == 0]
|
|
|
|
if page0_pii:
|
|
print(f"\n PII détectés sur la page principale:")
|
|
for pii in page0_pii:
|
|
original = pii.get('original', '')[:60]
|
|
print(f" • {pii['kind']:20s} : {original}")
|
|
|
|
# Afficher les noms extraits (propagation globale)
|
|
extracted_names = [p for p in pii_list if p.get('kind') == 'NOM_EXTRACTED']
|
|
if extracted_names:
|
|
unique_names = set(p['original'] for p in extracted_names)
|
|
print(f"\n Noms propagés globalement ({len(unique_names)} uniques):")
|
|
for name in sorted(unique_names):
|
|
count = sum(1 for p in extracted_names if p['original'] == name)
|
|
print(f" • {name:20s} : {count} occurrences")
|
|
|
|
# Afficher le texte anonymisé
|
|
if text_path.exists():
|
|
print(f"\n📝 TEXTE ANONYMISÉ")
|
|
print(f" Fichier: {text_path.name}")
|
|
|
|
with open(text_path, 'r', encoding='utf-8') as f:
|
|
text = f.read()
|
|
|
|
print(f"\n Extrait (200 premiers caractères):")
|
|
print(" " + "-"*76)
|
|
lines = text[:200].split('\n')
|
|
for line in lines[:5]:
|
|
print(f" {line}")
|
|
print(" " + "-"*76)
|
|
|
|
# Scanner les fuites
|
|
if redacted_pdf.exists() and audit_path.exists():
|
|
print(f"\n🔒 SCAN DE FUITE")
|
|
print(f" PDF anonymisé: {redacted_pdf.name}")
|
|
|
|
scanner = LeakScanner()
|
|
leak_report = scanner.scan(redacted_pdf, audit_path)
|
|
|
|
if leak_report.is_safe:
|
|
print(f"\n ✓ DOCUMENT SÛR")
|
|
print(f" Aucune fuite détectée")
|
|
else:
|
|
print(f"\n ✗ ATTENTION - {leak_report.leak_count} fuite(s)")
|
|
|
|
# Par sévérité
|
|
print(f"\n Fuites par sévérité:")
|
|
for severity, count in sorted(leak_report.severity_counts.items()):
|
|
print(f" {severity:10s} : {count}")
|
|
|
|
# Détails
|
|
print(f"\n Détails des fuites:")
|
|
for i, leak in enumerate(leak_report.leaks[:10], 1):
|
|
print(f" {i}. [{leak['severity']}] {leak['type']}")
|
|
print(f" {leak['message']}")
|
|
|
|
if leak_report.leak_count > 10:
|
|
print(f" ... et {leak_report.leak_count - 10} autres")
|
|
|
|
print("\n" + "="*80)
|
|
print("✨ Analyse terminée")
|
|
print("="*80)
|
|
|
|
print(f"\n💡 Fichiers disponibles:")
|
|
print(f" - PDF anonymisé (raster): {redacted_pdf.name}")
|
|
print(f" - PDF anonymisé (vector): {base_name}.redacted_vector.pdf")
|
|
print(f" - Texte anonymisé: {text_path.name}")
|
|
print(f" - Audit complet: {audit_path.name}")
|
|
|
|
print(f"\n📂 Répertoire: {output_dir}")
|
|
|
|
print(f"\n🔍 Pour voir le PDF:")
|
|
print(f" xdg-open {redacted_pdf}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|