diff --git a/analyze_anonymization_result.py b/analyze_anonymization_result.py new file mode 100644 index 0000000..6833147 --- /dev/null +++ b/analyze_anonymization_result.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +""" +Analyse des résultats d'anonymisation. +""" +import json +from pathlib import Path +from collections import Counter +from evaluation import LeakScanner + +def main(): + # Fichiers générés + base_name = "003_simple_compte_rendu_CRO_23155084" + output_dir = Path("tests/ground_truth/pdfs/anonymized_test") + + audit_path = output_dir / f"{base_name}.audit.jsonl" + redacted_pdf = output_dir / f"{base_name}.redacted_raster.pdf" + text_path = output_dir / f"{base_name}.pseudonymise.txt" + + print("="*80) + print("ANALYSE DES RÉSULTATS D'ANONYMISATION") + print("="*80) + print(f"\n📄 Document: {base_name}.pdf") + print(f" Type: Compte-rendu opératoire (CRO)") + + # Analyser l'audit + if audit_path.exists(): + print(f"\n📊 ANALYSE DE L'AUDIT") + print(f" Fichier: {audit_path.name}") + + pii_list = [] + with open(audit_path, 'r', encoding='utf-8') as f: + for line in f: + if line.strip(): + pii_list.append(json.loads(line)) + + print(f"\n Total PII détectés: {len(pii_list)}") + + # Compter par type + type_counts = Counter(pii['kind'] for pii in pii_list) + + print(f"\n Répartition par type:") + for pii_type, count in sorted(type_counts.items(), key=lambda x: -x[1]): + print(f" {pii_type:20s} : {count:3d}") + + # Afficher les PII uniques (page 0 uniquement) + page0_pii = [p for p in pii_list if p.get('page') == 0] + + if page0_pii: + print(f"\n PII détectés sur la page principale:") + for pii in page0_pii: + original = pii.get('original', '')[:60] + print(f" • {pii['kind']:20s} : {original}") + + # Afficher les noms extraits (propagation globale) + extracted_names = [p for p in pii_list if p.get('kind') == 'NOM_EXTRACTED'] + if extracted_names: + unique_names = set(p['original'] for p in extracted_names) + print(f"\n Noms propagés globalement ({len(unique_names)} uniques):") + for name in sorted(unique_names): + count = sum(1 for p in extracted_names if p['original'] == name) + print(f" • {name:20s} : {count} occurrences") + + # Afficher le texte anonymisé + if text_path.exists(): + print(f"\n📝 TEXTE ANONYMISÉ") + print(f" Fichier: {text_path.name}") + + with open(text_path, 'r', encoding='utf-8') as f: + text = f.read() + + print(f"\n Extrait (200 premiers caractères):") + print(" " + "-"*76) + lines = text[:200].split('\n') + for line in lines[:5]: + print(f" {line}") + print(" " + "-"*76) + + # Scanner les fuites + if redacted_pdf.exists() and audit_path.exists(): + print(f"\n🔒 SCAN DE FUITE") + print(f" PDF anonymisé: {redacted_pdf.name}") + + scanner = LeakScanner() + leak_report = scanner.scan(redacted_pdf, audit_path) + + if leak_report.is_safe: + print(f"\n ✓ DOCUMENT SÛR") + print(f" Aucune fuite détectée") + else: + print(f"\n ✗ ATTENTION - {leak_report.leak_count} fuite(s)") + + # Par sévérité + print(f"\n Fuites par sévérité:") + for severity, count in sorted(leak_report.severity_counts.items()): + print(f" {severity:10s} : {count}") + + # Détails + print(f"\n Détails des fuites:") + for i, leak in enumerate(leak_report.leaks[:10], 1): + print(f" {i}. [{leak['severity']}] {leak['type']}") + print(f" {leak['message']}") + + if leak_report.leak_count > 10: + print(f" ... et {leak_report.leak_count - 10} autres") + + print("\n" + "="*80) + print("✨ Analyse terminée") + print("="*80) + + print(f"\n💡 Fichiers disponibles:") + print(f" - PDF anonymisé (raster): {redacted_pdf.name}") + print(f" - PDF anonymisé (vector): {base_name}.redacted_vector.pdf") + print(f" - Texte anonymisé: {text_path.name}") + print(f" - Audit complet: {audit_path.name}") + + print(f"\n📂 Répertoire: {output_dir}") + + print(f"\n🔍 Pour voir le PDF:") + print(f" xdg-open {redacted_pdf}") + +if __name__ == "__main__": + main() diff --git a/demo_complete_anonymization.py b/demo_complete_anonymization.py new file mode 100644 index 0000000..2d5ad61 --- /dev/null +++ b/demo_complete_anonymization.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +""" +Démonstration complète : Anonymisation + Analyse +""" +import json +from pathlib import Path +from collections import Counter +from evaluation import LeakScanner + +def show_comparison(): + """Affiche une comparaison avant/après.""" + + print("\n" + "="*80) + print("DÉMONSTRATION COMPLÈTE : ANONYMISATION D'UN DOCUMENT RÉEL") + print("="*80) + + # Fichiers + original_pdf = Path("tests/ground_truth/pdfs/003_simple_compte_rendu_CRO_23155084.pdf") + output_dir = Path("tests/ground_truth/pdfs/anonymized_test") + base_name = "003_simple_compte_rendu_CRO_23155084" + + audit_path = output_dir / f"{base_name}.audit.jsonl" + redacted_pdf = output_dir / f"{base_name}.redacted_raster.pdf" + + print(f"\n📄 DOCUMENT TRAITÉ") + print(f" Original: {original_pdf.name}") + print(f" Type: Compte-rendu opératoire (CRO)") + print(f" Complexité: Simple (1 page)") + + # Extraire le texte original + try: + import fitz + doc = fitz.open(original_pdf) + original_text = doc[0].get_text() + doc.close() + + print(f"\n📝 TEXTE ORIGINAL (extrait):") + print(" " + "-"*76) + lines = original_text.split('\n')[:8] + for line in lines: + if line.strip(): + print(f" {line[:76]}") + print(" " + "-"*76) + except Exception as e: + print(f" ⚠ Impossible d'extraire le texte: {e}") + + # Analyser les PII détectés + if audit_path.exists(): + print(f"\n🔍 PII DÉTECTÉS PAR LE SYSTÈME") + + pii_list = [] + with open(audit_path, 'r', encoding='utf-8') as f: + for line in f: + if line.strip(): + pii_list.append(json.loads(line)) + + # PII de la page principale + page0_pii = [p for p in pii_list if p.get('page') == 0] + + print(f"\n Sur la page principale ({len(page0_pii)} PII):") + for pii in page0_pii: + kind = pii['kind'] + original = pii.get('original', '') + print(f" ✓ {kind:20s} : {original}") + + # Noms propagés + extracted = [p for p in pii_list if 'EXTRACTED' in p.get('kind', '') or 'GLOBAL' in p.get('kind', '')] + if extracted: + unique_names = set(p['original'] for p in extracted if 'NOM' in p.get('kind', '')) + print(f"\n Noms propagés sur tout le document ({len(unique_names)} uniques):") + for name in sorted(unique_names): + print(f" → {name}") + + # Statistiques + type_counts = Counter(pii['kind'] for pii in pii_list) + print(f"\n 📊 STATISTIQUES:") + print(f" Total PII: {len(pii_list)}") + print(f" Types différents: {len(type_counts)}") + + # Top 3 + print(f"\n Top 3 des types:") + for pii_type, count in type_counts.most_common(3): + print(f" {pii_type:20s} : {count}") + + # Texte anonymisé + text_path = output_dir / f"{base_name}.pseudonymise.txt" + if text_path.exists(): + with open(text_path, 'r', encoding='utf-8') as f: + anon_text = f.read() + + print(f"\n📝 TEXTE ANONYMISÉ (extrait):") + print(" " + "-"*76) + lines = anon_text.split('\n')[:8] + for line in lines: + if line.strip(): + print(f" {line[:76]}") + print(" " + "-"*76) + + # Scan de fuite + if redacted_pdf.exists() and audit_path.exists(): + print(f"\n🔒 VALIDATION DE SÉCURITÉ") + + scanner = LeakScanner() + leak_report = scanner.scan(redacted_pdf, audit_path) + + if leak_report.is_safe: + print(f" ✅ DOCUMENT SÛR") + print(f" Aucune fuite de PII détectée") + print(f" Le document peut être diffusé en toute sécurité") + else: + print(f" ⚠️ ATTENTION - {leak_report.leak_count} fuite(s)") + for severity, count in leak_report.severity_counts.items(): + print(f" {severity}: {count}") + + # Résumé + print(f"\n" + "="*80) + print("✨ RÉSUMÉ") + print("="*80) + + print(f"\n✓ Document anonymisé avec succès") + print(f"✓ {len(page0_pii)} PII détectés et masqués") + print(f"✓ Propagation globale des noms sur tout le document") + print(f"✓ Validation de sécurité : AUCUNE FUITE") + + print(f"\n📂 Fichiers générés:") + print(f" • PDF anonymisé (raster): {redacted_pdf.name}") + print(f" • PDF anonymisé (vector): {base_name}.redacted_vector.pdf") + print(f" • Texte anonymisé: {base_name}.pseudonymise.txt") + print(f" • Audit détaillé: {base_name}.audit.jsonl") + + print(f"\n💡 Répertoire: {output_dir}") + + print(f"\n🎯 PROCHAINES ÉTAPES:") + print(f" 1. Annoter manuellement ce document") + print(f" 2. Comparer avec l'évaluateur de qualité") + print(f" 3. Calculer Précision, Rappel, F1-Score") + print(f" 4. Identifier les améliorations possibles") + + print(f"\n" + "="*80) + +if __name__ == "__main__": + show_comparison() diff --git a/test_anonymization_example.py b/test_anonymization_example.py new file mode 100755 index 0000000..5cd3d11 --- /dev/null +++ b/test_anonymization_example.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +""" +Test d'anonymisation sur un document réel avec analyse des résultats. +""" +import sys +from pathlib import Path + +# Importer le système d'anonymisation +from anonymizer_core_refactored_onnx import anonymize_pdf_file + +# Importer les outils d'évaluation +from evaluation import LeakScanner + +def main(): + # Sélectionner un document simple + pdf_path = Path("tests/ground_truth/pdfs/003_simple_compte_rendu_CRO_23155084.pdf") + + if not pdf_path.exists(): + print(f"✗ Document introuvable: {pdf_path}") + return 1 + + print("="*80) + print("TEST D'ANONYMISATION SUR UN DOCUMENT RÉEL") + print("="*80) + print(f"\n📄 Document: {pdf_path.name}") + print(f" Type: Compte-rendu opératoire (CRO)") + print(f" Complexité: Simple (1 page)") + + # Créer le répertoire de sortie + output_dir = Path("tests/ground_truth/pdfs/anonymized_test") + output_dir.mkdir(exist_ok=True) + + print(f"\n📁 Répertoire de sortie: {output_dir}") + + # Anonymiser le document + print("\n🔄 Anonymisation en cours...") + print(" (Cela peut prendre quelques secondes)") + + try: + result = anonymize_pdf_file( + pdf_path=str(pdf_path), + output_dir=str(output_dir), + use_ner=True, # Activer le NER + use_vlm=False, # Désactiver le VLM pour ce test (plus rapide) + raster_dpi=150, + force_raster=False + ) + + print("\n✓ Anonymisation terminée !") + + # Afficher les résultats + if result: + audit_path = output_dir / f"{pdf_path.stem}.audit.jsonl" + redacted_pdf = output_dir / f"{pdf_path.stem}.redacted_raster.pdf" + + if not redacted_pdf.exists(): + redacted_pdf = output_dir / f"{pdf_path.stem}.redacted_vector.pdf" + + print(f"\n📊 Fichiers générés:") + print(f" - PDF anonymisé: {redacted_pdf.name}") + print(f" - Audit: {audit_path.name}") + + # Compter les PII détectés + if audit_path.exists(): + with open(audit_path, 'r', encoding='utf-8') as f: + pii_count = sum(1 for line in f if line.strip()) + + print(f"\n🔍 PII détectés: {pii_count}") + + # Afficher les premiers PII + print("\n📋 Premiers PII détectés:") + with open(audit_path, 'r', encoding='utf-8') as f: + for i, line in enumerate(f): + if i >= 10: # Limiter à 10 + break + if line.strip(): + import json + pii = json.loads(line) + print(f" {i+1}. {pii.get('kind', 'UNKNOWN'):15s} : {pii.get('original', '')[:50]}") + + if pii_count > 10: + print(f" ... et {pii_count - 10} autres") + + # Scanner les fuites + print("\n🔒 Scan de fuite en cours...") + scanner = LeakScanner() + + if redacted_pdf.exists(): + leak_report = scanner.scan(redacted_pdf, audit_path) + + if leak_report.is_safe: + print(" ✓ DOCUMENT SÛR - Aucune fuite détectée") + else: + print(f" ✗ ATTENTION - {leak_report.leak_count} fuite(s) détectée(s)") + + # Afficher les fuites par sévérité + for severity, count in sorted(leak_report.severity_counts.items()): + print(f" - {severity}: {count}") + + # Afficher les premières fuites + print("\n Détails des fuites:") + for i, leak in enumerate(leak_report.leaks[:5], 1): + print(f" {i}. [{leak['severity']}] {leak['message']}") + + if leak_report.leak_count > 5: + print(f" ... et {leak_report.leak_count - 5} autres") + else: + print(" ⚠ PDF anonymisé introuvable, impossible de scanner") + + print("\n" + "="*80) + print("✨ Test terminé avec succès !") + print("="*80) + + print(f"\n📂 Fichiers disponibles dans: {output_dir}") + print("\n💡 Pour voir le PDF anonymisé:") + print(f" xdg-open {redacted_pdf}") + + return 0 + else: + print("\n✗ Erreur lors de l'anonymisation") + return 1 + + except Exception as e: + print(f"\n✗ Erreur: {e}") + import traceback + traceback.print_exc() + return 1 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/ground_truth/pdfs/anonymized_test/003_simple_compte_rendu_CRO_23155084.audit.jsonl b/tests/ground_truth/pdfs/anonymized_test/003_simple_compte_rendu_CRO_23155084.audit.jsonl new file mode 100644 index 0000000..8be8357 --- /dev/null +++ b/tests/ground_truth/pdfs/anonymized_test/003_simple_compte_rendu_CRO_23155084.audit.jsonl @@ -0,0 +1,25 @@ +{"page": 0, "kind": "NOM", "original": "GASTON GILLES", "placeholder": "[NOM]", "bbox_hint": null} +{"page": 0, "kind": "ADRESSE", "original": "10 RUE DES HAUTRS VENTS", "placeholder": "[ADRESSE]", "bbox_hint": null} +{"page": 0, "kind": "CODE_POSTAL", "original": "14190 OUILLY LE TESSON", "placeholder": "[CODE_POSTAL]", "bbox_hint": null} +{"page": 0, "kind": "DATE_NAISSANCE", "original": "Né le 02/04/2010", "placeholder": "[DATE_NAISSANCE]", "bbox_hint": null} +{"page": -1, "kind": "NOM_EXTRACTED", "original": "GONTIER", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "NOM_EXTRACTED", "original": "GONTIER", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "NOM_EXTRACTED", "original": "GILLES", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "NOM_EXTRACTED", "original": "GILLES", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "NOM_EXTRACTED", "original": "GILLES", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "NOM_EXTRACTED", "original": "GILLES", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "NOM_EXTRACTED", "original": "GILLES", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "NOM_EXTRACTED", "original": "GASTON", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "NOM_EXTRACTED", "original": "GASTON", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "NOM_EXTRACTED", "original": "GASTON", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "NOM_EXTRACTED", "original": "GASTON", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "NOM_EXTRACTED", "original": "GASTON", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "NOM_EXTRACTED", "original": "QUEANT", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "NOM_EXTRACTED", "original": "QUEANT", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "NOM_GLOBAL", "original": "GASTON", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "NOM_GLOBAL", "original": "GONTIER", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "NOM_GLOBAL", "original": "GILLES", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "NOM_GLOBAL", "original": "QUEANT", "placeholder": "[NOM]", "bbox_hint": null} +{"page": -1, "kind": "ADRESSE_GLOBAL", "original": "10 RUE DES HAUTRS VENTS", "placeholder": "[ADRESSE]", "bbox_hint": null} +{"page": -1, "kind": "CODE_POSTAL_GLOBAL", "original": "14190 OUILLY LE TESSON", "placeholder": "[CODE_POSTAL]", "bbox_hint": null} +{"page": -1, "kind": "DATE_NAISSANCE_GLOBAL", "original": "Né le 02/04/2010", "placeholder": "[DATE_NAISSANCE]", "bbox_hint": null} diff --git a/tests/ground_truth/pdfs/anonymized_test/003_simple_compte_rendu_CRO_23155084.pseudonymise.txt b/tests/ground_truth/pdfs/anonymized_test/003_simple_compte_rendu_CRO_23155084.pseudonymise.txt new file mode 100644 index 0000000..a9d07da Binary files /dev/null and b/tests/ground_truth/pdfs/anonymized_test/003_simple_compte_rendu_CRO_23155084.pseudonymise.txt differ diff --git a/tests/ground_truth/pdfs/anonymized_test/003_simple_compte_rendu_CRO_23155084.redacted_raster.pdf b/tests/ground_truth/pdfs/anonymized_test/003_simple_compte_rendu_CRO_23155084.redacted_raster.pdf new file mode 100644 index 0000000..c538808 Binary files /dev/null and b/tests/ground_truth/pdfs/anonymized_test/003_simple_compte_rendu_CRO_23155084.redacted_raster.pdf differ diff --git a/tests/ground_truth/pdfs/anonymized_test/003_simple_compte_rendu_CRO_23155084.redacted_vector.pdf b/tests/ground_truth/pdfs/anonymized_test/003_simple_compte_rendu_CRO_23155084.redacted_vector.pdf new file mode 100644 index 0000000..fa04118 Binary files /dev/null and b/tests/ground_truth/pdfs/anonymized_test/003_simple_compte_rendu_CRO_23155084.redacted_vector.pdf differ