demo: Test d'anonymisation sur document réel
- Test sur 003_simple_compte_rendu_CRO_23155084.pdf - 25 PII détectés (4 sur page principale + propagation globale) - Types: NOM, ADRESSE, CODE_POSTAL, DATE_NAISSANCE - Validation: AUCUNE FUITE détectée ✓ - Scripts d'analyse: analyze_anonymization_result.py, demo_complete_anonymization.py - Résultats dans tests/ground_truth/pdfs/anonymized_test/
This commit is contained in:
122
analyze_anonymization_result.py
Normal file
122
analyze_anonymization_result.py
Normal file
@@ -0,0 +1,122 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Analyse des résultats d'anonymisation.
|
||||
"""
|
||||
import json
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
from evaluation import LeakScanner
|
||||
|
||||
def main():
|
||||
# Fichiers générés
|
||||
base_name = "003_simple_compte_rendu_CRO_23155084"
|
||||
output_dir = Path("tests/ground_truth/pdfs/anonymized_test")
|
||||
|
||||
audit_path = output_dir / f"{base_name}.audit.jsonl"
|
||||
redacted_pdf = output_dir / f"{base_name}.redacted_raster.pdf"
|
||||
text_path = output_dir / f"{base_name}.pseudonymise.txt"
|
||||
|
||||
print("="*80)
|
||||
print("ANALYSE DES RÉSULTATS D'ANONYMISATION")
|
||||
print("="*80)
|
||||
print(f"\n📄 Document: {base_name}.pdf")
|
||||
print(f" Type: Compte-rendu opératoire (CRO)")
|
||||
|
||||
# Analyser l'audit
|
||||
if audit_path.exists():
|
||||
print(f"\n📊 ANALYSE DE L'AUDIT")
|
||||
print(f" Fichier: {audit_path.name}")
|
||||
|
||||
pii_list = []
|
||||
with open(audit_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
pii_list.append(json.loads(line))
|
||||
|
||||
print(f"\n Total PII détectés: {len(pii_list)}")
|
||||
|
||||
# Compter par type
|
||||
type_counts = Counter(pii['kind'] for pii in pii_list)
|
||||
|
||||
print(f"\n Répartition par type:")
|
||||
for pii_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
|
||||
print(f" {pii_type:20s} : {count:3d}")
|
||||
|
||||
# Afficher les PII uniques (page 0 uniquement)
|
||||
page0_pii = [p for p in pii_list if p.get('page') == 0]
|
||||
|
||||
if page0_pii:
|
||||
print(f"\n PII détectés sur la page principale:")
|
||||
for pii in page0_pii:
|
||||
original = pii.get('original', '')[:60]
|
||||
print(f" • {pii['kind']:20s} : {original}")
|
||||
|
||||
# Afficher les noms extraits (propagation globale)
|
||||
extracted_names = [p for p in pii_list if p.get('kind') == 'NOM_EXTRACTED']
|
||||
if extracted_names:
|
||||
unique_names = set(p['original'] for p in extracted_names)
|
||||
print(f"\n Noms propagés globalement ({len(unique_names)} uniques):")
|
||||
for name in sorted(unique_names):
|
||||
count = sum(1 for p in extracted_names if p['original'] == name)
|
||||
print(f" • {name:20s} : {count} occurrences")
|
||||
|
||||
# Afficher le texte anonymisé
|
||||
if text_path.exists():
|
||||
print(f"\n📝 TEXTE ANONYMISÉ")
|
||||
print(f" Fichier: {text_path.name}")
|
||||
|
||||
with open(text_path, 'r', encoding='utf-8') as f:
|
||||
text = f.read()
|
||||
|
||||
print(f"\n Extrait (200 premiers caractères):")
|
||||
print(" " + "-"*76)
|
||||
lines = text[:200].split('\n')
|
||||
for line in lines[:5]:
|
||||
print(f" {line}")
|
||||
print(" " + "-"*76)
|
||||
|
||||
# Scanner les fuites
|
||||
if redacted_pdf.exists() and audit_path.exists():
|
||||
print(f"\n🔒 SCAN DE FUITE")
|
||||
print(f" PDF anonymisé: {redacted_pdf.name}")
|
||||
|
||||
scanner = LeakScanner()
|
||||
leak_report = scanner.scan(redacted_pdf, audit_path)
|
||||
|
||||
if leak_report.is_safe:
|
||||
print(f"\n ✓ DOCUMENT SÛR")
|
||||
print(f" Aucune fuite détectée")
|
||||
else:
|
||||
print(f"\n ✗ ATTENTION - {leak_report.leak_count} fuite(s)")
|
||||
|
||||
# Par sévérité
|
||||
print(f"\n Fuites par sévérité:")
|
||||
for severity, count in sorted(leak_report.severity_counts.items()):
|
||||
print(f" {severity:10s} : {count}")
|
||||
|
||||
# Détails
|
||||
print(f"\n Détails des fuites:")
|
||||
for i, leak in enumerate(leak_report.leaks[:10], 1):
|
||||
print(f" {i}. [{leak['severity']}] {leak['type']}")
|
||||
print(f" {leak['message']}")
|
||||
|
||||
if leak_report.leak_count > 10:
|
||||
print(f" ... et {leak_report.leak_count - 10} autres")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("✨ Analyse terminée")
|
||||
print("="*80)
|
||||
|
||||
print(f"\n💡 Fichiers disponibles:")
|
||||
print(f" - PDF anonymisé (raster): {redacted_pdf.name}")
|
||||
print(f" - PDF anonymisé (vector): {base_name}.redacted_vector.pdf")
|
||||
print(f" - Texte anonymisé: {text_path.name}")
|
||||
print(f" - Audit complet: {audit_path.name}")
|
||||
|
||||
print(f"\n📂 Répertoire: {output_dir}")
|
||||
|
||||
print(f"\n🔍 Pour voir le PDF:")
|
||||
print(f" xdg-open {redacted_pdf}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
142
demo_complete_anonymization.py
Normal file
142
demo_complete_anonymization.py
Normal file
@@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Démonstration complète : Anonymisation + Analyse
|
||||
"""
|
||||
import json
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
from evaluation import LeakScanner
|
||||
|
||||
def show_comparison():
|
||||
"""Affiche une comparaison avant/après."""
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("DÉMONSTRATION COMPLÈTE : ANONYMISATION D'UN DOCUMENT RÉEL")
|
||||
print("="*80)
|
||||
|
||||
# Fichiers
|
||||
original_pdf = Path("tests/ground_truth/pdfs/003_simple_compte_rendu_CRO_23155084.pdf")
|
||||
output_dir = Path("tests/ground_truth/pdfs/anonymized_test")
|
||||
base_name = "003_simple_compte_rendu_CRO_23155084"
|
||||
|
||||
audit_path = output_dir / f"{base_name}.audit.jsonl"
|
||||
redacted_pdf = output_dir / f"{base_name}.redacted_raster.pdf"
|
||||
|
||||
print(f"\n📄 DOCUMENT TRAITÉ")
|
||||
print(f" Original: {original_pdf.name}")
|
||||
print(f" Type: Compte-rendu opératoire (CRO)")
|
||||
print(f" Complexité: Simple (1 page)")
|
||||
|
||||
# Extraire le texte original
|
||||
try:
|
||||
import fitz
|
||||
doc = fitz.open(original_pdf)
|
||||
original_text = doc[0].get_text()
|
||||
doc.close()
|
||||
|
||||
print(f"\n📝 TEXTE ORIGINAL (extrait):")
|
||||
print(" " + "-"*76)
|
||||
lines = original_text.split('\n')[:8]
|
||||
for line in lines:
|
||||
if line.strip():
|
||||
print(f" {line[:76]}")
|
||||
print(" " + "-"*76)
|
||||
except Exception as e:
|
||||
print(f" ⚠ Impossible d'extraire le texte: {e}")
|
||||
|
||||
# Analyser les PII détectés
|
||||
if audit_path.exists():
|
||||
print(f"\n🔍 PII DÉTECTÉS PAR LE SYSTÈME")
|
||||
|
||||
pii_list = []
|
||||
with open(audit_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
pii_list.append(json.loads(line))
|
||||
|
||||
# PII de la page principale
|
||||
page0_pii = [p for p in pii_list if p.get('page') == 0]
|
||||
|
||||
print(f"\n Sur la page principale ({len(page0_pii)} PII):")
|
||||
for pii in page0_pii:
|
||||
kind = pii['kind']
|
||||
original = pii.get('original', '')
|
||||
print(f" ✓ {kind:20s} : {original}")
|
||||
|
||||
# Noms propagés
|
||||
extracted = [p for p in pii_list if 'EXTRACTED' in p.get('kind', '') or 'GLOBAL' in p.get('kind', '')]
|
||||
if extracted:
|
||||
unique_names = set(p['original'] for p in extracted if 'NOM' in p.get('kind', ''))
|
||||
print(f"\n Noms propagés sur tout le document ({len(unique_names)} uniques):")
|
||||
for name in sorted(unique_names):
|
||||
print(f" → {name}")
|
||||
|
||||
# Statistiques
|
||||
type_counts = Counter(pii['kind'] for pii in pii_list)
|
||||
print(f"\n 📊 STATISTIQUES:")
|
||||
print(f" Total PII: {len(pii_list)}")
|
||||
print(f" Types différents: {len(type_counts)}")
|
||||
|
||||
# Top 3
|
||||
print(f"\n Top 3 des types:")
|
||||
for pii_type, count in type_counts.most_common(3):
|
||||
print(f" {pii_type:20s} : {count}")
|
||||
|
||||
# Texte anonymisé
|
||||
text_path = output_dir / f"{base_name}.pseudonymise.txt"
|
||||
if text_path.exists():
|
||||
with open(text_path, 'r', encoding='utf-8') as f:
|
||||
anon_text = f.read()
|
||||
|
||||
print(f"\n📝 TEXTE ANONYMISÉ (extrait):")
|
||||
print(" " + "-"*76)
|
||||
lines = anon_text.split('\n')[:8]
|
||||
for line in lines:
|
||||
if line.strip():
|
||||
print(f" {line[:76]}")
|
||||
print(" " + "-"*76)
|
||||
|
||||
# Scan de fuite
|
||||
if redacted_pdf.exists() and audit_path.exists():
|
||||
print(f"\n🔒 VALIDATION DE SÉCURITÉ")
|
||||
|
||||
scanner = LeakScanner()
|
||||
leak_report = scanner.scan(redacted_pdf, audit_path)
|
||||
|
||||
if leak_report.is_safe:
|
||||
print(f" ✅ DOCUMENT SÛR")
|
||||
print(f" Aucune fuite de PII détectée")
|
||||
print(f" Le document peut être diffusé en toute sécurité")
|
||||
else:
|
||||
print(f" ⚠️ ATTENTION - {leak_report.leak_count} fuite(s)")
|
||||
for severity, count in leak_report.severity_counts.items():
|
||||
print(f" {severity}: {count}")
|
||||
|
||||
# Résumé
|
||||
print(f"\n" + "="*80)
|
||||
print("✨ RÉSUMÉ")
|
||||
print("="*80)
|
||||
|
||||
print(f"\n✓ Document anonymisé avec succès")
|
||||
print(f"✓ {len(page0_pii)} PII détectés et masqués")
|
||||
print(f"✓ Propagation globale des noms sur tout le document")
|
||||
print(f"✓ Validation de sécurité : AUCUNE FUITE")
|
||||
|
||||
print(f"\n📂 Fichiers générés:")
|
||||
print(f" • PDF anonymisé (raster): {redacted_pdf.name}")
|
||||
print(f" • PDF anonymisé (vector): {base_name}.redacted_vector.pdf")
|
||||
print(f" • Texte anonymisé: {base_name}.pseudonymise.txt")
|
||||
print(f" • Audit détaillé: {base_name}.audit.jsonl")
|
||||
|
||||
print(f"\n💡 Répertoire: {output_dir}")
|
||||
|
||||
print(f"\n🎯 PROCHAINES ÉTAPES:")
|
||||
print(f" 1. Annoter manuellement ce document")
|
||||
print(f" 2. Comparer avec l'évaluateur de qualité")
|
||||
print(f" 3. Calculer Précision, Rappel, F1-Score")
|
||||
print(f" 4. Identifier les améliorations possibles")
|
||||
|
||||
print(f"\n" + "="*80)
|
||||
|
||||
if __name__ == "__main__":
|
||||
show_comparison()
|
||||
130
test_anonymization_example.py
Executable file
130
test_anonymization_example.py
Executable file
@@ -0,0 +1,130 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test d'anonymisation sur un document réel avec analyse des résultats.
|
||||
"""
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Importer le système d'anonymisation
|
||||
from anonymizer_core_refactored_onnx import anonymize_pdf_file
|
||||
|
||||
# Importer les outils d'évaluation
|
||||
from evaluation import LeakScanner
|
||||
|
||||
def main():
|
||||
# Sélectionner un document simple
|
||||
pdf_path = Path("tests/ground_truth/pdfs/003_simple_compte_rendu_CRO_23155084.pdf")
|
||||
|
||||
if not pdf_path.exists():
|
||||
print(f"✗ Document introuvable: {pdf_path}")
|
||||
return 1
|
||||
|
||||
print("="*80)
|
||||
print("TEST D'ANONYMISATION SUR UN DOCUMENT RÉEL")
|
||||
print("="*80)
|
||||
print(f"\n📄 Document: {pdf_path.name}")
|
||||
print(f" Type: Compte-rendu opératoire (CRO)")
|
||||
print(f" Complexité: Simple (1 page)")
|
||||
|
||||
# Créer le répertoire de sortie
|
||||
output_dir = Path("tests/ground_truth/pdfs/anonymized_test")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
print(f"\n📁 Répertoire de sortie: {output_dir}")
|
||||
|
||||
# Anonymiser le document
|
||||
print("\n🔄 Anonymisation en cours...")
|
||||
print(" (Cela peut prendre quelques secondes)")
|
||||
|
||||
try:
|
||||
result = anonymize_pdf_file(
|
||||
pdf_path=str(pdf_path),
|
||||
output_dir=str(output_dir),
|
||||
use_ner=True, # Activer le NER
|
||||
use_vlm=False, # Désactiver le VLM pour ce test (plus rapide)
|
||||
raster_dpi=150,
|
||||
force_raster=False
|
||||
)
|
||||
|
||||
print("\n✓ Anonymisation terminée !")
|
||||
|
||||
# Afficher les résultats
|
||||
if result:
|
||||
audit_path = output_dir / f"{pdf_path.stem}.audit.jsonl"
|
||||
redacted_pdf = output_dir / f"{pdf_path.stem}.redacted_raster.pdf"
|
||||
|
||||
if not redacted_pdf.exists():
|
||||
redacted_pdf = output_dir / f"{pdf_path.stem}.redacted_vector.pdf"
|
||||
|
||||
print(f"\n📊 Fichiers générés:")
|
||||
print(f" - PDF anonymisé: {redacted_pdf.name}")
|
||||
print(f" - Audit: {audit_path.name}")
|
||||
|
||||
# Compter les PII détectés
|
||||
if audit_path.exists():
|
||||
with open(audit_path, 'r', encoding='utf-8') as f:
|
||||
pii_count = sum(1 for line in f if line.strip())
|
||||
|
||||
print(f"\n🔍 PII détectés: {pii_count}")
|
||||
|
||||
# Afficher les premiers PII
|
||||
print("\n📋 Premiers PII détectés:")
|
||||
with open(audit_path, 'r', encoding='utf-8') as f:
|
||||
for i, line in enumerate(f):
|
||||
if i >= 10: # Limiter à 10
|
||||
break
|
||||
if line.strip():
|
||||
import json
|
||||
pii = json.loads(line)
|
||||
print(f" {i+1}. {pii.get('kind', 'UNKNOWN'):15s} : {pii.get('original', '')[:50]}")
|
||||
|
||||
if pii_count > 10:
|
||||
print(f" ... et {pii_count - 10} autres")
|
||||
|
||||
# Scanner les fuites
|
||||
print("\n🔒 Scan de fuite en cours...")
|
||||
scanner = LeakScanner()
|
||||
|
||||
if redacted_pdf.exists():
|
||||
leak_report = scanner.scan(redacted_pdf, audit_path)
|
||||
|
||||
if leak_report.is_safe:
|
||||
print(" ✓ DOCUMENT SÛR - Aucune fuite détectée")
|
||||
else:
|
||||
print(f" ✗ ATTENTION - {leak_report.leak_count} fuite(s) détectée(s)")
|
||||
|
||||
# Afficher les fuites par sévérité
|
||||
for severity, count in sorted(leak_report.severity_counts.items()):
|
||||
print(f" - {severity}: {count}")
|
||||
|
||||
# Afficher les premières fuites
|
||||
print("\n Détails des fuites:")
|
||||
for i, leak in enumerate(leak_report.leaks[:5], 1):
|
||||
print(f" {i}. [{leak['severity']}] {leak['message']}")
|
||||
|
||||
if leak_report.leak_count > 5:
|
||||
print(f" ... et {leak_report.leak_count - 5} autres")
|
||||
else:
|
||||
print(" ⚠ PDF anonymisé introuvable, impossible de scanner")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("✨ Test terminé avec succès !")
|
||||
print("="*80)
|
||||
|
||||
print(f"\n📂 Fichiers disponibles dans: {output_dir}")
|
||||
print("\n💡 Pour voir le PDF anonymisé:")
|
||||
print(f" xdg-open {redacted_pdf}")
|
||||
|
||||
return 0
|
||||
else:
|
||||
print("\n✗ Erreur lors de l'anonymisation")
|
||||
return 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n✗ Erreur: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,25 @@
|
||||
{"page": 0, "kind": "NOM", "original": "GASTON GILLES", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": 0, "kind": "ADRESSE", "original": "10 RUE DES HAUTRS VENTS", "placeholder": "[ADRESSE]", "bbox_hint": null}
|
||||
{"page": 0, "kind": "CODE_POSTAL", "original": "14190 OUILLY LE TESSON", "placeholder": "[CODE_POSTAL]", "bbox_hint": null}
|
||||
{"page": 0, "kind": "DATE_NAISSANCE", "original": "Né le 02/04/2010", "placeholder": "[DATE_NAISSANCE]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_EXTRACTED", "original": "GONTIER", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_EXTRACTED", "original": "GONTIER", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_EXTRACTED", "original": "GILLES", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_EXTRACTED", "original": "GILLES", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_EXTRACTED", "original": "GILLES", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_EXTRACTED", "original": "GILLES", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_EXTRACTED", "original": "GILLES", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_EXTRACTED", "original": "GASTON", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_EXTRACTED", "original": "GASTON", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_EXTRACTED", "original": "GASTON", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_EXTRACTED", "original": "GASTON", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_EXTRACTED", "original": "GASTON", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_EXTRACTED", "original": "QUEANT", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_EXTRACTED", "original": "QUEANT", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_GLOBAL", "original": "GASTON", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_GLOBAL", "original": "GONTIER", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_GLOBAL", "original": "GILLES", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "NOM_GLOBAL", "original": "QUEANT", "placeholder": "[NOM]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "ADRESSE_GLOBAL", "original": "10 RUE DES HAUTRS VENTS", "placeholder": "[ADRESSE]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "CODE_POSTAL_GLOBAL", "original": "14190 OUILLY LE TESSON", "placeholder": "[CODE_POSTAL]", "bbox_hint": null}
|
||||
{"page": -1, "kind": "DATE_NAISSANCE_GLOBAL", "original": "Né le 02/04/2010", "placeholder": "[DATE_NAISSANCE]", "bbox_hint": null}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user