feat: Benchmark de performance baseline - 2.62s/doc moyen, 92% dans objectif
This commit is contained in:
123
tools/show_anonymization_example.py
Executable file
123
tools/show_anonymization_example.py
Executable file
@@ -0,0 +1,123 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Affiche un exemple d'anonymisation avec statistiques détaillées.
|
||||
"""
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
|
||||
def show_example(pdf_name: str = None):
|
||||
"""Affiche les détails d'un document anonymisé."""
|
||||
|
||||
baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
||||
|
||||
# Charger les résultats du batch
|
||||
results_file = baseline_dir / "batch_results.json"
|
||||
if not results_file.exists():
|
||||
print(f"✗ Fichier de résultats non trouvé: {results_file}")
|
||||
return 1
|
||||
|
||||
with open(results_file, 'r', encoding='utf-8') as f:
|
||||
batch_data = json.load(f)
|
||||
|
||||
# Si pas de PDF spécifié, prendre le premier avec le plus de PII
|
||||
if not pdf_name:
|
||||
successful = [r for r in batch_data['results'] if r.get('success')]
|
||||
if not successful:
|
||||
print("✗ Aucun document traité avec succès")
|
||||
return 1
|
||||
|
||||
# Trier par nombre de PII (décroissant)
|
||||
successful.sort(key=lambda x: x.get('pii_count', 0), reverse=True)
|
||||
pdf_name = successful[0]['pdf']
|
||||
|
||||
# Trouver le résultat
|
||||
result = next((r for r in batch_data['results'] if r['pdf'] == pdf_name), None)
|
||||
if not result:
|
||||
print(f"✗ Document non trouvé: {pdf_name}")
|
||||
return 1
|
||||
|
||||
if not result.get('success'):
|
||||
print(f"✗ Document en échec: {pdf_name}")
|
||||
print(f" Erreur: {result.get('error', 'Unknown')}")
|
||||
return 1
|
||||
|
||||
# Charger l'audit
|
||||
audit_file = baseline_dir / f"{Path(pdf_name).stem}.audit.jsonl"
|
||||
if not audit_file.exists():
|
||||
print(f"✗ Fichier d'audit non trouvé: {audit_file}")
|
||||
return 1
|
||||
|
||||
detections = []
|
||||
with open(audit_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
detections.append(json.loads(line))
|
||||
|
||||
# Analyser les détections
|
||||
types_counter = Counter(d.get('kind', d.get('type', 'unknown')) for d in detections)
|
||||
methods_counter = Counter(d.get('method', 'unknown') for d in detections)
|
||||
pages_counter = Counter(d['page'] for d in detections)
|
||||
|
||||
# Afficher
|
||||
print("="*80)
|
||||
print(f"EXEMPLE D'ANONYMISATION: {pdf_name}")
|
||||
print("="*80)
|
||||
|
||||
print(f"\n📄 Document: {pdf_name}")
|
||||
print(f"⏱️ Temps de traitement: {result['time_s']:.2f}s")
|
||||
print(f"🔍 PII détectés: {result['pii_count']}")
|
||||
|
||||
print(f"\n📊 Répartition par type:")
|
||||
for pii_type, count in types_counter.most_common():
|
||||
print(f" - {pii_type}: {count}")
|
||||
|
||||
print(f"\n🔬 Répartition par méthode de détection:")
|
||||
for method, count in methods_counter.most_common():
|
||||
print(f" - {method}: {count}")
|
||||
|
||||
print(f"\n📖 Répartition par page:")
|
||||
for page, count in sorted(pages_counter.items()):
|
||||
print(f" - Page {page}: {count} PII")
|
||||
|
||||
# Exemples de détections
|
||||
print(f"\n🔍 Exemples de détections (5 premiers):")
|
||||
for i, det in enumerate(detections[:5], 1):
|
||||
text = det.get('original', det.get('text', ''))
|
||||
if len(text) > 40:
|
||||
text = text[:37] + "..."
|
||||
pii_type = det.get('kind', det.get('type', 'unknown'))
|
||||
print(f" {i}. [{pii_type}] \"{text}\" (page {det['page']}, méthode: {det.get('method', 'unknown')})")
|
||||
|
||||
# Fichiers générés
|
||||
print(f"\n📂 Fichiers générés:")
|
||||
stem = Path(pdf_name).stem
|
||||
files = [
|
||||
baseline_dir / f"{stem}.pseudonymise.txt",
|
||||
baseline_dir / f"{stem}.redacted_vector.pdf",
|
||||
baseline_dir / f"{stem}.redacted_raster.pdf",
|
||||
baseline_dir / f"{stem}.audit.jsonl"
|
||||
]
|
||||
for f in files:
|
||||
status = "✓" if f.exists() else "✗"
|
||||
print(f" {status} {f.name}")
|
||||
|
||||
print("\n" + "="*80)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Afficher un exemple d'anonymisation")
|
||||
parser.add_argument("pdf", nargs="?", help="Nom du PDF (optionnel, par défaut le plus complexe)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return show_example(args.pdf)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user