feat: Benchmark de performance baseline - 2.62s/doc moyen, 92% dans objectif

This commit is contained in:
2026-03-02 10:42:15 +01:00
parent f61e767ee6
commit 30a6ebcc19
108 changed files with 33195 additions and 6 deletions

85
tools/show_batch_summary.py Executable file
View File

@@ -0,0 +1,85 @@
#!/usr/bin/env python3
"""
Affiche un résumé des résultats du batch d'anonymisation.
"""
import json
import sys
from pathlib import Path
from collections import Counter
def show_summary():
"""Affiche le résumé du batch."""
baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
results_file = baseline_dir / "batch_results.json"
if not results_file.exists():
print(f"✗ Fichier de résultats non trouvé: {results_file}")
return 1
with open(results_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# Statistiques globales
print("="*80)
print("RÉSUMÉ DU BATCH D'ANONYMISATION")
print("="*80)
print(f"\n📅 Date: {data['date']}")
print(f"📄 Documents traités: {data['success_count']}/{data['total_documents']}")
print(f"🔍 PII détectés: {data['total_pii']:,}")
print(f"⏱️ Temps total: {data['total_time_s']:.2f}s")
print(f"⏱️ Temps moyen: {data['avg_time_s']:.2f}s par document")
# Analyser les résultats
successful = [r for r in data['results'] if r.get('success')]
failed = [r for r in data['results'] if not r.get('success')]
if successful:
times = [r['time_s'] for r in successful]
pii_counts = [r['pii_count'] for r in successful]
print(f"\n📊 Statistiques de temps:")
print(f" - Min: {min(times):.2f}s")
print(f" - Max: {max(times):.2f}s")
print(f" - Médiane: {sorted(times)[len(times)//2]:.2f}s")
print(f"\n📊 Statistiques de PII:")
print(f" - Min: {min(pii_counts)}")
print(f" - Max: {max(pii_counts):,}")
print(f" - Médiane: {sorted(pii_counts)[len(pii_counts)//2]}")
print(f" - Moyenne: {sum(pii_counts)/len(pii_counts):.1f}")
# Top 5 documents les plus complexes
if successful:
print(f"\n🏆 Top 5 documents les plus complexes (par PII):")
top5 = sorted(successful, key=lambda x: x['pii_count'], reverse=True)[:5]
for i, r in enumerate(top5, 1):
print(f" {i}. {r['pdf']}")
print(f"{r['pii_count']:,} PII en {r['time_s']:.2f}s")
# Top 5 documents les plus rapides
if successful:
print(f"\n⚡ Top 5 documents les plus rapides:")
fastest = sorted(successful, key=lambda x: x['time_s'])[:5]
for i, r in enumerate(fastest, 1):
print(f" {i}. {r['pdf']}")
print(f"{r['time_s']:.2f}s ({r['pii_count']} PII)")
# Échecs
if failed:
print(f"\n⚠️ Échecs ({len(failed)}):")
for r in failed:
error = r.get('error', 'Unknown error')
if not error:
error = "PDF protégé par mot de passe"
print(f" - {r['pdf']}")
print(f"{error}")
print("\n" + "="*80)
return 0
if __name__ == "__main__":
sys.exit(show_summary())