feat: Benchmark de performance baseline - 2.62s/doc moyen, 92% dans objectif
This commit is contained in:
154
tools/batch_anonymize_test_dataset.py
Normal file
154
tools/batch_anonymize_test_dataset.py
Normal file
@@ -0,0 +1,154 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Anonymisation en batch du dataset de test (27 documents).
|
||||
|
||||
Ce script anonymise tous les documents sélectionnés pour créer la baseline.
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
# Importer le système d'anonymisation
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from anonymizer_core_refactored_onnx import process_pdf
|
||||
|
||||
def anonymize_test_dataset(use_ner: bool = True, use_vlm: bool = False):
|
||||
"""
|
||||
Anonymise tous les documents du dataset de test.
|
||||
|
||||
Args:
|
||||
use_ner: Activer le NER (EDS-Pseudo ou CamemBERT)
|
||||
use_vlm: Activer le VLM (Ollama) - plus lent
|
||||
"""
|
||||
# Répertoires
|
||||
input_dir = Path("tests/ground_truth/pdfs")
|
||||
output_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Lister les PDFs
|
||||
pdf_files = sorted(input_dir.glob("*.pdf"))
|
||||
|
||||
if not pdf_files:
|
||||
print(f"✗ Aucun PDF trouvé dans {input_dir}")
|
||||
return 1
|
||||
|
||||
print("="*80)
|
||||
print("ANONYMISATION EN BATCH DU DATASET DE TEST")
|
||||
print("="*80)
|
||||
print(f"\n📁 Répertoire d'entrée: {input_dir}")
|
||||
print(f"📁 Répertoire de sortie: {output_dir}")
|
||||
print(f"\n📄 Documents à traiter: {len(pdf_files)}")
|
||||
print(f"\n⚙️ Configuration:")
|
||||
print(f" - NER: {'✓ Activé' if use_ner else '✗ Désactivé'}")
|
||||
print(f" - VLM: {'✓ Activé' if use_vlm else '✗ Désactivé'}")
|
||||
|
||||
# Statistiques
|
||||
results = []
|
||||
start_time = time.time()
|
||||
|
||||
# Traiter chaque document
|
||||
for i, pdf_path in enumerate(pdf_files, 1):
|
||||
print(f"\n[{i}/{len(pdf_files)}] {pdf_path.name}")
|
||||
|
||||
doc_start = time.time()
|
||||
|
||||
try:
|
||||
# Anonymiser
|
||||
result = process_pdf(
|
||||
pdf_path=pdf_path,
|
||||
out_dir=output_dir,
|
||||
make_vector_redaction=True,
|
||||
also_make_raster_burn=True,
|
||||
use_hf=use_ner,
|
||||
ner_manager=None, # Sera chargé automatiquement si use_hf=True
|
||||
)
|
||||
|
||||
doc_time = time.time() - doc_start
|
||||
|
||||
# Compter les PII
|
||||
audit_path = output_dir / f"{pdf_path.stem}.audit.jsonl"
|
||||
pii_count = 0
|
||||
if audit_path.exists():
|
||||
with open(audit_path, 'r', encoding='utf-8') as f:
|
||||
pii_count = sum(1 for line in f if line.strip())
|
||||
|
||||
print(f" ✓ Terminé en {doc_time:.2f}s - {pii_count} PII détectés")
|
||||
|
||||
results.append({
|
||||
"pdf": pdf_path.name,
|
||||
"success": True,
|
||||
"time_s": doc_time,
|
||||
"pii_count": pii_count,
|
||||
"files": result
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
doc_time = time.time() - doc_start
|
||||
print(f" ✗ Erreur: {e}")
|
||||
|
||||
results.append({
|
||||
"pdf": pdf_path.name,
|
||||
"success": False,
|
||||
"time_s": doc_time,
|
||||
"error": str(e)
|
||||
})
|
||||
|
||||
# Résumé
|
||||
total_time = time.time() - start_time
|
||||
success_count = sum(1 for r in results if r.get("success"))
|
||||
total_pii = sum(r.get("pii_count", 0) for r in results if r.get("success"))
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("RÉSUMÉ")
|
||||
print("="*80)
|
||||
print(f"\n✓ Documents traités: {success_count}/{len(pdf_files)}")
|
||||
print(f"✓ PII détectés: {total_pii}")
|
||||
print(f"✓ Temps total: {total_time:.2f}s")
|
||||
print(f"✓ Temps moyen: {total_time/len(pdf_files):.2f}s par document")
|
||||
|
||||
if success_count < len(pdf_files):
|
||||
failed = [r for r in results if not r.get("success")]
|
||||
print(f"\n⚠ Échecs: {len(failed)}")
|
||||
for r in failed:
|
||||
print(f" - {r['pdf']}: {r.get('error', 'Unknown error')}")
|
||||
|
||||
# Sauvegarder les résultats
|
||||
results_file = output_dir / "batch_results.json"
|
||||
with open(results_file, 'w', encoding='utf-8') as f:
|
||||
json.dump({
|
||||
"date": datetime.now().isoformat(),
|
||||
"total_documents": len(pdf_files),
|
||||
"success_count": success_count,
|
||||
"total_pii": total_pii,
|
||||
"total_time_s": total_time,
|
||||
"avg_time_s": total_time / len(pdf_files),
|
||||
"use_ner": use_ner,
|
||||
"use_vlm": use_vlm,
|
||||
"results": results
|
||||
}, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n📊 Résultats sauvegardés: {results_file}")
|
||||
print(f"\n📂 Fichiers générés dans: {output_dir}")
|
||||
|
||||
return 0 if success_count == len(pdf_files) else 1
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Anonymiser le dataset de test en batch")
|
||||
parser.add_argument("--no-ner", action="store_true", help="Désactiver le NER")
|
||||
parser.add_argument("--vlm", action="store_true", help="Activer le VLM (plus lent)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return anonymize_test_dataset(
|
||||
use_ner=not args.no_ner,
|
||||
use_vlm=args.vlm
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
199
tools/run_baseline_benchmark.py
Executable file
199
tools/run_baseline_benchmark.py
Executable file
@@ -0,0 +1,199 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Benchmark de performance du système d'anonymisation sur le dataset de test.
|
||||
|
||||
Analyse les résultats du batch pour générer un rapport de performance.
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import statistics
|
||||
|
||||
def run_baseline_benchmark():
|
||||
"""Génère le rapport de benchmark à partir des résultats du batch."""
|
||||
|
||||
# Répertoires
|
||||
baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
||||
results_file = baseline_dir / "batch_results.json"
|
||||
|
||||
if not results_file.exists():
|
||||
print(f"✗ Fichier de résultats non trouvé: {results_file}")
|
||||
print(f" Exécutez d'abord: python3 tools/batch_anonymize_test_dataset.py")
|
||||
return 1
|
||||
|
||||
# Charger les résultats du batch
|
||||
with open(results_file, 'r', encoding='utf-8') as f:
|
||||
batch_data = json.load(f)
|
||||
|
||||
successful = [r for r in batch_data['results'] if r.get('success')]
|
||||
|
||||
if not successful:
|
||||
print("✗ Aucun document traité avec succès")
|
||||
return 1
|
||||
|
||||
print("="*80)
|
||||
print("BENCHMARK DE PERFORMANCE - BASELINE")
|
||||
print("="*80)
|
||||
print(f"\n📅 Date du batch: {batch_data['date']}")
|
||||
print(f"📄 Documents: {len(successful)}/{batch_data['total_documents']}")
|
||||
print(f"🔍 PII détectés: {batch_data['total_pii']:,}")
|
||||
|
||||
# Extraire les métriques
|
||||
times = [r['time_s'] for r in successful]
|
||||
pii_counts = [r['pii_count'] for r in successful]
|
||||
|
||||
# Calculer les statistiques
|
||||
stats = {
|
||||
"total_documents": len(successful),
|
||||
"total_time_s": sum(times),
|
||||
"avg_time_s": statistics.mean(times),
|
||||
"median_time_s": statistics.median(times),
|
||||
"min_time_s": min(times),
|
||||
"max_time_s": max(times),
|
||||
"stdev_time_s": statistics.stdev(times) if len(times) > 1 else 0.0,
|
||||
"total_pii": sum(pii_counts),
|
||||
"avg_pii": statistics.mean(pii_counts),
|
||||
"median_pii": statistics.median(pii_counts),
|
||||
"min_pii": min(pii_counts),
|
||||
"max_pii": max(pii_counts),
|
||||
"docs_per_second": len(successful) / sum(times),
|
||||
"pii_per_second": sum(pii_counts) / sum(times)
|
||||
}
|
||||
|
||||
# Afficher les statistiques
|
||||
print("\n" + "="*80)
|
||||
print("STATISTIQUES DE PERFORMANCE")
|
||||
print("="*80)
|
||||
|
||||
print(f"\n⏱️ Temps de traitement:")
|
||||
print(f" - Total: {stats['total_time_s']:.2f}s")
|
||||
print(f" - Moyen: {stats['avg_time_s']:.2f}s par document")
|
||||
print(f" - Médiane: {stats['median_time_s']:.2f}s")
|
||||
print(f" - Min: {stats['min_time_s']:.2f}s")
|
||||
print(f" - Max: {stats['max_time_s']:.2f}s")
|
||||
print(f" - Écart-type: {stats['stdev_time_s']:.2f}s")
|
||||
|
||||
print(f"\n🔍 PII détectés:")
|
||||
print(f" - Total: {stats['total_pii']:,}")
|
||||
print(f" - Moyen: {stats['avg_pii']:.1f} par document")
|
||||
print(f" - Médiane: {stats['median_pii']:.0f}")
|
||||
print(f" - Min: {stats['min_pii']}")
|
||||
print(f" - Max: {stats['max_pii']:,}")
|
||||
|
||||
print(f"\n📊 Débit:")
|
||||
print(f" - Documents/seconde: {stats['docs_per_second']:.2f}")
|
||||
print(f" - PII/seconde: {stats['pii_per_second']:.1f}")
|
||||
|
||||
# Identifier les documents lents (> 2× moyenne)
|
||||
slow_threshold = stats['avg_time_s'] * 2
|
||||
slow_docs = [r for r in successful if r['time_s'] > slow_threshold]
|
||||
if slow_docs:
|
||||
print(f"\n⚠️ Documents lents (> {slow_threshold:.2f}s):")
|
||||
for doc in sorted(slow_docs, key=lambda x: x['time_s'], reverse=True)[:5]:
|
||||
print(f" - {doc['pdf']}: {doc['time_s']:.2f}s ({doc['pii_count']} PII)")
|
||||
|
||||
# Identifier les documents rapides (< 0.5× moyenne)
|
||||
fast_threshold = stats['avg_time_s'] * 0.5
|
||||
fast_docs = [r for r in successful if r['time_s'] < fast_threshold]
|
||||
if fast_docs:
|
||||
print(f"\n⚡ Documents rapides (< {fast_threshold:.2f}s):")
|
||||
for doc in sorted(fast_docs, key=lambda x: x['time_s'])[:5]:
|
||||
print(f" - {doc['pdf']}: {doc['time_s']:.2f}s ({doc['pii_count']} PII)")
|
||||
|
||||
# Analyser la corrélation PII / temps
|
||||
print(f"\n📈 Analyse de corrélation:")
|
||||
# Documents avec beaucoup de PII
|
||||
high_pii_docs = [r for r in successful if r['pii_count'] > stats['avg_pii'] * 2]
|
||||
if high_pii_docs:
|
||||
avg_time_high_pii = statistics.mean([r['time_s'] for r in high_pii_docs])
|
||||
print(f" - Documents avec beaucoup de PII (>{stats['avg_pii']*2:.0f}): {len(high_pii_docs)}")
|
||||
print(f" Temps moyen: {avg_time_high_pii:.2f}s")
|
||||
|
||||
# Documents avec peu de PII
|
||||
low_pii_docs = [r for r in successful if r['pii_count'] < stats['avg_pii'] * 0.5]
|
||||
if low_pii_docs:
|
||||
avg_time_low_pii = statistics.mean([r['time_s'] for r in low_pii_docs])
|
||||
print(f" - Documents avec peu de PII (<{stats['avg_pii']*0.5:.0f}): {len(low_pii_docs)}")
|
||||
print(f" Temps moyen: {avg_time_low_pii:.2f}s")
|
||||
|
||||
# Sauvegarder les résultats
|
||||
output_dir = Path("tests/ground_truth/benchmarks")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
benchmark_data = {
|
||||
"date": datetime.now().isoformat(),
|
||||
"batch_date": batch_data['date'],
|
||||
"configuration": {
|
||||
"use_ner": batch_data.get('use_ner', True),
|
||||
"use_vlm": batch_data.get('use_vlm', False)
|
||||
},
|
||||
"statistics": stats,
|
||||
"documents": [
|
||||
{
|
||||
"pdf": r['pdf'],
|
||||
"time_s": r['time_s'],
|
||||
"pii_count": r['pii_count']
|
||||
}
|
||||
for r in successful
|
||||
]
|
||||
}
|
||||
|
||||
json_file = output_dir / "baseline_benchmark.json"
|
||||
with open(json_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(benchmark_data, f, indent=2, ensure_ascii=False)
|
||||
print(f"\n📊 Résultats JSON: {json_file}")
|
||||
|
||||
# Export CSV
|
||||
csv_file = output_dir / "baseline_benchmark.csv"
|
||||
with open(csv_file, 'w', encoding='utf-8') as f:
|
||||
f.write("pdf,time_s,pii_count\n")
|
||||
for r in successful:
|
||||
f.write(f"{r['pdf']},{r['time_s']},{r['pii_count']}\n")
|
||||
print(f"📊 Résultats CSV: {csv_file}")
|
||||
|
||||
# Vérifier les objectifs de performance
|
||||
print("\n" + "="*80)
|
||||
print("VALIDATION DES OBJECTIFS")
|
||||
print("="*80)
|
||||
|
||||
target_time_no_vlm = 10.0 # < 10s par PDF (sans VLM)
|
||||
target_time_with_vlm = 30.0 # < 30s par PDF (avec VLM)
|
||||
|
||||
# On n'a pas utilisé le VLM dans le batch
|
||||
target = target_time_no_vlm
|
||||
use_vlm = batch_data.get('use_vlm', False)
|
||||
|
||||
if use_vlm:
|
||||
target = target_time_with_vlm
|
||||
|
||||
print(f"\n🎯 Objectif: < {target}s par document (VLM: {'✓' if use_vlm else '✗'})")
|
||||
|
||||
if stats['avg_time_s'] <= target:
|
||||
print(f"✅ Temps moyen atteint: {stats['avg_time_s']:.2f}s ≤ {target}s")
|
||||
else:
|
||||
print(f"⚠️ Temps moyen non atteint: {stats['avg_time_s']:.2f}s > {target}s")
|
||||
print(f" Écart: +{stats['avg_time_s'] - target:.2f}s ({(stats['avg_time_s']/target - 1)*100:.1f}%)")
|
||||
|
||||
if stats['max_time_s'] <= target * 3:
|
||||
print(f"✅ Temps max acceptable: {stats['max_time_s']:.2f}s ≤ {target * 3}s")
|
||||
else:
|
||||
print(f"⚠️ Temps max trop élevé: {stats['max_time_s']:.2f}s > {target * 3}s")
|
||||
|
||||
# Pourcentage de documents dans l'objectif
|
||||
docs_in_target = sum(1 for r in successful if r['time_s'] <= target)
|
||||
pct_in_target = (docs_in_target / len(successful)) * 100
|
||||
print(f"\n📊 Documents dans l'objectif: {docs_in_target}/{len(successful)} ({pct_in_target:.1f}%)")
|
||||
|
||||
if pct_in_target >= 80:
|
||||
print(f"✅ Objectif de couverture atteint (≥80%)")
|
||||
else:
|
||||
print(f"⚠️ Objectif de couverture non atteint (<80%)")
|
||||
|
||||
print("\n" + "="*80)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(run_baseline_benchmark())
|
||||
123
tools/show_anonymization_example.py
Executable file
123
tools/show_anonymization_example.py
Executable file
@@ -0,0 +1,123 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Affiche un exemple d'anonymisation avec statistiques détaillées.
|
||||
"""
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
|
||||
def show_example(pdf_name: str = None):
|
||||
"""Affiche les détails d'un document anonymisé."""
|
||||
|
||||
baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
||||
|
||||
# Charger les résultats du batch
|
||||
results_file = baseline_dir / "batch_results.json"
|
||||
if not results_file.exists():
|
||||
print(f"✗ Fichier de résultats non trouvé: {results_file}")
|
||||
return 1
|
||||
|
||||
with open(results_file, 'r', encoding='utf-8') as f:
|
||||
batch_data = json.load(f)
|
||||
|
||||
# Si pas de PDF spécifié, prendre le premier avec le plus de PII
|
||||
if not pdf_name:
|
||||
successful = [r for r in batch_data['results'] if r.get('success')]
|
||||
if not successful:
|
||||
print("✗ Aucun document traité avec succès")
|
||||
return 1
|
||||
|
||||
# Trier par nombre de PII (décroissant)
|
||||
successful.sort(key=lambda x: x.get('pii_count', 0), reverse=True)
|
||||
pdf_name = successful[0]['pdf']
|
||||
|
||||
# Trouver le résultat
|
||||
result = next((r for r in batch_data['results'] if r['pdf'] == pdf_name), None)
|
||||
if not result:
|
||||
print(f"✗ Document non trouvé: {pdf_name}")
|
||||
return 1
|
||||
|
||||
if not result.get('success'):
|
||||
print(f"✗ Document en échec: {pdf_name}")
|
||||
print(f" Erreur: {result.get('error', 'Unknown')}")
|
||||
return 1
|
||||
|
||||
# Charger l'audit
|
||||
audit_file = baseline_dir / f"{Path(pdf_name).stem}.audit.jsonl"
|
||||
if not audit_file.exists():
|
||||
print(f"✗ Fichier d'audit non trouvé: {audit_file}")
|
||||
return 1
|
||||
|
||||
detections = []
|
||||
with open(audit_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
detections.append(json.loads(line))
|
||||
|
||||
# Analyser les détections
|
||||
types_counter = Counter(d.get('kind', d.get('type', 'unknown')) for d in detections)
|
||||
methods_counter = Counter(d.get('method', 'unknown') for d in detections)
|
||||
pages_counter = Counter(d['page'] for d in detections)
|
||||
|
||||
# Afficher
|
||||
print("="*80)
|
||||
print(f"EXEMPLE D'ANONYMISATION: {pdf_name}")
|
||||
print("="*80)
|
||||
|
||||
print(f"\n📄 Document: {pdf_name}")
|
||||
print(f"⏱️ Temps de traitement: {result['time_s']:.2f}s")
|
||||
print(f"🔍 PII détectés: {result['pii_count']}")
|
||||
|
||||
print(f"\n📊 Répartition par type:")
|
||||
for pii_type, count in types_counter.most_common():
|
||||
print(f" - {pii_type}: {count}")
|
||||
|
||||
print(f"\n🔬 Répartition par méthode de détection:")
|
||||
for method, count in methods_counter.most_common():
|
||||
print(f" - {method}: {count}")
|
||||
|
||||
print(f"\n📖 Répartition par page:")
|
||||
for page, count in sorted(pages_counter.items()):
|
||||
print(f" - Page {page}: {count} PII")
|
||||
|
||||
# Exemples de détections
|
||||
print(f"\n🔍 Exemples de détections (5 premiers):")
|
||||
for i, det in enumerate(detections[:5], 1):
|
||||
text = det.get('original', det.get('text', ''))
|
||||
if len(text) > 40:
|
||||
text = text[:37] + "..."
|
||||
pii_type = det.get('kind', det.get('type', 'unknown'))
|
||||
print(f" {i}. [{pii_type}] \"{text}\" (page {det['page']}, méthode: {det.get('method', 'unknown')})")
|
||||
|
||||
# Fichiers générés
|
||||
print(f"\n📂 Fichiers générés:")
|
||||
stem = Path(pdf_name).stem
|
||||
files = [
|
||||
baseline_dir / f"{stem}.pseudonymise.txt",
|
||||
baseline_dir / f"{stem}.redacted_vector.pdf",
|
||||
baseline_dir / f"{stem}.redacted_raster.pdf",
|
||||
baseline_dir / f"{stem}.audit.jsonl"
|
||||
]
|
||||
for f in files:
|
||||
status = "✓" if f.exists() else "✗"
|
||||
print(f" {status} {f.name}")
|
||||
|
||||
print("\n" + "="*80)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Afficher un exemple d'anonymisation")
|
||||
parser.add_argument("pdf", nargs="?", help="Nom du PDF (optionnel, par défaut le plus complexe)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return show_example(args.pdf)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
85
tools/show_batch_summary.py
Executable file
85
tools/show_batch_summary.py
Executable file
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Affiche un résumé des résultats du batch d'anonymisation.
|
||||
"""
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
|
||||
def show_summary():
|
||||
"""Affiche le résumé du batch."""
|
||||
|
||||
baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
||||
results_file = baseline_dir / "batch_results.json"
|
||||
|
||||
if not results_file.exists():
|
||||
print(f"✗ Fichier de résultats non trouvé: {results_file}")
|
||||
return 1
|
||||
|
||||
with open(results_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Statistiques globales
|
||||
print("="*80)
|
||||
print("RÉSUMÉ DU BATCH D'ANONYMISATION")
|
||||
print("="*80)
|
||||
|
||||
print(f"\n📅 Date: {data['date']}")
|
||||
print(f"📄 Documents traités: {data['success_count']}/{data['total_documents']}")
|
||||
print(f"🔍 PII détectés: {data['total_pii']:,}")
|
||||
print(f"⏱️ Temps total: {data['total_time_s']:.2f}s")
|
||||
print(f"⏱️ Temps moyen: {data['avg_time_s']:.2f}s par document")
|
||||
|
||||
# Analyser les résultats
|
||||
successful = [r for r in data['results'] if r.get('success')]
|
||||
failed = [r for r in data['results'] if not r.get('success')]
|
||||
|
||||
if successful:
|
||||
times = [r['time_s'] for r in successful]
|
||||
pii_counts = [r['pii_count'] for r in successful]
|
||||
|
||||
print(f"\n📊 Statistiques de temps:")
|
||||
print(f" - Min: {min(times):.2f}s")
|
||||
print(f" - Max: {max(times):.2f}s")
|
||||
print(f" - Médiane: {sorted(times)[len(times)//2]:.2f}s")
|
||||
|
||||
print(f"\n📊 Statistiques de PII:")
|
||||
print(f" - Min: {min(pii_counts)}")
|
||||
print(f" - Max: {max(pii_counts):,}")
|
||||
print(f" - Médiane: {sorted(pii_counts)[len(pii_counts)//2]}")
|
||||
print(f" - Moyenne: {sum(pii_counts)/len(pii_counts):.1f}")
|
||||
|
||||
# Top 5 documents les plus complexes
|
||||
if successful:
|
||||
print(f"\n🏆 Top 5 documents les plus complexes (par PII):")
|
||||
top5 = sorted(successful, key=lambda x: x['pii_count'], reverse=True)[:5]
|
||||
for i, r in enumerate(top5, 1):
|
||||
print(f" {i}. {r['pdf']}")
|
||||
print(f" → {r['pii_count']:,} PII en {r['time_s']:.2f}s")
|
||||
|
||||
# Top 5 documents les plus rapides
|
||||
if successful:
|
||||
print(f"\n⚡ Top 5 documents les plus rapides:")
|
||||
fastest = sorted(successful, key=lambda x: x['time_s'])[:5]
|
||||
for i, r in enumerate(fastest, 1):
|
||||
print(f" {i}. {r['pdf']}")
|
||||
print(f" → {r['time_s']:.2f}s ({r['pii_count']} PII)")
|
||||
|
||||
# Échecs
|
||||
if failed:
|
||||
print(f"\n⚠️ Échecs ({len(failed)}):")
|
||||
for r in failed:
|
||||
error = r.get('error', 'Unknown error')
|
||||
if not error:
|
||||
error = "PDF protégé par mot de passe"
|
||||
print(f" - {r['pdf']}")
|
||||
print(f" → {error}")
|
||||
|
||||
print("\n" + "="*80)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(show_summary())
|
||||
Reference in New Issue
Block a user