#!/usr/bin/env python3 """ Benchmark de performance du système d'anonymisation. Mesure les temps de traitement, l'utilisation CPU/RAM, et les métriques de qualité. """ import json import time import psutil import platform from dataclasses import dataclass, field from pathlib import Path from typing import List, Dict, Optional from datetime import datetime @dataclass class BenchmarkResult: """Résultat de benchmark pour un document.""" pdf_path: str processing_time_s: float = 0.0 time_per_page_s: float = 0.0 cpu_usage_percent: float = 0.0 ram_usage_mb: float = 0.0 pii_detected: int = 0 quality_metrics: Dict = field(default_factory=dict) def to_dict(self) -> Dict: """Convertit en dictionnaire.""" return { "pdf_path": self.pdf_path, "processing_time_s": round(self.processing_time_s, 2), "time_per_page_s": round(self.time_per_page_s, 2), "cpu_usage_percent": round(self.cpu_usage_percent, 2), "ram_usage_mb": round(self.ram_usage_mb, 2), "pii_detected": self.pii_detected, "quality_metrics": self.quality_metrics } class Benchmark: """Benchmark de performance.""" def __init__(self, test_data_dir: Path): """ Initialise le benchmark. Args: test_data_dir: Répertoire contenant les données de test """ self.test_data_dir = Path(test_data_dir) self.process = psutil.Process() def get_system_info(self) -> Dict: """ Récupère les informations système. Returns: Dictionnaire des informations système """ return { "os": platform.system(), "os_version": platform.version(), "cpu": platform.processor(), "cpu_count": psutil.cpu_count(logical=False), "cpu_count_logical": psutil.cpu_count(logical=True), "ram_gb": round(psutil.virtual_memory().total / (1024**3), 2), "python_version": platform.python_version() } def measure_cpu_ram(self, duration_s: float = 1.0) -> tuple: """ Mesure l'utilisation CPU et RAM pendant une durée. Args: duration_s: Durée de mesure en secondes Returns: Tuple (cpu_percent, ram_mb) """ # Mesurer le CPU sur une période cpu_percent = self.process.cpu_percent(interval=duration_s) # Mesurer la RAM ram_mb = self.process.memory_info().rss / (1024 * 1024) return cpu_percent, ram_mb def benchmark_document( self, pdf_path: Path, anonymize_func, page_count: Optional[int] = None ) -> BenchmarkResult: """ Benchmark un document. Args: pdf_path: Chemin vers le PDF anonymize_func: Fonction d'anonymisation à benchmarker page_count: Nombre de pages (optionnel) Returns: Résultat du benchmark """ # Mesurer le temps de traitement start_time = time.time() start_cpu = self.process.cpu_percent() start_ram = self.process.memory_info().rss / (1024 * 1024) # Exécuter l'anonymisation try: audit_path = anonymize_func(pdf_path) except Exception as e: print(f"✗ Erreur lors de l'anonymisation de {pdf_path.name}: {e}") return BenchmarkResult(pdf_path=str(pdf_path)) # Mesurer après traitement end_time = time.time() end_cpu = self.process.cpu_percent() end_ram = self.process.memory_info().rss / (1024 * 1024) processing_time = end_time - start_time cpu_usage = (start_cpu + end_cpu) / 2 ram_usage = end_ram - start_ram # Compter les PII détectés pii_count = 0 if audit_path and audit_path.exists(): try: with open(audit_path, 'r', encoding='utf-8') as f: pii_count = sum(1 for line in f if line.strip()) except Exception: pass # Calculer le temps par page time_per_page = processing_time / page_count if page_count and page_count > 0 else 0.0 # Créer le résultat result = BenchmarkResult( pdf_path=str(pdf_path), processing_time_s=processing_time, time_per_page_s=time_per_page, cpu_usage_percent=cpu_usage, ram_usage_mb=ram_usage, pii_detected=pii_count ) return result def run( self, pdf_list: List[Path], anonymize_func, page_counts: Optional[List[int]] = None ) -> List[BenchmarkResult]: """ Exécute le benchmark sur une liste de documents. Args: pdf_list: Liste des PDFs à benchmarker anonymize_func: Fonction d'anonymisation page_counts: Liste des nombres de pages (optionnel) Returns: Liste des résultats """ results = [] if page_counts is None: page_counts = [None] * len(pdf_list) for i, (pdf_path, page_count) in enumerate(zip(pdf_list, page_counts), 1): print(f"[{i}/{len(pdf_list)}] Benchmark: {pdf_path.name}") result = self.benchmark_document(pdf_path, anonymize_func, page_count) results.append(result) print(f" Temps: {result.processing_time_s:.2f}s " f"CPU: {result.cpu_usage_percent:.1f}% " f"RAM: {result.ram_usage_mb:.1f}MB " f"PII: {result.pii_detected}") return results def calculate_summary(self, results: List[BenchmarkResult]) -> Dict: """ Calcule les statistiques résumées. Args: results: Liste des résultats Returns: Dictionnaire des statistiques """ if not results: return {} processing_times = [r.processing_time_s for r in results] cpu_usages = [r.cpu_usage_percent for r in results] ram_usages = [r.ram_usage_mb for r in results] pii_counts = [r.pii_detected for r in results] return { "documents_count": len(results), "avg_time_per_doc": round(sum(processing_times) / len(processing_times), 2), "min_time": round(min(processing_times), 2), "max_time": round(max(processing_times), 2), "avg_cpu_percent": round(sum(cpu_usages) / len(cpu_usages), 2), "avg_ram_mb": round(sum(ram_usages) / len(ram_usages), 2), "total_pii_detected": sum(pii_counts), "avg_pii_per_doc": round(sum(pii_counts) / len(pii_counts), 2) } def generate_report(self, results: List[BenchmarkResult]) -> str: """ Génère un rapport texte. Args: results: Liste des résultats Returns: Rapport texte """ if not results: return "Aucun résultat à afficher." summary = self.calculate_summary(results) system_info = self.get_system_info() lines = [] lines.append("=" * 80) lines.append("RAPPORT DE BENCHMARK - PERFORMANCE D'ANONYMISATION") lines.append("=" * 80) lines.append("") # Informations système lines.append("SYSTÈME:") lines.append(f" OS: {system_info['os']} {system_info['os_version']}") lines.append(f" CPU: {system_info['cpu']}") lines.append(f" Cœurs: {system_info['cpu_count']} physiques / {system_info['cpu_count_logical']} logiques") lines.append(f" RAM: {system_info['ram_gb']} GB") lines.append(f" Python: {system_info['python_version']}") lines.append("") # Résumé lines.append("RÉSUMÉ:") lines.append(f" Documents: {summary['documents_count']}") lines.append(f" Temps moyen: {summary['avg_time_per_doc']}s") lines.append(f" Temps min/max: {summary['min_time']}s / {summary['max_time']}s") lines.append(f" CPU moyen: {summary['avg_cpu_percent']}%") lines.append(f" RAM moyenne: {summary['avg_ram_mb']} MB") lines.append(f" PII détectés: {summary['total_pii_detected']} (moy: {summary['avg_pii_per_doc']})") lines.append("") # Détails par document lines.append("DÉTAILS PAR DOCUMENT:") lines.append("") for result in results: pdf_name = Path(result.pdf_path).name lines.append(f" {pdf_name}") lines.append(f" Temps: {result.processing_time_s:.2f}s " f"CPU: {result.cpu_usage_percent:.1f}% " f"RAM: {result.ram_usage_mb:.1f}MB " f"PII: {result.pii_detected}") lines.append("") lines.append("=" * 80) return "\n".join(lines) def export_json(self, results: List[BenchmarkResult], output_path: Path): """ Exporte les résultats en JSON. Args: results: Liste des résultats output_path: Chemin du fichier de sortie """ data = { "benchmark_date": datetime.now().isoformat(), "system_info": self.get_system_info(), "results": [r.to_dict() for r in results], "summary": self.calculate_summary(results) } with open(output_path, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) print(f"✓ Résultats exportés: {output_path}") def export_csv(self, results: List[BenchmarkResult], output_path: Path): """ Exporte les résultats en CSV. Args: results: Liste des résultats output_path: Chemin du fichier de sortie """ import csv with open(output_path, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) # En-tête writer.writerow([ "pdf_path", "processing_time_s", "time_per_page_s", "cpu_usage_percent", "ram_usage_mb", "pii_detected" ]) # Données for result in results: writer.writerow([ result.pdf_path, result.processing_time_s, result.time_per_page_s, result.cpu_usage_percent, result.ram_usage_mb, result.pii_detected ]) print(f"✓ Résultats exportés: {output_path}") if __name__ == "__main__": # Test basique benchmark = Benchmark(Path("tests/ground_truth/pdfs")) # Afficher les informations système system_info = benchmark.get_system_info() print("Informations système:") for key, value in system_info.items(): print(f" {key}: {value}")