feat: Phase 1 - Système d'évaluation de la qualité

- Sélection et copie de 27 documents représentatifs (10 simples, 12 moyens, 5 complexes)
- Outil d'annotation CLI complet (tools/annotation_tool.py)
- Guide d'annotation détaillé (docs/annotation_guide.md)
- Évaluateur de qualité (evaluation/quality_evaluator.py)
  * Calcul Précision, Rappel, F1-Score
  * Identification faux positifs/négatifs
  * Métriques par type de PII
  * Export JSON et rapports texte
- Scanner de fuite (evaluation/leak_scanner.py)
  * Détection PII résiduels (CRITIQUE)
  * Détection nouveaux PII (HAUTE)
  * Scan métadonnées PDF (MOYENNE)
- Benchmark de performance (evaluation/benchmark.py)
  * Mesure temps de traitement
  * Mesure CPU/RAM
  * Export JSON/CSV
- Tests unitaires complets pour tous les composants
- Documentation complète du module d'évaluation

Tâches complétées:
- 1.1.1 Sélection de 27 documents (au lieu de 30)
- 1.1.2 Outil d'annotation CLI
- 1.2.1 Évaluateur de qualité
- 1.2.2 Scanner de fuite
- 1.2.3 Benchmark de performance

Prochaines étapes:
- 1.1.3 Annotation des 27 documents (manuel)
- 1.1.4 Enrichissement stopwords médicaux
- 1.3 Mesure de la baseline
This commit is contained in:
2026-03-02 10:07:41 +01:00
parent 0067738df6
commit 340348b820
86 changed files with 35587 additions and 40 deletions

339
evaluation/benchmark.py Normal file
View File

@@ -0,0 +1,339 @@
#!/usr/bin/env python3
"""
Benchmark de performance du système d'anonymisation.
Mesure les temps de traitement, l'utilisation CPU/RAM, et les métriques de qualité.
"""
import json
import time
import psutil
import platform
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Dict, Optional
from datetime import datetime
@dataclass
class BenchmarkResult:
"""Résultat de benchmark pour un document."""
pdf_path: str
processing_time_s: float = 0.0
time_per_page_s: float = 0.0
cpu_usage_percent: float = 0.0
ram_usage_mb: float = 0.0
pii_detected: int = 0
quality_metrics: Dict = field(default_factory=dict)
def to_dict(self) -> Dict:
"""Convertit en dictionnaire."""
return {
"pdf_path": self.pdf_path,
"processing_time_s": round(self.processing_time_s, 2),
"time_per_page_s": round(self.time_per_page_s, 2),
"cpu_usage_percent": round(self.cpu_usage_percent, 2),
"ram_usage_mb": round(self.ram_usage_mb, 2),
"pii_detected": self.pii_detected,
"quality_metrics": self.quality_metrics
}
class Benchmark:
"""Benchmark de performance."""
def __init__(self, test_data_dir: Path):
"""
Initialise le benchmark.
Args:
test_data_dir: Répertoire contenant les données de test
"""
self.test_data_dir = Path(test_data_dir)
self.process = psutil.Process()
def get_system_info(self) -> Dict:
"""
Récupère les informations système.
Returns:
Dictionnaire des informations système
"""
return {
"os": platform.system(),
"os_version": platform.version(),
"cpu": platform.processor(),
"cpu_count": psutil.cpu_count(logical=False),
"cpu_count_logical": psutil.cpu_count(logical=True),
"ram_gb": round(psutil.virtual_memory().total / (1024**3), 2),
"python_version": platform.python_version()
}
def measure_cpu_ram(self, duration_s: float = 1.0) -> tuple:
"""
Mesure l'utilisation CPU et RAM pendant une durée.
Args:
duration_s: Durée de mesure en secondes
Returns:
Tuple (cpu_percent, ram_mb)
"""
# Mesurer le CPU sur une période
cpu_percent = self.process.cpu_percent(interval=duration_s)
# Mesurer la RAM
ram_mb = self.process.memory_info().rss / (1024 * 1024)
return cpu_percent, ram_mb
def benchmark_document(
self,
pdf_path: Path,
anonymize_func,
page_count: Optional[int] = None
) -> BenchmarkResult:
"""
Benchmark un document.
Args:
pdf_path: Chemin vers le PDF
anonymize_func: Fonction d'anonymisation à benchmarker
page_count: Nombre de pages (optionnel)
Returns:
Résultat du benchmark
"""
# Mesurer le temps de traitement
start_time = time.time()
start_cpu = self.process.cpu_percent()
start_ram = self.process.memory_info().rss / (1024 * 1024)
# Exécuter l'anonymisation
try:
audit_path = anonymize_func(pdf_path)
except Exception as e:
print(f"✗ Erreur lors de l'anonymisation de {pdf_path.name}: {e}")
return BenchmarkResult(pdf_path=str(pdf_path))
# Mesurer après traitement
end_time = time.time()
end_cpu = self.process.cpu_percent()
end_ram = self.process.memory_info().rss / (1024 * 1024)
processing_time = end_time - start_time
cpu_usage = (start_cpu + end_cpu) / 2
ram_usage = end_ram - start_ram
# Compter les PII détectés
pii_count = 0
if audit_path and audit_path.exists():
try:
with open(audit_path, 'r', encoding='utf-8') as f:
pii_count = sum(1 for line in f if line.strip())
except Exception:
pass
# Calculer le temps par page
time_per_page = processing_time / page_count if page_count and page_count > 0 else 0.0
# Créer le résultat
result = BenchmarkResult(
pdf_path=str(pdf_path),
processing_time_s=processing_time,
time_per_page_s=time_per_page,
cpu_usage_percent=cpu_usage,
ram_usage_mb=ram_usage,
pii_detected=pii_count
)
return result
def run(
self,
pdf_list: List[Path],
anonymize_func,
page_counts: Optional[List[int]] = None
) -> List[BenchmarkResult]:
"""
Exécute le benchmark sur une liste de documents.
Args:
pdf_list: Liste des PDFs à benchmarker
anonymize_func: Fonction d'anonymisation
page_counts: Liste des nombres de pages (optionnel)
Returns:
Liste des résultats
"""
results = []
if page_counts is None:
page_counts = [None] * len(pdf_list)
for i, (pdf_path, page_count) in enumerate(zip(pdf_list, page_counts), 1):
print(f"[{i}/{len(pdf_list)}] Benchmark: {pdf_path.name}")
result = self.benchmark_document(pdf_path, anonymize_func, page_count)
results.append(result)
print(f" Temps: {result.processing_time_s:.2f}s "
f"CPU: {result.cpu_usage_percent:.1f}% "
f"RAM: {result.ram_usage_mb:.1f}MB "
f"PII: {result.pii_detected}")
return results
def calculate_summary(self, results: List[BenchmarkResult]) -> Dict:
"""
Calcule les statistiques résumées.
Args:
results: Liste des résultats
Returns:
Dictionnaire des statistiques
"""
if not results:
return {}
processing_times = [r.processing_time_s for r in results]
cpu_usages = [r.cpu_usage_percent for r in results]
ram_usages = [r.ram_usage_mb for r in results]
pii_counts = [r.pii_detected for r in results]
return {
"documents_count": len(results),
"avg_time_per_doc": round(sum(processing_times) / len(processing_times), 2),
"min_time": round(min(processing_times), 2),
"max_time": round(max(processing_times), 2),
"avg_cpu_percent": round(sum(cpu_usages) / len(cpu_usages), 2),
"avg_ram_mb": round(sum(ram_usages) / len(ram_usages), 2),
"total_pii_detected": sum(pii_counts),
"avg_pii_per_doc": round(sum(pii_counts) / len(pii_counts), 2)
}
def generate_report(self, results: List[BenchmarkResult]) -> str:
"""
Génère un rapport texte.
Args:
results: Liste des résultats
Returns:
Rapport texte
"""
if not results:
return "Aucun résultat à afficher."
summary = self.calculate_summary(results)
system_info = self.get_system_info()
lines = []
lines.append("=" * 80)
lines.append("RAPPORT DE BENCHMARK - PERFORMANCE D'ANONYMISATION")
lines.append("=" * 80)
lines.append("")
# Informations système
lines.append("SYSTÈME:")
lines.append(f" OS: {system_info['os']} {system_info['os_version']}")
lines.append(f" CPU: {system_info['cpu']}")
lines.append(f" Cœurs: {system_info['cpu_count']} physiques / {system_info['cpu_count_logical']} logiques")
lines.append(f" RAM: {system_info['ram_gb']} GB")
lines.append(f" Python: {system_info['python_version']}")
lines.append("")
# Résumé
lines.append("RÉSUMÉ:")
lines.append(f" Documents: {summary['documents_count']}")
lines.append(f" Temps moyen: {summary['avg_time_per_doc']}s")
lines.append(f" Temps min/max: {summary['min_time']}s / {summary['max_time']}s")
lines.append(f" CPU moyen: {summary['avg_cpu_percent']}%")
lines.append(f" RAM moyenne: {summary['avg_ram_mb']} MB")
lines.append(f" PII détectés: {summary['total_pii_detected']} (moy: {summary['avg_pii_per_doc']})")
lines.append("")
# Détails par document
lines.append("DÉTAILS PAR DOCUMENT:")
lines.append("")
for result in results:
pdf_name = Path(result.pdf_path).name
lines.append(f" {pdf_name}")
lines.append(f" Temps: {result.processing_time_s:.2f}s "
f"CPU: {result.cpu_usage_percent:.1f}% "
f"RAM: {result.ram_usage_mb:.1f}MB "
f"PII: {result.pii_detected}")
lines.append("")
lines.append("=" * 80)
return "\n".join(lines)
def export_json(self, results: List[BenchmarkResult], output_path: Path):
"""
Exporte les résultats en JSON.
Args:
results: Liste des résultats
output_path: Chemin du fichier de sortie
"""
data = {
"benchmark_date": datetime.now().isoformat(),
"system_info": self.get_system_info(),
"results": [r.to_dict() for r in results],
"summary": self.calculate_summary(results)
}
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"✓ Résultats exportés: {output_path}")
def export_csv(self, results: List[BenchmarkResult], output_path: Path):
"""
Exporte les résultats en CSV.
Args:
results: Liste des résultats
output_path: Chemin du fichier de sortie
"""
import csv
with open(output_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
# En-tête
writer.writerow([
"pdf_path",
"processing_time_s",
"time_per_page_s",
"cpu_usage_percent",
"ram_usage_mb",
"pii_detected"
])
# Données
for result in results:
writer.writerow([
result.pdf_path,
result.processing_time_s,
result.time_per_page_s,
result.cpu_usage_percent,
result.ram_usage_mb,
result.pii_detected
])
print(f"✓ Résultats exportés: {output_path}")
if __name__ == "__main__":
# Test basique
benchmark = Benchmark(Path("tests/ground_truth/pdfs"))
# Afficher les informations système
system_info = benchmark.get_system_info()
print("Informations système:")
for key, value in system_info.items():
print(f" {key}: {value}")