feat: Phase 1 - Système d'évaluation de la qualité
- Sélection et copie de 27 documents représentatifs (10 simples, 12 moyens, 5 complexes) - Outil d'annotation CLI complet (tools/annotation_tool.py) - Guide d'annotation détaillé (docs/annotation_guide.md) - Évaluateur de qualité (evaluation/quality_evaluator.py) * Calcul Précision, Rappel, F1-Score * Identification faux positifs/négatifs * Métriques par type de PII * Export JSON et rapports texte - Scanner de fuite (evaluation/leak_scanner.py) * Détection PII résiduels (CRITIQUE) * Détection nouveaux PII (HAUTE) * Scan métadonnées PDF (MOYENNE) - Benchmark de performance (evaluation/benchmark.py) * Mesure temps de traitement * Mesure CPU/RAM * Export JSON/CSV - Tests unitaires complets pour tous les composants - Documentation complète du module d'évaluation Tâches complétées: - 1.1.1 Sélection de 27 documents (au lieu de 30) - 1.1.2 Outil d'annotation CLI - 1.2.1 Évaluateur de qualité - 1.2.2 Scanner de fuite - 1.2.3 Benchmark de performance Prochaines étapes: - 1.1.3 Annotation des 27 documents (manuel) - 1.1.4 Enrichissement stopwords médicaux - 1.3 Mesure de la baseline
This commit is contained in:
110
tests/unit/test_leak_scanner.py
Normal file
110
tests/unit/test_leak_scanner.py
Normal file
@@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests unitaires pour le scanner de fuite.
|
||||
"""
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from evaluation.leak_scanner import LeakScanner, LeakReport
|
||||
|
||||
|
||||
class TestLeakScanner:
|
||||
"""Tests pour LeakScanner."""
|
||||
|
||||
def test_scan_text_no_leak(self):
|
||||
"""Test sans fuite."""
|
||||
scanner = LeakScanner()
|
||||
|
||||
text = "Le patient a été examiné par le Dr. [NOM] le [DATE]."
|
||||
original_pii = [
|
||||
{"kind": "NOM", "original": "DUPONT"},
|
||||
{"kind": "DATE", "original": "15/01/2024"}
|
||||
]
|
||||
|
||||
leaks = scanner.scan_text(text, original_pii)
|
||||
|
||||
assert len(leaks) == 0
|
||||
|
||||
def test_scan_text_original_pii_present(self):
|
||||
"""Test avec PII original présent."""
|
||||
scanner = LeakScanner()
|
||||
|
||||
text = "Le patient DUPONT a été examiné le 15/01/2024."
|
||||
original_pii = [
|
||||
{"kind": "NOM", "original": "DUPONT"},
|
||||
{"kind": "DATE", "original": "15/01/2024"}
|
||||
]
|
||||
|
||||
leaks = scanner.scan_text(text, original_pii)
|
||||
|
||||
assert len(leaks) == 2
|
||||
assert all(leak["severity"] == "CRITIQUE" for leak in leaks)
|
||||
assert all(leak["type"] == "original_pii_present" for leak in leaks)
|
||||
|
||||
def test_scan_text_new_pii_detected(self):
|
||||
"""Test avec nouveau PII détecté."""
|
||||
scanner = LeakScanner()
|
||||
|
||||
text = "Contact: jean.dupont@example.com ou 01 23 45 67 89"
|
||||
original_pii = []
|
||||
|
||||
leaks = scanner.scan_text(text, original_pii)
|
||||
|
||||
# Devrait détecter l'email et le téléphone
|
||||
assert len(leaks) >= 2
|
||||
|
||||
email_leak = next((l for l in leaks if l["pii_type"] == "EMAIL"), None)
|
||||
assert email_leak is not None
|
||||
assert email_leak["severity"] == "HAUTE"
|
||||
|
||||
tel_leak = next((l for l in leaks if l["pii_type"] == "TEL"), None)
|
||||
assert tel_leak is not None
|
||||
assert tel_leak["severity"] == "HAUTE"
|
||||
|
||||
def test_leak_report_is_safe(self):
|
||||
"""Test de rapport sûr."""
|
||||
report = LeakReport(
|
||||
is_safe=True,
|
||||
leak_count=0,
|
||||
leaks=[],
|
||||
severity_counts={}
|
||||
)
|
||||
|
||||
assert report.is_safe
|
||||
assert report.leak_count == 0
|
||||
|
||||
def test_leak_report_not_safe(self):
|
||||
"""Test de rapport non sûr."""
|
||||
report = LeakReport(
|
||||
is_safe=False,
|
||||
leak_count=2,
|
||||
leaks=[
|
||||
{"severity": "CRITIQUE", "type": "original_pii_present"},
|
||||
{"severity": "HAUTE", "type": "new_pii_detected"}
|
||||
],
|
||||
severity_counts={"CRITIQUE": 1, "HAUTE": 1}
|
||||
)
|
||||
|
||||
assert not report.is_safe
|
||||
assert report.leak_count == 2
|
||||
assert report.severity_counts["CRITIQUE"] == 1
|
||||
assert report.severity_counts["HAUTE"] == 1
|
||||
|
||||
def test_leak_report_to_dict(self):
|
||||
"""Test de conversion en dictionnaire."""
|
||||
report = LeakReport(
|
||||
is_safe=False,
|
||||
leak_count=1,
|
||||
leaks=[{"severity": "CRITIQUE"}],
|
||||
severity_counts={"CRITIQUE": 1}
|
||||
)
|
||||
|
||||
data = report.to_dict()
|
||||
|
||||
assert data["is_safe"] is False
|
||||
assert data["leak_count"] == 1
|
||||
assert len(data["leaks"]) == 1
|
||||
assert data["severity_counts"]["CRITIQUE"] == 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
Reference in New Issue
Block a user