232 lines
8.4 KiB
Python
Executable File
232 lines
8.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Évaluation de la qualité d'anonymisation sur le dataset annoté.
|
|
|
|
Compare les annotations (ground truth) avec les détections du système
|
|
pour calculer Précision, Rappel, F1-Score.
|
|
"""
|
|
import sys
|
|
import json
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
from evaluation.quality_evaluator import QualityEvaluator
|
|
|
|
def run_quality_evaluation():
|
|
"""Exécute l'évaluation qualité sur tous les documents annotés."""
|
|
|
|
# Répertoires
|
|
annotations_dir = Path("tests/ground_truth/annotations")
|
|
baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
|
pdfs_dir = Path("tests/ground_truth/pdfs")
|
|
results_dir = Path("tests/ground_truth/quality_evaluation")
|
|
results_dir.mkdir(exist_ok=True)
|
|
|
|
# Lister les annotations
|
|
annotation_files = sorted(annotations_dir.glob("*.json"))
|
|
annotation_files = [f for f in annotation_files if f.name != "dataset_statistics.json"]
|
|
|
|
if not annotation_files:
|
|
print(f"✗ Aucune annotation trouvée dans {annotations_dir}")
|
|
return 1
|
|
|
|
print("="*80)
|
|
print("ÉVALUATION DE LA QUALITÉ D'ANONYMISATION")
|
|
print("="*80)
|
|
print(f"\n📁 Annotations: {annotations_dir}")
|
|
print(f"📁 Détections: {baseline_dir}")
|
|
print(f"📁 Résultats: {results_dir}")
|
|
print(f"\n📄 Documents à évaluer: {len(annotation_files)}")
|
|
|
|
# Créer l'évaluateur
|
|
evaluator = QualityEvaluator(annotations_dir)
|
|
|
|
# Statistiques globales
|
|
all_results = []
|
|
total_tp = 0
|
|
total_fp = 0
|
|
total_fn = 0
|
|
by_type_stats = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0})
|
|
|
|
# Évaluer chaque document
|
|
for i, annotation_file in enumerate(annotation_files, 1):
|
|
pdf_name = annotation_file.stem
|
|
|
|
print(f"\n[{i}/{len(annotation_files)}] {pdf_name}")
|
|
|
|
# Trouver le PDF
|
|
pdf_path = pdfs_dir / f"{pdf_name}.pdf"
|
|
if not pdf_path.exists():
|
|
print(f" ⚠️ PDF non trouvé: {pdf_path.name}")
|
|
continue
|
|
|
|
# Trouver l'audit
|
|
audit_path = baseline_dir / f"{pdf_name}.audit.jsonl"
|
|
if not audit_path.exists():
|
|
# Essayer avec les suffixes
|
|
for suffix in ['.redacted_raster', '.redacted_vector']:
|
|
audit_path_alt = baseline_dir / f"{pdf_name}{suffix}.audit.jsonl"
|
|
if audit_path_alt.exists():
|
|
audit_path = audit_path_alt
|
|
break
|
|
|
|
if not audit_path.exists():
|
|
print(f" ⚠️ Fichier audit non trouvé: {audit_path.name}")
|
|
continue
|
|
|
|
# Évaluer
|
|
result = evaluator.evaluate(pdf_path, audit_path)
|
|
|
|
if result is None:
|
|
print(f" ⚠️ Échec de l'évaluation")
|
|
continue
|
|
|
|
all_results.append({
|
|
"pdf": pdf_name,
|
|
"result": result
|
|
})
|
|
|
|
# Afficher
|
|
print(f" Précision: {result.precision:.2%} "
|
|
f"Rappel: {result.recall:.2%} "
|
|
f"F1: {result.f1_score:.2%}")
|
|
print(f" TP: {result.true_positives} "
|
|
f"FP: {result.false_positives} "
|
|
f"FN: {result.false_negatives}")
|
|
|
|
# Accumuler
|
|
total_tp += result.true_positives
|
|
total_fp += result.false_positives
|
|
total_fn += result.false_negatives
|
|
|
|
# Par type
|
|
for pii_type, stats in result.by_type.items():
|
|
by_type_stats[pii_type]["tp"] += stats["tp"]
|
|
by_type_stats[pii_type]["fp"] += stats["fp"]
|
|
by_type_stats[pii_type]["fn"] += stats["fn"]
|
|
|
|
if not all_results:
|
|
print("\n✗ Aucun document évalué avec succès")
|
|
return 1
|
|
|
|
# Calculer les métriques globales
|
|
print("\n" + "="*80)
|
|
print("RÉSULTATS GLOBAUX")
|
|
print("="*80)
|
|
|
|
precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
|
|
recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
|
|
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
|
|
|
|
print(f"\n📊 Métriques:")
|
|
print(f" - Précision: {precision:.2%}")
|
|
print(f" - Rappel: {recall:.2%}")
|
|
print(f" - F1-Score: {f1:.2%}")
|
|
|
|
print(f"\n📊 Détails:")
|
|
print(f" - Vrais positifs (TP): {total_tp}")
|
|
print(f" - Faux positifs (FP): {total_fp}")
|
|
print(f" - Faux négatifs (FN): {total_fn}")
|
|
|
|
# Métriques par type
|
|
print(f"\n📊 Métriques par type de PII:")
|
|
for pii_type in sorted(by_type_stats.keys()):
|
|
stats = by_type_stats[pii_type]
|
|
tp = stats["tp"]
|
|
fp = stats["fp"]
|
|
fn = stats["fn"]
|
|
|
|
prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
|
rec = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
|
f1_type = 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0.0
|
|
|
|
print(f" - {pii_type}:")
|
|
print(f" Précision: {prec:.2%} Rappel: {rec:.2%} F1: {f1_type:.2%}")
|
|
print(f" TP: {tp} FP: {fp} FN: {fn}")
|
|
|
|
# Validation des objectifs
|
|
print("\n" + "="*80)
|
|
print("VALIDATION DES OBJECTIFS")
|
|
print("="*80)
|
|
|
|
target_recall = 0.995 # ≥ 99.5%
|
|
target_precision = 0.97 # ≥ 97%
|
|
target_f1 = 0.98 # ≥ 0.98
|
|
|
|
print(f"\n🎯 Objectifs:")
|
|
print(f" - Rappel: ≥ {target_recall:.1%}")
|
|
print(f" - Précision: ≥ {target_precision:.1%}")
|
|
print(f" - F1-Score: ≥ {target_f1:.2%}")
|
|
|
|
print(f"\n📊 Résultats:")
|
|
|
|
if recall >= target_recall:
|
|
print(f" ✅ Rappel atteint: {recall:.2%} ≥ {target_recall:.1%}")
|
|
else:
|
|
print(f" ⚠️ Rappel non atteint: {recall:.2%} < {target_recall:.1%}")
|
|
print(f" Écart: {(target_recall - recall)*100:.2f} points")
|
|
|
|
if precision >= target_precision:
|
|
print(f" ✅ Précision atteinte: {precision:.2%} ≥ {target_precision:.1%}")
|
|
else:
|
|
print(f" ⚠️ Précision non atteinte: {precision:.2%} < {target_precision:.1%}")
|
|
print(f" Écart: {(target_precision - precision)*100:.2f} points")
|
|
|
|
if f1 >= target_f1:
|
|
print(f" ✅ F1-Score atteint: {f1:.2%} ≥ {target_f1:.2%}")
|
|
else:
|
|
print(f" ⚠️ F1-Score non atteint: {f1:.2%} < {target_f1:.2%}")
|
|
print(f" Écart: {(target_f1 - f1)*100:.2f} points")
|
|
|
|
# Sauvegarder les résultats
|
|
output_data = {
|
|
"evaluation_date": "2026-03-02",
|
|
"total_documents": len(all_results),
|
|
"global_metrics": {
|
|
"precision": round(precision, 4),
|
|
"recall": round(recall, 4),
|
|
"f1_score": round(f1, 4),
|
|
"true_positives": total_tp,
|
|
"false_positives": total_fp,
|
|
"false_negatives": total_fn
|
|
},
|
|
"by_type": {
|
|
pii_type: {
|
|
"precision": round(stats["tp"] / (stats["tp"] + stats["fp"]), 4) if (stats["tp"] + stats["fp"]) > 0 else 0.0,
|
|
"recall": round(stats["tp"] / (stats["tp"] + stats["fn"]), 4) if (stats["tp"] + stats["fn"]) > 0 else 0.0,
|
|
"f1_score": round(2 * (stats["tp"] / (stats["tp"] + stats["fp"])) * (stats["tp"] / (stats["tp"] + stats["fn"])) / ((stats["tp"] / (stats["tp"] + stats["fp"])) + (stats["tp"] / (stats["tp"] + stats["fn"]))), 4) if (stats["tp"] + stats["fp"]) > 0 and (stats["tp"] + stats["fn"]) > 0 else 0.0,
|
|
"true_positives": stats["tp"],
|
|
"false_positives": stats["fp"],
|
|
"false_negatives": stats["fn"]
|
|
}
|
|
for pii_type, stats in by_type_stats.items()
|
|
},
|
|
"per_document": [
|
|
{
|
|
"pdf": r["pdf"],
|
|
"precision": round(r["result"].precision, 4),
|
|
"recall": round(r["result"].recall, 4),
|
|
"f1_score": round(r["result"].f1_score, 4),
|
|
"true_positives": r["result"].true_positives,
|
|
"false_positives": r["result"].false_positives,
|
|
"false_negatives": r["result"].false_negatives
|
|
}
|
|
for r in all_results
|
|
]
|
|
}
|
|
|
|
json_file = results_dir / "baseline_quality_evaluation.json"
|
|
with open(json_file, 'w', encoding='utf-8') as f:
|
|
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\n📊 Résultats sauvegardés: {json_file}")
|
|
|
|
print("\n" + "="*80)
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(run_quality_evaluation())
|