#!/usr/bin/env python3 """ Évaluation de la qualité d'anonymisation sur le dataset annoté. Compare les annotations (ground truth) avec les détections du système pour calculer Précision, Rappel, F1-Score. """ import sys import json from pathlib import Path from collections import defaultdict sys.path.insert(0, str(Path(__file__).parent.parent)) from evaluation.quality_evaluator import QualityEvaluator def run_quality_evaluation(): """Exécute l'évaluation qualité sur tous les documents annotés.""" # Répertoires annotations_dir = Path("tests/ground_truth/annotations") baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized") pdfs_dir = Path("tests/ground_truth/pdfs") results_dir = Path("tests/ground_truth/quality_evaluation") results_dir.mkdir(exist_ok=True) # Lister les annotations annotation_files = sorted(annotations_dir.glob("*.json")) annotation_files = [f for f in annotation_files if f.name != "dataset_statistics.json"] if not annotation_files: print(f"✗ Aucune annotation trouvée dans {annotations_dir}") return 1 print("="*80) print("ÉVALUATION DE LA QUALITÉ D'ANONYMISATION") print("="*80) print(f"\n📁 Annotations: {annotations_dir}") print(f"📁 Détections: {baseline_dir}") print(f"📁 Résultats: {results_dir}") print(f"\n📄 Documents à évaluer: {len(annotation_files)}") # Créer l'évaluateur evaluator = QualityEvaluator(annotations_dir) # Statistiques globales all_results = [] total_tp = 0 total_fp = 0 total_fn = 0 by_type_stats = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0}) # Évaluer chaque document for i, annotation_file in enumerate(annotation_files, 1): pdf_name = annotation_file.stem print(f"\n[{i}/{len(annotation_files)}] {pdf_name}") # Trouver le PDF pdf_path = pdfs_dir / f"{pdf_name}.pdf" if not pdf_path.exists(): print(f" ⚠️ PDF non trouvé: {pdf_path.name}") continue # Trouver l'audit audit_path = baseline_dir / f"{pdf_name}.audit.jsonl" if not audit_path.exists(): # Essayer avec les suffixes for suffix in ['.redacted_raster', '.redacted_vector']: audit_path_alt = baseline_dir / f"{pdf_name}{suffix}.audit.jsonl" if audit_path_alt.exists(): audit_path = audit_path_alt break if not audit_path.exists(): print(f" ⚠️ Fichier audit non trouvé: {audit_path.name}") continue # Évaluer result = evaluator.evaluate(pdf_path, audit_path) if result is None: print(f" ⚠️ Échec de l'évaluation") continue all_results.append({ "pdf": pdf_name, "result": result }) # Afficher print(f" Précision: {result.precision:.2%} " f"Rappel: {result.recall:.2%} " f"F1: {result.f1_score:.2%}") print(f" TP: {result.true_positives} " f"FP: {result.false_positives} " f"FN: {result.false_negatives}") # Accumuler total_tp += result.true_positives total_fp += result.false_positives total_fn += result.false_negatives # Par type for pii_type, stats in result.by_type.items(): by_type_stats[pii_type]["tp"] += stats["tp"] by_type_stats[pii_type]["fp"] += stats["fp"] by_type_stats[pii_type]["fn"] += stats["fn"] if not all_results: print("\n✗ Aucun document évalué avec succès") return 1 # Calculer les métriques globales print("\n" + "="*80) print("RÉSULTATS GLOBAUX") print("="*80) precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0 recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0 f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0 print(f"\n📊 Métriques:") print(f" - Précision: {precision:.2%}") print(f" - Rappel: {recall:.2%}") print(f" - F1-Score: {f1:.2%}") print(f"\n📊 Détails:") print(f" - Vrais positifs (TP): {total_tp}") print(f" - Faux positifs (FP): {total_fp}") print(f" - Faux négatifs (FN): {total_fn}") # Métriques par type print(f"\n📊 Métriques par type de PII:") for pii_type in sorted(by_type_stats.keys()): stats = by_type_stats[pii_type] tp = stats["tp"] fp = stats["fp"] fn = stats["fn"] prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0 rec = tp / (tp + fn) if (tp + fn) > 0 else 0.0 f1_type = 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0.0 print(f" - {pii_type}:") print(f" Précision: {prec:.2%} Rappel: {rec:.2%} F1: {f1_type:.2%}") print(f" TP: {tp} FP: {fp} FN: {fn}") # Validation des objectifs print("\n" + "="*80) print("VALIDATION DES OBJECTIFS") print("="*80) target_recall = 0.995 # ≥ 99.5% target_precision = 0.97 # ≥ 97% target_f1 = 0.98 # ≥ 0.98 print(f"\n🎯 Objectifs:") print(f" - Rappel: ≥ {target_recall:.1%}") print(f" - Précision: ≥ {target_precision:.1%}") print(f" - F1-Score: ≥ {target_f1:.2%}") print(f"\n📊 Résultats:") if recall >= target_recall: print(f" ✅ Rappel atteint: {recall:.2%} ≥ {target_recall:.1%}") else: print(f" ⚠️ Rappel non atteint: {recall:.2%} < {target_recall:.1%}") print(f" Écart: {(target_recall - recall)*100:.2f} points") if precision >= target_precision: print(f" ✅ Précision atteinte: {precision:.2%} ≥ {target_precision:.1%}") else: print(f" ⚠️ Précision non atteinte: {precision:.2%} < {target_precision:.1%}") print(f" Écart: {(target_precision - precision)*100:.2f} points") if f1 >= target_f1: print(f" ✅ F1-Score atteint: {f1:.2%} ≥ {target_f1:.2%}") else: print(f" ⚠️ F1-Score non atteint: {f1:.2%} < {target_f1:.2%}") print(f" Écart: {(target_f1 - f1)*100:.2f} points") # Sauvegarder les résultats output_data = { "evaluation_date": "2026-03-02", "total_documents": len(all_results), "global_metrics": { "precision": round(precision, 4), "recall": round(recall, 4), "f1_score": round(f1, 4), "true_positives": total_tp, "false_positives": total_fp, "false_negatives": total_fn }, "by_type": { pii_type: { "precision": round(stats["tp"] / (stats["tp"] + stats["fp"]), 4) if (stats["tp"] + stats["fp"]) > 0 else 0.0, "recall": round(stats["tp"] / (stats["tp"] + stats["fn"]), 4) if (stats["tp"] + stats["fn"]) > 0 else 0.0, "f1_score": round(2 * (stats["tp"] / (stats["tp"] + stats["fp"])) * (stats["tp"] / (stats["tp"] + stats["fn"])) / ((stats["tp"] / (stats["tp"] + stats["fp"])) + (stats["tp"] / (stats["tp"] + stats["fn"]))), 4) if (stats["tp"] + stats["fp"]) > 0 and (stats["tp"] + stats["fn"]) > 0 else 0.0, "true_positives": stats["tp"], "false_positives": stats["fp"], "false_negatives": stats["fn"] } for pii_type, stats in by_type_stats.items() }, "per_document": [ { "pdf": r["pdf"], "precision": round(r["result"].precision, 4), "recall": round(r["result"].recall, 4), "f1_score": round(r["result"].f1_score, 4), "true_positives": r["result"].true_positives, "false_positives": r["result"].false_positives, "false_negatives": r["result"].false_negatives } for r in all_results ] } json_file = results_dir / "baseline_quality_evaluation.json" with open(json_file, 'w', encoding='utf-8') as f: json.dump(output_data, f, indent=2, ensure_ascii=False) print(f"\n📊 Résultats sauvegardés: {json_file}") print("\n" + "="*80) return 0 if __name__ == "__main__": sys.exit(run_quality_evaluation())