anonymisation/tools/run_quality_evaluation.py

#!/usr/bin/env python3
"""
Évaluation de la qualité d'anonymisation sur le dataset annoté.

Compare les annotations (ground truth) avec les détections du système
pour calculer Précision, Rappel, F1-Score.
"""
import sys
import json
from pathlib import Path
from collections import defaultdict

sys.path.insert(0, str(Path(__file__).parent.parent))
from evaluation.quality_evaluator import QualityEvaluator

def run_quality_evaluation():
    """Exécute l'évaluation qualité sur tous les documents annotés."""

    # Répertoires
    annotations_dir = Path("tests/ground_truth/annotations")
    baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
    pdfs_dir = Path("tests/ground_truth/pdfs")
    results_dir = Path("tests/ground_truth/quality_evaluation")
    results_dir.mkdir(exist_ok=True)

    # Lister les annotations
    annotation_files = sorted(annotations_dir.glob("*.json"))
    annotation_files = [f for f in annotation_files if f.name != "dataset_statistics.json"]

    if not annotation_files:
        print(f"✗ Aucune annotation trouvée dans {annotations_dir}")
        return 1

    print("="*80)
    print("ÉVALUATION DE LA QUALITÉ D'ANONYMISATION")
    print("="*80)
    print(f"\n📁 Annotations: {annotations_dir}")
    print(f"📁 Détections: {baseline_dir}")
    print(f"📁 Résultats: {results_dir}")
    print(f"\n📄 Documents à évaluer: {len(annotation_files)}")

    # Créer l'évaluateur
    evaluator = QualityEvaluator(annotations_dir)

    # Statistiques globales
    all_results = []
    total_tp = 0
    total_fp = 0
    total_fn = 0
    by_type_stats = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0})

    # Évaluer chaque document
    for i, annotation_file in enumerate(annotation_files, 1):
        pdf_name = annotation_file.stem

        print(f"\n[{i}/{len(annotation_files)}] {pdf_name}")

        # Trouver le PDF
        pdf_path = pdfs_dir / f"{pdf_name}.pdf"
        if not pdf_path.exists():
            print(f"   ⚠️  PDF non trouvé: {pdf_path.name}")
            continue

        # Trouver l'audit
        audit_path = baseline_dir / f"{pdf_name}.audit.jsonl"
        if not audit_path.exists():
            # Essayer avec les suffixes
            for suffix in ['.redacted_raster', '.redacted_vector']:
                audit_path_alt = baseline_dir / f"{pdf_name}{suffix}.audit.jsonl"
                if audit_path_alt.exists():
                    audit_path = audit_path_alt
                    break

        if not audit_path.exists():
            print(f"   ⚠️  Fichier audit non trouvé: {audit_path.name}")
            continue

        # Évaluer
        result = evaluator.evaluate(pdf_path, audit_path)

        if result is None:
            print(f"   ⚠️  Échec de l'évaluation")
            continue

        all_results.append({
            "pdf": pdf_name,
            "result": result
        })

        # Afficher
        print(f"   Précision: {result.precision:.2%}  "
              f"Rappel: {result.recall:.2%}  "
              f"F1: {result.f1_score:.2%}")
        print(f"   TP: {result.true_positives}  "
              f"FP: {result.false_positives}  "
              f"FN: {result.false_negatives}")

        # Accumuler
        total_tp += result.true_positives
        total_fp += result.false_positives
        total_fn += result.false_negatives

        # Par type
        for pii_type, stats in result.by_type.items():
            by_type_stats[pii_type]["tp"] += stats["tp"]
            by_type_stats[pii_type]["fp"] += stats["fp"]
            by_type_stats[pii_type]["fn"] += stats["fn"]

    if not all_results:
        print("\n✗ Aucun document évalué avec succès")
        return 1

    # Calculer les métriques globales
    print("\n" + "="*80)
    print("RÉSULTATS GLOBAUX")
    print("="*80)

    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    print(f"\n📊 Métriques:")
    print(f"   - Précision: {precision:.2%}")
    print(f"   - Rappel: {recall:.2%}")
    print(f"   - F1-Score: {f1:.2%}")

    print(f"\n📊 Détails:")
    print(f"   - Vrais positifs (TP): {total_tp}")
    print(f"   - Faux positifs (FP): {total_fp}")
    print(f"   - Faux négatifs (FN): {total_fn}")

    # Métriques par type
    print(f"\n📊 Métriques par type de PII:")
    for pii_type in sorted(by_type_stats.keys()):
        stats = by_type_stats[pii_type]
        tp = stats["tp"]
        fp = stats["fp"]
        fn = stats["fn"]

        prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        rec = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1_type = 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0.0

        print(f"   - {pii_type}:")
        print(f"     Précision: {prec:.2%}  Rappel: {rec:.2%}  F1: {f1_type:.2%}")
        print(f"     TP: {tp}  FP: {fp}  FN: {fn}")

    # Validation des objectifs
    print("\n" + "="*80)
    print("VALIDATION DES OBJECTIFS")
    print("="*80)

    target_recall = 0.995  # ≥ 99.5%
    target_precision = 0.97  # ≥ 97%
    target_f1 = 0.98  # ≥ 0.98

    print(f"\n🎯 Objectifs:")
    print(f"   - Rappel: ≥ {target_recall:.1%}")
    print(f"   - Précision: ≥ {target_precision:.1%}")
    print(f"   - F1-Score: ≥ {target_f1:.2%}")

    print(f"\n📊 Résultats:")

    if recall >= target_recall:
        print(f"   ✅ Rappel atteint: {recall:.2%} ≥ {target_recall:.1%}")
    else:
        print(f"   ⚠️  Rappel non atteint: {recall:.2%} < {target_recall:.1%}")
        print(f"      Écart: {(target_recall - recall)*100:.2f} points")

    if precision >= target_precision:
        print(f"   ✅ Précision atteinte: {precision:.2%} ≥ {target_precision:.1%}")
    else:
        print(f"   ⚠️  Précision non atteinte: {precision:.2%} < {target_precision:.1%}")
        print(f"      Écart: {(target_precision - precision)*100:.2f} points")

    if f1 >= target_f1:
        print(f"   ✅ F1-Score atteint: {f1:.2%} ≥ {target_f1:.2%}")
    else:
        print(f"   ⚠️  F1-Score non atteint: {f1:.2%} < {target_f1:.2%}")
        print(f"      Écart: {(target_f1 - f1)*100:.2f} points")

    # Sauvegarder les résultats
    output_data = {
        "evaluation_date": "2026-03-02",
        "total_documents": len(all_results),
        "global_metrics": {
            "precision": round(precision, 4),
            "recall": round(recall, 4),
            "f1_score": round(f1, 4),
            "true_positives": total_tp,
            "false_positives": total_fp,
            "false_negatives": total_fn
        },
        "by_type": {
            pii_type: {
                "precision": round(stats["tp"] / (stats["tp"] + stats["fp"]), 4) if (stats["tp"] + stats["fp"]) > 0 else 0.0,
                "recall": round(stats["tp"] / (stats["tp"] + stats["fn"]), 4) if (stats["tp"] + stats["fn"]) > 0 else 0.0,
                "f1_score": round(2 * (stats["tp"] / (stats["tp"] + stats["fp"])) * (stats["tp"] / (stats["tp"] + stats["fn"])) / ((stats["tp"] / (stats["tp"] + stats["fp"])) + (stats["tp"] / (stats["tp"] + stats["fn"]))), 4) if (stats["tp"] + stats["fp"]) > 0 and (stats["tp"] + stats["fn"]) > 0 else 0.0,
                "true_positives": stats["tp"],
                "false_positives": stats["fp"],
                "false_negatives": stats["fn"]
            }
            for pii_type, stats in by_type_stats.items()
        },
        "per_document": [
            {
                "pdf": r["pdf"],
                "precision": round(r["result"].precision, 4),
                "recall": round(r["result"].recall, 4),
                "f1_score": round(r["result"].f1_score, 4),
                "true_positives": r["result"].true_positives,
                "false_positives": r["result"].false_positives,
                "false_negatives": r["result"].false_negatives
            }
            for r in all_results
        ]
    }

    json_file = results_dir / "baseline_quality_evaluation.json"
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)

    print(f"\n📊 Résultats sauvegardés: {json_file}")

    print("\n" + "="*80)

    return 0


if __name__ == "__main__":
    sys.exit(run_quality_evaluation())