anonymisation/tools/analyze_false_positives.py

#!/usr/bin/env python3
"""
Analyse détaillée des faux positifs pour identifier les patterns problématiques.
"""

import json
from pathlib import Path
from collections import defaultdict
import sys

def analyze_false_positives():
    """Analyse les faux positifs par type et identifie les patterns."""

    # Charger les résultats d'évaluation
    eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json")
    if not eval_file.exists():
        print(f"❌ Fichier non trouvé: {eval_file}")
        return

    with open(eval_file, 'r', encoding='utf-8') as f:
        eval_data = json.load(f)

    # Charger les fichiers audit pour analyser les FP
    audit_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")

    fp_examples = defaultdict(list)

    # Parcourir les fichiers audit
    for audit_file in audit_dir.glob("*.audit.jsonl"):
        with open(audit_file, 'r', encoding='utf-8') as f:
            for line in f:
                detection = json.loads(line)
                pii_type = detection.get('type', 'UNKNOWN')
                text = detection.get('text', '')

                # Collecter des exemples de chaque type
                if len(fp_examples[pii_type]) < 20:  # Limiter à 20 exemples par type
                    fp_examples[pii_type].append({
                        'text': text,
                        'file': audit_file.stem.replace('.audit', ''),
                        'page': detection.get('page', 0)
                    })

    # Afficher l'analyse
    print("=" * 80)
    print("ANALYSE DES FAUX POSITIFS")
    print("=" * 80)
    print()

    # Focus sur les types problématiques
    problematic_types = ['EPISODE', 'VILLE', 'CODE_POSTAL', 'ADRESSE', 'TEL']

    for pii_type in problematic_types:
        type_metrics = eval_data['by_type'].get(pii_type, {})
        fp_count = type_metrics.get('false_positives', 0)
        precision = type_metrics.get('precision', 0)

        if fp_count == 0:
            continue

        print(f"\n{'=' * 80}")
        print(f"Type: {pii_type}")
        print(f"Faux positifs: {fp_count}")
        print(f"Précision: {precision:.2%}")
        print(f"{'=' * 80}")

        examples = fp_examples.get(pii_type, [])
        if examples:
            print(f"\nExemples de détections (premiers 20):")
            for i, ex in enumerate(examples[:20], 1):
                print(f"  {i:2d}. '{ex['text']}' (page {ex['page']})")
        else:
            print("\n⚠️  Aucun exemple trouvé dans les fichiers audit")

    # Statistiques globales
    print(f"\n{'=' * 80}")
    print("STATISTIQUES GLOBALES")
    print(f"{'=' * 80}")
    global_metrics = eval_data['global_metrics']
    print(f"Précision: {global_metrics['precision']:.2%}")
    print(f"Rappel: {global_metrics['recall']:.2%}")
    print(f"F1-Score: {global_metrics['f1_score']:.2%}")
    print(f"Faux positifs totaux: {global_metrics['false_positives']}")
    print()

if __name__ == "__main__":
    analyze_false_positives()