anonymisation/tools/extract_false_positives.py

#!/usr/bin/env python3
"""
Extrait les exemples de faux positifs en comparant annotations et détections.
"""

import json
from pathlib import Path
from collections import defaultdict

def load_annotations(pdf_name):
    """Charge les annotations pour un PDF."""
    # Essayer différents formats de noms
    possible_names = [
        pdf_name,
        pdf_name.replace('.redacted_raster', ''),
        pdf_name.split('.')[0]
    ]

    for name in possible_names:
        annotation_file = Path(f"tests/ground_truth/annotations/{name}.json")
        if annotation_file.exists():
            with open(annotation_file, 'r', encoding='utf-8') as f:
                return json.load(f)

    return None

def load_detections(pdf_name):
    """Charge les détections pour un PDF."""
    audit_file = Path(f"tests/ground_truth/pdfs/baseline_anonymized/{pdf_name}.audit.jsonl")
    if not audit_file.exists():
        return []

    detections = []
    with open(audit_file, 'r', encoding='utf-8') as f:
        for line in f:
            detections.append(json.loads(line))
    return detections

def normalize_text(text):
    """Normalise le texte pour la comparaison."""
    return text.lower().strip()

def is_match(detection, annotation, tolerance=5):
    """Vérifie si une détection correspond à une annotation."""
    # Même page
    if detection.get('page') != annotation.get('page'):
        return False

    # Même type (ou compatible)
    det_type = detection.get('type', '')
    ann_type = annotation.get('type', '')

    # Normaliser les types
    type_mapping = {
        'NOM': ['NOM', 'PRENOM'],
        'PRENOM': ['NOM', 'PRENOM'],
    }

    det_types = type_mapping.get(det_type, [det_type])
    ann_types = type_mapping.get(ann_type, [ann_type])

    if not any(dt in ann_types for dt in det_types):
        return False

    # Texte similaire
    det_text = normalize_text(detection.get('text', ''))
    ann_text = normalize_text(annotation.get('text', ''))

    return det_text == ann_text or det_text in ann_text or ann_text in det_text

def extract_false_positives():
    """Extrait les faux positifs de chaque document."""

    eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json")
    with open(eval_file, 'r', encoding='utf-8') as f:
        eval_data = json.load(f)

    false_positives = defaultdict(list)

    # Parcourir chaque document
    for doc_result in eval_data['per_document']:
        pdf_name = doc_result['pdf']

        # Charger annotations et détections
        annotations = load_annotations(pdf_name)
        detections = load_detections(pdf_name)

        if not annotations or not detections:
            continue

        # Identifier les faux positifs
        for detection in detections:
            # Vérifier si cette détection correspond à une annotation
            is_true_positive = False
            for annotation in annotations.get('pii', []):
                if is_match(detection, annotation):
                    is_true_positive = True
                    break

            # Si pas de correspondance, c'est un faux positif
            if not is_true_positive:
                pii_type = detection.get('type', 'UNKNOWN')
                false_positives[pii_type].append({
                    'text': detection.get('text', ''),
                    'page': detection.get('page', 0),
                    'file': pdf_name,
                    'method': detection.get('method', 'unknown')
                })

    # Afficher les résultats
    print("=" * 80)
    print("EXEMPLES DE FAUX POSITIFS")
    print("=" * 80)
    print()

    problematic_types = ['EPISODE', 'VILLE', 'CODE_POSTAL', 'ADRESSE', 'TEL']

    for pii_type in problematic_types:
        fps = false_positives.get(pii_type, [])
        if not fps:
            continue

        print(f"\n{'=' * 80}")
        print(f"Type: {pii_type} ({len(fps)} faux positifs)")
        print(f"{'=' * 80}")

        # Grouper par texte pour voir les patterns
        text_counts = defaultdict(int)
        for fp in fps:
            text_counts[fp['text']] += 1

        # Afficher les plus fréquents
        sorted_texts = sorted(text_counts.items(), key=lambda x: x[1], reverse=True)

        print(f"\nTextes les plus fréquents:")
        for text, count in sorted_texts[:20]:
            print(f"  {count:3d}x '{text}'")

        # Afficher quelques exemples avec contexte
        print(f"\nExemples avec contexte:")
        for i, fp in enumerate(fps[:10], 1):
            print(f"  {i:2d}. '{fp['text']}' (page {fp['page']}, méthode: {fp['method']})")
            print(f"      Fichier: {fp['file']}")

    # Sauvegarder les résultats
    output_file = Path("tests/ground_truth/analysis/false_positives_examples.json")
    output_file.parent.mkdir(parents=True, exist_ok=True)

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(dict(false_positives), f, indent=2, ensure_ascii=False)

    print(f"\n✅ Résultats sauvegardés dans: {output_file}")

if __name__ == "__main__":
    extract_false_positives()