anonymisation/tools/auto_annotate_dataset.py

#!/usr/bin/env python3
"""
Annotation automatique du dataset de test.

Ce script utilise les résultats d'anonymisation (audit.jsonl) pour générer
automatiquement les annotations au format attendu par l'évaluateur.

L'idée: Les détections du système actuel deviennent la "ground truth" pour
mesurer les améliorations futures. On pourra ensuite corriger manuellement
les faux positifs/négatifs identifiés.
"""
import sys
import json
from pathlib import Path
from collections import defaultdict

def convert_audit_to_annotation(audit_path: Path, pdf_path: Path) -> dict:
    """
    Convertit un fichier audit.jsonl en annotation.

    Args:
        audit_path: Chemin vers le fichier audit.jsonl
        pdf_path: Chemin vers le PDF source

    Returns:
        Dictionnaire d'annotation
    """
    # Charger les détections
    detections = []
    if audit_path.exists():
        with open(audit_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    detections.append(json.loads(line))

    # Grouper par page
    by_page = defaultdict(list)
    for det in detections:
        page = det.get('page', -1)
        if page >= 0:  # Ignorer les détections globales (page -1)
            by_page[page].append(det)

    # Créer l'annotation
    annotation = {
        "pdf_path": str(pdf_path.name),
        "total_pages": max(by_page.keys()) + 1 if by_page else 1,
        "annotated_by": "auto-annotation-v1",
        "annotation_date": "2026-03-02",
        "pages": []
    }

    # Ajouter les pages
    for page_num in sorted(by_page.keys()):
        page_dets = by_page[page_num]

        # Grouper par type
        by_type = defaultdict(list)
        for det in page_dets:
            pii_type = det.get('kind', 'UNKNOWN')
            text = det.get('original', '')

            # Mapper les types
            type_mapping = {
                'NOM': 'NOM',
                'NOM_GLOBAL': 'NOM',
                'NOM_EXTRACTED': 'NOM',
                'PRENOM': 'PRENOM',
                'PRENOM_GLOBAL': 'PRENOM',
                'DATE_NAISSANCE': 'DATE_NAISSANCE',
                'DATE_NAISSANCE_GLOBAL': 'DATE_NAISSANCE',
                'ADRESSE': 'ADRESSE',
                'ADRESSE_GLOBAL': 'ADRESSE',
                'CODE_POSTAL': 'CODE_POSTAL',
                'CODE_POSTAL_GLOBAL': 'CODE_POSTAL',
                'VILLE': 'VILLE',
                'VILLE_GLOBAL': 'VILLE',
                'TEL': 'TEL',
                'TEL_GLOBAL': 'TEL',
                'EMAIL': 'EMAIL',
                'EMAIL_GLOBAL': 'EMAIL',
                'NIR': 'NIR',
                'NIR_GLOBAL': 'NIR',
                'IPP': 'IPP',
                'IPP_GLOBAL': 'IPP',
                'EPISODE': 'EPISODE',
                'EPISODE_GLOBAL': 'EPISODE',
                'ETAB': 'ETABLISSEMENT',
                'MEDECIN': 'MEDECIN',
                'HOPITAL': 'HOPITAL',
                'SERVICE': 'SERVICE'
            }

            mapped_type = type_mapping.get(pii_type, pii_type)

            if text:  # Ignorer les détections vides
                by_type[mapped_type].append(text)

        # Créer la page
        page_data = {
            "page_number": page_num,
            "pii": {}
        }

        for pii_type, texts in by_type.items():
            # Dédupliquer tout en préservant l'ordre
            unique_texts = []
            seen = set()
            for text in texts:
                if text not in seen:
                    unique_texts.append(text)
                    seen.add(text)

            page_data["pii"][pii_type] = unique_texts

        annotation["pages"].append(page_data)

    return annotation


def auto_annotate_dataset():
    """Génère les annotations automatiquement pour tous les documents."""

    # Répertoires
    baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
    annotations_dir = Path("tests/ground_truth/annotations")
    annotations_dir.mkdir(exist_ok=True)

    pdfs_dir = Path("tests/ground_truth/pdfs")

    # Lister les fichiers audit
    audit_files = sorted(baseline_dir.glob("*.audit.jsonl"))

    if not audit_files:
        print(f"✗ Aucun fichier audit trouvé dans {baseline_dir}")
        return 1

    print("="*80)
    print("ANNOTATION AUTOMATIQUE DU DATASET")
    print("="*80)
    print(f"\n📁 Répertoire audit: {baseline_dir}")
    print(f"📁 Répertoire annotations: {annotations_dir}")
    print(f"\n📄 Fichiers à annoter: {len(audit_files)}")

    # Statistiques
    total_annotations = 0
    total_pages = 0
    by_type = defaultdict(int)

    # Traiter chaque fichier
    for i, audit_path in enumerate(audit_files, 1):
        # Trouver le PDF source
        pdf_name = audit_path.stem.replace('.audit', '') + '.pdf'

        # Chercher le PDF (peut être dans baseline_anonymized ou pdfs)
        pdf_path = pdfs_dir / pdf_name
        if not pdf_path.exists():
            # Essayer sans le suffixe .redacted_raster
            pdf_name_clean = pdf_name.replace('.redacted_raster', '').replace('.redacted_vector', '')
            pdf_path = pdfs_dir / pdf_name_clean

        print(f"\n[{i}/{len(audit_files)}] {pdf_name}")

        # Convertir
        annotation = convert_audit_to_annotation(audit_path, pdf_path)

        # Compter
        page_count = len(annotation['pages'])
        pii_count = sum(
            len(texts)
            for page in annotation['pages']
            for texts in page['pii'].values()
        )

        total_annotations += pii_count
        total_pages += page_count

        # Compter par type
        for page in annotation['pages']:
            for pii_type, texts in page['pii'].items():
                by_type[pii_type] += len(texts)

        print(f"   Pages: {page_count}  PII: {pii_count}")

        # Sauvegarder
        output_path = annotations_dir / f"{pdf_path.stem}.json"
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(annotation, f, indent=2, ensure_ascii=False)

    # Résumé
    print("\n" + "="*80)
    print("RÉSUMÉ")
    print("="*80)
    print(f"\n✓ Documents annotés: {len(audit_files)}")
    print(f"✓ Pages annotées: {total_pages}")
    print(f"✓ PII annotés: {total_annotations}")

    print(f"\n📊 Répartition par type:")
    for pii_type, count in sorted(by_type.items(), key=lambda x: x[1], reverse=True):
        print(f"   - {pii_type}: {count}")

    # Créer un fichier de statistiques
    stats = {
        "total_documents": len(audit_files),
        "total_pages": total_pages,
        "total_pii": total_annotations,
        "by_type": dict(by_type),
        "avg_pii_per_doc": round(total_annotations / len(audit_files), 1),
        "avg_pages_per_doc": round(total_pages / len(audit_files), 1)
    }

    stats_path = annotations_dir / "dataset_statistics.json"
    with open(stats_path, 'w', encoding='utf-8') as f:
        json.dump(stats, f, indent=2, ensure_ascii=False)

    print(f"\n📊 Statistiques sauvegardées: {stats_path}")
    print(f"\n📂 Annotations générées dans: {annotations_dir}")

    print("\n" + "="*80)
    print("NOTE")
    print("="*80)
    print("""
Ces annotations sont générées automatiquement à partir des détections
du système actuel. Elles servent de baseline pour mesurer les améliorations.

Pour affiner la qualité:
1. Utiliser l'évaluateur pour identifier les faux positifs/négatifs
2. Corriger manuellement les annotations problématiques
3. Ré-exécuter l'évaluation

Commande pour corriger une annotation:
  python3 tools/annotation_tool.py --resume <pdf_name>
""")

    return 0


if __name__ == "__main__":
    sys.exit(auto_annotate_dataset())