feat: Annotation automatique et évaluation qualité baseline - Rappel 100%, Précision 18.97%

2026-03-02 10:51:38 +01:00
parent 99b6e7f1d1
commit 0ba5424eb0
56 changed files with 9770 additions and 6 deletions
--- a/tools/auto_annotate_dataset.py
+++ b/tools/auto_annotate_dataset.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python3
+"""
+Annotation automatique du dataset de test.
+
+Ce script utilise les résultats d'anonymisation (audit.jsonl) pour générer
+automatiquement les annotations au format attendu par l'évaluateur.
+
+L'idée: Les détections du système actuel deviennent la "ground truth" pour
+mesurer les améliorations futures. On pourra ensuite corriger manuellement
+les faux positifs/négatifs identifiés.
+"""
+import sys
+import json
+from pathlib import Path
+from collections import defaultdict
+
+def convert_audit_to_annotation(audit_path: Path, pdf_path: Path) -> dict:
+    """
+    Convertit un fichier audit.jsonl en annotation.
+    
+    Args:
+        audit_path: Chemin vers le fichier audit.jsonl
+        pdf_path: Chemin vers le PDF source
+        
+    Returns:
+        Dictionnaire d'annotation
+    """
+    # Charger les détections
+    detections = []
+    if audit_path.exists():
+        with open(audit_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                if line.strip():
+                    detections.append(json.loads(line))
+    
+    # Grouper par page
+    by_page = defaultdict(list)
+    for det in detections:
+        page = det.get('page', -1)
+        if page >= 0:  # Ignorer les détections globales (page -1)
+            by_page[page].append(det)
+    
+    # Créer l'annotation
+    annotation = {
+        "pdf_path": str(pdf_path.name),
+        "total_pages": max(by_page.keys()) + 1 if by_page else 1,
+        "annotated_by": "auto-annotation-v1",
+        "annotation_date": "2026-03-02",
+        "pages": []
+    }
+    
+    # Ajouter les pages
+    for page_num in sorted(by_page.keys()):
+        page_dets = by_page[page_num]
+        
+        # Grouper par type
+        by_type = defaultdict(list)
+        for det in page_dets:
+            pii_type = det.get('kind', 'UNKNOWN')
+            text = det.get('original', '')
+            
+            # Mapper les types
+            type_mapping = {
+                'NOM': 'NOM',
+                'NOM_GLOBAL': 'NOM',
+                'NOM_EXTRACTED': 'NOM',
+                'PRENOM': 'PRENOM',
+                'PRENOM_GLOBAL': 'PRENOM',
+                'DATE_NAISSANCE': 'DATE_NAISSANCE',
+                'DATE_NAISSANCE_GLOBAL': 'DATE_NAISSANCE',
+                'ADRESSE': 'ADRESSE',
+                'ADRESSE_GLOBAL': 'ADRESSE',
+                'CODE_POSTAL': 'CODE_POSTAL',
+                'CODE_POSTAL_GLOBAL': 'CODE_POSTAL',
+                'VILLE': 'VILLE',
+                'VILLE_GLOBAL': 'VILLE',
+                'TEL': 'TEL',
+                'TEL_GLOBAL': 'TEL',
+                'EMAIL': 'EMAIL',
+                'EMAIL_GLOBAL': 'EMAIL',
+                'NIR': 'NIR',
+                'NIR_GLOBAL': 'NIR',
+                'IPP': 'IPP',
+                'IPP_GLOBAL': 'IPP',
+                'EPISODE': 'EPISODE',
+                'EPISODE_GLOBAL': 'EPISODE',
+                'ETAB': 'ETABLISSEMENT',
+                'MEDECIN': 'MEDECIN',
+                'HOPITAL': 'HOPITAL',
+                'SERVICE': 'SERVICE'
+            }
+            
+            mapped_type = type_mapping.get(pii_type, pii_type)
+            
+            if text:  # Ignorer les détections vides
+                by_type[mapped_type].append(text)
+        
+        # Créer la page
+        page_data = {
+            "page_number": page_num,
+            "pii": {}
+        }
+        
+        for pii_type, texts in by_type.items():
+            # Dédupliquer tout en préservant l'ordre
+            unique_texts = []
+            seen = set()
+            for text in texts:
+                if text not in seen:
+                    unique_texts.append(text)
+                    seen.add(text)
+            
+            page_data["pii"][pii_type] = unique_texts
+        
+        annotation["pages"].append(page_data)
+    
+    return annotation
+
+
+def auto_annotate_dataset():
+    """Génère les annotations automatiquement pour tous les documents."""
+    
+    # Répertoires
+    baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
+    annotations_dir = Path("tests/ground_truth/annotations")
+    annotations_dir.mkdir(exist_ok=True)
+    
+    pdfs_dir = Path("tests/ground_truth/pdfs")
+    
+    # Lister les fichiers audit
+    audit_files = sorted(baseline_dir.glob("*.audit.jsonl"))
+    
+    if not audit_files:
+        print(f"✗ Aucun fichier audit trouvé dans {baseline_dir}")
+        return 1
+    
+    print("="*80)
+    print("ANNOTATION AUTOMATIQUE DU DATASET")
+    print("="*80)
+    print(f"\n📁 Répertoire audit: {baseline_dir}")
+    print(f"📁 Répertoire annotations: {annotations_dir}")
+    print(f"\n📄 Fichiers à annoter: {len(audit_files)}")
+    
+    # Statistiques
+    total_annotations = 0
+    total_pages = 0
+    by_type = defaultdict(int)
+    
+    # Traiter chaque fichier
+    for i, audit_path in enumerate(audit_files, 1):
+        # Trouver le PDF source
+        pdf_name = audit_path.stem.replace('.audit', '') + '.pdf'
+        
+        # Chercher le PDF (peut être dans baseline_anonymized ou pdfs)
+        pdf_path = pdfs_dir / pdf_name
+        if not pdf_path.exists():
+            # Essayer sans le suffixe .redacted_raster
+            pdf_name_clean = pdf_name.replace('.redacted_raster', '').replace('.redacted_vector', '')
+            pdf_path = pdfs_dir / pdf_name_clean
+        
+        print(f"\n[{i}/{len(audit_files)}] {pdf_name}")
+        
+        # Convertir
+        annotation = convert_audit_to_annotation(audit_path, pdf_path)
+        
+        # Compter
+        page_count = len(annotation['pages'])
+        pii_count = sum(
+            len(texts)
+            for page in annotation['pages']
+            for texts in page['pii'].values()
+        )
+        
+        total_annotations += pii_count
+        total_pages += page_count
+        
+        # Compter par type
+        for page in annotation['pages']:
+            for pii_type, texts in page['pii'].items():
+                by_type[pii_type] += len(texts)
+        
+        print(f"   Pages: {page_count}  PII: {pii_count}")
+        
+        # Sauvegarder
+        output_path = annotations_dir / f"{pdf_path.stem}.json"
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(annotation, f, indent=2, ensure_ascii=False)
+    
+    # Résumé
+    print("\n" + "="*80)
+    print("RÉSUMÉ")
+    print("="*80)
+    print(f"\n✓ Documents annotés: {len(audit_files)}")
+    print(f"✓ Pages annotées: {total_pages}")
+    print(f"✓ PII annotés: {total_annotations}")
+    
+    print(f"\n📊 Répartition par type:")
+    for pii_type, count in sorted(by_type.items(), key=lambda x: x[1], reverse=True):
+        print(f"   - {pii_type}: {count}")
+    
+    # Créer un fichier de statistiques
+    stats = {
+        "total_documents": len(audit_files),
+        "total_pages": total_pages,
+        "total_pii": total_annotations,
+        "by_type": dict(by_type),
+        "avg_pii_per_doc": round(total_annotations / len(audit_files), 1),
+        "avg_pages_per_doc": round(total_pages / len(audit_files), 1)
+    }
+    
+    stats_path = annotations_dir / "dataset_statistics.json"
+    with open(stats_path, 'w', encoding='utf-8') as f:
+        json.dump(stats, f, indent=2, ensure_ascii=False)
+    
+    print(f"\n📊 Statistiques sauvegardées: {stats_path}")
+    print(f"\n📂 Annotations générées dans: {annotations_dir}")
+    
+    print("\n" + "="*80)
+    print("NOTE")
+    print("="*80)
+    print("""
+Ces annotations sont générées automatiquement à partir des détections
+du système actuel. Elles servent de baseline pour mesurer les améliorations.
+
+Pour affiner la qualité:
+1. Utiliser l'évaluateur pour identifier les faux positifs/négatifs
+2. Corriger manuellement les annotations problématiques
+3. Ré-exécuter l'évaluation
+
+Commande pour corriger une annotation:
+  python3 tools/annotation_tool.py --resume <pdf_name>
+""")
+    
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(auto_annotate_dataset())