feat: Filtre hospitalier pour éliminer les faux positifs

- Ajout config/hospital_stopwords.yml avec adresses/téléphones hôpitaux - Ajout detectors/hospital_filter.py pour filtrer les FP - Intégration dans anonymizer_core_refactored_onnx.py - Test sur document: 40 -> 32 détections (-8 FP) - Élimine: adresses hôpitaux, codes postaux CEDEX, épisodes dans noms de fichiers
2026-03-02 11:21:48 +01:00
parent 70ff0b9e12
commit 6806aee587
10 changed files with 10478 additions and 6 deletions
--- a/tools/extract_false_positives.py
+++ b/tools/extract_false_positives.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+"""
+Extrait les exemples de faux positifs en comparant annotations et détections.
+"""
+
+import json
+from pathlib import Path
+from collections import defaultdict
+
+def load_annotations(pdf_name):
+    """Charge les annotations pour un PDF."""
+    # Essayer différents formats de noms
+    possible_names = [
+        pdf_name,
+        pdf_name.replace('.redacted_raster', ''),
+        pdf_name.split('.')[0]
+    ]
+    
+    for name in possible_names:
+        annotation_file = Path(f"tests/ground_truth/annotations/{name}.json")
+        if annotation_file.exists():
+            with open(annotation_file, 'r', encoding='utf-8') as f:
+                return json.load(f)
+    
+    return None
+
+def load_detections(pdf_name):
+    """Charge les détections pour un PDF."""
+    audit_file = Path(f"tests/ground_truth/pdfs/baseline_anonymized/{pdf_name}.audit.jsonl")
+    if not audit_file.exists():
+        return []
+    
+    detections = []
+    with open(audit_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            detections.append(json.loads(line))
+    return detections
+
+def normalize_text(text):
+    """Normalise le texte pour la comparaison."""
+    return text.lower().strip()
+
+def is_match(detection, annotation, tolerance=5):
+    """Vérifie si une détection correspond à une annotation."""
+    # Même page
+    if detection.get('page') != annotation.get('page'):
+        return False
+    
+    # Même type (ou compatible)
+    det_type = detection.get('type', '')
+    ann_type = annotation.get('type', '')
+    
+    # Normaliser les types
+    type_mapping = {
+        'NOM': ['NOM', 'PRENOM'],
+        'PRENOM': ['NOM', 'PRENOM'],
+    }
+    
+    det_types = type_mapping.get(det_type, [det_type])
+    ann_types = type_mapping.get(ann_type, [ann_type])
+    
+    if not any(dt in ann_types for dt in det_types):
+        return False
+    
+    # Texte similaire
+    det_text = normalize_text(detection.get('text', ''))
+    ann_text = normalize_text(annotation.get('text', ''))
+    
+    return det_text == ann_text or det_text in ann_text or ann_text in det_text
+
+def extract_false_positives():
+    """Extrait les faux positifs de chaque document."""
+    
+    eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json")
+    with open(eval_file, 'r', encoding='utf-8') as f:
+        eval_data = json.load(f)
+    
+    false_positives = defaultdict(list)
+    
+    # Parcourir chaque document
+    for doc_result in eval_data['per_document']:
+        pdf_name = doc_result['pdf']
+        
+        # Charger annotations et détections
+        annotations = load_annotations(pdf_name)
+        detections = load_detections(pdf_name)
+        
+        if not annotations or not detections:
+            continue
+        
+        # Identifier les faux positifs
+        for detection in detections:
+            # Vérifier si cette détection correspond à une annotation
+            is_true_positive = False
+            for annotation in annotations.get('pii', []):
+                if is_match(detection, annotation):
+                    is_true_positive = True
+                    break
+            
+            # Si pas de correspondance, c'est un faux positif
+            if not is_true_positive:
+                pii_type = detection.get('type', 'UNKNOWN')
+                false_positives[pii_type].append({
+                    'text': detection.get('text', ''),
+                    'page': detection.get('page', 0),
+                    'file': pdf_name,
+                    'method': detection.get('method', 'unknown')
+                })
+    
+    # Afficher les résultats
+    print("=" * 80)
+    print("EXEMPLES DE FAUX POSITIFS")
+    print("=" * 80)
+    print()
+    
+    problematic_types = ['EPISODE', 'VILLE', 'CODE_POSTAL', 'ADRESSE', 'TEL']
+    
+    for pii_type in problematic_types:
+        fps = false_positives.get(pii_type, [])
+        if not fps:
+            continue
+        
+        print(f"\n{'=' * 80}")
+        print(f"Type: {pii_type} ({len(fps)} faux positifs)")
+        print(f"{'=' * 80}")
+        
+        # Grouper par texte pour voir les patterns
+        text_counts = defaultdict(int)
+        for fp in fps:
+            text_counts[fp['text']] += 1
+        
+        # Afficher les plus fréquents
+        sorted_texts = sorted(text_counts.items(), key=lambda x: x[1], reverse=True)
+        
+        print(f"\nTextes les plus fréquents:")
+        for text, count in sorted_texts[:20]:
+            print(f"  {count:3d}x '{text}'")
+        
+        # Afficher quelques exemples avec contexte
+        print(f"\nExemples avec contexte:")
+        for i, fp in enumerate(fps[:10], 1):
+            print(f"  {i:2d}. '{fp['text']}' (page {fp['page']}, méthode: {fp['method']})")
+            print(f"      Fichier: {fp['file']}")
+    
+    # Sauvegarder les résultats
+    output_file = Path("tests/ground_truth/analysis/false_positives_examples.json")
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(dict(false_positives), f, indent=2, ensure_ascii=False)
+    
+    print(f"\n✅ Résultats sauvegardés dans: {output_file}")
+
+if __name__ == "__main__":
+    extract_false_positives()