feat: Filtre hospitalier pour éliminer les faux positifs

- Ajout config/hospital_stopwords.yml avec adresses/téléphones hôpitaux - Ajout detectors/hospital_filter.py pour filtrer les FP - Intégration dans anonymizer_core_refactored_onnx.py - Test sur document: 40 -> 32 détections (-8 FP) - Élimine: adresses hôpitaux, codes postaux CEDEX, épisodes dans noms de fichiers
2026-03-02 11:21:48 +01:00
parent 70ff0b9e12
commit 6806aee587
10 changed files with 10478 additions and 6 deletions
--- a/tools/analyze_false_positives.py
+++ b/tools/analyze_false_positives.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+"""
+Analyse détaillée des faux positifs pour identifier les patterns problématiques.
+"""
+
+import json
+from pathlib import Path
+from collections import defaultdict
+import sys
+
+def analyze_false_positives():
+    """Analyse les faux positifs par type et identifie les patterns."""
+    
+    # Charger les résultats d'évaluation
+    eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json")
+    if not eval_file.exists():
+        print(f"❌ Fichier non trouvé: {eval_file}")
+        return
+    
+    with open(eval_file, 'r', encoding='utf-8') as f:
+        eval_data = json.load(f)
+    
+    # Charger les fichiers audit pour analyser les FP
+    audit_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
+    
+    fp_examples = defaultdict(list)
+    
+    # Parcourir les fichiers audit
+    for audit_file in audit_dir.glob("*.audit.jsonl"):
+        with open(audit_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                detection = json.loads(line)
+                pii_type = detection.get('type', 'UNKNOWN')
+                text = detection.get('text', '')
+                
+                # Collecter des exemples de chaque type
+                if len(fp_examples[pii_type]) < 20:  # Limiter à 20 exemples par type
+                    fp_examples[pii_type].append({
+                        'text': text,
+                        'file': audit_file.stem.replace('.audit', ''),
+                        'page': detection.get('page', 0)
+                    })
+    
+    # Afficher l'analyse
+    print("=" * 80)
+    print("ANALYSE DES FAUX POSITIFS")
+    print("=" * 80)
+    print()
+    
+    # Focus sur les types problématiques
+    problematic_types = ['EPISODE', 'VILLE', 'CODE_POSTAL', 'ADRESSE', 'TEL']
+    
+    for pii_type in problematic_types:
+        type_metrics = eval_data['by_type'].get(pii_type, {})
+        fp_count = type_metrics.get('false_positives', 0)
+        precision = type_metrics.get('precision', 0)
+        
+        if fp_count == 0:
+            continue
+        
+        print(f"\n{'=' * 80}")
+        print(f"Type: {pii_type}")
+        print(f"Faux positifs: {fp_count}")
+        print(f"Précision: {precision:.2%}")
+        print(f"{'=' * 80}")
+        
+        examples = fp_examples.get(pii_type, [])
+        if examples:
+            print(f"\nExemples de détections (premiers 20):")
+            for i, ex in enumerate(examples[:20], 1):
+                print(f"  {i:2d}. '{ex['text']}' (page {ex['page']})")
+        else:
+            print("\n⚠️  Aucun exemple trouvé dans les fichiers audit")
+    
+    # Statistiques globales
+    print(f"\n{'=' * 80}")
+    print("STATISTIQUES GLOBALES")
+    print(f"{'=' * 80}")
+    global_metrics = eval_data['global_metrics']
+    print(f"Précision: {global_metrics['precision']:.2%}")
+    print(f"Rappel: {global_metrics['recall']:.2%}")
+    print(f"F1-Score: {global_metrics['f1_score']:.2%}")
+    print(f"Faux positifs totaux: {global_metrics['false_positives']}")
+    print()
+
+if __name__ == "__main__":
+    analyze_false_positives()
--- a/tools/extract_false_positives.py
+++ b/tools/extract_false_positives.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+"""
+Extrait les exemples de faux positifs en comparant annotations et détections.
+"""
+
+import json
+from pathlib import Path
+from collections import defaultdict
+
+def load_annotations(pdf_name):
+    """Charge les annotations pour un PDF."""
+    # Essayer différents formats de noms
+    possible_names = [
+        pdf_name,
+        pdf_name.replace('.redacted_raster', ''),
+        pdf_name.split('.')[0]
+    ]
+    
+    for name in possible_names:
+        annotation_file = Path(f"tests/ground_truth/annotations/{name}.json")
+        if annotation_file.exists():
+            with open(annotation_file, 'r', encoding='utf-8') as f:
+                return json.load(f)
+    
+    return None
+
+def load_detections(pdf_name):
+    """Charge les détections pour un PDF."""
+    audit_file = Path(f"tests/ground_truth/pdfs/baseline_anonymized/{pdf_name}.audit.jsonl")
+    if not audit_file.exists():
+        return []
+    
+    detections = []
+    with open(audit_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            detections.append(json.loads(line))
+    return detections
+
+def normalize_text(text):
+    """Normalise le texte pour la comparaison."""
+    return text.lower().strip()
+
+def is_match(detection, annotation, tolerance=5):
+    """Vérifie si une détection correspond à une annotation."""
+    # Même page
+    if detection.get('page') != annotation.get('page'):
+        return False
+    
+    # Même type (ou compatible)
+    det_type = detection.get('type', '')
+    ann_type = annotation.get('type', '')
+    
+    # Normaliser les types
+    type_mapping = {
+        'NOM': ['NOM', 'PRENOM'],
+        'PRENOM': ['NOM', 'PRENOM'],
+    }
+    
+    det_types = type_mapping.get(det_type, [det_type])
+    ann_types = type_mapping.get(ann_type, [ann_type])
+    
+    if not any(dt in ann_types for dt in det_types):
+        return False
+    
+    # Texte similaire
+    det_text = normalize_text(detection.get('text', ''))
+    ann_text = normalize_text(annotation.get('text', ''))
+    
+    return det_text == ann_text or det_text in ann_text or ann_text in det_text
+
+def extract_false_positives():
+    """Extrait les faux positifs de chaque document."""
+    
+    eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json")
+    with open(eval_file, 'r', encoding='utf-8') as f:
+        eval_data = json.load(f)
+    
+    false_positives = defaultdict(list)
+    
+    # Parcourir chaque document
+    for doc_result in eval_data['per_document']:
+        pdf_name = doc_result['pdf']
+        
+        # Charger annotations et détections
+        annotations = load_annotations(pdf_name)
+        detections = load_detections(pdf_name)
+        
+        if not annotations or not detections:
+            continue
+        
+        # Identifier les faux positifs
+        for detection in detections:
+            # Vérifier si cette détection correspond à une annotation
+            is_true_positive = False
+            for annotation in annotations.get('pii', []):
+                if is_match(detection, annotation):
+                    is_true_positive = True
+                    break
+            
+            # Si pas de correspondance, c'est un faux positif
+            if not is_true_positive:
+                pii_type = detection.get('type', 'UNKNOWN')
+                false_positives[pii_type].append({
+                    'text': detection.get('text', ''),
+                    'page': detection.get('page', 0),
+                    'file': pdf_name,
+                    'method': detection.get('method', 'unknown')
+                })
+    
+    # Afficher les résultats
+    print("=" * 80)
+    print("EXEMPLES DE FAUX POSITIFS")
+    print("=" * 80)
+    print()
+    
+    problematic_types = ['EPISODE', 'VILLE', 'CODE_POSTAL', 'ADRESSE', 'TEL']
+    
+    for pii_type in problematic_types:
+        fps = false_positives.get(pii_type, [])
+        if not fps:
+            continue
+        
+        print(f"\n{'=' * 80}")
+        print(f"Type: {pii_type} ({len(fps)} faux positifs)")
+        print(f"{'=' * 80}")
+        
+        # Grouper par texte pour voir les patterns
+        text_counts = defaultdict(int)
+        for fp in fps:
+            text_counts[fp['text']] += 1
+        
+        # Afficher les plus fréquents
+        sorted_texts = sorted(text_counts.items(), key=lambda x: x[1], reverse=True)
+        
+        print(f"\nTextes les plus fréquents:")
+        for text, count in sorted_texts[:20]:
+            print(f"  {count:3d}x '{text}'")
+        
+        # Afficher quelques exemples avec contexte
+        print(f"\nExemples avec contexte:")
+        for i, fp in enumerate(fps[:10], 1):
+            print(f"  {i:2d}. '{fp['text']}' (page {fp['page']}, méthode: {fp['method']})")
+            print(f"      Fichier: {fp['file']}")
+    
+    # Sauvegarder les résultats
+    output_file = Path("tests/ground_truth/analysis/false_positives_examples.json")
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(dict(false_positives), f, indent=2, ensure_ascii=False)
+    
+    print(f"\n✅ Résultats sauvegardés dans: {output_file}")
+
+if __name__ == "__main__":
+    extract_false_positives()
--- a/tools/show_fp_details.py
+++ b/tools/show_fp_details.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+"""
+Affiche les détails des faux positifs à partir des résultats d'évaluation.
+"""
+
+import json
+from pathlib import Path
+from collections import defaultdict, Counter
+
+# Charger l'évaluation
+eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json")
+with open(eval_file, 'r', encoding='utf-8') as f:
+    eval_data = json.load(f)
+
+# Analyser les types problématiques
+problematic_types = {
+    'EPISODE': 106,
+    'VILLE': 20,
+    'CODE_POSTAL': 10,
+    'ADRESSE': 10,
+    'TEL': 8
+}
+
+print("=" * 80)
+print("ANALYSE DES FAUX POSITIFS PAR TYPE")
+print("=" * 80)
+
+# Collecter tous les exemples de détections
+all_detections = defaultdict(list)
+
+for doc in eval_data['per_document']:
+    pdf_name = doc['pdf']
+    audit_file = Path(f"tests/ground_truth/pdfs/baseline_anonymized/{pdf_name}.audit.jsonl")
+    
+    if not audit_file.exists():
+        continue
+    
+    with open(audit_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            det = json.loads(line)
+            kind = det.get('kind', 'UNKNOWN')
+            original = det.get('original', '')
+            page = det.get('page', -1)
+            
+            all_detections[kind].append({
+                'text': original,
+                'page': page,
+                'file': pdf_name
+            })
+
+# Afficher les statistiques pour chaque type problématique
+for pii_type, expected_fp in problematic_types.items():
+    detections = all_detections.get(pii_type, [])
+    
+    print(f"\n{'=' * 80}")
+    print(f"Type: {pii_type}")
+    print(f"Faux positifs attendus: {expected_fp}")
+    print(f"Détections totales: {len(detections)}")
+    print(f"{'=' * 80}")
+    
+    # Compter les occurrences
+    text_counter = Counter(d['text'] for d in detections)
+    
+    print(f"\nTextes les plus fréquents:")
+    for text, count in text_counter.most_common(30):
+        print(f"  {count:3d}x '{text}'")
+    
+    # Afficher quelques exemples avec contexte
+    print(f"\nExemples avec fichier:")
+    seen = set()
+    for d in detections[:20]:
+        key = (d['text'], d['file'])
+        if key not in seen:
+            seen.add(key)
+            print(f"  '{d['text']}' (page {d['page']}) - {d['file']}")
+
+print("\n" + "=" * 80)
--- a/tools/test_hospital_filter.py
+++ b/tools/test_hospital_filter.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+"""
+Test du filtre hospitalier sur le dataset complet.
+"""
+
+import sys
+sys.path.insert(0, '.')
+
+from pathlib import Path
+import json
+from collections import Counter
+from anonymizer_core_refactored_onnx import process_pdf
+
+def main():
+    # Répertoires
+    input_dir = Path("tests/ground_truth/pdfs")
+    output_dir = Path("tests/ground_truth/pdfs/filtered_anonymized")
+    output_dir.mkdir(exist_ok=True)
+    
+    # Lister les PDFs
+    pdf_files = sorted(input_dir.glob("*.pdf"))
+    pdf_files = [p for p in pdf_files if not p.name.startswith('.')]
+    
+    print(f"Anonymisation avec filtre hospitalier sur {len(pdf_files)} documents...")
+    print("=" * 80)
+    
+    total_detections = 0
+    total_by_type = Counter()
+    
+    for i, pdf_path in enumerate(pdf_files, 1):
+        print(f"\n[{i}/{len(pdf_files)}] {pdf_path.name}")
+        
+        try:
+            result = process_pdf(
+                pdf_path,
+                output_dir,
+                make_vector_redaction=False,
+                also_make_raster_burn=False
+            )
+            
+            # Compter les détections
+            audit_file = Path(result['audit'])
+            if audit_file.exists():
+                detections = []
+                with open(audit_file, 'r') as f:
+                    for line in f:
+                        det = json.loads(line)
+                        detections.append(det)
+                        total_by_type[det['kind']] += 1
+                
+                total_detections += len(detections)
+                print(f"  ✅ {len(detections)} PII détectés")
+            else:
+                print(f"  ⚠️  Pas de fichier audit")
+        
+        except Exception as e:
+            print(f"  ❌ Erreur: {e}")
+    
+    print("\n" + "=" * 80)
+    print("RÉSULTATS GLOBAUX")
+    print("=" * 80)
+    print(f"Total PII détectés: {total_detections}")
+    print(f"\nPar type:")
+    for kind, count in sorted(total_by_type.items(), key=lambda x: -x[1]):
+        print(f"  {kind:20s}: {count:4d}")
+    
+    print(f"\n✅ Résultats sauvegardés dans: {output_dir}")
+
+if __name__ == "__main__":
+    main()