feat: Analyse baseline - 77.7% FP dus à NOM_EXTRACTED, 19.2% à propagation globale

2026-03-02 10:59:10 +01:00
parent 0ba5424eb0
commit 4eba826ca5
3 changed files with 443 additions and 6 deletions
--- a/.kiro/specs/anonymization-quality-optimization/tasks.md
+++ b/.kiro/specs/anonymization-quality-optimization/tasks.md
@@ -79,12 +79,12 @@
  - [x] 1.3.2.3 Mesurer l'utilisation CPU/RAM
  - [x] 1.3.2.4 Exporter les résultats baseline

- [ ] 1.3.3 Analyser les résultats baseline
-  - [ ] 1.3.3.1 Analyser les types de PII manqués (faux négatifs)
-  - [ ] 1.3.3.2 Analyser les types de faux positifs
-  - [ ] 1.3.3.3 Identifier les patterns problématiques
-  - [ ] 1.3.3.4 Prioriser les améliorations à implémenter
-  - [ ] 1.3.3.5 Documenter les findings dans un rapport
+- [x] 1.3.3 Analyser les résultats baseline
+  - [x] 1.3.3.1 Analyser les types de PII manqués (faux négatifs)
+  - [x] 1.3.3.2 Analyser les types de faux positifs
+  - [x] 1.3.3.3 Identifier les patterns problématiques
+  - [x] 1.3.3.4 Prioriser les améliorations à implémenter
+  - [x] 1.3.3.5 Documenter les findings dans un rapport

 ---

--- a/tests/ground_truth/analysis/baseline_analysis.json
+++ b/tests/ground_truth/analysis/baseline_analysis.json
@@ -0,0 +1,158 @@
+{
+  "analysis_date": "2026-03-02",
+  "global_metrics": {
+    "precision": 0.1897,
+    "recall": 1.0,
+    "f1_score": 0.3189,
+    "true_positives": 1159,
+    "false_positives": 4951,
+    "false_negatives": 0
+  },
+  "problems": [
+    {
+      "priority": "HAUTE",
+      "category": "Propagation globale",
+      "description": "951 faux positifs dus aux détections *_GLOBAL",
+      "types": [
+        "NOM_GLOBAL",
+        "ETAB_GLOBAL",
+        "TEL_GLOBAL",
+        "ADRESSE_GLOBAL",
+        "CODE_POSTAL_GLOBAL",
+        "DATE_NAISSANCE_GLOBAL",
+        "EMAIL_GLOBAL",
+        "RPPS_GLOBAL",
+        "EPISODE_GLOBAL",
+        "VILLE_GLOBAL"
+      ],
+      "impact": "19.2% des FP totaux",
+      "solution": "Améliorer la logique de propagation globale ou désactiver pour certains types"
+    },
+    {
+      "priority": "HAUTE",
+      "category": "Extraction de noms",
+      "description": "3846 faux positifs de type NOM_EXTRACTED",
+      "types": [
+        "NOM_EXTRACTED"
+      ],
+      "impact": "77.7% des FP totaux",
+      "solution": "Améliorer les stopwords médicaux et la détection contextuelle"
+    },
+    {
+      "priority": "MOYENNE",
+      "category": "Précision faible",
+      "description": "10 types avec précision < 50%",
+      "types": [
+        "NOM_EXTRACTED",
+        "NOM_GLOBAL",
+        "ETAB_GLOBAL",
+        "TEL_GLOBAL",
+        "ADRESSE_GLOBAL",
+        "CODE_POSTAL_GLOBAL",
+        "DATE_NAISSANCE_GLOBAL",
+        "EMAIL_GLOBAL",
+        "EPISODE",
+        "VILLE"
+      ],
+      "impact": "Affecte 4897 FP",
+      "solution": "Améliorer les regex et la détection contextuelle pour ces types"
+    }
+  ],
+  "improvements": [
+    {
+      "priority": 2,
+      "title": "Enrichir les stopwords médicaux",
+      "impact": "Réduction de ~3846 FP NOM_EXTRACTED",
+      "effort": "Faible",
+      "gain_precision": "+62.9 points",
+      "tasks": [
+        "Extraire les termes médicaux des documents annotés",
+        "Identifier les faux positifs récurrents",
+        "Ajouter à _MEDICAL_STOP_WORDS_SET"
+      ]
+    },
+    {
+      "priority": 4,
+      "title": "Implémenter la détection contextuelle",
+      "impact": "Réduction de ~126 FP",
+      "effort": "Élevé",
+      "gain_precision": "+2.1 points",
+      "tasks": [
+        "Créer detectors/contextual.py",
+        "Implémenter la détection avec contexte fort/faible",
+        "Filtrer via stopwords médicaux",
+        "Intégrer dans le pipeline hybride"
+      ]
+    }
+  ],
+  "false_positives_by_type": {
+    "NOM_EXTRACTED": 3846,
+    "NOM_GLOBAL": 670,
+    "EPISODE": 106,
+    "TEL_GLOBAL": 77,
+    "ADRESSE_GLOBAL": 55,
+    "CODE_POSTAL_GLOBAL": 39,
+    "ETAB_GLOBAL": 36,
+    "EMAIL_GLOBAL": 28,
+    "DATE_NAISSANCE_GLOBAL": 20,
+    "VILLE": 20,
+    "ADRESSE": 10,
+    "CODE_POSTAL": 10,
+    "VILLE_GLOBAL": 10,
+    "EPISODE_GLOBAL": 9,
+    "TEL": 8,
+    "RPPS_GLOBAL": 7
+  },
+  "low_precision_types": [
+    {
+      "type": "NOM_EXTRACTED",
+      "precision": 0.0,
+      "fp": 3846
+    },
+    {
+      "type": "NOM_GLOBAL",
+      "precision": 0.0,
+      "fp": 670
+    },
+    {
+      "type": "ETAB_GLOBAL",
+      "precision": 0.0,
+      "fp": 36
+    },
+    {
+      "type": "TEL_GLOBAL",
+      "precision": 0.0,
+      "fp": 77
+    },
+    {
+      "type": "ADRESSE_GLOBAL",
+      "precision": 0.0,
+      "fp": 55
+    },
+    {
+      "type": "CODE_POSTAL_GLOBAL",
+      "precision": 0.0,
+      "fp": 39
+    },
+    {
+      "type": "DATE_NAISSANCE_GLOBAL",
+      "precision": 0.0,
+      "fp": 20
+    },
+    {
+      "type": "EMAIL_GLOBAL",
+      "precision": 0.0,
+      "fp": 28
+    },
+    {
+      "type": "EPISODE",
+      "precision": 0.1452,
+      "fp": 106
+    },
+    {
+      "type": "VILLE",
+      "precision": 0.2,
+      "fp": 20
+    }
+  ]
+}
--- a/tools/analyze_baseline_results.py
+++ b/tools/analyze_baseline_results.py
@@ -0,0 +1,279 @@
+#!/usr/bin/env python3
+"""
+Analyse détaillée des résultats baseline.
+
+Identifie les patterns problématiques, les faux positifs/négatifs,
+et priorise les améliorations à implémenter.
+"""
+import sys
+import json
+from pathlib import Path
+from collections import defaultdict, Counter
+
+def analyze_baseline_results():
+    """Analyse les résultats baseline et génère un rapport détaillé."""
+    
+    # Charger les résultats d'évaluation
+    eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json")
+    
+    if not eval_file.exists():
+        print(f"✗ Fichier d'évaluation non trouvé: {eval_file}")
+        return 1
+    
+    with open(eval_file, 'r', encoding='utf-8') as f:
+        eval_data = json.load(f)
+    
+    print("="*80)
+    print("ANALYSE DES RÉSULTATS BASELINE")
+    print("="*80)
+    
+    # Métriques globales
+    global_metrics = eval_data['global_metrics']
+    
+    print(f"\n📊 Métriques Globales:")
+    print(f"   - Précision: {global_metrics['precision']:.2%}")
+    print(f"   - Rappel: {global_metrics['recall']:.2%}")
+    print(f"   - F1-Score: {global_metrics['f1_score']:.2%}")
+    print(f"   - TP: {global_metrics['true_positives']}")
+    print(f"   - FP: {global_metrics['false_positives']}")
+    print(f"   - FN: {global_metrics['false_negatives']}")
+    
+    # Analyse des faux positifs par type
+    print(f"\n" + "="*80)
+    print("ANALYSE DES FAUX POSITIFS")
+    print("="*80)
+    
+    by_type = eval_data['by_type']
+    
+    # Trier par nombre de FP
+    fp_by_type = [(pii_type, stats['false_positives']) 
+                  for pii_type, stats in by_type.items() 
+                  if stats['false_positives'] > 0]
+    fp_by_type.sort(key=lambda x: x[1], reverse=True)
+    
+    print(f"\n🔴 Top 10 types avec le plus de faux positifs:")
+    for i, (pii_type, fp_count) in enumerate(fp_by_type[:10], 1):
+        stats = by_type[pii_type]
+        total = stats['true_positives'] + stats['false_positives']
+        pct = (fp_count / total * 100) if total > 0 else 0
+        print(f"   {i}. {pii_type}: {fp_count} FP ({pct:.1f}% du total)")
+        print(f"      TP: {stats['true_positives']}, Précision: {stats['precision']:.2%}")
+    
+    # Identifier les patterns problématiques
+    print(f"\n" + "="*80)
+    print("PATTERNS PROBLÉMATIQUES IDENTIFIÉS")
+    print("="*80)
+    
+    problems = []
+    
+    # Problème 1: Détections *_GLOBAL
+    global_types = [t for t in by_type.keys() if t.endswith('_GLOBAL')]
+    global_fp = sum(by_type[t]['false_positives'] for t in global_types)
+    if global_fp > 0:
+        problems.append({
+            "priority": "HAUTE",
+            "category": "Propagation globale",
+            "description": f"{global_fp} faux positifs dus aux détections *_GLOBAL",
+            "types": global_types,
+            "impact": f"{global_fp / global_metrics['false_positives'] * 100:.1f}% des FP totaux",
+            "solution": "Améliorer la logique de propagation globale ou désactiver pour certains types"
+        })
+    
+    # Problème 2: NOM_EXTRACTED
+    if 'NOM_EXTRACTED' in by_type:
+        nom_extracted_fp = by_type['NOM_EXTRACTED']['false_positives']
+        if nom_extracted_fp > 0:
+            problems.append({
+                "priority": "HAUTE",
+                "category": "Extraction de noms",
+                "description": f"{nom_extracted_fp} faux positifs de type NOM_EXTRACTED",
+                "types": ['NOM_EXTRACTED'],
+                "impact": f"{nom_extracted_fp / global_metrics['false_positives'] * 100:.1f}% des FP totaux",
+                "solution": "Améliorer les stopwords médicaux et la détection contextuelle"
+            })
+    
+    # Problème 3: Types avec faible précision
+    low_precision_types = [(t, s) for t, s in by_type.items() 
+                           if s['precision'] < 0.5 and s['false_positives'] > 10]
+    if low_precision_types:
+        low_precision_types.sort(key=lambda x: x[1]['precision'])
+        problems.append({
+            "priority": "MOYENNE",
+            "category": "Précision faible",
+            "description": f"{len(low_precision_types)} types avec précision < 50%",
+            "types": [t for t, _ in low_precision_types],
+            "impact": f"Affecte {sum(s['false_positives'] for _, s in low_precision_types)} FP",
+            "solution": "Améliorer les regex et la détection contextuelle pour ces types"
+        })
+    
+    # Problème 4: Faux négatifs (si présents)
+    fn_by_type = [(pii_type, stats['false_negatives']) 
+                  for pii_type, stats in by_type.items() 
+                  if stats['false_negatives'] > 0]
+    if fn_by_type:
+        fn_by_type.sort(key=lambda x: x[1], reverse=True)
+        total_fn = sum(fn for _, fn in fn_by_type)
+        problems.append({
+            "priority": "CRITIQUE",
+            "category": "Faux négatifs",
+            "description": f"{total_fn} PII manqués",
+            "types": [t for t, _ in fn_by_type],
+            "impact": f"Rappel affecté: {global_metrics['recall']:.2%}",
+            "solution": "Améliorer la couverture des regex et ajouter détection contextuelle"
+        })
+    
+    # Afficher les problèmes
+    print(f"\n🔍 {len(problems)} problèmes identifiés:\n")
+    for i, problem in enumerate(problems, 1):
+        print(f"{i}. [{problem['priority']}] {problem['category']}")
+        print(f"   Description: {problem['description']}")
+        print(f"   Impact: {problem['impact']}")
+        print(f"   Types concernés: {', '.join(problem['types'][:5])}")
+        if len(problem['types']) > 5:
+            print(f"                    ... et {len(problem['types']) - 5} autres")
+        print(f"   Solution proposée: {problem['solution']}")
+        print()
+    
+    # Priorisation des améliorations
+    print("="*80)
+    print("PRIORISATION DES AMÉLIORATIONS")
+    print("="*80)
+    
+    improvements = []
+    
+    # Amélioration 1: Désactiver ou améliorer la propagation globale
+    if global_fp > 1000:
+        improvements.append({
+            "priority": 1,
+            "title": "Optimiser la propagation globale (*_GLOBAL)",
+            "impact": f"Réduction de ~{global_fp} FP",
+            "effort": "Moyen",
+            "gain_precision": f"+{global_fp / (global_metrics['true_positives'] + global_metrics['false_positives']) * 100:.1f} points",
+            "tasks": [
+                "Analyser la pertinence de chaque type *_GLOBAL",
+                "Désactiver la propagation pour les types problématiques",
+                "Implémenter une validation croisée avant propagation"
+            ]
+        })
+    
+    # Amélioration 2: Enrichir les stopwords médicaux
+    if 'NOM_EXTRACTED' in by_type and by_type['NOM_EXTRACTED']['false_positives'] > 1000:
+        improvements.append({
+            "priority": 2,
+            "title": "Enrichir les stopwords médicaux",
+            "impact": f"Réduction de ~{by_type['NOM_EXTRACTED']['false_positives']} FP NOM_EXTRACTED",
+            "effort": "Faible",
+            "gain_precision": f"+{by_type['NOM_EXTRACTED']['false_positives'] / (global_metrics['true_positives'] + global_metrics['false_positives']) * 100:.1f} points",
+            "tasks": [
+                "Extraire les termes médicaux des documents annotés",
+                "Identifier les faux positifs récurrents",
+                "Ajouter à _MEDICAL_STOP_WORDS_SET"
+            ]
+        })
+    
+    # Amélioration 3: Améliorer les regex
+    regex_types = ['TEL', 'EMAIL', 'ADRESSE', 'CODE_POSTAL', 'NIR']
+    regex_fp = sum(by_type.get(t, {}).get('false_positives', 0) for t in regex_types)
+    if regex_fp > 50:
+        improvements.append({
+            "priority": 3,
+            "title": "Améliorer les regex de détection",
+            "impact": f"Réduction de ~{regex_fp} FP",
+            "effort": "Moyen",
+            "gain_precision": f"+{regex_fp / (global_metrics['true_positives'] + global_metrics['false_positives']) * 100:.1f} points",
+            "tasks": [
+                "Améliorer RE_TEL (formats fragmentés)",
+                "Améliorer RE_EMAIL (domaines médicaux)",
+                "Améliorer RE_ADRESSE (compléments Bât., Appt.)",
+                "Améliorer RE_NIR (espaces variables)"
+            ]
+        })
+    
+    # Amélioration 4: Détection contextuelle
+    contextual_types = ['EPISODE', 'VILLE']
+    contextual_issues = [(t, by_type[t]) for t in contextual_types if t in by_type and by_type[t]['precision'] < 0.5]
+    if contextual_issues:
+        total_contextual_fp = sum(s['false_positives'] for _, s in contextual_issues)
+        improvements.append({
+            "priority": 4,
+            "title": "Implémenter la détection contextuelle",
+            "impact": f"Réduction de ~{total_contextual_fp} FP",
+            "effort": "Élevé",
+            "gain_precision": f"+{total_contextual_fp / (global_metrics['true_positives'] + global_metrics['false_positives']) * 100:.1f} points",
+            "tasks": [
+                "Créer detectors/contextual.py",
+                "Implémenter la détection avec contexte fort/faible",
+                "Filtrer via stopwords médicaux",
+                "Intégrer dans le pipeline hybride"
+            ]
+        })
+    
+    print(f"\n🎯 {len(improvements)} améliorations prioritaires:\n")
+    for imp in improvements:
+        print(f"Priorité {imp['priority']}: {imp['title']}")
+        print(f"   Impact: {imp['impact']}")
+        print(f"   Gain précision estimé: {imp['gain_precision']}")
+        print(f"   Effort: {imp['effort']}")
+        print(f"   Tâches:")
+        for task in imp['tasks']:
+            print(f"      - {task}")
+        print()
+    
+    # Sauvegarder le rapport
+    report_dir = Path("tests/ground_truth/analysis")
+    report_dir.mkdir(exist_ok=True)
+    
+    report_data = {
+        "analysis_date": "2026-03-02",
+        "global_metrics": global_metrics,
+        "problems": problems,
+        "improvements": improvements,
+        "false_positives_by_type": dict(fp_by_type),
+        "low_precision_types": [
+            {"type": t, "precision": s['precision'], "fp": s['false_positives']}
+            for t, s in low_precision_types
+        ] if low_precision_types else []
+    }
+    
+    report_file = report_dir / "baseline_analysis.json"
+    with open(report_file, 'w', encoding='utf-8') as f:
+        json.dump(report_data, f, indent=2, ensure_ascii=False)
+    
+    print(f"📊 Rapport d'analyse sauvegardé: {report_file}")
+    
+    # Estimation du gain potentiel
+    print("\n" + "="*80)
+    print("ESTIMATION DU GAIN POTENTIEL")
+    print("="*80)
+    
+    # Si on implémente toutes les améliorations prioritaires
+    total_fp_reduction = sum(
+        int(imp['impact'].split('~')[1].split()[0].replace(',', ''))
+        for imp in improvements
+        if '~' in imp['impact']
+    )
+    
+    new_fp = global_metrics['false_positives'] - total_fp_reduction
+    new_precision = global_metrics['true_positives'] / (global_metrics['true_positives'] + new_fp)
+    new_f1 = 2 * (new_precision * global_metrics['recall']) / (new_precision + global_metrics['recall'])
+    
+    print(f"\n🎯 Avec toutes les améliorations prioritaires:")
+    print(f"   - FP actuels: {global_metrics['false_positives']}")
+    print(f"   - FP estimés: {new_fp} (-{total_fp_reduction})")
+    print(f"   - Précision actuelle: {global_metrics['precision']:.2%}")
+    print(f"   - Précision estimée: {new_precision:.2%} (+{(new_precision - global_metrics['precision'])*100:.1f} points)")
+    print(f"   - F1 actuel: {global_metrics['f1_score']:.2%}")
+    print(f"   - F1 estimé: {new_f1:.2%} (+{(new_f1 - global_metrics['f1_score'])*100:.1f} points)")
+    
+    if new_precision >= 0.97:
+        print(f"\n   ✅ Objectif de précision (≥97%) ATTEIGNABLE")
+    else:
+        print(f"\n   ⚠️  Objectif de précision (≥97%) nécessite des améliorations supplémentaires")
+    
+    print("\n" + "="*80)
+    
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(analyze_baseline_results())