feat: Analyse propagation globale - 100% des *_GLOBAL et NOM_EXTRACTED sont des FP

2026-03-02 11:01:14 +01:00
parent 4eba826ca5
commit dfa45041d7
6 changed files with 876 additions and 3 deletions
--- a/tools/analyze_global_propagation.py
+++ b/tools/analyze_global_propagation.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+"""
+Analyse la propagation globale et propose des optimisations.
+
+La propagation globale crée 4,797 FP (96.9% du total):
+- NOM_EXTRACTED: 3,846 FP (77.7%)
+- *_GLOBAL: 951 FP (19.2%)
+
+Ce script analyse quels types bénéficient vraiment de la propagation.
+"""
+import sys
+import json
+from pathlib import Path
+from collections import defaultdict, Counter
+
+def analyze_global_propagation():
+    """Analyse la propagation globale."""
+    
+    print("="*80)
+    print("ANALYSE DE LA PROPAGATION GLOBALE")
+    print("="*80)
+    
+    # Charger les résultats d'évaluation
+    eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json")
+    
+    with open(eval_file, 'r', encoding='utf-8') as f:
+        eval_data = json.load(f)
+    
+    by_type = eval_data['by_type']
+    
+    # Analyser les types *_GLOBAL
+    global_types = {k: v for k, v in by_type.items() if k.endswith('_GLOBAL')}
+    non_global_types = {k: v for k, v in by_type.items() if not k.endswith('_GLOBAL') and k != 'NOM_EXTRACTED'}
+    
+    print(f"\n📊 Types avec propagation globale: {len(global_types)}")
+    print(f"📊 Types sans propagation: {len(non_global_types)}")
+    
+    # Statistiques par type *_GLOBAL
+    print(f"\n" + "="*80)
+    print("ANALYSE DES TYPES *_GLOBAL")
+    print("="*80)
+    
+    global_stats = []
+    for pii_type, stats in global_types.items():
+        base_type = pii_type.replace('_GLOBAL', '')
+        base_stats = non_global_types.get(base_type, {})
+        
+        global_stats.append({
+            "type": pii_type,
+            "base_type": base_type,
+            "tp": stats['true_positives'],
+            "fp": stats['false_positives'],
+            "fn": stats['false_negatives'],
+            "precision": stats['precision'],
+            "base_tp": base_stats.get('true_positives', 0),
+            "base_fp": base_stats.get('false_positives', 0),
+            "base_precision": base_stats.get('precision', 0.0)
+        })
+    
+    # Trier par nombre de FP
+    global_stats.sort(key=lambda x: x['fp'], reverse=True)
+    
+    print(f"\n🔴 Types *_GLOBAL par nombre de faux positifs:\n")
+    for i, stat in enumerate(global_stats, 1):
+        print(f"{i}. {stat['type']}")
+        print(f"   TP: {stat['tp']}, FP: {stat['fp']}, Précision: {stat['precision']:.2%}")
+        print(f"   Type de base ({stat['base_type']}): TP: {stat['base_tp']}, FP: {stat['base_fp']}, Précision: {stat['base_precision']:.2%}")
+        
+        # Évaluer l'utilité
+        if stat['tp'] == 0 and stat['fp'] > 0:
+            print(f"   ⚠️  INUTILE: Aucun TP, {stat['fp']} FP → À DÉSACTIVER")
+        elif stat['precision'] < 0.5:
+            print(f"   ⚠️  PROBLÉMATIQUE: Précision {stat['precision']:.2%} → À DÉSACTIVER")
+        elif stat['tp'] > 0 and stat['precision'] >= 0.8:
+            print(f"   ✅ UTILE: {stat['tp']} TP avec bonne précision → À CONSERVER")
+        else:
+            print(f"   ⚙️  À ÉVALUER: Bénéfice/coût à analyser")
+        print()
+    
+    # Analyse NOM_EXTRACTED
+    print("="*80)
+    print("ANALYSE DE NOM_EXTRACTED")
+    print("="*80)
+    
+    nom_extracted_stats = by_type.get('NOM_EXTRACTED', {})
+    print(f"\n📊 NOM_EXTRACTED:")
+    print(f"   TP: {nom_extracted_stats.get('true_positives', 0)}")
+    print(f"   FP: {nom_extracted_stats.get('false_positives', 0)}")
+    print(f"   Précision: {nom_extracted_stats.get('precision', 0):.2%}")
+    
+    if nom_extracted_stats.get('false_positives', 0) > 1000:
+        print(f"\n   ⚠️  PROBLÈME MAJEUR: {nom_extracted_stats['false_positives']} FP")
+        print(f"   Impact: {nom_extracted_stats['false_positives'] / eval_data['global_metrics']['false_positives'] * 100:.1f}% des FP totaux")
+        print(f"\n   💡 SOLUTION: Désactiver NOM_EXTRACTED ou améliorer drastiquement le filtrage")
+    
+    # Recommandations
+    print(f"\n" + "="*80)
+    print("RECOMMANDATIONS")
+    print("="*80)
+    
+    recommendations = []
+    
+    # Recommandation 1: Désactiver les types *_GLOBAL inutiles
+    useless_global = [s for s in global_stats if s['tp'] == 0 and s['fp'] > 0]
+    if useless_global:
+        total_fp_saved = sum(s['fp'] for s in useless_global)
+        recommendations.append({
+            "priority": 1,
+            "title": "Désactiver les types *_GLOBAL inutiles",
+            "types": [s['type'] for s in useless_global],
+            "impact": f"Réduction de {total_fp_saved} FP",
+            "gain_precision": f"+{total_fp_saved / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
+            "effort": "Faible",
+            "implementation": "Modifier la fonction de propagation globale pour exclure ces types"
+        })
+    
+    # Recommandation 2: Désactiver NOM_EXTRACTED
+    if nom_extracted_stats.get('false_positives', 0) > 1000:
+        recommendations.append({
+            "priority": 2,
+            "title": "Désactiver NOM_EXTRACTED",
+            "types": ['NOM_EXTRACTED'],
+            "impact": f"Réduction de {nom_extracted_stats['false_positives']} FP",
+            "gain_precision": f"+{nom_extracted_stats['false_positives'] / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
+            "effort": "Faible",
+            "implementation": "Commenter ou supprimer la logique d'extraction de noms"
+        })
+    
+    # Recommandation 3: Améliorer le filtrage des types *_GLOBAL problématiques
+    problematic_global = [s for s in global_stats if s['tp'] > 0 and s['precision'] < 0.5]
+    if problematic_global:
+        total_fp = sum(s['fp'] for s in problematic_global)
+        recommendations.append({
+            "priority": 3,
+            "title": "Améliorer le filtrage des types *_GLOBAL problématiques",
+            "types": [s['type'] for s in problematic_global],
+            "impact": f"Réduction estimée de {int(total_fp * 0.7)} FP (70% des {total_fp})",
+            "gain_precision": f"+{int(total_fp * 0.7) / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
+            "effort": "Moyen",
+            "implementation": "Ajouter validation croisée ou seuil de confiance avant propagation"
+        })
+    
+    print(f"\n🎯 {len(recommendations)} recommandations:\n")
+    for i, rec in enumerate(recommendations, 1):
+        print(f"Priorité {rec['priority']}: {rec['title']}")
+        print(f"   Types: {', '.join(rec['types'][:5])}")
+        if len(rec['types']) > 5:
+            print(f"          ... et {len(rec['types']) - 5} autres")
+        print(f"   Impact: {rec['impact']}")
+        print(f"   Gain précision: {rec['gain_precision']}")
+        print(f"   Effort: {rec['effort']}")
+        print(f"   Implémentation: {rec['implementation']}")
+        print()
+    
+    # Estimation du gain total
+    total_fp_reduction = 0
+    for rec in recommendations:
+        # Extraire le nombre de FP de l'impact
+        import re
+        match = re.search(r'(\d+)', rec['impact'])
+        if match:
+            total_fp_reduction += int(match.group(1))
+    
+    current_fp = eval_data['global_metrics']['false_positives']
+    current_tp = eval_data['global_metrics']['true_positives']
+    current_precision = eval_data['global_metrics']['precision']
+    
+    new_fp = current_fp - total_fp_reduction
+    new_precision = current_tp / (current_tp + new_fp) if (current_tp + new_fp) > 0 else 0
+    
+    print("="*80)
+    print("ESTIMATION DU GAIN TOTAL")
+    print("="*80)
+    print(f"\n🎯 Avec toutes les recommandations:")
+    print(f"   - FP actuels: {current_fp}")
+    print(f"   - FP estimés: {new_fp} (-{total_fp_reduction})")
+    print(f"   - Précision actuelle: {current_precision:.2%}")
+    print(f"   - Précision estimée: {new_precision:.2%} (+{(new_precision - current_precision)*100:.1f} points)")
+    
+    if new_precision >= 0.97:
+        print(f"\n   ✅ Objectif de précision (≥97%) ATTEIGNABLE!")
+    else:
+        print(f"\n   ⚠️  Objectif de précision (≥97%) nécessite {(0.97 - new_precision)*100:.1f} points supplémentaires")
+    
+    # Sauvegarder
+    output_dir = Path("tests/ground_truth/analysis")
+    output_file = output_dir / "global_propagation_analysis.json"
+    
+    output_data = {
+        "analysis_date": "2026-03-02",
+        "global_types_stats": global_stats,
+        "nom_extracted_stats": nom_extracted_stats,
+        "recommendations": recommendations,
+        "estimated_gain": {
+            "current_fp": current_fp,
+            "estimated_fp": new_fp,
+            "fp_reduction": total_fp_reduction,
+            "current_precision": current_precision,
+            "estimated_precision": new_precision,
+            "precision_gain": new_precision - current_precision
+        }
+    }
+    
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(output_data, f, indent=2, ensure_ascii=False)
+    
+    print(f"\n📊 Analyse sauvegardée: {output_file}")
+    print("\n" + "="*80)
+    
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(analyze_global_propagation())