#!/usr/bin/env python3 """ Analyse la propagation globale et propose des optimisations. La propagation globale crée 4,797 FP (96.9% du total): - NOM_EXTRACTED: 3,846 FP (77.7%) - *_GLOBAL: 951 FP (19.2%) Ce script analyse quels types bénéficient vraiment de la propagation. """ import sys import json from pathlib import Path from collections import defaultdict, Counter def analyze_global_propagation(): """Analyse la propagation globale.""" print("="*80) print("ANALYSE DE LA PROPAGATION GLOBALE") print("="*80) # Charger les résultats d'évaluation eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json") with open(eval_file, 'r', encoding='utf-8') as f: eval_data = json.load(f) by_type = eval_data['by_type'] # Analyser les types *_GLOBAL global_types = {k: v for k, v in by_type.items() if k.endswith('_GLOBAL')} non_global_types = {k: v for k, v in by_type.items() if not k.endswith('_GLOBAL') and k != 'NOM_EXTRACTED'} print(f"\n📊 Types avec propagation globale: {len(global_types)}") print(f"📊 Types sans propagation: {len(non_global_types)}") # Statistiques par type *_GLOBAL print(f"\n" + "="*80) print("ANALYSE DES TYPES *_GLOBAL") print("="*80) global_stats = [] for pii_type, stats in global_types.items(): base_type = pii_type.replace('_GLOBAL', '') base_stats = non_global_types.get(base_type, {}) global_stats.append({ "type": pii_type, "base_type": base_type, "tp": stats['true_positives'], "fp": stats['false_positives'], "fn": stats['false_negatives'], "precision": stats['precision'], "base_tp": base_stats.get('true_positives', 0), "base_fp": base_stats.get('false_positives', 0), "base_precision": base_stats.get('precision', 0.0) }) # Trier par nombre de FP global_stats.sort(key=lambda x: x['fp'], reverse=True) print(f"\n🔴 Types *_GLOBAL par nombre de faux positifs:\n") for i, stat in enumerate(global_stats, 1): print(f"{i}. {stat['type']}") print(f" TP: {stat['tp']}, FP: {stat['fp']}, Précision: {stat['precision']:.2%}") print(f" Type de base ({stat['base_type']}): TP: {stat['base_tp']}, FP: {stat['base_fp']}, Précision: {stat['base_precision']:.2%}") # Évaluer l'utilité if stat['tp'] == 0 and stat['fp'] > 0: print(f" ⚠️ INUTILE: Aucun TP, {stat['fp']} FP → À DÉSACTIVER") elif stat['precision'] < 0.5: print(f" ⚠️ PROBLÉMATIQUE: Précision {stat['precision']:.2%} → À DÉSACTIVER") elif stat['tp'] > 0 and stat['precision'] >= 0.8: print(f" ✅ UTILE: {stat['tp']} TP avec bonne précision → À CONSERVER") else: print(f" ⚙️ À ÉVALUER: Bénéfice/coût à analyser") print() # Analyse NOM_EXTRACTED print("="*80) print("ANALYSE DE NOM_EXTRACTED") print("="*80) nom_extracted_stats = by_type.get('NOM_EXTRACTED', {}) print(f"\n📊 NOM_EXTRACTED:") print(f" TP: {nom_extracted_stats.get('true_positives', 0)}") print(f" FP: {nom_extracted_stats.get('false_positives', 0)}") print(f" Précision: {nom_extracted_stats.get('precision', 0):.2%}") if nom_extracted_stats.get('false_positives', 0) > 1000: print(f"\n ⚠️ PROBLÈME MAJEUR: {nom_extracted_stats['false_positives']} FP") print(f" Impact: {nom_extracted_stats['false_positives'] / eval_data['global_metrics']['false_positives'] * 100:.1f}% des FP totaux") print(f"\n 💡 SOLUTION: Désactiver NOM_EXTRACTED ou améliorer drastiquement le filtrage") # Recommandations print(f"\n" + "="*80) print("RECOMMANDATIONS") print("="*80) recommendations = [] # Recommandation 1: Désactiver les types *_GLOBAL inutiles useless_global = [s for s in global_stats if s['tp'] == 0 and s['fp'] > 0] if useless_global: total_fp_saved = sum(s['fp'] for s in useless_global) recommendations.append({ "priority": 1, "title": "Désactiver les types *_GLOBAL inutiles", "types": [s['type'] for s in useless_global], "impact": f"Réduction de {total_fp_saved} FP", "gain_precision": f"+{total_fp_saved / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points", "effort": "Faible", "implementation": "Modifier la fonction de propagation globale pour exclure ces types" }) # Recommandation 2: Désactiver NOM_EXTRACTED if nom_extracted_stats.get('false_positives', 0) > 1000: recommendations.append({ "priority": 2, "title": "Désactiver NOM_EXTRACTED", "types": ['NOM_EXTRACTED'], "impact": f"Réduction de {nom_extracted_stats['false_positives']} FP", "gain_precision": f"+{nom_extracted_stats['false_positives'] / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points", "effort": "Faible", "implementation": "Commenter ou supprimer la logique d'extraction de noms" }) # Recommandation 3: Améliorer le filtrage des types *_GLOBAL problématiques problematic_global = [s for s in global_stats if s['tp'] > 0 and s['precision'] < 0.5] if problematic_global: total_fp = sum(s['fp'] for s in problematic_global) recommendations.append({ "priority": 3, "title": "Améliorer le filtrage des types *_GLOBAL problématiques", "types": [s['type'] for s in problematic_global], "impact": f"Réduction estimée de {int(total_fp * 0.7)} FP (70% des {total_fp})", "gain_precision": f"+{int(total_fp * 0.7) / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points", "effort": "Moyen", "implementation": "Ajouter validation croisée ou seuil de confiance avant propagation" }) print(f"\n🎯 {len(recommendations)} recommandations:\n") for i, rec in enumerate(recommendations, 1): print(f"Priorité {rec['priority']}: {rec['title']}") print(f" Types: {', '.join(rec['types'][:5])}") if len(rec['types']) > 5: print(f" ... et {len(rec['types']) - 5} autres") print(f" Impact: {rec['impact']}") print(f" Gain précision: {rec['gain_precision']}") print(f" Effort: {rec['effort']}") print(f" Implémentation: {rec['implementation']}") print() # Estimation du gain total total_fp_reduction = 0 for rec in recommendations: # Extraire le nombre de FP de l'impact import re match = re.search(r'(\d+)', rec['impact']) if match: total_fp_reduction += int(match.group(1)) current_fp = eval_data['global_metrics']['false_positives'] current_tp = eval_data['global_metrics']['true_positives'] current_precision = eval_data['global_metrics']['precision'] new_fp = current_fp - total_fp_reduction new_precision = current_tp / (current_tp + new_fp) if (current_tp + new_fp) > 0 else 0 print("="*80) print("ESTIMATION DU GAIN TOTAL") print("="*80) print(f"\n🎯 Avec toutes les recommandations:") print(f" - FP actuels: {current_fp}") print(f" - FP estimés: {new_fp} (-{total_fp_reduction})") print(f" - Précision actuelle: {current_precision:.2%}") print(f" - Précision estimée: {new_precision:.2%} (+{(new_precision - current_precision)*100:.1f} points)") if new_precision >= 0.97: print(f"\n ✅ Objectif de précision (≥97%) ATTEIGNABLE!") else: print(f"\n ⚠️ Objectif de précision (≥97%) nécessite {(0.97 - new_precision)*100:.1f} points supplémentaires") # Sauvegarder output_dir = Path("tests/ground_truth/analysis") output_file = output_dir / "global_propagation_analysis.json" output_data = { "analysis_date": "2026-03-02", "global_types_stats": global_stats, "nom_extracted_stats": nom_extracted_stats, "recommendations": recommendations, "estimated_gain": { "current_fp": current_fp, "estimated_fp": new_fp, "fp_reduction": total_fp_reduction, "current_precision": current_precision, "estimated_precision": new_precision, "precision_gain": new_precision - current_precision } } with open(output_file, 'w', encoding='utf-8') as f: json.dump(output_data, f, indent=2, ensure_ascii=False) print(f"\n📊 Analyse sauvegardée: {output_file}") print("\n" + "="*80) return 0 if __name__ == "__main__": sys.exit(analyze_global_propagation())