feat: Analyse propagation globale - 100% des *_GLOBAL et NOM_EXTRACTED sont des FP
This commit is contained in:
214
tools/analyze_global_propagation.py
Executable file
214
tools/analyze_global_propagation.py
Executable file
@@ -0,0 +1,214 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Analyse la propagation globale et propose des optimisations.
|
||||
|
||||
La propagation globale crée 4,797 FP (96.9% du total):
|
||||
- NOM_EXTRACTED: 3,846 FP (77.7%)
|
||||
- *_GLOBAL: 951 FP (19.2%)
|
||||
|
||||
Ce script analyse quels types bénéficient vraiment de la propagation.
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
from collections import defaultdict, Counter
|
||||
|
||||
def analyze_global_propagation():
|
||||
"""Analyse la propagation globale."""
|
||||
|
||||
print("="*80)
|
||||
print("ANALYSE DE LA PROPAGATION GLOBALE")
|
||||
print("="*80)
|
||||
|
||||
# Charger les résultats d'évaluation
|
||||
eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json")
|
||||
|
||||
with open(eval_file, 'r', encoding='utf-8') as f:
|
||||
eval_data = json.load(f)
|
||||
|
||||
by_type = eval_data['by_type']
|
||||
|
||||
# Analyser les types *_GLOBAL
|
||||
global_types = {k: v for k, v in by_type.items() if k.endswith('_GLOBAL')}
|
||||
non_global_types = {k: v for k, v in by_type.items() if not k.endswith('_GLOBAL') and k != 'NOM_EXTRACTED'}
|
||||
|
||||
print(f"\n📊 Types avec propagation globale: {len(global_types)}")
|
||||
print(f"📊 Types sans propagation: {len(non_global_types)}")
|
||||
|
||||
# Statistiques par type *_GLOBAL
|
||||
print(f"\n" + "="*80)
|
||||
print("ANALYSE DES TYPES *_GLOBAL")
|
||||
print("="*80)
|
||||
|
||||
global_stats = []
|
||||
for pii_type, stats in global_types.items():
|
||||
base_type = pii_type.replace('_GLOBAL', '')
|
||||
base_stats = non_global_types.get(base_type, {})
|
||||
|
||||
global_stats.append({
|
||||
"type": pii_type,
|
||||
"base_type": base_type,
|
||||
"tp": stats['true_positives'],
|
||||
"fp": stats['false_positives'],
|
||||
"fn": stats['false_negatives'],
|
||||
"precision": stats['precision'],
|
||||
"base_tp": base_stats.get('true_positives', 0),
|
||||
"base_fp": base_stats.get('false_positives', 0),
|
||||
"base_precision": base_stats.get('precision', 0.0)
|
||||
})
|
||||
|
||||
# Trier par nombre de FP
|
||||
global_stats.sort(key=lambda x: x['fp'], reverse=True)
|
||||
|
||||
print(f"\n🔴 Types *_GLOBAL par nombre de faux positifs:\n")
|
||||
for i, stat in enumerate(global_stats, 1):
|
||||
print(f"{i}. {stat['type']}")
|
||||
print(f" TP: {stat['tp']}, FP: {stat['fp']}, Précision: {stat['precision']:.2%}")
|
||||
print(f" Type de base ({stat['base_type']}): TP: {stat['base_tp']}, FP: {stat['base_fp']}, Précision: {stat['base_precision']:.2%}")
|
||||
|
||||
# Évaluer l'utilité
|
||||
if stat['tp'] == 0 and stat['fp'] > 0:
|
||||
print(f" ⚠️ INUTILE: Aucun TP, {stat['fp']} FP → À DÉSACTIVER")
|
||||
elif stat['precision'] < 0.5:
|
||||
print(f" ⚠️ PROBLÉMATIQUE: Précision {stat['precision']:.2%} → À DÉSACTIVER")
|
||||
elif stat['tp'] > 0 and stat['precision'] >= 0.8:
|
||||
print(f" ✅ UTILE: {stat['tp']} TP avec bonne précision → À CONSERVER")
|
||||
else:
|
||||
print(f" ⚙️ À ÉVALUER: Bénéfice/coût à analyser")
|
||||
print()
|
||||
|
||||
# Analyse NOM_EXTRACTED
|
||||
print("="*80)
|
||||
print("ANALYSE DE NOM_EXTRACTED")
|
||||
print("="*80)
|
||||
|
||||
nom_extracted_stats = by_type.get('NOM_EXTRACTED', {})
|
||||
print(f"\n📊 NOM_EXTRACTED:")
|
||||
print(f" TP: {nom_extracted_stats.get('true_positives', 0)}")
|
||||
print(f" FP: {nom_extracted_stats.get('false_positives', 0)}")
|
||||
print(f" Précision: {nom_extracted_stats.get('precision', 0):.2%}")
|
||||
|
||||
if nom_extracted_stats.get('false_positives', 0) > 1000:
|
||||
print(f"\n ⚠️ PROBLÈME MAJEUR: {nom_extracted_stats['false_positives']} FP")
|
||||
print(f" Impact: {nom_extracted_stats['false_positives'] / eval_data['global_metrics']['false_positives'] * 100:.1f}% des FP totaux")
|
||||
print(f"\n 💡 SOLUTION: Désactiver NOM_EXTRACTED ou améliorer drastiquement le filtrage")
|
||||
|
||||
# Recommandations
|
||||
print(f"\n" + "="*80)
|
||||
print("RECOMMANDATIONS")
|
||||
print("="*80)
|
||||
|
||||
recommendations = []
|
||||
|
||||
# Recommandation 1: Désactiver les types *_GLOBAL inutiles
|
||||
useless_global = [s for s in global_stats if s['tp'] == 0 and s['fp'] > 0]
|
||||
if useless_global:
|
||||
total_fp_saved = sum(s['fp'] for s in useless_global)
|
||||
recommendations.append({
|
||||
"priority": 1,
|
||||
"title": "Désactiver les types *_GLOBAL inutiles",
|
||||
"types": [s['type'] for s in useless_global],
|
||||
"impact": f"Réduction de {total_fp_saved} FP",
|
||||
"gain_precision": f"+{total_fp_saved / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
|
||||
"effort": "Faible",
|
||||
"implementation": "Modifier la fonction de propagation globale pour exclure ces types"
|
||||
})
|
||||
|
||||
# Recommandation 2: Désactiver NOM_EXTRACTED
|
||||
if nom_extracted_stats.get('false_positives', 0) > 1000:
|
||||
recommendations.append({
|
||||
"priority": 2,
|
||||
"title": "Désactiver NOM_EXTRACTED",
|
||||
"types": ['NOM_EXTRACTED'],
|
||||
"impact": f"Réduction de {nom_extracted_stats['false_positives']} FP",
|
||||
"gain_precision": f"+{nom_extracted_stats['false_positives'] / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
|
||||
"effort": "Faible",
|
||||
"implementation": "Commenter ou supprimer la logique d'extraction de noms"
|
||||
})
|
||||
|
||||
# Recommandation 3: Améliorer le filtrage des types *_GLOBAL problématiques
|
||||
problematic_global = [s for s in global_stats if s['tp'] > 0 and s['precision'] < 0.5]
|
||||
if problematic_global:
|
||||
total_fp = sum(s['fp'] for s in problematic_global)
|
||||
recommendations.append({
|
||||
"priority": 3,
|
||||
"title": "Améliorer le filtrage des types *_GLOBAL problématiques",
|
||||
"types": [s['type'] for s in problematic_global],
|
||||
"impact": f"Réduction estimée de {int(total_fp * 0.7)} FP (70% des {total_fp})",
|
||||
"gain_precision": f"+{int(total_fp * 0.7) / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
|
||||
"effort": "Moyen",
|
||||
"implementation": "Ajouter validation croisée ou seuil de confiance avant propagation"
|
||||
})
|
||||
|
||||
print(f"\n🎯 {len(recommendations)} recommandations:\n")
|
||||
for i, rec in enumerate(recommendations, 1):
|
||||
print(f"Priorité {rec['priority']}: {rec['title']}")
|
||||
print(f" Types: {', '.join(rec['types'][:5])}")
|
||||
if len(rec['types']) > 5:
|
||||
print(f" ... et {len(rec['types']) - 5} autres")
|
||||
print(f" Impact: {rec['impact']}")
|
||||
print(f" Gain précision: {rec['gain_precision']}")
|
||||
print(f" Effort: {rec['effort']}")
|
||||
print(f" Implémentation: {rec['implementation']}")
|
||||
print()
|
||||
|
||||
# Estimation du gain total
|
||||
total_fp_reduction = 0
|
||||
for rec in recommendations:
|
||||
# Extraire le nombre de FP de l'impact
|
||||
import re
|
||||
match = re.search(r'(\d+)', rec['impact'])
|
||||
if match:
|
||||
total_fp_reduction += int(match.group(1))
|
||||
|
||||
current_fp = eval_data['global_metrics']['false_positives']
|
||||
current_tp = eval_data['global_metrics']['true_positives']
|
||||
current_precision = eval_data['global_metrics']['precision']
|
||||
|
||||
new_fp = current_fp - total_fp_reduction
|
||||
new_precision = current_tp / (current_tp + new_fp) if (current_tp + new_fp) > 0 else 0
|
||||
|
||||
print("="*80)
|
||||
print("ESTIMATION DU GAIN TOTAL")
|
||||
print("="*80)
|
||||
print(f"\n🎯 Avec toutes les recommandations:")
|
||||
print(f" - FP actuels: {current_fp}")
|
||||
print(f" - FP estimés: {new_fp} (-{total_fp_reduction})")
|
||||
print(f" - Précision actuelle: {current_precision:.2%}")
|
||||
print(f" - Précision estimée: {new_precision:.2%} (+{(new_precision - current_precision)*100:.1f} points)")
|
||||
|
||||
if new_precision >= 0.97:
|
||||
print(f"\n ✅ Objectif de précision (≥97%) ATTEIGNABLE!")
|
||||
else:
|
||||
print(f"\n ⚠️ Objectif de précision (≥97%) nécessite {(0.97 - new_precision)*100:.1f} points supplémentaires")
|
||||
|
||||
# Sauvegarder
|
||||
output_dir = Path("tests/ground_truth/analysis")
|
||||
output_file = output_dir / "global_propagation_analysis.json"
|
||||
|
||||
output_data = {
|
||||
"analysis_date": "2026-03-02",
|
||||
"global_types_stats": global_stats,
|
||||
"nom_extracted_stats": nom_extracted_stats,
|
||||
"recommendations": recommendations,
|
||||
"estimated_gain": {
|
||||
"current_fp": current_fp,
|
||||
"estimated_fp": new_fp,
|
||||
"fp_reduction": total_fp_reduction,
|
||||
"current_precision": current_precision,
|
||||
"estimated_precision": new_precision,
|
||||
"precision_gain": new_precision - current_precision
|
||||
}
|
||||
}
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n📊 Analyse sauvegardée: {output_file}")
|
||||
print("\n" + "="*80)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(analyze_global_propagation())
|
||||
Reference in New Issue
Block a user