feat: Analyse propagation globale - 100% des *_GLOBAL et NOM_EXTRACTED sont des FP

This commit is contained in:
2026-03-02 11:01:14 +01:00
parent 4eba826ca5
commit dfa45041d7
6 changed files with 876 additions and 3 deletions

View File

@@ -0,0 +1,214 @@
#!/usr/bin/env python3
"""
Analyse la propagation globale et propose des optimisations.
La propagation globale crée 4,797 FP (96.9% du total):
- NOM_EXTRACTED: 3,846 FP (77.7%)
- *_GLOBAL: 951 FP (19.2%)
Ce script analyse quels types bénéficient vraiment de la propagation.
"""
import sys
import json
from pathlib import Path
from collections import defaultdict, Counter
def analyze_global_propagation():
"""Analyse la propagation globale."""
print("="*80)
print("ANALYSE DE LA PROPAGATION GLOBALE")
print("="*80)
# Charger les résultats d'évaluation
eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json")
with open(eval_file, 'r', encoding='utf-8') as f:
eval_data = json.load(f)
by_type = eval_data['by_type']
# Analyser les types *_GLOBAL
global_types = {k: v for k, v in by_type.items() if k.endswith('_GLOBAL')}
non_global_types = {k: v for k, v in by_type.items() if not k.endswith('_GLOBAL') and k != 'NOM_EXTRACTED'}
print(f"\n📊 Types avec propagation globale: {len(global_types)}")
print(f"📊 Types sans propagation: {len(non_global_types)}")
# Statistiques par type *_GLOBAL
print(f"\n" + "="*80)
print("ANALYSE DES TYPES *_GLOBAL")
print("="*80)
global_stats = []
for pii_type, stats in global_types.items():
base_type = pii_type.replace('_GLOBAL', '')
base_stats = non_global_types.get(base_type, {})
global_stats.append({
"type": pii_type,
"base_type": base_type,
"tp": stats['true_positives'],
"fp": stats['false_positives'],
"fn": stats['false_negatives'],
"precision": stats['precision'],
"base_tp": base_stats.get('true_positives', 0),
"base_fp": base_stats.get('false_positives', 0),
"base_precision": base_stats.get('precision', 0.0)
})
# Trier par nombre de FP
global_stats.sort(key=lambda x: x['fp'], reverse=True)
print(f"\n🔴 Types *_GLOBAL par nombre de faux positifs:\n")
for i, stat in enumerate(global_stats, 1):
print(f"{i}. {stat['type']}")
print(f" TP: {stat['tp']}, FP: {stat['fp']}, Précision: {stat['precision']:.2%}")
print(f" Type de base ({stat['base_type']}): TP: {stat['base_tp']}, FP: {stat['base_fp']}, Précision: {stat['base_precision']:.2%}")
# Évaluer l'utilité
if stat['tp'] == 0 and stat['fp'] > 0:
print(f" ⚠️ INUTILE: Aucun TP, {stat['fp']} FP → À DÉSACTIVER")
elif stat['precision'] < 0.5:
print(f" ⚠️ PROBLÉMATIQUE: Précision {stat['precision']:.2%} → À DÉSACTIVER")
elif stat['tp'] > 0 and stat['precision'] >= 0.8:
print(f" ✅ UTILE: {stat['tp']} TP avec bonne précision → À CONSERVER")
else:
print(f" ⚙️ À ÉVALUER: Bénéfice/coût à analyser")
print()
# Analyse NOM_EXTRACTED
print("="*80)
print("ANALYSE DE NOM_EXTRACTED")
print("="*80)
nom_extracted_stats = by_type.get('NOM_EXTRACTED', {})
print(f"\n📊 NOM_EXTRACTED:")
print(f" TP: {nom_extracted_stats.get('true_positives', 0)}")
print(f" FP: {nom_extracted_stats.get('false_positives', 0)}")
print(f" Précision: {nom_extracted_stats.get('precision', 0):.2%}")
if nom_extracted_stats.get('false_positives', 0) > 1000:
print(f"\n ⚠️ PROBLÈME MAJEUR: {nom_extracted_stats['false_positives']} FP")
print(f" Impact: {nom_extracted_stats['false_positives'] / eval_data['global_metrics']['false_positives'] * 100:.1f}% des FP totaux")
print(f"\n 💡 SOLUTION: Désactiver NOM_EXTRACTED ou améliorer drastiquement le filtrage")
# Recommandations
print(f"\n" + "="*80)
print("RECOMMANDATIONS")
print("="*80)
recommendations = []
# Recommandation 1: Désactiver les types *_GLOBAL inutiles
useless_global = [s for s in global_stats if s['tp'] == 0 and s['fp'] > 0]
if useless_global:
total_fp_saved = sum(s['fp'] for s in useless_global)
recommendations.append({
"priority": 1,
"title": "Désactiver les types *_GLOBAL inutiles",
"types": [s['type'] for s in useless_global],
"impact": f"Réduction de {total_fp_saved} FP",
"gain_precision": f"+{total_fp_saved / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
"effort": "Faible",
"implementation": "Modifier la fonction de propagation globale pour exclure ces types"
})
# Recommandation 2: Désactiver NOM_EXTRACTED
if nom_extracted_stats.get('false_positives', 0) > 1000:
recommendations.append({
"priority": 2,
"title": "Désactiver NOM_EXTRACTED",
"types": ['NOM_EXTRACTED'],
"impact": f"Réduction de {nom_extracted_stats['false_positives']} FP",
"gain_precision": f"+{nom_extracted_stats['false_positives'] / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
"effort": "Faible",
"implementation": "Commenter ou supprimer la logique d'extraction de noms"
})
# Recommandation 3: Améliorer le filtrage des types *_GLOBAL problématiques
problematic_global = [s for s in global_stats if s['tp'] > 0 and s['precision'] < 0.5]
if problematic_global:
total_fp = sum(s['fp'] for s in problematic_global)
recommendations.append({
"priority": 3,
"title": "Améliorer le filtrage des types *_GLOBAL problématiques",
"types": [s['type'] for s in problematic_global],
"impact": f"Réduction estimée de {int(total_fp * 0.7)} FP (70% des {total_fp})",
"gain_precision": f"+{int(total_fp * 0.7) / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
"effort": "Moyen",
"implementation": "Ajouter validation croisée ou seuil de confiance avant propagation"
})
print(f"\n🎯 {len(recommendations)} recommandations:\n")
for i, rec in enumerate(recommendations, 1):
print(f"Priorité {rec['priority']}: {rec['title']}")
print(f" Types: {', '.join(rec['types'][:5])}")
if len(rec['types']) > 5:
print(f" ... et {len(rec['types']) - 5} autres")
print(f" Impact: {rec['impact']}")
print(f" Gain précision: {rec['gain_precision']}")
print(f" Effort: {rec['effort']}")
print(f" Implémentation: {rec['implementation']}")
print()
# Estimation du gain total
total_fp_reduction = 0
for rec in recommendations:
# Extraire le nombre de FP de l'impact
import re
match = re.search(r'(\d+)', rec['impact'])
if match:
total_fp_reduction += int(match.group(1))
current_fp = eval_data['global_metrics']['false_positives']
current_tp = eval_data['global_metrics']['true_positives']
current_precision = eval_data['global_metrics']['precision']
new_fp = current_fp - total_fp_reduction
new_precision = current_tp / (current_tp + new_fp) if (current_tp + new_fp) > 0 else 0
print("="*80)
print("ESTIMATION DU GAIN TOTAL")
print("="*80)
print(f"\n🎯 Avec toutes les recommandations:")
print(f" - FP actuels: {current_fp}")
print(f" - FP estimés: {new_fp} (-{total_fp_reduction})")
print(f" - Précision actuelle: {current_precision:.2%}")
print(f" - Précision estimée: {new_precision:.2%} (+{(new_precision - current_precision)*100:.1f} points)")
if new_precision >= 0.97:
print(f"\n ✅ Objectif de précision (≥97%) ATTEIGNABLE!")
else:
print(f"\n ⚠️ Objectif de précision (≥97%) nécessite {(0.97 - new_precision)*100:.1f} points supplémentaires")
# Sauvegarder
output_dir = Path("tests/ground_truth/analysis")
output_file = output_dir / "global_propagation_analysis.json"
output_data = {
"analysis_date": "2026-03-02",
"global_types_stats": global_stats,
"nom_extracted_stats": nom_extracted_stats,
"recommendations": recommendations,
"estimated_gain": {
"current_fp": current_fp,
"estimated_fp": new_fp,
"fp_reduction": total_fp_reduction,
"current_precision": current_precision,
"estimated_precision": new_precision,
"precision_gain": new_precision - current_precision
}
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output_data, f, indent=2, ensure_ascii=False)
print(f"\n📊 Analyse sauvegardée: {output_file}")
print("\n" + "="*80)
return 0
if __name__ == "__main__":
sys.exit(analyze_global_propagation())