215 lines
9.0 KiB
Python
Executable File
215 lines
9.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Analyse la propagation globale et propose des optimisations.
|
|
|
|
La propagation globale crée 4,797 FP (96.9% du total):
|
|
- NOM_EXTRACTED: 3,846 FP (77.7%)
|
|
- *_GLOBAL: 951 FP (19.2%)
|
|
|
|
Ce script analyse quels types bénéficient vraiment de la propagation.
|
|
"""
|
|
import sys
|
|
import json
|
|
from pathlib import Path
|
|
from collections import defaultdict, Counter
|
|
|
|
def analyze_global_propagation():
|
|
"""Analyse la propagation globale."""
|
|
|
|
print("="*80)
|
|
print("ANALYSE DE LA PROPAGATION GLOBALE")
|
|
print("="*80)
|
|
|
|
# Charger les résultats d'évaluation
|
|
eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json")
|
|
|
|
with open(eval_file, 'r', encoding='utf-8') as f:
|
|
eval_data = json.load(f)
|
|
|
|
by_type = eval_data['by_type']
|
|
|
|
# Analyser les types *_GLOBAL
|
|
global_types = {k: v for k, v in by_type.items() if k.endswith('_GLOBAL')}
|
|
non_global_types = {k: v for k, v in by_type.items() if not k.endswith('_GLOBAL') and k != 'NOM_EXTRACTED'}
|
|
|
|
print(f"\n📊 Types avec propagation globale: {len(global_types)}")
|
|
print(f"📊 Types sans propagation: {len(non_global_types)}")
|
|
|
|
# Statistiques par type *_GLOBAL
|
|
print(f"\n" + "="*80)
|
|
print("ANALYSE DES TYPES *_GLOBAL")
|
|
print("="*80)
|
|
|
|
global_stats = []
|
|
for pii_type, stats in global_types.items():
|
|
base_type = pii_type.replace('_GLOBAL', '')
|
|
base_stats = non_global_types.get(base_type, {})
|
|
|
|
global_stats.append({
|
|
"type": pii_type,
|
|
"base_type": base_type,
|
|
"tp": stats['true_positives'],
|
|
"fp": stats['false_positives'],
|
|
"fn": stats['false_negatives'],
|
|
"precision": stats['precision'],
|
|
"base_tp": base_stats.get('true_positives', 0),
|
|
"base_fp": base_stats.get('false_positives', 0),
|
|
"base_precision": base_stats.get('precision', 0.0)
|
|
})
|
|
|
|
# Trier par nombre de FP
|
|
global_stats.sort(key=lambda x: x['fp'], reverse=True)
|
|
|
|
print(f"\n🔴 Types *_GLOBAL par nombre de faux positifs:\n")
|
|
for i, stat in enumerate(global_stats, 1):
|
|
print(f"{i}. {stat['type']}")
|
|
print(f" TP: {stat['tp']}, FP: {stat['fp']}, Précision: {stat['precision']:.2%}")
|
|
print(f" Type de base ({stat['base_type']}): TP: {stat['base_tp']}, FP: {stat['base_fp']}, Précision: {stat['base_precision']:.2%}")
|
|
|
|
# Évaluer l'utilité
|
|
if stat['tp'] == 0 and stat['fp'] > 0:
|
|
print(f" ⚠️ INUTILE: Aucun TP, {stat['fp']} FP → À DÉSACTIVER")
|
|
elif stat['precision'] < 0.5:
|
|
print(f" ⚠️ PROBLÉMATIQUE: Précision {stat['precision']:.2%} → À DÉSACTIVER")
|
|
elif stat['tp'] > 0 and stat['precision'] >= 0.8:
|
|
print(f" ✅ UTILE: {stat['tp']} TP avec bonne précision → À CONSERVER")
|
|
else:
|
|
print(f" ⚙️ À ÉVALUER: Bénéfice/coût à analyser")
|
|
print()
|
|
|
|
# Analyse NOM_EXTRACTED
|
|
print("="*80)
|
|
print("ANALYSE DE NOM_EXTRACTED")
|
|
print("="*80)
|
|
|
|
nom_extracted_stats = by_type.get('NOM_EXTRACTED', {})
|
|
print(f"\n📊 NOM_EXTRACTED:")
|
|
print(f" TP: {nom_extracted_stats.get('true_positives', 0)}")
|
|
print(f" FP: {nom_extracted_stats.get('false_positives', 0)}")
|
|
print(f" Précision: {nom_extracted_stats.get('precision', 0):.2%}")
|
|
|
|
if nom_extracted_stats.get('false_positives', 0) > 1000:
|
|
print(f"\n ⚠️ PROBLÈME MAJEUR: {nom_extracted_stats['false_positives']} FP")
|
|
print(f" Impact: {nom_extracted_stats['false_positives'] / eval_data['global_metrics']['false_positives'] * 100:.1f}% des FP totaux")
|
|
print(f"\n 💡 SOLUTION: Désactiver NOM_EXTRACTED ou améliorer drastiquement le filtrage")
|
|
|
|
# Recommandations
|
|
print(f"\n" + "="*80)
|
|
print("RECOMMANDATIONS")
|
|
print("="*80)
|
|
|
|
recommendations = []
|
|
|
|
# Recommandation 1: Désactiver les types *_GLOBAL inutiles
|
|
useless_global = [s for s in global_stats if s['tp'] == 0 and s['fp'] > 0]
|
|
if useless_global:
|
|
total_fp_saved = sum(s['fp'] for s in useless_global)
|
|
recommendations.append({
|
|
"priority": 1,
|
|
"title": "Désactiver les types *_GLOBAL inutiles",
|
|
"types": [s['type'] for s in useless_global],
|
|
"impact": f"Réduction de {total_fp_saved} FP",
|
|
"gain_precision": f"+{total_fp_saved / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
|
|
"effort": "Faible",
|
|
"implementation": "Modifier la fonction de propagation globale pour exclure ces types"
|
|
})
|
|
|
|
# Recommandation 2: Désactiver NOM_EXTRACTED
|
|
if nom_extracted_stats.get('false_positives', 0) > 1000:
|
|
recommendations.append({
|
|
"priority": 2,
|
|
"title": "Désactiver NOM_EXTRACTED",
|
|
"types": ['NOM_EXTRACTED'],
|
|
"impact": f"Réduction de {nom_extracted_stats['false_positives']} FP",
|
|
"gain_precision": f"+{nom_extracted_stats['false_positives'] / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
|
|
"effort": "Faible",
|
|
"implementation": "Commenter ou supprimer la logique d'extraction de noms"
|
|
})
|
|
|
|
# Recommandation 3: Améliorer le filtrage des types *_GLOBAL problématiques
|
|
problematic_global = [s for s in global_stats if s['tp'] > 0 and s['precision'] < 0.5]
|
|
if problematic_global:
|
|
total_fp = sum(s['fp'] for s in problematic_global)
|
|
recommendations.append({
|
|
"priority": 3,
|
|
"title": "Améliorer le filtrage des types *_GLOBAL problématiques",
|
|
"types": [s['type'] for s in problematic_global],
|
|
"impact": f"Réduction estimée de {int(total_fp * 0.7)} FP (70% des {total_fp})",
|
|
"gain_precision": f"+{int(total_fp * 0.7) / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
|
|
"effort": "Moyen",
|
|
"implementation": "Ajouter validation croisée ou seuil de confiance avant propagation"
|
|
})
|
|
|
|
print(f"\n🎯 {len(recommendations)} recommandations:\n")
|
|
for i, rec in enumerate(recommendations, 1):
|
|
print(f"Priorité {rec['priority']}: {rec['title']}")
|
|
print(f" Types: {', '.join(rec['types'][:5])}")
|
|
if len(rec['types']) > 5:
|
|
print(f" ... et {len(rec['types']) - 5} autres")
|
|
print(f" Impact: {rec['impact']}")
|
|
print(f" Gain précision: {rec['gain_precision']}")
|
|
print(f" Effort: {rec['effort']}")
|
|
print(f" Implémentation: {rec['implementation']}")
|
|
print()
|
|
|
|
# Estimation du gain total
|
|
total_fp_reduction = 0
|
|
for rec in recommendations:
|
|
# Extraire le nombre de FP de l'impact
|
|
import re
|
|
match = re.search(r'(\d+)', rec['impact'])
|
|
if match:
|
|
total_fp_reduction += int(match.group(1))
|
|
|
|
current_fp = eval_data['global_metrics']['false_positives']
|
|
current_tp = eval_data['global_metrics']['true_positives']
|
|
current_precision = eval_data['global_metrics']['precision']
|
|
|
|
new_fp = current_fp - total_fp_reduction
|
|
new_precision = current_tp / (current_tp + new_fp) if (current_tp + new_fp) > 0 else 0
|
|
|
|
print("="*80)
|
|
print("ESTIMATION DU GAIN TOTAL")
|
|
print("="*80)
|
|
print(f"\n🎯 Avec toutes les recommandations:")
|
|
print(f" - FP actuels: {current_fp}")
|
|
print(f" - FP estimés: {new_fp} (-{total_fp_reduction})")
|
|
print(f" - Précision actuelle: {current_precision:.2%}")
|
|
print(f" - Précision estimée: {new_precision:.2%} (+{(new_precision - current_precision)*100:.1f} points)")
|
|
|
|
if new_precision >= 0.97:
|
|
print(f"\n ✅ Objectif de précision (≥97%) ATTEIGNABLE!")
|
|
else:
|
|
print(f"\n ⚠️ Objectif de précision (≥97%) nécessite {(0.97 - new_precision)*100:.1f} points supplémentaires")
|
|
|
|
# Sauvegarder
|
|
output_dir = Path("tests/ground_truth/analysis")
|
|
output_file = output_dir / "global_propagation_analysis.json"
|
|
|
|
output_data = {
|
|
"analysis_date": "2026-03-02",
|
|
"global_types_stats": global_stats,
|
|
"nom_extracted_stats": nom_extracted_stats,
|
|
"recommendations": recommendations,
|
|
"estimated_gain": {
|
|
"current_fp": current_fp,
|
|
"estimated_fp": new_fp,
|
|
"fp_reduction": total_fp_reduction,
|
|
"current_precision": current_precision,
|
|
"estimated_precision": new_precision,
|
|
"precision_gain": new_precision - current_precision
|
|
}
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\n📊 Analyse sauvegardée: {output_file}")
|
|
print("\n" + "="*80)
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(analyze_global_propagation())
|