anonymisation/tools/analyze_global_propagation.py

#!/usr/bin/env python3
"""
Analyse la propagation globale et propose des optimisations.

La propagation globale crée 4,797 FP (96.9% du total):
- NOM_EXTRACTED: 3,846 FP (77.7%)
- *_GLOBAL: 951 FP (19.2%)

Ce script analyse quels types bénéficient vraiment de la propagation.
"""
import sys
import json
from pathlib import Path
from collections import defaultdict, Counter

def analyze_global_propagation():
    """Analyse la propagation globale."""

    print("="*80)
    print("ANALYSE DE LA PROPAGATION GLOBALE")
    print("="*80)

    # Charger les résultats d'évaluation
    eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json")

    with open(eval_file, 'r', encoding='utf-8') as f:
        eval_data = json.load(f)

    by_type = eval_data['by_type']

    # Analyser les types *_GLOBAL
    global_types = {k: v for k, v in by_type.items() if k.endswith('_GLOBAL')}
    non_global_types = {k: v for k, v in by_type.items() if not k.endswith('_GLOBAL') and k != 'NOM_EXTRACTED'}

    print(f"\n📊 Types avec propagation globale: {len(global_types)}")
    print(f"📊 Types sans propagation: {len(non_global_types)}")

    # Statistiques par type *_GLOBAL
    print(f"\n" + "="*80)
    print("ANALYSE DES TYPES *_GLOBAL")
    print("="*80)

    global_stats = []
    for pii_type, stats in global_types.items():
        base_type = pii_type.replace('_GLOBAL', '')
        base_stats = non_global_types.get(base_type, {})

        global_stats.append({
            "type": pii_type,
            "base_type": base_type,
            "tp": stats['true_positives'],
            "fp": stats['false_positives'],
            "fn": stats['false_negatives'],
            "precision": stats['precision'],
            "base_tp": base_stats.get('true_positives', 0),
            "base_fp": base_stats.get('false_positives', 0),
            "base_precision": base_stats.get('precision', 0.0)
        })

    # Trier par nombre de FP
    global_stats.sort(key=lambda x: x['fp'], reverse=True)

    print(f"\n🔴 Types *_GLOBAL par nombre de faux positifs:\n")
    for i, stat in enumerate(global_stats, 1):
        print(f"{i}. {stat['type']}")
        print(f"   TP: {stat['tp']}, FP: {stat['fp']}, Précision: {stat['precision']:.2%}")
        print(f"   Type de base ({stat['base_type']}): TP: {stat['base_tp']}, FP: {stat['base_fp']}, Précision: {stat['base_precision']:.2%}")

        # Évaluer l'utilité
        if stat['tp'] == 0 and stat['fp'] > 0:
            print(f"   ⚠️  INUTILE: Aucun TP, {stat['fp']} FP → À DÉSACTIVER")
        elif stat['precision'] < 0.5:
            print(f"   ⚠️  PROBLÉMATIQUE: Précision {stat['precision']:.2%} → À DÉSACTIVER")
        elif stat['tp'] > 0 and stat['precision'] >= 0.8:
            print(f"   ✅ UTILE: {stat['tp']} TP avec bonne précision → À CONSERVER")
        else:
            print(f"   ⚙️  À ÉVALUER: Bénéfice/coût à analyser")
        print()

    # Analyse NOM_EXTRACTED
    print("="*80)
    print("ANALYSE DE NOM_EXTRACTED")
    print("="*80)

    nom_extracted_stats = by_type.get('NOM_EXTRACTED', {})
    print(f"\n📊 NOM_EXTRACTED:")
    print(f"   TP: {nom_extracted_stats.get('true_positives', 0)}")
    print(f"   FP: {nom_extracted_stats.get('false_positives', 0)}")
    print(f"   Précision: {nom_extracted_stats.get('precision', 0):.2%}")

    if nom_extracted_stats.get('false_positives', 0) > 1000:
        print(f"\n   ⚠️  PROBLÈME MAJEUR: {nom_extracted_stats['false_positives']} FP")
        print(f"   Impact: {nom_extracted_stats['false_positives'] / eval_data['global_metrics']['false_positives'] * 100:.1f}% des FP totaux")
        print(f"\n   💡 SOLUTION: Désactiver NOM_EXTRACTED ou améliorer drastiquement le filtrage")

    # Recommandations
    print(f"\n" + "="*80)
    print("RECOMMANDATIONS")
    print("="*80)

    recommendations = []

    # Recommandation 1: Désactiver les types *_GLOBAL inutiles
    useless_global = [s for s in global_stats if s['tp'] == 0 and s['fp'] > 0]
    if useless_global:
        total_fp_saved = sum(s['fp'] for s in useless_global)
        recommendations.append({
            "priority": 1,
            "title": "Désactiver les types *_GLOBAL inutiles",
            "types": [s['type'] for s in useless_global],
            "impact": f"Réduction de {total_fp_saved} FP",
            "gain_precision": f"+{total_fp_saved / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
            "effort": "Faible",
            "implementation": "Modifier la fonction de propagation globale pour exclure ces types"
        })

    # Recommandation 2: Désactiver NOM_EXTRACTED
    if nom_extracted_stats.get('false_positives', 0) > 1000:
        recommendations.append({
            "priority": 2,
            "title": "Désactiver NOM_EXTRACTED",
            "types": ['NOM_EXTRACTED'],
            "impact": f"Réduction de {nom_extracted_stats['false_positives']} FP",
            "gain_precision": f"+{nom_extracted_stats['false_positives'] / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
            "effort": "Faible",
            "implementation": "Commenter ou supprimer la logique d'extraction de noms"
        })

    # Recommandation 3: Améliorer le filtrage des types *_GLOBAL problématiques
    problematic_global = [s for s in global_stats if s['tp'] > 0 and s['precision'] < 0.5]
    if problematic_global:
        total_fp = sum(s['fp'] for s in problematic_global)
        recommendations.append({
            "priority": 3,
            "title": "Améliorer le filtrage des types *_GLOBAL problématiques",
            "types": [s['type'] for s in problematic_global],
            "impact": f"Réduction estimée de {int(total_fp * 0.7)} FP (70% des {total_fp})",
            "gain_precision": f"+{int(total_fp * 0.7) / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
            "effort": "Moyen",
            "implementation": "Ajouter validation croisée ou seuil de confiance avant propagation"
        })

    print(f"\n🎯 {len(recommendations)} recommandations:\n")
    for i, rec in enumerate(recommendations, 1):
        print(f"Priorité {rec['priority']}: {rec['title']}")
        print(f"   Types: {', '.join(rec['types'][:5])}")
        if len(rec['types']) > 5:
            print(f"          ... et {len(rec['types']) - 5} autres")
        print(f"   Impact: {rec['impact']}")
        print(f"   Gain précision: {rec['gain_precision']}")
        print(f"   Effort: {rec['effort']}")
        print(f"   Implémentation: {rec['implementation']}")
        print()

    # Estimation du gain total
    total_fp_reduction = 0
    for rec in recommendations:
        # Extraire le nombre de FP de l'impact
        import re
        match = re.search(r'(\d+)', rec['impact'])
        if match:
            total_fp_reduction += int(match.group(1))

    current_fp = eval_data['global_metrics']['false_positives']
    current_tp = eval_data['global_metrics']['true_positives']
    current_precision = eval_data['global_metrics']['precision']

    new_fp = current_fp - total_fp_reduction
    new_precision = current_tp / (current_tp + new_fp) if (current_tp + new_fp) > 0 else 0

    print("="*80)
    print("ESTIMATION DU GAIN TOTAL")
    print("="*80)
    print(f"\n🎯 Avec toutes les recommandations:")
    print(f"   - FP actuels: {current_fp}")
    print(f"   - FP estimés: {new_fp} (-{total_fp_reduction})")
    print(f"   - Précision actuelle: {current_precision:.2%}")
    print(f"   - Précision estimée: {new_precision:.2%} (+{(new_precision - current_precision)*100:.1f} points)")

    if new_precision >= 0.97:
        print(f"\n   ✅ Objectif de précision (≥97%) ATTEIGNABLE!")
    else:
        print(f"\n   ⚠️  Objectif de précision (≥97%) nécessite {(0.97 - new_precision)*100:.1f} points supplémentaires")

    # Sauvegarder
    output_dir = Path("tests/ground_truth/analysis")
    output_file = output_dir / "global_propagation_analysis.json"

    output_data = {
        "analysis_date": "2026-03-02",
        "global_types_stats": global_stats,
        "nom_extracted_stats": nom_extracted_stats,
        "recommendations": recommendations,
        "estimated_gain": {
            "current_fp": current_fp,
            "estimated_fp": new_fp,
            "fp_reduction": total_fp_reduction,
            "current_precision": current_precision,
            "estimated_precision": new_precision,
            "precision_gain": new_precision - current_precision
        }
    }

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)

    print(f"\n📊 Analyse sauvegardée: {output_file}")
    print("\n" + "="*80)

    return 0


if __name__ == "__main__":
    sys.exit(analyze_global_propagation())