rpa_vision_v3/analyze_failed_matches.py

#!/usr/bin/env python3
"""
Analyseur des échecs de matching pour amélioration continue du système.

Ce script analyse les rapports d'échecs de matching et génère des statistiques
et recommandations pour améliorer le graphe de workflow.
"""

import json
import sys
from pathlib import Path
from datetime import datetime, timedelta
from typing import List, Dict, Any
from collections import Counter, defaultdict
import argparse


class FailedMatchAnalyzer:
    """Analyseur des échecs de matching."""

    def __init__(self, failed_matches_dir: str = "data/failed_matches"):
        self.failed_matches_dir = Path(failed_matches_dir)
        self.reports: List[Dict[str, Any]] = []

    def load_reports(self, last_n: int = None, since_hours: int = None):
        """
        Charger les rapports d'échecs.

        Args:
            last_n: Charger les N derniers rapports
            since_hours: Charger les rapports des X dernières heures
        """
        if not self.failed_matches_dir.exists():
            print(f"⚠️  Aucun dossier d'échecs trouvé: {self.failed_matches_dir}")
            return

        # Lister tous les dossiers d'échecs
        match_dirs = sorted(
            [d for d in self.failed_matches_dir.iterdir() if d.is_dir()],
            key=lambda x: x.name,
            reverse=True
        )

        if not match_dirs:
            print("⚠️  Aucun échec de matching enregistré")
            return

        # Filtrer par date si nécessaire
        if since_hours:
            cutoff = datetime.now() - timedelta(hours=since_hours)
            match_dirs = [
                d for d in match_dirs
                if self._parse_timestamp(d.name) >= cutoff
            ]

        # Limiter le nombre si nécessaire
        if last_n:
            match_dirs = match_dirs[:last_n]

        # Charger les rapports
        for match_dir in match_dirs:
            report_path = match_dir / "report.json"
            if report_path.exists():
                try:
                    with open(report_path, 'r') as f:
                        report = json.load(f)
                        report['_dir'] = match_dir
                        self.reports.append(report)
                except Exception as e:
                    print(f"⚠️  Erreur lors du chargement de {report_path}: {e}")

        print(f"✓ {len(self.reports)} rapports chargés")

    def _parse_timestamp(self, dirname: str) -> datetime:
        """Parser le timestamp depuis le nom du dossier."""
        try:
            # Format: failed_match_20251123_143052
            timestamp_str = dirname.replace("failed_match_", "")
            return datetime.strptime(timestamp_str, "%Y%m%d_%H%M%S")
        except:
            return datetime.min

    def analyze(self) -> Dict[str, Any]:
        """Analyser tous les rapports et générer des statistiques."""
        if not self.reports:
            return {}

        analysis = {
            'total_failures': len(self.reports),
            'date_range': self._get_date_range(),
            'confidence_stats': self._analyze_confidence(),
            'suggestions_summary': self._analyze_suggestions(),
            'problematic_nodes': self._identify_problematic_nodes(),
            'threshold_recommendations': self._recommend_thresholds(),
            'new_states_detected': self._count_new_states()
        }

        return analysis

    def _get_date_range(self) -> Dict[str, str]:
        """Obtenir la plage de dates des rapports."""
        timestamps = [
            datetime.strptime(r['timestamp'], "%Y%m%d_%H%M%S")
            for r in self.reports
        ]
        return {
            'first': min(timestamps).strftime("%Y-%m-%d %H:%M:%S"),
            'last': max(timestamps).strftime("%Y-%m-%d %H:%M:%S")
        }

    def _analyze_confidence(self) -> Dict[str, Any]:
        """Analyser les niveaux de confiance."""
        confidences = [
            r['matching_results']['best_confidence']
            for r in self.reports
        ]

        return {
            'min': min(confidences),
            'max': max(confidences),
            'avg': sum(confidences) / len(confidences),
            'below_70': sum(1 for c in confidences if c < 0.70),
            'between_70_85': sum(1 for c in confidences if 0.70 <= c < 0.85),
            'above_85': sum(1 for c in confidences if c >= 0.85)
        }

    def _analyze_suggestions(self) -> Dict[str, int]:
        """Compter les types de suggestions."""
        suggestion_types = Counter()

        for report in self.reports:
            for suggestion in report.get('suggestions', []):
                # Extraire le type de suggestion (avant le ':')
                suggestion_type = suggestion.split(':')[0]
                suggestion_types[suggestion_type] += 1

        return dict(suggestion_types)

    def _identify_problematic_nodes(self) -> List[Dict[str, Any]]:
        """Identifier les nodes qui causent le plus de confusion."""
        node_near_misses = defaultdict(list)

        for report in self.reports:
            similarities = report['matching_results'].get('similarities', [])
            if similarities:
                best = similarities[0]
                confidence = best['similarity']
                # Near miss: entre 0.70 et threshold
                if 0.70 <= confidence < report['matching_results']['threshold']:
                    node_near_misses[best['node_id']].append({
                        'confidence': confidence,
                        'label': best['node_label'],
                        'timestamp': report['timestamp']
                    })

        # Trier par nombre de near misses
        problematic = [
            {
                'node_id': node_id,
                'node_label': misses[0]['label'],
                'near_miss_count': len(misses),
                'avg_confidence': sum(m['confidence'] for m in misses) / len(misses)
            }
            for node_id, misses in node_near_misses.items()
        ]

        return sorted(problematic, key=lambda x: x['near_miss_count'], reverse=True)

    def _recommend_thresholds(self) -> Dict[str, Any]:
        """Recommander des ajustements de seuil."""
        confidences = [
            r['matching_results']['best_confidence']
            for r in self.reports
        ]

        # Calculer le percentile 90 des confidences
        sorted_conf = sorted(confidences)
        p90_index = int(len(sorted_conf) * 0.9)
        p90 = sorted_conf[p90_index] if sorted_conf else 0.85

        current_threshold = self.reports[0]['matching_results']['threshold']

        recommendations = {
            'current_threshold': current_threshold,
            'p90_confidence': p90,
            'recommended_threshold': max(0.70, min(0.90, p90 - 0.02))
        }

        if p90 < current_threshold - 0.05:
            recommendations['action'] = "LOWER_THRESHOLD"
            recommendations['reason'] = f"90% des échecs ont une confiance < {p90:.3f}"
        elif p90 > current_threshold + 0.05:
            recommendations['action'] = "RAISE_THRESHOLD"
            recommendations['reason'] = "Beaucoup de faux positifs potentiels"
        else:
            recommendations['action'] = "KEEP_CURRENT"
            recommendations['reason'] = "Seuil approprié"

        return recommendations

    def _count_new_states(self) -> int:
        """Compter les nouveaux états détectés (confiance < 0.70)."""
        return sum(
            1 for r in self.reports
            if r['matching_results']['best_confidence'] < 0.70
        )

    def print_report(self, analysis: Dict[str, Any]):
        """Afficher le rapport d'analyse."""
        print("\n" + "="*70)
        print("RAPPORT D'ANALYSE DES ÉCHECS DE MATCHING")
        print("="*70)

        print(f"\n📊 Statistiques Générales")
        print(f"  • Total d'échecs: {analysis['total_failures']}")
        print(f"  • Période: {analysis['date_range']['first']} → {analysis['date_range']['last']}")

        print(f"\n📈 Niveaux de Confiance")
        conf = analysis['confidence_stats']
        print(f"  • Minimum: {conf['min']:.3f}")
        print(f"  • Maximum: {conf['max']:.3f}")
        print(f"  • Moyenne: {conf['avg']:.3f}")
        print(f"  • < 0.70 (nouveaux états): {conf['below_70']}")
        print(f"  • 0.70-0.85 (near miss): {conf['between_70_85']}")
        print(f"  • > 0.85 (faux négatifs): {conf['above_85']}")

        print(f"\n💡 Suggestions Générées")
        for suggestion_type, count in analysis['suggestions_summary'].items():
            print(f"  • {suggestion_type}: {count}")

        print(f"\n⚠️  Nodes Problématiques (Top 5)")
        for i, node in enumerate(analysis['problematic_nodes'][:5], 1):
            print(f"  {i}. {node['node_label']} (ID: {node['node_id']})")
            print(f"     - Near misses: {node['near_miss_count']}")
            print(f"     - Confiance moyenne: {node['avg_confidence']:.3f}")

        print(f"\n🎯 Recommandations de Seuil")
        thresh = analysis['threshold_recommendations']
        print(f"  • Seuil actuel: {thresh['current_threshold']:.3f}")
        print(f"  • P90 des confidences: {thresh['p90_confidence']:.3f}")
        print(f"  • Seuil recommandé: {thresh['recommended_threshold']:.3f}")
        print(f"  • Action: {thresh['action']}")
        print(f"  • Raison: {thresh['reason']}")

        print(f"\n🆕 Nouveaux États Détectés")
        print(f"  • {analysis['new_states_detected']} états potentiellement nouveaux")
        print(f"    (confiance < 0.70, nécessitent création de nodes)")

        print("\n" + "="*70)

    def export_detailed_report(self, output_path: str = "failed_matches_analysis.json"):
        """Exporter un rapport détaillé en JSON."""
        analysis = self.analyze()

        detailed_report = {
            'analysis': analysis,
            'individual_reports': [
                {
                    'timestamp': r['timestamp'],
                    'confidence': r['matching_results']['best_confidence'],
                    'suggestions': r['suggestions'],
                    'window_title': r['state']['window_title'],
                    'screenshot_path': str(r['_dir'] / "screenshot.png")
                }
                for r in self.reports
            ]
        }

        with open(output_path, 'w') as f:
            json.dump(detailed_report, f, indent=2)

        print(f"\n✓ Rapport détaillé exporté: {output_path}")


def main():
    parser = argparse.ArgumentParser(
        description="Analyser les échecs de matching pour amélioration continue"
    )
    parser.add_argument(
        '--last',
        type=int,
        help="Analyser les N derniers échecs"
    )
    parser.add_argument(
        '--since-hours',
        type=int,
        help="Analyser les échecs des X dernières heures"
    )
    parser.add_argument(
        '--export',
        type=str,
        help="Exporter le rapport détaillé en JSON"
    )
    parser.add_argument(
        '--dir',
        type=str,
        default="data/failed_matches",
        help="Dossier contenant les échecs (défaut: data/failed_matches)"
    )

    args = parser.parse_args()

    # Créer l'analyseur
    analyzer = FailedMatchAnalyzer(failed_matches_dir=args.dir)

    # Charger les rapports
    analyzer.load_reports(last_n=args.last, since_hours=args.since_hours)

    if not analyzer.reports:
        print("\n❌ Aucun rapport à analyser")
        return 1

    # Analyser
    analysis = analyzer.analyze()

    # Afficher le rapport
    analyzer.print_report(analysis)

    # Exporter si demandé
    if args.export:
        analyzer.export_detailed_report(args.export)

    return 0


if __name__ == '__main__':
    sys.exit(main())