#!/usr/bin/env python3 """ Dashboard de monitoring de la santé du système de matching. """ import json import time import sys from pathlib import Path from datetime import datetime, timedelta from typing import Dict, Any, List from collections import deque class MatchingHealthMonitor: """Moniteur de santé du système de matching.""" def __init__( self, failed_matches_dir: str = "data/failed_matches", alert_threshold: int = 5, window_minutes: int = 10 ): self.failed_matches_dir = Path(failed_matches_dir) self.alert_threshold = alert_threshold self.window_minutes = window_minutes self.recent_failures = deque(maxlen=100) self.last_check = None def check_health(self) -> Dict[str, Any]: """Vérifier la santé du système.""" now = datetime.now() # Charger les nouveaux échecs new_failures = self._load_new_failures(since=self.last_check) self.recent_failures.extend(new_failures) self.last_check = now # Calculer les métriques metrics = { 'timestamp': now.isoformat(), 'total_failures_tracked': len(self.recent_failures), 'failures_last_10min': self._count_recent_failures(minutes=10), 'failures_last_hour': self._count_recent_failures(minutes=60), 'failure_rate_per_min': self._calculate_failure_rate(), 'avg_confidence': self._calculate_avg_confidence(), 'alerts': [] } # Générer des alertes metrics['alerts'] = self._generate_alerts(metrics) return metrics def _load_new_failures(self, since: datetime = None) -> List[Dict[str, Any]]: """Charger les nouveaux échecs.""" if not self.failed_matches_dir.exists(): return [] if since is None: since = datetime.now() - timedelta(hours=1) new_failures = [] for match_dir in self.failed_matches_dir.iterdir(): if not match_dir.is_dir(): continue try: timestamp_str = match_dir.name.replace("failed_match_", "") timestamp = datetime.strptime(timestamp_str, "%Y%m%d_%H%M%S") if timestamp > since: report_path = match_dir / "report.json" if report_path.exists(): with open(report_path, 'r') as f: report = json.load(f) report['_timestamp'] = timestamp new_failures.append(report) except: continue return sorted(new_failures, key=lambda x: x['_timestamp']) def _count_recent_failures(self, minutes: int) -> int: """Compter les échecs récents.""" cutoff = datetime.now() - timedelta(minutes=minutes) return sum(1 for f in self.recent_failures if f['_timestamp'] > cutoff) def _calculate_failure_rate(self) -> float: """Calculer le taux d'échec par minute.""" if not self.recent_failures: return 0.0 recent_count = self._count_recent_failures(minutes=self.window_minutes) return recent_count / self.window_minutes def _calculate_avg_confidence(self) -> float: """Calculer la confiance moyenne.""" if not self.recent_failures: return 0.0 confidences = [f['matching_results']['best_confidence'] for f in self.recent_failures] return sum(confidences) / len(confidences) def _generate_alerts(self, metrics: Dict[str, Any]) -> List[Dict[str, str]]: """Générer des alertes.""" alerts = [] if metrics['failures_last_10min'] >= self.alert_threshold: alerts.append({ 'level': 'WARNING', 'type': 'HIGH_FAILURE_RATE', 'message': f"{metrics['failures_last_10min']} échecs en 10 min" }) if metrics['avg_confidence'] < 0.60: alerts.append({ 'level': 'CRITICAL', 'type': 'LOW_CONFIDENCE', 'message': f"Confiance moyenne: {metrics['avg_confidence']:.3f}" }) return alerts def print_dashboard(self, metrics: Dict[str, Any]): """Afficher le dashboard.""" print("\n" + "="*70) print("DASHBOARD DE SANTÉ DU MATCHING") print("="*70) print(f"Timestamp: {metrics['timestamp']}") print(f"\n📊 Métriques") print(f" • Échecs suivis: {metrics['total_failures_tracked']}") print(f" • Échecs (10 min): {metrics['failures_last_10min']}") print(f" • Échecs (1 heure): {metrics['failures_last_hour']}") print(f" • Taux: {metrics['failure_rate_per_min']:.2f}/min") print(f" • Confiance moy: {metrics['avg_confidence']:.3f}") if metrics['alerts']: print(f"\n🚨 Alertes ({len(metrics['alerts'])})") for alert in metrics['alerts']: icon = {'CRITICAL': '🔴', 'WARNING': '🟡', 'INFO': '🔵'}.get(alert['level'], '⚪') print(f" {icon} [{alert['level']}] {alert['type']}") print(f" {alert['message']}") else: print(f"\n✅ Système en bonne santé") print("\n" + "="*70) def main(): import argparse parser = argparse.ArgumentParser(description="Monitorer la santé du matching") parser.add_argument('--continuous', action='store_true', help="Mode continu") parser.add_argument('--interval', type=int, default=60, help="Intervalle (s)") args = parser.parse_args() monitor = MatchingHealthMonitor() if args.continuous: print("🔄 Monitoring continu... (Ctrl+C pour arrêter)") try: while True: metrics = monitor.check_health() monitor.print_dashboard(metrics) time.sleep(args.interval) except KeyboardInterrupt: print("\n✓ Arrêté") else: metrics = monitor.check_health() monitor.print_dashboard(metrics) return 0 if __name__ == '__main__': sys.exit(main())