Files
Geniusia_v2/rpa_vision_v3/monitor_matching_health.py
2026-03-05 00:20:25 +01:00

179 lines
6.3 KiB
Python

#!/usr/bin/env python3
"""
Dashboard de monitoring de la santé du système de matching.
"""
import json
import time
import sys
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, Any, List
from collections import deque
class MatchingHealthMonitor:
"""Moniteur de santé du système de matching."""
def __init__(
self,
failed_matches_dir: str = "data/failed_matches",
alert_threshold: int = 5,
window_minutes: int = 10
):
self.failed_matches_dir = Path(failed_matches_dir)
self.alert_threshold = alert_threshold
self.window_minutes = window_minutes
self.recent_failures = deque(maxlen=100)
self.last_check = None
def check_health(self) -> Dict[str, Any]:
"""Vérifier la santé du système."""
now = datetime.now()
# Charger les nouveaux échecs
new_failures = self._load_new_failures(since=self.last_check)
self.recent_failures.extend(new_failures)
self.last_check = now
# Calculer les métriques
metrics = {
'timestamp': now.isoformat(),
'total_failures_tracked': len(self.recent_failures),
'failures_last_10min': self._count_recent_failures(minutes=10),
'failures_last_hour': self._count_recent_failures(minutes=60),
'failure_rate_per_min': self._calculate_failure_rate(),
'avg_confidence': self._calculate_avg_confidence(),
'alerts': []
}
# Générer des alertes
metrics['alerts'] = self._generate_alerts(metrics)
return metrics
def _load_new_failures(self, since: datetime = None) -> List[Dict[str, Any]]:
"""Charger les nouveaux échecs."""
if not self.failed_matches_dir.exists():
return []
if since is None:
since = datetime.now() - timedelta(hours=1)
new_failures = []
for match_dir in self.failed_matches_dir.iterdir():
if not match_dir.is_dir():
continue
try:
timestamp_str = match_dir.name.replace("failed_match_", "")
timestamp = datetime.strptime(timestamp_str, "%Y%m%d_%H%M%S")
if timestamp > since:
report_path = match_dir / "report.json"
if report_path.exists():
with open(report_path, 'r') as f:
report = json.load(f)
report['_timestamp'] = timestamp
new_failures.append(report)
except:
continue
return sorted(new_failures, key=lambda x: x['_timestamp'])
def _count_recent_failures(self, minutes: int) -> int:
"""Compter les échecs récents."""
cutoff = datetime.now() - timedelta(minutes=minutes)
return sum(1 for f in self.recent_failures if f['_timestamp'] > cutoff)
def _calculate_failure_rate(self) -> float:
"""Calculer le taux d'échec par minute."""
if not self.recent_failures:
return 0.0
recent_count = self._count_recent_failures(minutes=self.window_minutes)
return recent_count / self.window_minutes
def _calculate_avg_confidence(self) -> float:
"""Calculer la confiance moyenne."""
if not self.recent_failures:
return 0.0
confidences = [f['matching_results']['best_confidence'] for f in self.recent_failures]
return sum(confidences) / len(confidences)
def _generate_alerts(self, metrics: Dict[str, Any]) -> List[Dict[str, str]]:
"""Générer des alertes."""
alerts = []
if metrics['failures_last_10min'] >= self.alert_threshold:
alerts.append({
'level': 'WARNING',
'type': 'HIGH_FAILURE_RATE',
'message': f"{metrics['failures_last_10min']} échecs en 10 min"
})
if metrics['avg_confidence'] < 0.60:
alerts.append({
'level': 'CRITICAL',
'type': 'LOW_CONFIDENCE',
'message': f"Confiance moyenne: {metrics['avg_confidence']:.3f}"
})
return alerts
def print_dashboard(self, metrics: Dict[str, Any]):
"""Afficher le dashboard."""
print("\n" + "="*70)
print("DASHBOARD DE SANTÉ DU MATCHING")
print("="*70)
print(f"Timestamp: {metrics['timestamp']}")
print(f"\n📊 Métriques")
print(f" • Échecs suivis: {metrics['total_failures_tracked']}")
print(f" • Échecs (10 min): {metrics['failures_last_10min']}")
print(f" • Échecs (1 heure): {metrics['failures_last_hour']}")
print(f" • Taux: {metrics['failure_rate_per_min']:.2f}/min")
print(f" • Confiance moy: {metrics['avg_confidence']:.3f}")
if metrics['alerts']:
print(f"\n🚨 Alertes ({len(metrics['alerts'])})")
for alert in metrics['alerts']:
icon = {'CRITICAL': '🔴', 'WARNING': '🟡', 'INFO': '🔵'}.get(alert['level'], '')
print(f" {icon} [{alert['level']}] {alert['type']}")
print(f" {alert['message']}")
else:
print(f"\n✅ Système en bonne santé")
print("\n" + "="*70)
def main():
import argparse
parser = argparse.ArgumentParser(description="Monitorer la santé du matching")
parser.add_argument('--continuous', action='store_true', help="Mode continu")
parser.add_argument('--interval', type=int, default=60, help="Intervalle (s)")
args = parser.parse_args()
monitor = MatchingHealthMonitor()
if args.continuous:
print("🔄 Monitoring continu... (Ctrl+C pour arrêter)")
try:
while True:
metrics = monitor.check_health()
monitor.print_dashboard(metrics)
time.sleep(args.interval)
except KeyboardInterrupt:
print("\n✓ Arrêté")
else:
metrics = monitor.check_health()
monitor.print_dashboard(metrics)
return 0
if __name__ == '__main__':
sys.exit(main())