""" core/supervision/supervisor.py Main Production Supervisor - Intelligent coordination brain for Fiche #22 Monitors system health, coordinates recovery strategies, and makes intelligent decisions about when to trigger circuit breakers, rollbacks, or degraded mode. Integrates with: - Fiche #21: Production healthchecks and systemd services - Fiche #19: Failure case recording and analysis - Fiche #18: Persistent learning system - Fiche #10: Precision metrics engine - Core healing engine: Self-healing strategies """ import asyncio import logging import time from dataclasses import dataclass, field from datetime import datetime, timedelta from enum import Enum from pathlib import Path from typing import Dict, List, Optional, Any, Callable from .circuit_breaker import CircuitBreaker, CircuitState from .rollback_manager import RollbackManager from .degraded_mode import DegradedModeManager from .production_policy import ProductionPolicy from ..analytics.engine.performance_analyzer import PerformanceAnalyzer from ..healing.healing_engine import SelfHealingEngine from ..evaluation.failure_case_recorder import FailureCaseRecorder from ..precision.metrics_engine import PrecisionMetricsEngine logger = logging.getLogger(__name__) class SupervisionLevel(Enum): """Levels of supervision intensity""" MINIMAL = "minimal" # Basic monitoring only STANDARD = "standard" # Normal production supervision INTENSIVE = "intensive" # High-frequency monitoring EMERGENCY = "emergency" # Crisis mode with aggressive intervention @dataclass class SystemHealth: """Current system health snapshot""" timestamp: datetime overall_score: float # 0.0 to 1.0 api_healthy: bool dashboard_healthy: bool worker_healthy: bool disk_space_ok: bool failure_rate: float avg_response_time_ms: float circuit_breaker_states: Dict[str, CircuitState] degraded_mode_active: bool active_issues: List[str] = field(default_factory=list) @dataclass class SupervisionEvent: """Event in the supervision system""" timestamp: datetime event_type: str severity: str # info, warning, error, critical component: str message: str metadata: Dict[str, Any] = field(default_factory=dict) class ProductionSupervisor: """ Main production supervisor - the intelligent brain of Fiche #22. Responsibilities: - Monitor system health across all components - Coordinate recovery strategies intelligently - Trigger circuit breakers when needed - Initiate rollbacks for persistent failures - Activate degraded mode for unstable contexts - Learn from supervision decisions """ def __init__( self, policy: Optional[ProductionPolicy] = None, data_dir: Path = Path("data"), supervision_level: SupervisionLevel = SupervisionLevel.STANDARD ): """ Initialize the production supervisor. Args: policy: Production policy configuration data_dir: Data directory for storage supervision_level: Level of supervision intensity """ self.policy = policy or ProductionPolicy() self.data_dir = data_dir self.supervision_level = supervision_level # Core components self.circuit_breaker = CircuitBreaker( failure_threshold=self.policy.circuit_breaker_failure_threshold, recovery_timeout=self.policy.circuit_breaker_recovery_timeout, half_open_max_calls=self.policy.circuit_breaker_half_open_calls ) self.rollback_manager = RollbackManager( data_dir / "supervision" / "rollbacks" ) self.degraded_mode = DegradedModeManager( data_dir / "supervision" / "degraded_mode" ) # Integration with existing systems self.healing_engine: Optional[SelfHealingEngine] = None self.performance_analyzer: Optional[PerformanceAnalyzer] = None self.failure_recorder: Optional[FailureCaseRecorder] = None self.metrics_engine: Optional[PrecisionMetricsEngine] = None # State tracking self.current_health: Optional[SystemHealth] = None self.supervision_events: List[SupervisionEvent] = [] self.last_health_check = datetime.min self.running = False # Callbacks for external integration self.health_check_callback: Optional[Callable[[], Dict[str, Any]]] = None self.restart_service_callback: Optional[Callable[[str], bool]] = None logger.info(f"ProductionSupervisor initialized with {supervision_level.value} level") def integrate_systems( self, healing_engine: Optional[SelfHealingEngine] = None, performance_analyzer: Optional[PerformanceAnalyzer] = None, failure_recorder: Optional[FailureCaseRecorder] = None, metrics_engine: Optional[PrecisionMetricsEngine] = None ): """ Integrate with existing RPA Vision V3 systems. Args: healing_engine: Self-healing engine from core.healing performance_analyzer: Performance analyzer from core.analytics failure_recorder: Failure case recorder from core.evaluation metrics_engine: Precision metrics engine from core.precision """ self.healing_engine = healing_engine self.performance_analyzer = performance_analyzer self.failure_recorder = failure_recorder self.metrics_engine = metrics_engine logger.info("Integrated with existing RPA Vision V3 systems") def set_callbacks( self, health_check_callback: Optional[Callable[[], Dict[str, Any]]] = None, restart_service_callback: Optional[Callable[[str], bool]] = None ): """ Set callbacks for external system integration. Args: health_check_callback: Function to perform health checks restart_service_callback: Function to restart services """ self.health_check_callback = health_check_callback self.restart_service_callback = restart_service_callback async def start_supervision(self): """Start the supervision loop.""" if self.running: logger.warning("Supervision already running") return self.running = True logger.info("Starting production supervision") try: while self.running: await self._supervision_cycle() # Sleep based on supervision level sleep_time = self._get_supervision_interval() await asyncio.sleep(sleep_time) except Exception as e: logger.exception(f"Supervision loop crashed: {e}") self._record_event( "supervision_crash", "critical", "supervisor", f"Supervision loop crashed: {e}" ) finally: self.running = False logger.info("Production supervision stopped") def stop_supervision(self): """Stop the supervision loop.""" self.running = False logger.info("Stopping production supervision") async def _supervision_cycle(self): """Single supervision cycle.""" try: # 1. Assess system health health = await self._assess_system_health() self.current_health = health # 2. Make supervision decisions decisions = self._make_supervision_decisions(health) # 3. Execute decisions await self._execute_decisions(decisions) # 4. Update circuit breaker states self._update_circuit_breakers(health) # 5. Learn from outcomes self._learn_from_supervision_cycle(health, decisions) except Exception as e: logger.exception(f"Error in supervision cycle: {e}") self._record_event( "supervision_error", "error", "supervisor", f"Error in supervision cycle: {e}" ) async def _assess_system_health(self) -> SystemHealth: """Assess overall system health.""" now = datetime.now() # Get basic health from callback or default basic_health = {} if self.health_check_callback: try: basic_health = self.health_check_callback() except Exception as e: logger.warning(f"Health check callback failed: {e}") # Default health values api_healthy = basic_health.get('api_healthy', True) dashboard_healthy = basic_health.get('dashboard_healthy', True) worker_healthy = basic_health.get('worker_healthy', True) disk_space_ok = basic_health.get('disk_space_ok', True) # Calculate failure rate from recent data failure_rate = await self._calculate_recent_failure_rate() # Get average response time avg_response_time = await self._calculate_avg_response_time() # Get circuit breaker states cb_states = { 'main': self.circuit_breaker.state, 'healing': self.circuit_breaker.state # Could have multiple CBs } # Calculate overall health score health_factors = [ 1.0 if api_healthy else 0.0, 1.0 if dashboard_healthy else 0.8, # Dashboard less critical 1.0 if worker_healthy else 0.7, 1.0 if disk_space_ok else 0.5, max(0.0, 1.0 - failure_rate), # Failure rate impact max(0.0, 1.0 - min(1.0, avg_response_time / 5000.0)) # Response time impact ] overall_score = sum(health_factors) / len(health_factors) # Identify active issues active_issues = [] if not api_healthy: active_issues.append("API service unhealthy") if not dashboard_healthy: active_issues.append("Dashboard service unhealthy") if not worker_healthy: active_issues.append("Worker service unhealthy") if not disk_space_ok: active_issues.append("Low disk space") if failure_rate > self.policy.max_acceptable_failure_rate: active_issues.append(f"High failure rate: {failure_rate:.2%}") if avg_response_time > self.policy.max_acceptable_response_time_ms: active_issues.append(f"Slow response time: {avg_response_time:.0f}ms") health = SystemHealth( timestamp=now, overall_score=overall_score, api_healthy=api_healthy, dashboard_healthy=dashboard_healthy, worker_healthy=worker_healthy, disk_space_ok=disk_space_ok, failure_rate=failure_rate, avg_response_time_ms=avg_response_time, circuit_breaker_states=cb_states, degraded_mode_active=self.degraded_mode.is_active(), active_issues=active_issues ) self.last_health_check = now return health def _make_supervision_decisions(self, health: SystemHealth) -> List[Dict[str, Any]]: """Make intelligent supervision decisions based on health.""" decisions = [] # Decision 1: Circuit breaker management if health.failure_rate > self.policy.circuit_breaker_failure_threshold: if self.circuit_breaker.state == CircuitState.CLOSED: decisions.append({ 'type': 'open_circuit_breaker', 'reason': f'High failure rate: {health.failure_rate:.2%}', 'priority': 'high' }) # Decision 2: Service restart if not health.api_healthy and self.restart_service_callback: decisions.append({ 'type': 'restart_service', 'service': 'api', 'reason': 'API service unhealthy', 'priority': 'critical' }) # Decision 3: Degraded mode activation if (health.overall_score < self.policy.degraded_mode_threshold and not self.degraded_mode.is_active()): decisions.append({ 'type': 'activate_degraded_mode', 'reason': f'Low health score: {health.overall_score:.2f}', 'priority': 'medium' }) # Decision 4: Rollback consideration if (health.failure_rate > self.policy.rollback_failure_threshold and self.rollback_manager.has_stable_version()): decisions.append({ 'type': 'consider_rollback', 'reason': f'Persistent high failure rate: {health.failure_rate:.2%}', 'priority': 'high' }) # Decision 5: Supervision level adjustment if health.overall_score < 0.5 and self.supervision_level != SupervisionLevel.INTENSIVE: decisions.append({ 'type': 'increase_supervision', 'level': SupervisionLevel.INTENSIVE, 'reason': 'Critical health issues detected', 'priority': 'medium' }) # Sort by priority priority_order = {'critical': 0, 'high': 1, 'medium': 2, 'low': 3} decisions.sort(key=lambda d: priority_order.get(d.get('priority', 'low'), 3)) return decisions async def _execute_decisions(self, decisions: List[Dict[str, Any]]): """Execute supervision decisions.""" for decision in decisions: try: await self._execute_single_decision(decision) except Exception as e: logger.exception(f"Failed to execute decision {decision}: {e}") self._record_event( "decision_execution_failed", "error", "supervisor", f"Failed to execute {decision['type']}: {e}", {'decision': decision} ) async def _execute_single_decision(self, decision: Dict[str, Any]): """Execute a single supervision decision.""" decision_type = decision['type'] if decision_type == 'open_circuit_breaker': self.circuit_breaker.record_failure() self._record_event( "circuit_breaker_opened", "warning", "circuit_breaker", decision['reason'] ) elif decision_type == 'restart_service': service = decision['service'] if self.restart_service_callback: success = self.restart_service_callback(service) if success: self._record_event( "service_restarted", "info", "supervisor", f"Successfully restarted {service} service" ) else: self._record_event( "service_restart_failed", "error", "supervisor", f"Failed to restart {service} service" ) elif decision_type == 'activate_degraded_mode': await self.degraded_mode.activate(decision['reason']) self._record_event( "degraded_mode_activated", "warning", "degraded_mode", decision['reason'] ) elif decision_type == 'consider_rollback': # This is a consideration, not immediate action # Could trigger user notification or automatic rollback based on policy self._record_event( "rollback_considered", "warning", "rollback_manager", decision['reason'] ) elif decision_type == 'increase_supervision': old_level = self.supervision_level self.supervision_level = decision['level'] self._record_event( "supervision_level_changed", "info", "supervisor", f"Changed supervision level from {old_level.value} to {decision['level'].value}" ) def _update_circuit_breakers(self, health: SystemHealth): """Update circuit breaker states based on health.""" if health.overall_score > 0.8 and health.failure_rate < 0.05: # System is healthy, record success self.circuit_breaker.record_success() def _learn_from_supervision_cycle( self, health: SystemHealth, decisions: List[Dict[str, Any]] ): """Learn from supervision decisions and outcomes.""" # This could integrate with the persistent learning system (Fiche #18) # For now, just log the learning opportunity if decisions: logger.debug(f"Learning opportunity: health={health.overall_score:.2f}, " f"decisions={len(decisions)}") async def _calculate_recent_failure_rate(self) -> float: """Calculate recent failure rate from metrics.""" if not self.performance_analyzer: return 0.0 try: # Look at last hour of data end_time = datetime.now() start_time = end_time - timedelta(hours=1) # This would need to be implemented based on available metrics # For now, return a placeholder return 0.02 # 2% failure rate except Exception as e: logger.warning(f"Could not calculate failure rate: {e}") return 0.0 async def _calculate_avg_response_time(self) -> float: """Calculate average response time from metrics.""" if not self.performance_analyzer: return 1000.0 # Default 1 second try: # This would integrate with the performance analyzer # For now, return a placeholder return 1500.0 # 1.5 seconds except Exception as e: logger.warning(f"Could not calculate response time: {e}") return 1000.0 def _get_supervision_interval(self) -> float: """Get supervision interval based on current level.""" intervals = { SupervisionLevel.MINIMAL: 300.0, # 5 minutes SupervisionLevel.STANDARD: 60.0, # 1 minute SupervisionLevel.INTENSIVE: 15.0, # 15 seconds SupervisionLevel.EMERGENCY: 5.0 # 5 seconds } return intervals.get(self.supervision_level, 60.0) def _record_event( self, event_type: str, severity: str, component: str, message: str, metadata: Optional[Dict[str, Any]] = None ): """Record a supervision event.""" event = SupervisionEvent( timestamp=datetime.now(), event_type=event_type, severity=severity, component=component, message=message, metadata=metadata or {} ) self.supervision_events.append(event) # Keep only recent events to prevent memory bloat max_events = 1000 if len(self.supervision_events) > max_events: self.supervision_events = self.supervision_events[-max_events:] # Log the event log_level = { 'info': logging.INFO, 'warning': logging.WARNING, 'error': logging.ERROR, 'critical': logging.CRITICAL }.get(severity, logging.INFO) logger.log(log_level, f"[{component}] {message}") def get_current_status(self) -> Dict[str, Any]: """Get current supervision status.""" return { 'supervision_level': self.supervision_level.value, 'running': self.running, 'current_health': self.current_health.__dict__ if self.current_health else None, 'circuit_breaker_state': self.circuit_breaker.state.value, 'degraded_mode_active': self.degraded_mode.is_active(), 'recent_events': [ { 'timestamp': e.timestamp.isoformat(), 'type': e.event_type, 'severity': e.severity, 'component': e.component, 'message': e.message } for e in self.supervision_events[-10:] # Last 10 events ] } def force_health_check(self) -> Optional[SystemHealth]: """Force an immediate health check (synchronous).""" try: # Run async health check in sync context loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: health = loop.run_until_complete(self._assess_system_health()) self.current_health = health return health finally: loop.close() except Exception as e: logger.exception(f"Force health check failed: {e}") return None