"""Logging and monitoring for self-healing operations.""" import json import logging from pathlib import Path from typing import Dict, List, Optional from datetime import datetime from .models import RecoveryContext, RecoveryResult class RecoveryLogger: """Logger for self-healing recovery operations.""" def __init__(self, log_path: Optional[Path] = None): """ Initialize recovery logger. Args: log_path: Path for storing recovery logs """ self.log_path = log_path or Path('logs/healing') self.log_path.mkdir(parents=True, exist_ok=True) # Setup file logger self.logger = logging.getLogger('healing') self.logger.setLevel(logging.INFO) # File handler log_file = self.log_path / 'recovery.log' handler = logging.FileHandler(log_file) handler.setFormatter(logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s' )) self.logger.addHandler(handler) # Metrics storage self.metrics_file = self.log_path / 'metrics.json' self.metrics = self._load_metrics() def log_recovery_attempt( self, context: RecoveryContext, result: RecoveryResult ): """ Log a recovery attempt with full details. Args: context: Recovery context result: Recovery result """ log_entry = { 'timestamp': datetime.now().isoformat(), 'workflow_id': context.workflow_id, 'node_id': context.node_id, 'original_action': context.original_action, 'target_element': context.target_element, 'failure_reason': context.failure_reason, 'attempt_count': context.attempt_count, 'strategy_used': result.strategy_used, 'success': result.success, 'confidence_score': result.confidence_score, 'execution_time': result.execution_time, 'new_element': result.new_element, 'requires_user_input': result.requires_user_input, 'error_message': result.error_message } # Log to file if result.success: self.logger.info(f"Recovery SUCCESS: {json.dumps(log_entry)}") else: self.logger.warning(f"Recovery FAILED: {json.dumps(log_entry)}") # Update metrics self._update_metrics(context, result) def log_user_intervention( self, context: RecoveryContext, user_action: str, details: Dict ): """ Log user intervention in recovery process. Args: context: Recovery context user_action: Action taken by user details: Additional details """ log_entry = { 'timestamp': datetime.now().isoformat(), 'workflow_id': context.workflow_id, 'node_id': context.node_id, 'user_action': user_action, 'details': details } self.logger.info(f"User intervention: {json.dumps(log_entry)}") def get_recovery_statistics( self, workflow_id: Optional[str] = None ) -> Dict: """ Get recovery statistics. Args: workflow_id: Optional workflow ID to filter by Returns: Dictionary with statistics """ metrics = self.metrics.copy() if workflow_id and workflow_id in metrics.get('by_workflow', {}): return metrics['by_workflow'][workflow_id] return metrics def generate_insights(self) -> List[str]: """ Generate insights and recommendations from recovery patterns. Returns: List of insight strings """ insights = [] metrics = self.metrics # Overall success rate total = metrics.get('total_attempts', 0) successes = metrics.get('successful_recoveries', 0) if total > 0: success_rate = (successes / total) * 100 insights.append(f"Overall recovery success rate: {success_rate:.1f}%") # Strategy performance strategy_perf = metrics.get('strategy_performance', {}) if strategy_perf: best_strategy = max( strategy_perf.items(), key=lambda x: x[1].get('success_rate', 0) ) insights.append( f"Best performing strategy: {best_strategy[0]} " f"({best_strategy[1].get('success_rate', 0):.1f}% success)" ) # Time savings time_saved = metrics.get('time_saved_hours', 0) if time_saved > 0: insights.append(f"Estimated time saved: {time_saved:.1f} hours") # Repeated failures repeated_failures = self._detect_repeated_failures() if repeated_failures: insights.append( f"Warning: {len(repeated_failures)} workflows have repeated failures" ) return insights def check_for_alerts(self) -> List[Dict]: """ Check for conditions that require administrator attention. Returns: List of alert dictionaries """ alerts = [] # Check for repeated failures repeated_failures = self._detect_repeated_failures() for workflow_id, count in repeated_failures.items(): if count >= 5: alerts.append({ 'severity': 'high', 'type': 'repeated_failures', 'workflow_id': workflow_id, 'count': count, 'message': f'Workflow {workflow_id} has {count} repeated failures' }) # Check for low success rates strategy_perf = self.metrics.get('strategy_performance', {}) for strategy, perf in strategy_perf.items(): success_rate = perf.get('success_rate', 0) attempts = perf.get('attempts', 0) if attempts >= 10 and success_rate < 50: alerts.append({ 'severity': 'medium', 'type': 'low_success_rate', 'strategy': strategy, 'success_rate': success_rate, 'message': f'Strategy {strategy} has low success rate: {success_rate:.1f}%' }) return alerts def _update_metrics(self, context: RecoveryContext, result: RecoveryResult): """Update metrics with recovery result.""" # Total attempts self.metrics['total_attempts'] = self.metrics.get('total_attempts', 0) + 1 # Successful recoveries if result.success: self.metrics['successful_recoveries'] = \ self.metrics.get('successful_recoveries', 0) + 1 # Estimate time saved (assume 5 minutes per manual intervention) time_saved_hours = self.metrics.get('time_saved_hours', 0.0) self.metrics['time_saved_hours'] = time_saved_hours + (5.0 / 60.0) # Strategy performance if 'strategy_performance' not in self.metrics: self.metrics['strategy_performance'] = {} strategy = result.strategy_used if strategy not in self.metrics['strategy_performance']: self.metrics['strategy_performance'][strategy] = { 'attempts': 0, 'successes': 0, 'success_rate': 0.0 } perf = self.metrics['strategy_performance'][strategy] perf['attempts'] += 1 if result.success: perf['successes'] += 1 perf['success_rate'] = (perf['successes'] / perf['attempts']) * 100 # By workflow if 'by_workflow' not in self.metrics: self.metrics['by_workflow'] = {} workflow_id = context.workflow_id if workflow_id not in self.metrics['by_workflow']: self.metrics['by_workflow'][workflow_id] = { 'attempts': 0, 'successes': 0, 'failures': 0 } wf_metrics = self.metrics['by_workflow'][workflow_id] wf_metrics['attempts'] += 1 if result.success: wf_metrics['successes'] += 1 else: wf_metrics['failures'] += 1 # Save metrics self._save_metrics() def _detect_repeated_failures(self) -> Dict[str, int]: """Detect workflows with repeated failures.""" repeated = {} by_workflow = self.metrics.get('by_workflow', {}) for workflow_id, metrics in by_workflow.items(): failures = metrics.get('failures', 0) if failures >= 3: repeated[workflow_id] = failures return repeated def _load_metrics(self) -> Dict: """Load metrics from storage.""" if not self.metrics_file.exists(): return {} try: with open(self.metrics_file, 'r') as f: return json.load(f) except Exception as e: self.logger.error(f"Error loading metrics: {e}") return {} def _save_metrics(self): """Save metrics to storage.""" try: with open(self.metrics_file, 'w') as f: json.dump(self.metrics, f, indent=2) except Exception as e: self.logger.error(f"Error saving metrics: {e}")