rpa_vision_v3/core/healing/recovery_logger.py

"""Logging and monitoring for self-healing operations."""

import json
import logging
from pathlib import Path
from typing import Dict, List, Optional
from datetime import datetime
from .models import RecoveryContext, RecoveryResult


class RecoveryLogger:
    """Logger for self-healing recovery operations."""

    def __init__(self, log_path: Optional[Path] = None):
        """
        Initialize recovery logger.

        Args:
            log_path: Path for storing recovery logs
        """
        self.log_path = log_path or Path('logs/healing')
        self.log_path.mkdir(parents=True, exist_ok=True)

        # Setup file logger
        self.logger = logging.getLogger('healing')
        self.logger.setLevel(logging.INFO)

        # File handler
        log_file = self.log_path / 'recovery.log'
        handler = logging.FileHandler(log_file)
        handler.setFormatter(logging.Formatter(
            '%(asctime)s - %(levelname)s - %(message)s'
        ))
        self.logger.addHandler(handler)

        # Metrics storage
        self.metrics_file = self.log_path / 'metrics.json'
        self.metrics = self._load_metrics()

    def log_recovery_attempt(
        self,
        context: RecoveryContext,
        result: RecoveryResult
    ):
        """
        Log a recovery attempt with full details.

        Args:
            context: Recovery context
            result: Recovery result
        """
        log_entry = {
            'timestamp': datetime.now().isoformat(),
            'workflow_id': context.workflow_id,
            'node_id': context.node_id,
            'original_action': context.original_action,
            'target_element': context.target_element,
            'failure_reason': context.failure_reason,
            'attempt_count': context.attempt_count,
            'strategy_used': result.strategy_used,
            'success': result.success,
            'confidence_score': result.confidence_score,
            'execution_time': result.execution_time,
            'new_element': result.new_element,
            'requires_user_input': result.requires_user_input,
            'error_message': result.error_message
        }

        # Log to file
        if result.success:
            self.logger.info(f"Recovery SUCCESS: {json.dumps(log_entry)}")
        else:
            self.logger.warning(f"Recovery FAILED: {json.dumps(log_entry)}")

        # Update metrics
        self._update_metrics(context, result)

    def log_user_intervention(
        self,
        context: RecoveryContext,
        user_action: str,
        details: Dict
    ):
        """
        Log user intervention in recovery process.

        Args:
            context: Recovery context
            user_action: Action taken by user
            details: Additional details
        """
        log_entry = {
            'timestamp': datetime.now().isoformat(),
            'workflow_id': context.workflow_id,
            'node_id': context.node_id,
            'user_action': user_action,
            'details': details
        }

        self.logger.info(f"User intervention: {json.dumps(log_entry)}")

    def get_recovery_statistics(
        self,
        workflow_id: Optional[str] = None
    ) -> Dict:
        """
        Get recovery statistics.

        Args:
            workflow_id: Optional workflow ID to filter by

        Returns:
            Dictionary with statistics
        """
        metrics = self.metrics.copy()

        if workflow_id and workflow_id in metrics.get('by_workflow', {}):
            return metrics['by_workflow'][workflow_id]

        return metrics

    def generate_insights(self) -> List[str]:
        """
        Generate insights and recommendations from recovery patterns.

        Returns:
            List of insight strings
        """
        insights = []
        metrics = self.metrics

        # Overall success rate
        total = metrics.get('total_attempts', 0)
        successes = metrics.get('successful_recoveries', 0)
        if total > 0:
            success_rate = (successes / total) * 100
            insights.append(f"Overall recovery success rate: {success_rate:.1f}%")

        # Strategy performance
        strategy_perf = metrics.get('strategy_performance', {})
        if strategy_perf:
            best_strategy = max(
                strategy_perf.items(),
                key=lambda x: x[1].get('success_rate', 0)
            )
            insights.append(
                f"Best performing strategy: {best_strategy[0]} "
                f"({best_strategy[1].get('success_rate', 0):.1f}% success)"
            )

        # Time savings
        time_saved = metrics.get('time_saved_hours', 0)
        if time_saved > 0:
            insights.append(f"Estimated time saved: {time_saved:.1f} hours")

        # Repeated failures
        repeated_failures = self._detect_repeated_failures()
        if repeated_failures:
            insights.append(
                f"Warning: {len(repeated_failures)} workflows have repeated failures"
            )

        return insights

    def check_for_alerts(self) -> List[Dict]:
        """
        Check for conditions that require administrator attention.

        Returns:
            List of alert dictionaries
        """
        alerts = []

        # Check for repeated failures
        repeated_failures = self._detect_repeated_failures()
        for workflow_id, count in repeated_failures.items():
            if count >= 5:
                alerts.append({
                    'severity': 'high',
                    'type': 'repeated_failures',
                    'workflow_id': workflow_id,
                    'count': count,
                    'message': f'Workflow {workflow_id} has {count} repeated failures'
                })

        # Check for low success rates
        strategy_perf = self.metrics.get('strategy_performance', {})
        for strategy, perf in strategy_perf.items():
            success_rate = perf.get('success_rate', 0)
            attempts = perf.get('attempts', 0)
            if attempts >= 10 and success_rate < 50:
                alerts.append({
                    'severity': 'medium',
                    'type': 'low_success_rate',
                    'strategy': strategy,
                    'success_rate': success_rate,
                    'message': f'Strategy {strategy} has low success rate: {success_rate:.1f}%'
                })

        return alerts

    def _update_metrics(self, context: RecoveryContext, result: RecoveryResult):
        """Update metrics with recovery result."""
        # Total attempts
        self.metrics['total_attempts'] = self.metrics.get('total_attempts', 0) + 1

        # Successful recoveries
        if result.success:
            self.metrics['successful_recoveries'] = \
                self.metrics.get('successful_recoveries', 0) + 1

            # Estimate time saved (assume 5 minutes per manual intervention)
            time_saved_hours = self.metrics.get('time_saved_hours', 0.0)
            self.metrics['time_saved_hours'] = time_saved_hours + (5.0 / 60.0)

        # Strategy performance
        if 'strategy_performance' not in self.metrics:
            self.metrics['strategy_performance'] = {}

        strategy = result.strategy_used
        if strategy not in self.metrics['strategy_performance']:
            self.metrics['strategy_performance'][strategy] = {
                'attempts': 0,
                'successes': 0,
                'success_rate': 0.0
            }

        perf = self.metrics['strategy_performance'][strategy]
        perf['attempts'] += 1
        if result.success:
            perf['successes'] += 1
        perf['success_rate'] = (perf['successes'] / perf['attempts']) * 100

        # By workflow
        if 'by_workflow' not in self.metrics:
            self.metrics['by_workflow'] = {}

        workflow_id = context.workflow_id
        if workflow_id not in self.metrics['by_workflow']:
            self.metrics['by_workflow'][workflow_id] = {
                'attempts': 0,
                'successes': 0,
                'failures': 0
            }

        wf_metrics = self.metrics['by_workflow'][workflow_id]
        wf_metrics['attempts'] += 1
        if result.success:
            wf_metrics['successes'] += 1
        else:
            wf_metrics['failures'] += 1

        # Save metrics
        self._save_metrics()

    def _detect_repeated_failures(self) -> Dict[str, int]:
        """Detect workflows with repeated failures."""
        repeated = {}
        by_workflow = self.metrics.get('by_workflow', {})

        for workflow_id, metrics in by_workflow.items():
            failures = metrics.get('failures', 0)
            if failures >= 3:
                repeated[workflow_id] = failures

        return repeated

    def _load_metrics(self) -> Dict:
        """Load metrics from storage."""
        if not self.metrics_file.exists():
            return {}

        try:
            with open(self.metrics_file, 'r') as f:
                return json.load(f)
        except Exception as e:
            self.logger.error(f"Error loading metrics: {e}")
            return {}

    def _save_metrics(self):
        """Save metrics to storage."""
        try:
            with open(self.metrics_file, 'w') as f:
                json.dump(self.metrics, f, indent=2)
        except Exception as e:
            self.logger.error(f"Error saving metrics: {e}")