v1.0 - Version stable: multi-PC, détection UI-DETR-1, 3 modes exécution

- Frontend v4 accessible sur réseau local (192.168.1.40) - Ports ouverts: 3002 (frontend), 5001 (backend), 5004 (dashboard) - Ollama GPU fonctionnel - Self-healing interactif - Dashboard confiance Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 11:23:51 +01:00
parent 21bfa3b337
commit a27b74cf22
1595 changed files with 412691 additions and 400 deletions
--- a/core/analytics/collection/init.py
+++ b/core/analytics/collection/init.py
@@ -0,0 +1,12 @@
+"""Data collection components for analytics."""
+
+from .metrics_collector import MetricsCollector, ExecutionMetrics, StepMetrics
+from .resource_collector import ResourceCollector, ResourceMetrics
+
+__all__ = [
+    'MetricsCollector',
+    'ExecutionMetrics',
+    'StepMetrics',
+    'ResourceCollector',
+    'ResourceMetrics',
+]
--- a/core/analytics/collection/metrics_collector.py
+++ b/core/analytics/collection/metrics_collector.py
@@ -0,0 +1,348 @@
+"""Metrics collection for workflow executions."""
+
+import threading
+import time
+import logging
+from dataclasses import dataclass, field
+from typing import List, Dict, Any, Optional, Union
+from datetime import datetime
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ExecutionMetrics:
+    """Metrics for a workflow execution."""
+    execution_id: str
+    workflow_id: str
+    started_at: datetime
+    completed_at: Optional[datetime] = None
+    duration_ms: Optional[float] = None
+    status: str = 'running'  # 'running', 'completed', 'failed'
+    steps_total: int = 0
+    steps_completed: int = 0
+    steps_failed: int = 0
+    error_message: Optional[str] = None
+    context: Dict[str, Any] = field(default_factory=dict)
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for storage."""
+        return {
+            'execution_id': self.execution_id,
+            'workflow_id': self.workflow_id,
+            'started_at': self.started_at.isoformat(),
+            'completed_at': self.completed_at.isoformat() if self.completed_at else None,
+            'duration_ms': self.duration_ms,
+            'status': self.status,
+            'steps_total': self.steps_total,
+            'steps_completed': self.steps_completed,
+            'steps_failed': self.steps_failed,
+            'error_message': self.error_message,
+            'context': self.context
+        }
+    
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'ExecutionMetrics':
+        """Create from dictionary."""
+        return cls(
+            execution_id=data['execution_id'],
+            workflow_id=data['workflow_id'],
+            started_at=datetime.fromisoformat(data['started_at']),
+            completed_at=datetime.fromisoformat(data['completed_at']) if data.get('completed_at') else None,
+            duration_ms=data.get('duration_ms'),
+            status=data.get('status', 'running'),
+            steps_total=data.get('steps_total', 0),
+            steps_completed=data.get('steps_completed', 0),
+            steps_failed=data.get('steps_failed', 0),
+            error_message=data.get('error_message'),
+            context=data.get('context', {})
+        )
+
+
+@dataclass
+class StepMetrics:
+    """Metrics for a workflow step."""
+    step_id: str
+    execution_id: str
+    workflow_id: str
+    node_id: str
+    action_type: str
+    target_element: str
+    started_at: datetime
+    completed_at: datetime
+    duration_ms: float
+    status: str
+    confidence_score: float
+    retry_count: int = 0
+    error_details: Optional[str] = None
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for storage."""
+        return {
+            'step_id': self.step_id,
+            'execution_id': self.execution_id,
+            'workflow_id': self.workflow_id,
+            'node_id': self.node_id,
+            'action_type': self.action_type,
+            'target_element': self.target_element,
+            'started_at': self.started_at.isoformat(),
+            'completed_at': self.completed_at.isoformat(),
+            'duration_ms': self.duration_ms,
+            'status': self.status,
+            'confidence_score': self.confidence_score,
+            'retry_count': self.retry_count,
+            'error_details': self.error_details
+        }
+    
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'StepMetrics':
+        """Create from dictionary."""
+        return cls(
+            step_id=data['step_id'],
+            execution_id=data['execution_id'],
+            workflow_id=data['workflow_id'],
+            node_id=data['node_id'],
+            action_type=data['action_type'],
+            target_element=data['target_element'],
+            started_at=datetime.fromisoformat(data['started_at']),
+            completed_at=datetime.fromisoformat(data['completed_at']),
+            duration_ms=data['duration_ms'],
+            status=data['status'],
+            confidence_score=data['confidence_score'],
+            retry_count=data.get('retry_count', 0),
+            error_details=data.get('error_details')
+        )
+
+
+class MetricsCollector:
+    """Collects metrics from workflow executions."""
+    
+    def __init__(
+        self,
+        storage_callback: Optional[callable] = None,
+        buffer_size: int = 1000,
+        flush_interval_sec: float = 5.0
+    ):
+        """
+        Initialize metrics collector.
+        
+        Args:
+            storage_callback: Callback to persist metrics (receives list of metrics)
+            buffer_size: Maximum buffer size before forcing flush
+            flush_interval_sec: Interval between automatic flushes
+        """
+        self.storage_callback = storage_callback
+        self.buffer_size = buffer_size
+        self.flush_interval = flush_interval_sec
+        
+        self._buffer: List[Union[ExecutionMetrics, StepMetrics]] = []
+        self._lock = threading.Lock()
+        self._flush_thread: Optional[threading.Thread] = None
+        self._running = False
+        
+        # Track active executions
+        self._active_executions: Dict[str, ExecutionMetrics] = {}
+        
+        logger.info(f"MetricsCollector initialized (buffer_size={buffer_size}, flush_interval={flush_interval_sec}s)")
+    
+    def start(self) -> None:
+        """Start automatic flushing."""
+        if self._running:
+            return
+        
+        self._running = True
+        self._flush_thread = threading.Thread(target=self._auto_flush, daemon=True)
+        self._flush_thread.start()
+        logger.info("MetricsCollector started")
+    
+    def stop(self) -> None:
+        """Stop automatic flushing and flush remaining metrics."""
+        self._running = False
+        if self._flush_thread:
+            self._flush_thread.join(timeout=5.0)
+        self.flush()
+        logger.info("MetricsCollector stopped")
+    
+    def record_execution_start(
+        self,
+        execution_id: str,
+        workflow_id: str,
+        context: Optional[Dict[str, Any]] = None
+    ) -> None:
+        """
+        Record the start of a workflow execution.
+        
+        Args:
+            execution_id: Unique execution identifier
+            workflow_id: Workflow identifier
+            context: Additional context information
+        """
+        metrics = ExecutionMetrics(
+            execution_id=execution_id,
+            workflow_id=workflow_id,
+            started_at=datetime.now(),
+            status='running',
+            context=context or {}
+        )
+        
+        with self._lock:
+            self._active_executions[execution_id] = metrics
+        
+        logger.debug(f"Recorded execution start: {execution_id}")
+    
+    def record_execution_complete(
+        self,
+        execution_id: str,
+        status: str,
+        steps_total: int = 0,
+        steps_completed: int = 0,
+        steps_failed: int = 0,
+        error_message: Optional[str] = None
+    ) -> None:
+        """
+        Record the completion of a workflow execution.
+        
+        Args:
+            execution_id: Execution identifier
+            status: Final status ('completed' or 'failed')
+            steps_total: Total number of steps
+            steps_completed: Number of completed steps
+            steps_failed: Number of failed steps
+            error_message: Error message if failed
+        """
+        with self._lock:
+            if execution_id not in self._active_executions:
+                logger.warning(f"Execution not found: {execution_id}")
+                return
+            
+            metrics = self._active_executions[execution_id]
+            metrics.completed_at = datetime.now()
+            metrics.duration_ms = (metrics.completed_at - metrics.started_at).total_seconds() * 1000
+            metrics.status = status
+            metrics.steps_total = steps_total
+            metrics.steps_completed = steps_completed
+            metrics.steps_failed = steps_failed
+            metrics.error_message = error_message
+            
+            # Move to buffer
+            self._buffer.append(metrics)
+            del self._active_executions[execution_id]
+            
+            # Check if buffer is full
+            if len(self._buffer) >= self.buffer_size:
+                self._flush_unlocked()
+        
+        logger.debug(f"Recorded execution complete: {execution_id} ({status})")
+    
+    def record_step(self, step_metrics: StepMetrics) -> None:
+        """
+        Record metrics for a completed step.
+        
+        Args:
+            step_metrics: Step metrics to record
+        """
+        with self._lock:
+            self._buffer.append(step_metrics)
+            
+            # Check if buffer is full
+            if len(self._buffer) >= self.buffer_size:
+                self._flush_unlocked()
+        
+        logger.debug(f"Recorded step: {step_metrics.step_id}")
+    
+    def flush(self) -> int:
+        """
+        Flush buffered metrics to storage.
+        
+        Returns:
+            Number of metrics flushed
+        """
+        with self._lock:
+            return self._flush_unlocked()
+    
+    def _flush_unlocked(self) -> int:
+        """Flush without acquiring lock (must be called with lock held)."""
+        if not self._buffer:
+            return 0
+        
+        if not self.storage_callback:
+            logger.warning("No storage callback configured, discarding metrics")
+            count = len(self._buffer)
+            self._buffer.clear()
+            return count
+        
+        try:
+            # Copy buffer
+            metrics_to_flush = self._buffer.copy()
+            self._buffer.clear()
+            
+            # Persist (outside lock to avoid blocking)
+            self.storage_callback(metrics_to_flush)
+            
+            logger.debug(f"Flushed {len(metrics_to_flush)} metrics")
+            return len(metrics_to_flush)
+        
+        except Exception as e:
+            logger.error(f"Error flushing metrics: {e}")
+            # Put metrics back in buffer
+            self._buffer.extend(metrics_to_flush)
+            return 0
+    
+    def _auto_flush(self) -> None:
+        """Automatic flush thread."""
+        while self._running:
+            time.sleep(self.flush_interval)
+            if self._running:
+                self.flush()
+    
+    def get_active_executions(self) -> Dict[str, ExecutionMetrics]:
+        """Get currently active executions."""
+        with self._lock:
+            return self._active_executions.copy()
+    
+    def get_buffer_size(self) -> int:
+        """Get current buffer size."""
+        with self._lock:
+            return len(self._buffer)
+    
+    def record_recovery_attempt(
+        self,
+        workflow_id: str,
+        node_id: str,
+        failure_reason: str,
+        recovery_success: bool,
+        strategy_used: Optional[str] = None,
+        confidence: float = 0.0
+    ) -> None:
+        """
+        Record a self-healing recovery attempt.
+        
+        Args:
+            workflow_id: Workflow identifier
+            node_id: Node where failure occurred
+            failure_reason: Reason for the failure
+            recovery_success: Whether recovery was successful
+            strategy_used: Strategy used for recovery
+            confidence: Confidence score of recovery
+        """
+        # Create a custom metrics entry for recovery
+        recovery_metrics = {
+            'type': 'recovery_attempt',
+            'timestamp': datetime.now().isoformat(),
+            'workflow_id': workflow_id,
+            'node_id': node_id,
+            'failure_reason': failure_reason,
+            'recovery_success': recovery_success,
+            'strategy_used': strategy_used,
+            'confidence': confidence
+        }
+        
+        with self._lock:
+            self._buffer.append(recovery_metrics)
+            
+            # Check if buffer is full
+            if len(self._buffer) >= self.buffer_size:
+                self._flush_unlocked()
+        
+        logger.debug(f"Recorded recovery attempt: {workflow_id}/{node_id} - {'success' if recovery_success else 'failed'}")
--- a/core/analytics/collection/resource_collector.py
+++ b/core/analytics/collection/resource_collector.py
@@ -0,0 +1,209 @@
+"""Resource usage collection for analytics."""
+
+import psutil
+import threading
+import time
+import logging
+from dataclasses import dataclass
+from typing import Optional, Dict, Any, List
+from datetime import datetime
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ResourceMetrics:
+    """System resource usage metrics."""
+    timestamp: datetime
+    workflow_id: Optional[str] = None
+    execution_id: Optional[str] = None
+    cpu_percent: float = 0.0
+    memory_mb: float = 0.0
+    gpu_utilization: float = 0.0
+    gpu_memory_mb: float = 0.0
+    disk_io_mb: float = 0.0
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for storage."""
+        return {
+            'timestamp': self.timestamp.isoformat(),
+            'workflow_id': self.workflow_id,
+            'execution_id': self.execution_id,
+            'cpu_percent': self.cpu_percent,
+            'memory_mb': self.memory_mb,
+            'gpu_utilization': self.gpu_utilization,
+            'gpu_memory_mb': self.gpu_memory_mb,
+            'disk_io_mb': self.disk_io_mb
+        }
+    
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'ResourceMetrics':
+        """Create from dictionary."""
+        return cls(
+            timestamp=datetime.fromisoformat(data['timestamp']),
+            workflow_id=data.get('workflow_id'),
+            execution_id=data.get('execution_id'),
+            cpu_percent=data.get('cpu_percent', 0.0),
+            memory_mb=data.get('memory_mb', 0.0),
+            gpu_utilization=data.get('gpu_utilization', 0.0),
+            gpu_memory_mb=data.get('gpu_memory_mb', 0.0),
+            disk_io_mb=data.get('disk_io_mb', 0.0)
+        )
+
+
+class ResourceCollector:
+    """Collects system resource usage metrics."""
+    
+    def __init__(
+        self,
+        storage_callback: Optional[callable] = None,
+        sample_interval_sec: float = 1.0
+    ):
+        """
+        Initialize resource collector.
+        
+        Args:
+            storage_callback: Callback to persist metrics
+            sample_interval_sec: Interval between samples
+        """
+        self.storage_callback = storage_callback
+        self.sample_interval = sample_interval_sec
+        
+        self._running = False
+        self._thread: Optional[threading.Thread] = None
+        self._current_context: Dict[str, Optional[str]] = {
+            'workflow_id': None,
+            'execution_id': None
+        }
+        self._context_lock = threading.Lock()
+        
+        # Initialize psutil
+        self._process = psutil.Process()
+        self._last_disk_io = None
+        
+        # Try to import GPU monitoring
+        self._gpu_available = False
+        try:
+            import pynvml
+            pynvml.nvmlInit()
+            self._gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+            self._gpu_available = True
+            logger.info("GPU monitoring enabled")
+        except:
+            logger.info("GPU monitoring not available")
+        
+        logger.info(f"ResourceCollector initialized (sample_interval={sample_interval_sec}s)")
+
+    @property
+    def monitoring_active(self) -> bool:
+        """Check if resource monitoring is active."""
+        return self._running
+
+    def start(self) -> None:
+        """Start collecting resource metrics."""
+        if self._running:
+            return
+        
+        self._running = True
+        self._thread = threading.Thread(target=self._collect_loop, daemon=True)
+        self._thread.start()
+        logger.info("ResourceCollector started")
+    
+    def stop(self) -> None:
+        """Stop collecting resource metrics."""
+        self._running = False
+        if self._thread:
+            self._thread.join(timeout=5.0)
+        logger.info("ResourceCollector stopped")
+    
+    def set_context(
+        self,
+        workflow_id: Optional[str] = None,
+        execution_id: Optional[str] = None
+    ) -> None:
+        """
+        Set current execution context for resource tracking.
+        
+        Args:
+            workflow_id: Current workflow ID
+            execution_id: Current execution ID
+        """
+        with self._context_lock:
+            self._current_context['workflow_id'] = workflow_id
+            self._current_context['execution_id'] = execution_id
+    
+    def clear_context(self) -> None:
+        """Clear execution context."""
+        with self._context_lock:
+            self._current_context['workflow_id'] = None
+            self._current_context['execution_id'] = None
+    
+    def get_current_metrics(self) -> ResourceMetrics:
+        """
+        Get current resource usage.
+        
+        Returns:
+            ResourceMetrics with current usage
+        """
+        with self._context_lock:
+            workflow_id = self._current_context['workflow_id']
+            execution_id = self._current_context['execution_id']
+        
+        # CPU usage
+        cpu_percent = self._process.cpu_percent(interval=0.1)
+        
+        # Memory usage
+        memory_info = self._process.memory_info()
+        memory_mb = memory_info.rss / (1024 * 1024)
+        
+        # Disk I/O
+        disk_io_mb = 0.0
+        try:
+            disk_io = self._process.io_counters()
+            if self._last_disk_io:
+                bytes_read = disk_io.read_bytes - self._last_disk_io.read_bytes
+                bytes_written = disk_io.write_bytes - self._last_disk_io.write_bytes
+                disk_io_mb = (bytes_read + bytes_written) / (1024 * 1024)
+            self._last_disk_io = disk_io
+        except:
+            pass
+        
+        # GPU usage
+        gpu_utilization = 0.0
+        gpu_memory_mb = 0.0
+        if self._gpu_available:
+            try:
+                import pynvml
+                util = pynvml.nvmlDeviceGetUtilizationRates(self._gpu_handle)
+                gpu_utilization = float(util.gpu)
+                
+                mem_info = pynvml.nvmlDeviceGetMemoryInfo(self._gpu_handle)
+                gpu_memory_mb = mem_info.used / (1024 * 1024)
+            except:
+                pass
+        
+        return ResourceMetrics(
+            timestamp=datetime.now(),
+            workflow_id=workflow_id,
+            execution_id=execution_id,
+            cpu_percent=cpu_percent,
+            memory_mb=memory_mb,
+            gpu_utilization=gpu_utilization,
+            gpu_memory_mb=gpu_memory_mb,
+            disk_io_mb=disk_io_mb
+        )
+    
+    def _collect_loop(self) -> None:
+        """Collection loop running in background thread."""
+        while self._running:
+            try:
+                metrics = self.get_current_metrics()
+                
+                # Persist if callback is configured
+                if self.storage_callback:
+                    self.storage_callback([metrics])
+                
+            except Exception as e:
+                logger.error(f"Error collecting resource metrics: {e}")
+            
+            time.sleep(self.sample_interval)