v1.0 - Version stable: multi-PC, détection UI-DETR-1, 3 modes exécution

- Frontend v4 accessible sur réseau local (192.168.1.40)
- Ports ouverts: 3002 (frontend), 5001 (backend), 5004 (dashboard)
- Ollama GPU fonctionnel
- Self-healing interactif
- Dashboard confiance

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Dom
2026-01-29 11:23:51 +01:00
parent 21bfa3b337
commit a27b74cf22
1595 changed files with 412691 additions and 400 deletions

View File

@@ -0,0 +1,12 @@
"""Data collection components for analytics."""
from .metrics_collector import MetricsCollector, ExecutionMetrics, StepMetrics
from .resource_collector import ResourceCollector, ResourceMetrics
__all__ = [
'MetricsCollector',
'ExecutionMetrics',
'StepMetrics',
'ResourceCollector',
'ResourceMetrics',
]

View File

@@ -0,0 +1,348 @@
"""Metrics collection for workflow executions."""
import threading
import time
import logging
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional, Union
from datetime import datetime
from pathlib import Path
logger = logging.getLogger(__name__)
@dataclass
class ExecutionMetrics:
"""Metrics for a workflow execution."""
execution_id: str
workflow_id: str
started_at: datetime
completed_at: Optional[datetime] = None
duration_ms: Optional[float] = None
status: str = 'running' # 'running', 'completed', 'failed'
steps_total: int = 0
steps_completed: int = 0
steps_failed: int = 0
error_message: Optional[str] = None
context: Dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for storage."""
return {
'execution_id': self.execution_id,
'workflow_id': self.workflow_id,
'started_at': self.started_at.isoformat(),
'completed_at': self.completed_at.isoformat() if self.completed_at else None,
'duration_ms': self.duration_ms,
'status': self.status,
'steps_total': self.steps_total,
'steps_completed': self.steps_completed,
'steps_failed': self.steps_failed,
'error_message': self.error_message,
'context': self.context
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'ExecutionMetrics':
"""Create from dictionary."""
return cls(
execution_id=data['execution_id'],
workflow_id=data['workflow_id'],
started_at=datetime.fromisoformat(data['started_at']),
completed_at=datetime.fromisoformat(data['completed_at']) if data.get('completed_at') else None,
duration_ms=data.get('duration_ms'),
status=data.get('status', 'running'),
steps_total=data.get('steps_total', 0),
steps_completed=data.get('steps_completed', 0),
steps_failed=data.get('steps_failed', 0),
error_message=data.get('error_message'),
context=data.get('context', {})
)
@dataclass
class StepMetrics:
"""Metrics for a workflow step."""
step_id: str
execution_id: str
workflow_id: str
node_id: str
action_type: str
target_element: str
started_at: datetime
completed_at: datetime
duration_ms: float
status: str
confidence_score: float
retry_count: int = 0
error_details: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for storage."""
return {
'step_id': self.step_id,
'execution_id': self.execution_id,
'workflow_id': self.workflow_id,
'node_id': self.node_id,
'action_type': self.action_type,
'target_element': self.target_element,
'started_at': self.started_at.isoformat(),
'completed_at': self.completed_at.isoformat(),
'duration_ms': self.duration_ms,
'status': self.status,
'confidence_score': self.confidence_score,
'retry_count': self.retry_count,
'error_details': self.error_details
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'StepMetrics':
"""Create from dictionary."""
return cls(
step_id=data['step_id'],
execution_id=data['execution_id'],
workflow_id=data['workflow_id'],
node_id=data['node_id'],
action_type=data['action_type'],
target_element=data['target_element'],
started_at=datetime.fromisoformat(data['started_at']),
completed_at=datetime.fromisoformat(data['completed_at']),
duration_ms=data['duration_ms'],
status=data['status'],
confidence_score=data['confidence_score'],
retry_count=data.get('retry_count', 0),
error_details=data.get('error_details')
)
class MetricsCollector:
"""Collects metrics from workflow executions."""
def __init__(
self,
storage_callback: Optional[callable] = None,
buffer_size: int = 1000,
flush_interval_sec: float = 5.0
):
"""
Initialize metrics collector.
Args:
storage_callback: Callback to persist metrics (receives list of metrics)
buffer_size: Maximum buffer size before forcing flush
flush_interval_sec: Interval between automatic flushes
"""
self.storage_callback = storage_callback
self.buffer_size = buffer_size
self.flush_interval = flush_interval_sec
self._buffer: List[Union[ExecutionMetrics, StepMetrics]] = []
self._lock = threading.Lock()
self._flush_thread: Optional[threading.Thread] = None
self._running = False
# Track active executions
self._active_executions: Dict[str, ExecutionMetrics] = {}
logger.info(f"MetricsCollector initialized (buffer_size={buffer_size}, flush_interval={flush_interval_sec}s)")
def start(self) -> None:
"""Start automatic flushing."""
if self._running:
return
self._running = True
self._flush_thread = threading.Thread(target=self._auto_flush, daemon=True)
self._flush_thread.start()
logger.info("MetricsCollector started")
def stop(self) -> None:
"""Stop automatic flushing and flush remaining metrics."""
self._running = False
if self._flush_thread:
self._flush_thread.join(timeout=5.0)
self.flush()
logger.info("MetricsCollector stopped")
def record_execution_start(
self,
execution_id: str,
workflow_id: str,
context: Optional[Dict[str, Any]] = None
) -> None:
"""
Record the start of a workflow execution.
Args:
execution_id: Unique execution identifier
workflow_id: Workflow identifier
context: Additional context information
"""
metrics = ExecutionMetrics(
execution_id=execution_id,
workflow_id=workflow_id,
started_at=datetime.now(),
status='running',
context=context or {}
)
with self._lock:
self._active_executions[execution_id] = metrics
logger.debug(f"Recorded execution start: {execution_id}")
def record_execution_complete(
self,
execution_id: str,
status: str,
steps_total: int = 0,
steps_completed: int = 0,
steps_failed: int = 0,
error_message: Optional[str] = None
) -> None:
"""
Record the completion of a workflow execution.
Args:
execution_id: Execution identifier
status: Final status ('completed' or 'failed')
steps_total: Total number of steps
steps_completed: Number of completed steps
steps_failed: Number of failed steps
error_message: Error message if failed
"""
with self._lock:
if execution_id not in self._active_executions:
logger.warning(f"Execution not found: {execution_id}")
return
metrics = self._active_executions[execution_id]
metrics.completed_at = datetime.now()
metrics.duration_ms = (metrics.completed_at - metrics.started_at).total_seconds() * 1000
metrics.status = status
metrics.steps_total = steps_total
metrics.steps_completed = steps_completed
metrics.steps_failed = steps_failed
metrics.error_message = error_message
# Move to buffer
self._buffer.append(metrics)
del self._active_executions[execution_id]
# Check if buffer is full
if len(self._buffer) >= self.buffer_size:
self._flush_unlocked()
logger.debug(f"Recorded execution complete: {execution_id} ({status})")
def record_step(self, step_metrics: StepMetrics) -> None:
"""
Record metrics for a completed step.
Args:
step_metrics: Step metrics to record
"""
with self._lock:
self._buffer.append(step_metrics)
# Check if buffer is full
if len(self._buffer) >= self.buffer_size:
self._flush_unlocked()
logger.debug(f"Recorded step: {step_metrics.step_id}")
def flush(self) -> int:
"""
Flush buffered metrics to storage.
Returns:
Number of metrics flushed
"""
with self._lock:
return self._flush_unlocked()
def _flush_unlocked(self) -> int:
"""Flush without acquiring lock (must be called with lock held)."""
if not self._buffer:
return 0
if not self.storage_callback:
logger.warning("No storage callback configured, discarding metrics")
count = len(self._buffer)
self._buffer.clear()
return count
try:
# Copy buffer
metrics_to_flush = self._buffer.copy()
self._buffer.clear()
# Persist (outside lock to avoid blocking)
self.storage_callback(metrics_to_flush)
logger.debug(f"Flushed {len(metrics_to_flush)} metrics")
return len(metrics_to_flush)
except Exception as e:
logger.error(f"Error flushing metrics: {e}")
# Put metrics back in buffer
self._buffer.extend(metrics_to_flush)
return 0
def _auto_flush(self) -> None:
"""Automatic flush thread."""
while self._running:
time.sleep(self.flush_interval)
if self._running:
self.flush()
def get_active_executions(self) -> Dict[str, ExecutionMetrics]:
"""Get currently active executions."""
with self._lock:
return self._active_executions.copy()
def get_buffer_size(self) -> int:
"""Get current buffer size."""
with self._lock:
return len(self._buffer)
def record_recovery_attempt(
self,
workflow_id: str,
node_id: str,
failure_reason: str,
recovery_success: bool,
strategy_used: Optional[str] = None,
confidence: float = 0.0
) -> None:
"""
Record a self-healing recovery attempt.
Args:
workflow_id: Workflow identifier
node_id: Node where failure occurred
failure_reason: Reason for the failure
recovery_success: Whether recovery was successful
strategy_used: Strategy used for recovery
confidence: Confidence score of recovery
"""
# Create a custom metrics entry for recovery
recovery_metrics = {
'type': 'recovery_attempt',
'timestamp': datetime.now().isoformat(),
'workflow_id': workflow_id,
'node_id': node_id,
'failure_reason': failure_reason,
'recovery_success': recovery_success,
'strategy_used': strategy_used,
'confidence': confidence
}
with self._lock:
self._buffer.append(recovery_metrics)
# Check if buffer is full
if len(self._buffer) >= self.buffer_size:
self._flush_unlocked()
logger.debug(f"Recorded recovery attempt: {workflow_id}/{node_id} - {'success' if recovery_success else 'failed'}")

View File

@@ -0,0 +1,209 @@
"""Resource usage collection for analytics."""
import psutil
import threading
import time
import logging
from dataclasses import dataclass
from typing import Optional, Dict, Any, List
from datetime import datetime
logger = logging.getLogger(__name__)
@dataclass
class ResourceMetrics:
"""System resource usage metrics."""
timestamp: datetime
workflow_id: Optional[str] = None
execution_id: Optional[str] = None
cpu_percent: float = 0.0
memory_mb: float = 0.0
gpu_utilization: float = 0.0
gpu_memory_mb: float = 0.0
disk_io_mb: float = 0.0
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for storage."""
return {
'timestamp': self.timestamp.isoformat(),
'workflow_id': self.workflow_id,
'execution_id': self.execution_id,
'cpu_percent': self.cpu_percent,
'memory_mb': self.memory_mb,
'gpu_utilization': self.gpu_utilization,
'gpu_memory_mb': self.gpu_memory_mb,
'disk_io_mb': self.disk_io_mb
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'ResourceMetrics':
"""Create from dictionary."""
return cls(
timestamp=datetime.fromisoformat(data['timestamp']),
workflow_id=data.get('workflow_id'),
execution_id=data.get('execution_id'),
cpu_percent=data.get('cpu_percent', 0.0),
memory_mb=data.get('memory_mb', 0.0),
gpu_utilization=data.get('gpu_utilization', 0.0),
gpu_memory_mb=data.get('gpu_memory_mb', 0.0),
disk_io_mb=data.get('disk_io_mb', 0.0)
)
class ResourceCollector:
"""Collects system resource usage metrics."""
def __init__(
self,
storage_callback: Optional[callable] = None,
sample_interval_sec: float = 1.0
):
"""
Initialize resource collector.
Args:
storage_callback: Callback to persist metrics
sample_interval_sec: Interval between samples
"""
self.storage_callback = storage_callback
self.sample_interval = sample_interval_sec
self._running = False
self._thread: Optional[threading.Thread] = None
self._current_context: Dict[str, Optional[str]] = {
'workflow_id': None,
'execution_id': None
}
self._context_lock = threading.Lock()
# Initialize psutil
self._process = psutil.Process()
self._last_disk_io = None
# Try to import GPU monitoring
self._gpu_available = False
try:
import pynvml
pynvml.nvmlInit()
self._gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(0)
self._gpu_available = True
logger.info("GPU monitoring enabled")
except:
logger.info("GPU monitoring not available")
logger.info(f"ResourceCollector initialized (sample_interval={sample_interval_sec}s)")
@property
def monitoring_active(self) -> bool:
"""Check if resource monitoring is active."""
return self._running
def start(self) -> None:
"""Start collecting resource metrics."""
if self._running:
return
self._running = True
self._thread = threading.Thread(target=self._collect_loop, daemon=True)
self._thread.start()
logger.info("ResourceCollector started")
def stop(self) -> None:
"""Stop collecting resource metrics."""
self._running = False
if self._thread:
self._thread.join(timeout=5.0)
logger.info("ResourceCollector stopped")
def set_context(
self,
workflow_id: Optional[str] = None,
execution_id: Optional[str] = None
) -> None:
"""
Set current execution context for resource tracking.
Args:
workflow_id: Current workflow ID
execution_id: Current execution ID
"""
with self._context_lock:
self._current_context['workflow_id'] = workflow_id
self._current_context['execution_id'] = execution_id
def clear_context(self) -> None:
"""Clear execution context."""
with self._context_lock:
self._current_context['workflow_id'] = None
self._current_context['execution_id'] = None
def get_current_metrics(self) -> ResourceMetrics:
"""
Get current resource usage.
Returns:
ResourceMetrics with current usage
"""
with self._context_lock:
workflow_id = self._current_context['workflow_id']
execution_id = self._current_context['execution_id']
# CPU usage
cpu_percent = self._process.cpu_percent(interval=0.1)
# Memory usage
memory_info = self._process.memory_info()
memory_mb = memory_info.rss / (1024 * 1024)
# Disk I/O
disk_io_mb = 0.0
try:
disk_io = self._process.io_counters()
if self._last_disk_io:
bytes_read = disk_io.read_bytes - self._last_disk_io.read_bytes
bytes_written = disk_io.write_bytes - self._last_disk_io.write_bytes
disk_io_mb = (bytes_read + bytes_written) / (1024 * 1024)
self._last_disk_io = disk_io
except:
pass
# GPU usage
gpu_utilization = 0.0
gpu_memory_mb = 0.0
if self._gpu_available:
try:
import pynvml
util = pynvml.nvmlDeviceGetUtilizationRates(self._gpu_handle)
gpu_utilization = float(util.gpu)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(self._gpu_handle)
gpu_memory_mb = mem_info.used / (1024 * 1024)
except:
pass
return ResourceMetrics(
timestamp=datetime.now(),
workflow_id=workflow_id,
execution_id=execution_id,
cpu_percent=cpu_percent,
memory_mb=memory_mb,
gpu_utilization=gpu_utilization,
gpu_memory_mb=gpu_memory_mb,
disk_io_mb=disk_io_mb
)
def _collect_loop(self) -> None:
"""Collection loop running in background thread."""
while self._running:
try:
metrics = self.get_current_metrics()
# Persist if callback is configured
if self.storage_callback:
self.storage_callback([metrics])
except Exception as e:
logger.error(f"Error collecting resource metrics: {e}")
time.sleep(self.sample_interval)