v1.0 - Version stable: multi-PC, détection UI-DETR-1, 3 modes exécution
- Frontend v4 accessible sur réseau local (192.168.1.40) - Ports ouverts: 3002 (frontend), 5001 (backend), 5004 (dashboard) - Ollama GPU fonctionnel - Self-healing interactif - Dashboard confiance Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
12
core/analytics/collection/__init__.py
Normal file
12
core/analytics/collection/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""Data collection components for analytics."""
|
||||
|
||||
from .metrics_collector import MetricsCollector, ExecutionMetrics, StepMetrics
|
||||
from .resource_collector import ResourceCollector, ResourceMetrics
|
||||
|
||||
__all__ = [
|
||||
'MetricsCollector',
|
||||
'ExecutionMetrics',
|
||||
'StepMetrics',
|
||||
'ResourceCollector',
|
||||
'ResourceMetrics',
|
||||
]
|
||||
348
core/analytics/collection/metrics_collector.py
Normal file
348
core/analytics/collection/metrics_collector.py
Normal file
@@ -0,0 +1,348 @@
|
||||
"""Metrics collection for workflow executions."""
|
||||
|
||||
import threading
|
||||
import time
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Dict, Any, Optional, Union
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExecutionMetrics:
|
||||
"""Metrics for a workflow execution."""
|
||||
execution_id: str
|
||||
workflow_id: str
|
||||
started_at: datetime
|
||||
completed_at: Optional[datetime] = None
|
||||
duration_ms: Optional[float] = None
|
||||
status: str = 'running' # 'running', 'completed', 'failed'
|
||||
steps_total: int = 0
|
||||
steps_completed: int = 0
|
||||
steps_failed: int = 0
|
||||
error_message: Optional[str] = None
|
||||
context: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary for storage."""
|
||||
return {
|
||||
'execution_id': self.execution_id,
|
||||
'workflow_id': self.workflow_id,
|
||||
'started_at': self.started_at.isoformat(),
|
||||
'completed_at': self.completed_at.isoformat() if self.completed_at else None,
|
||||
'duration_ms': self.duration_ms,
|
||||
'status': self.status,
|
||||
'steps_total': self.steps_total,
|
||||
'steps_completed': self.steps_completed,
|
||||
'steps_failed': self.steps_failed,
|
||||
'error_message': self.error_message,
|
||||
'context': self.context
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> 'ExecutionMetrics':
|
||||
"""Create from dictionary."""
|
||||
return cls(
|
||||
execution_id=data['execution_id'],
|
||||
workflow_id=data['workflow_id'],
|
||||
started_at=datetime.fromisoformat(data['started_at']),
|
||||
completed_at=datetime.fromisoformat(data['completed_at']) if data.get('completed_at') else None,
|
||||
duration_ms=data.get('duration_ms'),
|
||||
status=data.get('status', 'running'),
|
||||
steps_total=data.get('steps_total', 0),
|
||||
steps_completed=data.get('steps_completed', 0),
|
||||
steps_failed=data.get('steps_failed', 0),
|
||||
error_message=data.get('error_message'),
|
||||
context=data.get('context', {})
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class StepMetrics:
|
||||
"""Metrics for a workflow step."""
|
||||
step_id: str
|
||||
execution_id: str
|
||||
workflow_id: str
|
||||
node_id: str
|
||||
action_type: str
|
||||
target_element: str
|
||||
started_at: datetime
|
||||
completed_at: datetime
|
||||
duration_ms: float
|
||||
status: str
|
||||
confidence_score: float
|
||||
retry_count: int = 0
|
||||
error_details: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary for storage."""
|
||||
return {
|
||||
'step_id': self.step_id,
|
||||
'execution_id': self.execution_id,
|
||||
'workflow_id': self.workflow_id,
|
||||
'node_id': self.node_id,
|
||||
'action_type': self.action_type,
|
||||
'target_element': self.target_element,
|
||||
'started_at': self.started_at.isoformat(),
|
||||
'completed_at': self.completed_at.isoformat(),
|
||||
'duration_ms': self.duration_ms,
|
||||
'status': self.status,
|
||||
'confidence_score': self.confidence_score,
|
||||
'retry_count': self.retry_count,
|
||||
'error_details': self.error_details
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> 'StepMetrics':
|
||||
"""Create from dictionary."""
|
||||
return cls(
|
||||
step_id=data['step_id'],
|
||||
execution_id=data['execution_id'],
|
||||
workflow_id=data['workflow_id'],
|
||||
node_id=data['node_id'],
|
||||
action_type=data['action_type'],
|
||||
target_element=data['target_element'],
|
||||
started_at=datetime.fromisoformat(data['started_at']),
|
||||
completed_at=datetime.fromisoformat(data['completed_at']),
|
||||
duration_ms=data['duration_ms'],
|
||||
status=data['status'],
|
||||
confidence_score=data['confidence_score'],
|
||||
retry_count=data.get('retry_count', 0),
|
||||
error_details=data.get('error_details')
|
||||
)
|
||||
|
||||
|
||||
class MetricsCollector:
|
||||
"""Collects metrics from workflow executions."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
storage_callback: Optional[callable] = None,
|
||||
buffer_size: int = 1000,
|
||||
flush_interval_sec: float = 5.0
|
||||
):
|
||||
"""
|
||||
Initialize metrics collector.
|
||||
|
||||
Args:
|
||||
storage_callback: Callback to persist metrics (receives list of metrics)
|
||||
buffer_size: Maximum buffer size before forcing flush
|
||||
flush_interval_sec: Interval between automatic flushes
|
||||
"""
|
||||
self.storage_callback = storage_callback
|
||||
self.buffer_size = buffer_size
|
||||
self.flush_interval = flush_interval_sec
|
||||
|
||||
self._buffer: List[Union[ExecutionMetrics, StepMetrics]] = []
|
||||
self._lock = threading.Lock()
|
||||
self._flush_thread: Optional[threading.Thread] = None
|
||||
self._running = False
|
||||
|
||||
# Track active executions
|
||||
self._active_executions: Dict[str, ExecutionMetrics] = {}
|
||||
|
||||
logger.info(f"MetricsCollector initialized (buffer_size={buffer_size}, flush_interval={flush_interval_sec}s)")
|
||||
|
||||
def start(self) -> None:
|
||||
"""Start automatic flushing."""
|
||||
if self._running:
|
||||
return
|
||||
|
||||
self._running = True
|
||||
self._flush_thread = threading.Thread(target=self._auto_flush, daemon=True)
|
||||
self._flush_thread.start()
|
||||
logger.info("MetricsCollector started")
|
||||
|
||||
def stop(self) -> None:
|
||||
"""Stop automatic flushing and flush remaining metrics."""
|
||||
self._running = False
|
||||
if self._flush_thread:
|
||||
self._flush_thread.join(timeout=5.0)
|
||||
self.flush()
|
||||
logger.info("MetricsCollector stopped")
|
||||
|
||||
def record_execution_start(
|
||||
self,
|
||||
execution_id: str,
|
||||
workflow_id: str,
|
||||
context: Optional[Dict[str, Any]] = None
|
||||
) -> None:
|
||||
"""
|
||||
Record the start of a workflow execution.
|
||||
|
||||
Args:
|
||||
execution_id: Unique execution identifier
|
||||
workflow_id: Workflow identifier
|
||||
context: Additional context information
|
||||
"""
|
||||
metrics = ExecutionMetrics(
|
||||
execution_id=execution_id,
|
||||
workflow_id=workflow_id,
|
||||
started_at=datetime.now(),
|
||||
status='running',
|
||||
context=context or {}
|
||||
)
|
||||
|
||||
with self._lock:
|
||||
self._active_executions[execution_id] = metrics
|
||||
|
||||
logger.debug(f"Recorded execution start: {execution_id}")
|
||||
|
||||
def record_execution_complete(
|
||||
self,
|
||||
execution_id: str,
|
||||
status: str,
|
||||
steps_total: int = 0,
|
||||
steps_completed: int = 0,
|
||||
steps_failed: int = 0,
|
||||
error_message: Optional[str] = None
|
||||
) -> None:
|
||||
"""
|
||||
Record the completion of a workflow execution.
|
||||
|
||||
Args:
|
||||
execution_id: Execution identifier
|
||||
status: Final status ('completed' or 'failed')
|
||||
steps_total: Total number of steps
|
||||
steps_completed: Number of completed steps
|
||||
steps_failed: Number of failed steps
|
||||
error_message: Error message if failed
|
||||
"""
|
||||
with self._lock:
|
||||
if execution_id not in self._active_executions:
|
||||
logger.warning(f"Execution not found: {execution_id}")
|
||||
return
|
||||
|
||||
metrics = self._active_executions[execution_id]
|
||||
metrics.completed_at = datetime.now()
|
||||
metrics.duration_ms = (metrics.completed_at - metrics.started_at).total_seconds() * 1000
|
||||
metrics.status = status
|
||||
metrics.steps_total = steps_total
|
||||
metrics.steps_completed = steps_completed
|
||||
metrics.steps_failed = steps_failed
|
||||
metrics.error_message = error_message
|
||||
|
||||
# Move to buffer
|
||||
self._buffer.append(metrics)
|
||||
del self._active_executions[execution_id]
|
||||
|
||||
# Check if buffer is full
|
||||
if len(self._buffer) >= self.buffer_size:
|
||||
self._flush_unlocked()
|
||||
|
||||
logger.debug(f"Recorded execution complete: {execution_id} ({status})")
|
||||
|
||||
def record_step(self, step_metrics: StepMetrics) -> None:
|
||||
"""
|
||||
Record metrics for a completed step.
|
||||
|
||||
Args:
|
||||
step_metrics: Step metrics to record
|
||||
"""
|
||||
with self._lock:
|
||||
self._buffer.append(step_metrics)
|
||||
|
||||
# Check if buffer is full
|
||||
if len(self._buffer) >= self.buffer_size:
|
||||
self._flush_unlocked()
|
||||
|
||||
logger.debug(f"Recorded step: {step_metrics.step_id}")
|
||||
|
||||
def flush(self) -> int:
|
||||
"""
|
||||
Flush buffered metrics to storage.
|
||||
|
||||
Returns:
|
||||
Number of metrics flushed
|
||||
"""
|
||||
with self._lock:
|
||||
return self._flush_unlocked()
|
||||
|
||||
def _flush_unlocked(self) -> int:
|
||||
"""Flush without acquiring lock (must be called with lock held)."""
|
||||
if not self._buffer:
|
||||
return 0
|
||||
|
||||
if not self.storage_callback:
|
||||
logger.warning("No storage callback configured, discarding metrics")
|
||||
count = len(self._buffer)
|
||||
self._buffer.clear()
|
||||
return count
|
||||
|
||||
try:
|
||||
# Copy buffer
|
||||
metrics_to_flush = self._buffer.copy()
|
||||
self._buffer.clear()
|
||||
|
||||
# Persist (outside lock to avoid blocking)
|
||||
self.storage_callback(metrics_to_flush)
|
||||
|
||||
logger.debug(f"Flushed {len(metrics_to_flush)} metrics")
|
||||
return len(metrics_to_flush)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error flushing metrics: {e}")
|
||||
# Put metrics back in buffer
|
||||
self._buffer.extend(metrics_to_flush)
|
||||
return 0
|
||||
|
||||
def _auto_flush(self) -> None:
|
||||
"""Automatic flush thread."""
|
||||
while self._running:
|
||||
time.sleep(self.flush_interval)
|
||||
if self._running:
|
||||
self.flush()
|
||||
|
||||
def get_active_executions(self) -> Dict[str, ExecutionMetrics]:
|
||||
"""Get currently active executions."""
|
||||
with self._lock:
|
||||
return self._active_executions.copy()
|
||||
|
||||
def get_buffer_size(self) -> int:
|
||||
"""Get current buffer size."""
|
||||
with self._lock:
|
||||
return len(self._buffer)
|
||||
|
||||
def record_recovery_attempt(
|
||||
self,
|
||||
workflow_id: str,
|
||||
node_id: str,
|
||||
failure_reason: str,
|
||||
recovery_success: bool,
|
||||
strategy_used: Optional[str] = None,
|
||||
confidence: float = 0.0
|
||||
) -> None:
|
||||
"""
|
||||
Record a self-healing recovery attempt.
|
||||
|
||||
Args:
|
||||
workflow_id: Workflow identifier
|
||||
node_id: Node where failure occurred
|
||||
failure_reason: Reason for the failure
|
||||
recovery_success: Whether recovery was successful
|
||||
strategy_used: Strategy used for recovery
|
||||
confidence: Confidence score of recovery
|
||||
"""
|
||||
# Create a custom metrics entry for recovery
|
||||
recovery_metrics = {
|
||||
'type': 'recovery_attempt',
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'workflow_id': workflow_id,
|
||||
'node_id': node_id,
|
||||
'failure_reason': failure_reason,
|
||||
'recovery_success': recovery_success,
|
||||
'strategy_used': strategy_used,
|
||||
'confidence': confidence
|
||||
}
|
||||
|
||||
with self._lock:
|
||||
self._buffer.append(recovery_metrics)
|
||||
|
||||
# Check if buffer is full
|
||||
if len(self._buffer) >= self.buffer_size:
|
||||
self._flush_unlocked()
|
||||
|
||||
logger.debug(f"Recorded recovery attempt: {workflow_id}/{node_id} - {'success' if recovery_success else 'failed'}")
|
||||
209
core/analytics/collection/resource_collector.py
Normal file
209
core/analytics/collection/resource_collector.py
Normal file
@@ -0,0 +1,209 @@
|
||||
"""Resource usage collection for analytics."""
|
||||
|
||||
import psutil
|
||||
import threading
|
||||
import time
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Dict, Any, List
|
||||
from datetime import datetime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ResourceMetrics:
|
||||
"""System resource usage metrics."""
|
||||
timestamp: datetime
|
||||
workflow_id: Optional[str] = None
|
||||
execution_id: Optional[str] = None
|
||||
cpu_percent: float = 0.0
|
||||
memory_mb: float = 0.0
|
||||
gpu_utilization: float = 0.0
|
||||
gpu_memory_mb: float = 0.0
|
||||
disk_io_mb: float = 0.0
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary for storage."""
|
||||
return {
|
||||
'timestamp': self.timestamp.isoformat(),
|
||||
'workflow_id': self.workflow_id,
|
||||
'execution_id': self.execution_id,
|
||||
'cpu_percent': self.cpu_percent,
|
||||
'memory_mb': self.memory_mb,
|
||||
'gpu_utilization': self.gpu_utilization,
|
||||
'gpu_memory_mb': self.gpu_memory_mb,
|
||||
'disk_io_mb': self.disk_io_mb
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> 'ResourceMetrics':
|
||||
"""Create from dictionary."""
|
||||
return cls(
|
||||
timestamp=datetime.fromisoformat(data['timestamp']),
|
||||
workflow_id=data.get('workflow_id'),
|
||||
execution_id=data.get('execution_id'),
|
||||
cpu_percent=data.get('cpu_percent', 0.0),
|
||||
memory_mb=data.get('memory_mb', 0.0),
|
||||
gpu_utilization=data.get('gpu_utilization', 0.0),
|
||||
gpu_memory_mb=data.get('gpu_memory_mb', 0.0),
|
||||
disk_io_mb=data.get('disk_io_mb', 0.0)
|
||||
)
|
||||
|
||||
|
||||
class ResourceCollector:
|
||||
"""Collects system resource usage metrics."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
storage_callback: Optional[callable] = None,
|
||||
sample_interval_sec: float = 1.0
|
||||
):
|
||||
"""
|
||||
Initialize resource collector.
|
||||
|
||||
Args:
|
||||
storage_callback: Callback to persist metrics
|
||||
sample_interval_sec: Interval between samples
|
||||
"""
|
||||
self.storage_callback = storage_callback
|
||||
self.sample_interval = sample_interval_sec
|
||||
|
||||
self._running = False
|
||||
self._thread: Optional[threading.Thread] = None
|
||||
self._current_context: Dict[str, Optional[str]] = {
|
||||
'workflow_id': None,
|
||||
'execution_id': None
|
||||
}
|
||||
self._context_lock = threading.Lock()
|
||||
|
||||
# Initialize psutil
|
||||
self._process = psutil.Process()
|
||||
self._last_disk_io = None
|
||||
|
||||
# Try to import GPU monitoring
|
||||
self._gpu_available = False
|
||||
try:
|
||||
import pynvml
|
||||
pynvml.nvmlInit()
|
||||
self._gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(0)
|
||||
self._gpu_available = True
|
||||
logger.info("GPU monitoring enabled")
|
||||
except:
|
||||
logger.info("GPU monitoring not available")
|
||||
|
||||
logger.info(f"ResourceCollector initialized (sample_interval={sample_interval_sec}s)")
|
||||
|
||||
@property
|
||||
def monitoring_active(self) -> bool:
|
||||
"""Check if resource monitoring is active."""
|
||||
return self._running
|
||||
|
||||
def start(self) -> None:
|
||||
"""Start collecting resource metrics."""
|
||||
if self._running:
|
||||
return
|
||||
|
||||
self._running = True
|
||||
self._thread = threading.Thread(target=self._collect_loop, daemon=True)
|
||||
self._thread.start()
|
||||
logger.info("ResourceCollector started")
|
||||
|
||||
def stop(self) -> None:
|
||||
"""Stop collecting resource metrics."""
|
||||
self._running = False
|
||||
if self._thread:
|
||||
self._thread.join(timeout=5.0)
|
||||
logger.info("ResourceCollector stopped")
|
||||
|
||||
def set_context(
|
||||
self,
|
||||
workflow_id: Optional[str] = None,
|
||||
execution_id: Optional[str] = None
|
||||
) -> None:
|
||||
"""
|
||||
Set current execution context for resource tracking.
|
||||
|
||||
Args:
|
||||
workflow_id: Current workflow ID
|
||||
execution_id: Current execution ID
|
||||
"""
|
||||
with self._context_lock:
|
||||
self._current_context['workflow_id'] = workflow_id
|
||||
self._current_context['execution_id'] = execution_id
|
||||
|
||||
def clear_context(self) -> None:
|
||||
"""Clear execution context."""
|
||||
with self._context_lock:
|
||||
self._current_context['workflow_id'] = None
|
||||
self._current_context['execution_id'] = None
|
||||
|
||||
def get_current_metrics(self) -> ResourceMetrics:
|
||||
"""
|
||||
Get current resource usage.
|
||||
|
||||
Returns:
|
||||
ResourceMetrics with current usage
|
||||
"""
|
||||
with self._context_lock:
|
||||
workflow_id = self._current_context['workflow_id']
|
||||
execution_id = self._current_context['execution_id']
|
||||
|
||||
# CPU usage
|
||||
cpu_percent = self._process.cpu_percent(interval=0.1)
|
||||
|
||||
# Memory usage
|
||||
memory_info = self._process.memory_info()
|
||||
memory_mb = memory_info.rss / (1024 * 1024)
|
||||
|
||||
# Disk I/O
|
||||
disk_io_mb = 0.0
|
||||
try:
|
||||
disk_io = self._process.io_counters()
|
||||
if self._last_disk_io:
|
||||
bytes_read = disk_io.read_bytes - self._last_disk_io.read_bytes
|
||||
bytes_written = disk_io.write_bytes - self._last_disk_io.write_bytes
|
||||
disk_io_mb = (bytes_read + bytes_written) / (1024 * 1024)
|
||||
self._last_disk_io = disk_io
|
||||
except:
|
||||
pass
|
||||
|
||||
# GPU usage
|
||||
gpu_utilization = 0.0
|
||||
gpu_memory_mb = 0.0
|
||||
if self._gpu_available:
|
||||
try:
|
||||
import pynvml
|
||||
util = pynvml.nvmlDeviceGetUtilizationRates(self._gpu_handle)
|
||||
gpu_utilization = float(util.gpu)
|
||||
|
||||
mem_info = pynvml.nvmlDeviceGetMemoryInfo(self._gpu_handle)
|
||||
gpu_memory_mb = mem_info.used / (1024 * 1024)
|
||||
except:
|
||||
pass
|
||||
|
||||
return ResourceMetrics(
|
||||
timestamp=datetime.now(),
|
||||
workflow_id=workflow_id,
|
||||
execution_id=execution_id,
|
||||
cpu_percent=cpu_percent,
|
||||
memory_mb=memory_mb,
|
||||
gpu_utilization=gpu_utilization,
|
||||
gpu_memory_mb=gpu_memory_mb,
|
||||
disk_io_mb=disk_io_mb
|
||||
)
|
||||
|
||||
def _collect_loop(self) -> None:
|
||||
"""Collection loop running in background thread."""
|
||||
while self._running:
|
||||
try:
|
||||
metrics = self.get_current_metrics()
|
||||
|
||||
# Persist if callback is configured
|
||||
if self.storage_callback:
|
||||
self.storage_callback([metrics])
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting resource metrics: {e}")
|
||||
|
||||
time.sleep(self.sample_interval)
|
||||
Reference in New Issue
Block a user