v1.0 - Version stable: multi-PC, détection UI-DETR-1, 3 modes exécution

- Frontend v4 accessible sur réseau local (192.168.1.40)
- Ports ouverts: 3002 (frontend), 5001 (backend), 5004 (dashboard)
- Ollama GPU fonctionnel
- Self-healing interactif
- Dashboard confiance

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Dom
2026-01-29 11:23:51 +01:00
parent 21bfa3b337
commit a27b74cf22
1595 changed files with 412691 additions and 400 deletions

View File

@@ -0,0 +1,52 @@
"""
RPA Analytics & Insights Module
This module provides comprehensive analytics and insights for RPA workflows,
including performance analysis, anomaly detection, and automated recommendations.
"""
from .collection.metrics_collector import MetricsCollector, ExecutionMetrics, StepMetrics
from .collection.resource_collector import ResourceCollector, ResourceMetrics
from .storage.timeseries_store import TimeSeriesStore
from .storage.archive_storage import ArchiveStorage, RetentionPolicyEngine, RetentionPolicy
from .engine.performance_analyzer import PerformanceAnalyzer, PerformanceStats
from .engine.anomaly_detector import AnomalyDetector, Anomaly
from .engine.insight_generator import InsightGenerator, Insight
from .engine.success_rate_calculator import SuccessRateCalculator, SuccessRateStats, ReliabilityRanking
from .query.query_engine import QueryEngine
from .realtime.realtime_analytics import RealtimeAnalytics, LiveExecution
from .reporting.report_generator import ReportGenerator, ReportConfig, ScheduledReport
from .dashboard.dashboard_manager import DashboardManager, Dashboard, DashboardWidget, DashboardTemplate
from .api.analytics_api import AnalyticsAPI
__all__ = [
'MetricsCollector',
'ExecutionMetrics',
'StepMetrics',
'ResourceCollector',
'ResourceMetrics',
'TimeSeriesStore',
'ArchiveStorage',
'RetentionPolicyEngine',
'RetentionPolicy',
'PerformanceAnalyzer',
'PerformanceStats',
'AnomalyDetector',
'Anomaly',
'InsightGenerator',
'Insight',
'SuccessRateCalculator',
'SuccessRateStats',
'ReliabilityRanking',
'QueryEngine',
'RealtimeAnalytics',
'LiveExecution',
'ReportGenerator',
'ReportConfig',
'ScheduledReport',
'DashboardManager',
'Dashboard',
'DashboardWidget',
'DashboardTemplate',
'AnalyticsAPI',
]

View File

@@ -0,0 +1,197 @@
"""Integrated analytics system."""
import logging
from typing import Optional
from pathlib import Path
from .collection.metrics_collector import MetricsCollector
from .collection.resource_collector import ResourceCollector
from .storage.timeseries_store import TimeSeriesStore
from .storage.archive_storage import ArchiveStorage, RetentionPolicyEngine
from .engine.performance_analyzer import PerformanceAnalyzer
from .engine.anomaly_detector import AnomalyDetector
from .engine.insight_generator import InsightGenerator
from .engine.success_rate_calculator import SuccessRateCalculator
from .query.query_engine import QueryEngine
from .realtime.realtime_analytics import RealtimeAnalytics
from .reporting.report_generator import ReportGenerator
from .dashboard.dashboard_manager import DashboardManager
from .api.analytics_api import AnalyticsAPI
logger = logging.getLogger(__name__)
class AnalyticsSystem:
"""Integrated analytics system."""
def __init__(
self,
db_path: str = "data/analytics/metrics.db",
archive_dir: str = "data/analytics/archive",
reports_dir: str = "data/analytics/reports",
dashboards_dir: str = "data/analytics/dashboards"
):
"""
Initialize analytics system.
Args:
db_path: Path to metrics database
archive_dir: Directory for archived data
reports_dir: Directory for reports
dashboards_dir: Directory for dashboards
"""
logger.info("Initializing AnalyticsSystem...")
# Storage layer
self.store = TimeSeriesStore(db_path)
self.archive = ArchiveStorage(archive_dir)
self.retention_engine = RetentionPolicyEngine(self.archive)
# Collection layer
self.metrics_collector = MetricsCollector(self.store)
self.resource_collector = ResourceCollector(self.store)
# Analysis layer
self.performance_analyzer = PerformanceAnalyzer(self.store)
self.anomaly_detector = AnomalyDetector(self.store)
self.insight_generator = InsightGenerator(
self.performance_analyzer,
self.anomaly_detector
)
self.success_rate_calculator = SuccessRateCalculator(self.store)
# Query layer
self.query_engine = QueryEngine(self.store)
self.realtime_analytics = RealtimeAnalytics(self.metrics_collector)
# Reporting layer
self.report_generator = ReportGenerator(
self.query_engine,
self.performance_analyzer,
self.insight_generator,
reports_dir
)
# Dashboard layer
self.dashboard_manager = DashboardManager(dashboards_dir)
# API layer
self.api = AnalyticsAPI(
self.query_engine,
self.performance_analyzer,
self.anomaly_detector,
self.insight_generator,
self.success_rate_calculator,
self.report_generator,
self.dashboard_manager
)
logger.info("AnalyticsSystem initialized successfully")
def start_resource_monitoring(
self,
interval_seconds: int = 60
) -> None:
"""
Start resource monitoring.
Args:
interval_seconds: Monitoring interval in seconds
"""
self.resource_collector.start_monitoring(interval_seconds)
logger.info(f"Resource monitoring started (interval: {interval_seconds}s)")
def stop_resource_monitoring(self) -> None:
"""Stop resource monitoring."""
self.resource_collector.stop_monitoring()
logger.info("Resource monitoring stopped")
def apply_retention_policies(self, dry_run: bool = False) -> dict:
"""
Apply retention policies.
Args:
dry_run: If True, don't actually delete data
Returns:
Dictionary with application results
"""
results = self.retention_engine.apply_policies(self.store, dry_run)
logger.info(f"Retention policies applied (dry_run={dry_run})")
return results
def get_system_stats(self) -> dict:
"""
Get system statistics.
Returns:
Dictionary with system stats
"""
return {
'storage': {
'metrics_count': self.store.get_metrics_count(),
'database_size': Path(self.store.db_path).stat().st_size if Path(self.store.db_path).exists() else 0
},
'archive': self.archive.get_archive_stats(),
'collectors': {
'metrics_buffer_size': len(self.metrics_collector.buffer),
'resource_monitoring_active': self.resource_collector.monitoring_active
},
'dashboards': {
'total': len(self.dashboard_manager.dashboards)
},
'reports': {
'scheduled': len(self.report_generator.scheduled_reports)
}
}
def shutdown(self) -> None:
"""Shutdown analytics system."""
logger.info("Shutting down AnalyticsSystem...")
# Stop monitoring
if self.resource_collector.monitoring_active:
self.stop_resource_monitoring()
# Flush any pending metrics
self.metrics_collector.flush()
# Close database connection
self.store.close()
logger.info("AnalyticsSystem shutdown complete")
# Global instance
_analytics_system: Optional[AnalyticsSystem] = None
def get_analytics_system(
db_path: str = "data/analytics/metrics.db",
archive_dir: str = "data/analytics/archive",
reports_dir: str = "data/analytics/reports",
dashboards_dir: str = "data/analytics/dashboards"
) -> AnalyticsSystem:
"""
Get or create global analytics system instance.
Args:
db_path: Path to metrics database
archive_dir: Directory for archived data
reports_dir: Directory for reports
dashboards_dir: Directory for dashboards
Returns:
AnalyticsSystem instance
"""
global _analytics_system
if _analytics_system is None:
_analytics_system = AnalyticsSystem(
db_path=db_path,
archive_dir=archive_dir,
reports_dir=reports_dir,
dashboards_dir=dashboards_dir
)
return _analytics_system

View File

@@ -0,0 +1,5 @@
"""Analytics API module."""
from .analytics_api import AnalyticsAPI
__all__ = ['AnalyticsAPI']

View File

@@ -0,0 +1,387 @@
"""REST API for analytics."""
import logging
from typing import Dict, List, Optional, Any
from datetime import datetime, timedelta
try:
from flask import Blueprint, request, jsonify, send_file
FLASK_AVAILABLE = True
except ImportError:
FLASK_AVAILABLE = False
Blueprint = None
logger = logging.getLogger(__name__)
class AnalyticsAPI:
"""REST API for analytics."""
def __init__(
self,
query_engine,
performance_analyzer,
anomaly_detector,
insight_generator,
success_rate_calculator,
report_generator,
dashboard_manager
):
"""
Initialize analytics API.
Args:
query_engine: Query engine instance
performance_analyzer: Performance analyzer instance
anomaly_detector: Anomaly detector instance
insight_generator: Insight generator instance
success_rate_calculator: Success rate calculator instance
report_generator: Report generator instance
dashboard_manager: Dashboard manager instance
"""
if not FLASK_AVAILABLE:
logger.warning("Flask not available - API endpoints will not be registered")
self.blueprint = None
return
self.query_engine = query_engine
self.performance_analyzer = performance_analyzer
self.anomaly_detector = anomaly_detector
self.insight_generator = insight_generator
self.success_rate_calculator = success_rate_calculator
self.report_generator = report_generator
self.dashboard_manager = dashboard_manager
self.blueprint = Blueprint('analytics', __name__, url_prefix='/api/analytics')
self._register_routes()
logger.info("AnalyticsAPI initialized")
def _register_routes(self) -> None:
"""Register API routes."""
if not FLASK_AVAILABLE or not self.blueprint:
return
@self.blueprint.route('/metrics', methods=['GET'])
def get_metrics():
"""Get metrics with filters."""
try:
metric_type = request.args.get('type', 'execution')
workflow_id = request.args.get('workflow_id')
hours = int(request.args.get('hours', 24))
end_time = datetime.now()
start_time = end_time - timedelta(hours=hours)
filters = {}
if workflow_id:
filters['workflow_id'] = workflow_id
metrics = self.query_engine.query(
metric_type=metric_type,
start_time=start_time,
end_time=end_time,
filters=filters
)
return jsonify({
'success': True,
'count': len(metrics),
'metrics': metrics
})
except Exception as e:
logger.error(f"Error getting metrics: {e}")
return jsonify({'success': False, 'error': str(e)}), 500
@self.blueprint.route('/performance', methods=['GET'])
def get_performance():
"""Get performance analysis."""
try:
workflow_id = request.args.get('workflow_id')
if not workflow_id:
return jsonify({'success': False, 'error': 'workflow_id required'}), 400
hours = int(request.args.get('hours', 24))
end_time = datetime.now()
start_time = end_time - timedelta(hours=hours)
stats = self.performance_analyzer.analyze_performance(
workflow_id=workflow_id,
start_time=start_time,
end_time=end_time
)
return jsonify({
'success': True,
'performance': stats.to_dict()
})
except Exception as e:
logger.error(f"Error getting performance: {e}")
return jsonify({'success': False, 'error': str(e)}), 500
@self.blueprint.route('/performance/bottlenecks', methods=['GET'])
def get_bottlenecks():
"""Get performance bottlenecks."""
try:
workflow_id = request.args.get('workflow_id')
if not workflow_id:
return jsonify({'success': False, 'error': 'workflow_id required'}), 400
hours = int(request.args.get('hours', 24))
end_time = datetime.now()
start_time = end_time - timedelta(hours=hours)
bottlenecks = self.performance_analyzer.identify_bottlenecks(
workflow_id=workflow_id,
start_time=start_time,
end_time=end_time
)
return jsonify({
'success': True,
'bottlenecks': [b.to_dict() for b in bottlenecks]
})
except Exception as e:
logger.error(f"Error getting bottlenecks: {e}")
return jsonify({'success': False, 'error': str(e)}), 500
@self.blueprint.route('/anomalies', methods=['GET'])
def get_anomalies():
"""Get detected anomalies."""
try:
workflow_id = request.args.get('workflow_id')
hours = int(request.args.get('hours', 24))
end_time = datetime.now()
start_time = end_time - timedelta(hours=hours)
anomalies = self.anomaly_detector.detect_anomalies(
workflow_id=workflow_id,
start_time=start_time,
end_time=end_time
)
return jsonify({
'success': True,
'count': len(anomalies),
'anomalies': [a.to_dict() for a in anomalies]
})
except Exception as e:
logger.error(f"Error getting anomalies: {e}")
return jsonify({'success': False, 'error': str(e)}), 500
@self.blueprint.route('/insights', methods=['GET'])
def get_insights():
"""Get generated insights."""
try:
hours = int(request.args.get('hours', 168)) # 1 week default
end_time = datetime.now()
start_time = end_time - timedelta(hours=hours)
insights = self.insight_generator.generate_insights(
start_time=start_time,
end_time=end_time
)
return jsonify({
'success': True,
'count': len(insights),
'insights': [i.to_dict() for i in insights]
})
except Exception as e:
logger.error(f"Error getting insights: {e}")
return jsonify({'success': False, 'error': str(e)}), 500
@self.blueprint.route('/success-rate', methods=['GET'])
def get_success_rate():
"""Get success rate statistics."""
try:
workflow_id = request.args.get('workflow_id')
if not workflow_id:
return jsonify({'success': False, 'error': 'workflow_id required'}), 400
hours = int(request.args.get('hours', 24))
stats = self.success_rate_calculator.calculate_success_rate(
workflow_id=workflow_id,
time_window_hours=hours
)
return jsonify({
'success': True,
'stats': stats.to_dict()
})
except Exception as e:
logger.error(f"Error getting success rate: {e}")
return jsonify({'success': False, 'error': str(e)}), 500
@self.blueprint.route('/reliability-ranking', methods=['GET'])
def get_reliability_ranking():
"""Get workflow reliability rankings."""
try:
hours = int(request.args.get('hours', 168)) # 1 week default
rankings = self.success_rate_calculator.rank_workflows_by_reliability(
time_window_hours=hours
)
return jsonify({
'success': True,
'rankings': [r.to_dict() for r in rankings]
})
except Exception as e:
logger.error(f"Error getting reliability ranking: {e}")
return jsonify({'success': False, 'error': str(e)}), 500
@self.blueprint.route('/reports', methods=['POST'])
def generate_report():
"""Generate a report."""
try:
data = request.json
from ..reporting.report_generator import ReportConfig
config = ReportConfig(
title=data.get('title', 'Analytics Report'),
metric_types=data.get('metric_types', ['execution']),
start_time=datetime.fromisoformat(data['start_time']),
end_time=datetime.fromisoformat(data['end_time']),
workflow_ids=data.get('workflow_ids'),
include_charts=data.get('include_charts', True),
include_insights=data.get('include_insights', True),
format=data.get('format', 'json')
)
report_data = self.report_generator.generate_report(config)
# Export based on format
if config.format == 'json':
filepath = self.report_generator.export_json(report_data)
elif config.format == 'csv':
filepath = self.report_generator.export_csv(report_data)
elif config.format == 'html':
filepath = self.report_generator.export_html(report_data)
elif config.format == 'pdf':
filepath = self.report_generator.export_pdf(report_data)
else:
filepath = self.report_generator.export_json(report_data)
return jsonify({
'success': True,
'filepath': filepath
})
except Exception as e:
logger.error(f"Error generating report: {e}")
return jsonify({'success': False, 'error': str(e)}), 500
@self.blueprint.route('/reports/<path:filename>', methods=['GET'])
def download_report(filename):
"""Download a generated report."""
try:
filepath = self.report_generator.output_dir / filename
if not filepath.exists():
return jsonify({'success': False, 'error': 'Report not found'}), 404
return send_file(str(filepath), as_attachment=True)
except Exception as e:
logger.error(f"Error downloading report: {e}")
return jsonify({'success': False, 'error': str(e)}), 500
@self.blueprint.route('/dashboards', methods=['GET'])
def list_dashboards():
"""List dashboards."""
try:
owner = request.args.get('owner')
dashboards = self.dashboard_manager.list_dashboards(owner=owner)
return jsonify({
'success': True,
'dashboards': [d.to_dict() for d in dashboards]
})
except Exception as e:
logger.error(f"Error listing dashboards: {e}")
return jsonify({'success': False, 'error': str(e)}), 500
@self.blueprint.route('/dashboards', methods=['POST'])
def create_dashboard():
"""Create a dashboard."""
try:
data = request.json
dashboard = self.dashboard_manager.create_dashboard(
name=data['name'],
description=data.get('description', ''),
owner=data['owner'],
template_id=data.get('template_id')
)
return jsonify({
'success': True,
'dashboard': dashboard.to_dict()
})
except Exception as e:
logger.error(f"Error creating dashboard: {e}")
return jsonify({'success': False, 'error': str(e)}), 500
@self.blueprint.route('/dashboards/<dashboard_id>', methods=['GET'])
def get_dashboard(dashboard_id):
"""Get dashboard by ID."""
try:
dashboard = self.dashboard_manager.get_dashboard(dashboard_id)
if not dashboard:
return jsonify({'success': False, 'error': 'Dashboard not found'}), 404
return jsonify({
'success': True,
'dashboard': dashboard.to_dict()
})
except Exception as e:
logger.error(f"Error getting dashboard: {e}")
return jsonify({'success': False, 'error': str(e)}), 500
@self.blueprint.route('/dashboards/<dashboard_id>', methods=['PUT'])
def update_dashboard(dashboard_id):
"""Update dashboard."""
try:
data = request.json
dashboard = self.dashboard_manager.update_dashboard(dashboard_id, data)
if not dashboard:
return jsonify({'success': False, 'error': 'Dashboard not found'}), 404
return jsonify({
'success': True,
'dashboard': dashboard.to_dict()
})
except Exception as e:
logger.error(f"Error updating dashboard: {e}")
return jsonify({'success': False, 'error': str(e)}), 500
@self.blueprint.route('/dashboards/<dashboard_id>', methods=['DELETE'])
def delete_dashboard(dashboard_id):
"""Delete dashboard."""
try:
success = self.dashboard_manager.delete_dashboard(dashboard_id)
if not success:
return jsonify({'success': False, 'error': 'Dashboard not found'}), 404
return jsonify({'success': True})
except Exception as e:
logger.error(f"Error deleting dashboard: {e}")
return jsonify({'success': False, 'error': str(e)}), 500
@self.blueprint.route('/dashboard-templates', methods=['GET'])
def get_dashboard_templates():
"""Get dashboard templates."""
try:
templates = self.dashboard_manager.get_templates()
return jsonify({
'success': True,
'templates': [t.to_dict() for t in templates]
})
except Exception as e:
logger.error(f"Error getting templates: {e}")
return jsonify({'success': False, 'error': str(e)}), 500
def get_blueprint(self) -> Blueprint:
"""Get Flask blueprint."""
return self.blueprint

View File

@@ -0,0 +1,12 @@
"""Data collection components for analytics."""
from .metrics_collector import MetricsCollector, ExecutionMetrics, StepMetrics
from .resource_collector import ResourceCollector, ResourceMetrics
__all__ = [
'MetricsCollector',
'ExecutionMetrics',
'StepMetrics',
'ResourceCollector',
'ResourceMetrics',
]

View File

@@ -0,0 +1,348 @@
"""Metrics collection for workflow executions."""
import threading
import time
import logging
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional, Union
from datetime import datetime
from pathlib import Path
logger = logging.getLogger(__name__)
@dataclass
class ExecutionMetrics:
"""Metrics for a workflow execution."""
execution_id: str
workflow_id: str
started_at: datetime
completed_at: Optional[datetime] = None
duration_ms: Optional[float] = None
status: str = 'running' # 'running', 'completed', 'failed'
steps_total: int = 0
steps_completed: int = 0
steps_failed: int = 0
error_message: Optional[str] = None
context: Dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for storage."""
return {
'execution_id': self.execution_id,
'workflow_id': self.workflow_id,
'started_at': self.started_at.isoformat(),
'completed_at': self.completed_at.isoformat() if self.completed_at else None,
'duration_ms': self.duration_ms,
'status': self.status,
'steps_total': self.steps_total,
'steps_completed': self.steps_completed,
'steps_failed': self.steps_failed,
'error_message': self.error_message,
'context': self.context
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'ExecutionMetrics':
"""Create from dictionary."""
return cls(
execution_id=data['execution_id'],
workflow_id=data['workflow_id'],
started_at=datetime.fromisoformat(data['started_at']),
completed_at=datetime.fromisoformat(data['completed_at']) if data.get('completed_at') else None,
duration_ms=data.get('duration_ms'),
status=data.get('status', 'running'),
steps_total=data.get('steps_total', 0),
steps_completed=data.get('steps_completed', 0),
steps_failed=data.get('steps_failed', 0),
error_message=data.get('error_message'),
context=data.get('context', {})
)
@dataclass
class StepMetrics:
"""Metrics for a workflow step."""
step_id: str
execution_id: str
workflow_id: str
node_id: str
action_type: str
target_element: str
started_at: datetime
completed_at: datetime
duration_ms: float
status: str
confidence_score: float
retry_count: int = 0
error_details: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for storage."""
return {
'step_id': self.step_id,
'execution_id': self.execution_id,
'workflow_id': self.workflow_id,
'node_id': self.node_id,
'action_type': self.action_type,
'target_element': self.target_element,
'started_at': self.started_at.isoformat(),
'completed_at': self.completed_at.isoformat(),
'duration_ms': self.duration_ms,
'status': self.status,
'confidence_score': self.confidence_score,
'retry_count': self.retry_count,
'error_details': self.error_details
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'StepMetrics':
"""Create from dictionary."""
return cls(
step_id=data['step_id'],
execution_id=data['execution_id'],
workflow_id=data['workflow_id'],
node_id=data['node_id'],
action_type=data['action_type'],
target_element=data['target_element'],
started_at=datetime.fromisoformat(data['started_at']),
completed_at=datetime.fromisoformat(data['completed_at']),
duration_ms=data['duration_ms'],
status=data['status'],
confidence_score=data['confidence_score'],
retry_count=data.get('retry_count', 0),
error_details=data.get('error_details')
)
class MetricsCollector:
"""Collects metrics from workflow executions."""
def __init__(
self,
storage_callback: Optional[callable] = None,
buffer_size: int = 1000,
flush_interval_sec: float = 5.0
):
"""
Initialize metrics collector.
Args:
storage_callback: Callback to persist metrics (receives list of metrics)
buffer_size: Maximum buffer size before forcing flush
flush_interval_sec: Interval between automatic flushes
"""
self.storage_callback = storage_callback
self.buffer_size = buffer_size
self.flush_interval = flush_interval_sec
self._buffer: List[Union[ExecutionMetrics, StepMetrics]] = []
self._lock = threading.Lock()
self._flush_thread: Optional[threading.Thread] = None
self._running = False
# Track active executions
self._active_executions: Dict[str, ExecutionMetrics] = {}
logger.info(f"MetricsCollector initialized (buffer_size={buffer_size}, flush_interval={flush_interval_sec}s)")
def start(self) -> None:
"""Start automatic flushing."""
if self._running:
return
self._running = True
self._flush_thread = threading.Thread(target=self._auto_flush, daemon=True)
self._flush_thread.start()
logger.info("MetricsCollector started")
def stop(self) -> None:
"""Stop automatic flushing and flush remaining metrics."""
self._running = False
if self._flush_thread:
self._flush_thread.join(timeout=5.0)
self.flush()
logger.info("MetricsCollector stopped")
def record_execution_start(
self,
execution_id: str,
workflow_id: str,
context: Optional[Dict[str, Any]] = None
) -> None:
"""
Record the start of a workflow execution.
Args:
execution_id: Unique execution identifier
workflow_id: Workflow identifier
context: Additional context information
"""
metrics = ExecutionMetrics(
execution_id=execution_id,
workflow_id=workflow_id,
started_at=datetime.now(),
status='running',
context=context or {}
)
with self._lock:
self._active_executions[execution_id] = metrics
logger.debug(f"Recorded execution start: {execution_id}")
def record_execution_complete(
self,
execution_id: str,
status: str,
steps_total: int = 0,
steps_completed: int = 0,
steps_failed: int = 0,
error_message: Optional[str] = None
) -> None:
"""
Record the completion of a workflow execution.
Args:
execution_id: Execution identifier
status: Final status ('completed' or 'failed')
steps_total: Total number of steps
steps_completed: Number of completed steps
steps_failed: Number of failed steps
error_message: Error message if failed
"""
with self._lock:
if execution_id not in self._active_executions:
logger.warning(f"Execution not found: {execution_id}")
return
metrics = self._active_executions[execution_id]
metrics.completed_at = datetime.now()
metrics.duration_ms = (metrics.completed_at - metrics.started_at).total_seconds() * 1000
metrics.status = status
metrics.steps_total = steps_total
metrics.steps_completed = steps_completed
metrics.steps_failed = steps_failed
metrics.error_message = error_message
# Move to buffer
self._buffer.append(metrics)
del self._active_executions[execution_id]
# Check if buffer is full
if len(self._buffer) >= self.buffer_size:
self._flush_unlocked()
logger.debug(f"Recorded execution complete: {execution_id} ({status})")
def record_step(self, step_metrics: StepMetrics) -> None:
"""
Record metrics for a completed step.
Args:
step_metrics: Step metrics to record
"""
with self._lock:
self._buffer.append(step_metrics)
# Check if buffer is full
if len(self._buffer) >= self.buffer_size:
self._flush_unlocked()
logger.debug(f"Recorded step: {step_metrics.step_id}")
def flush(self) -> int:
"""
Flush buffered metrics to storage.
Returns:
Number of metrics flushed
"""
with self._lock:
return self._flush_unlocked()
def _flush_unlocked(self) -> int:
"""Flush without acquiring lock (must be called with lock held)."""
if not self._buffer:
return 0
if not self.storage_callback:
logger.warning("No storage callback configured, discarding metrics")
count = len(self._buffer)
self._buffer.clear()
return count
try:
# Copy buffer
metrics_to_flush = self._buffer.copy()
self._buffer.clear()
# Persist (outside lock to avoid blocking)
self.storage_callback(metrics_to_flush)
logger.debug(f"Flushed {len(metrics_to_flush)} metrics")
return len(metrics_to_flush)
except Exception as e:
logger.error(f"Error flushing metrics: {e}")
# Put metrics back in buffer
self._buffer.extend(metrics_to_flush)
return 0
def _auto_flush(self) -> None:
"""Automatic flush thread."""
while self._running:
time.sleep(self.flush_interval)
if self._running:
self.flush()
def get_active_executions(self) -> Dict[str, ExecutionMetrics]:
"""Get currently active executions."""
with self._lock:
return self._active_executions.copy()
def get_buffer_size(self) -> int:
"""Get current buffer size."""
with self._lock:
return len(self._buffer)
def record_recovery_attempt(
self,
workflow_id: str,
node_id: str,
failure_reason: str,
recovery_success: bool,
strategy_used: Optional[str] = None,
confidence: float = 0.0
) -> None:
"""
Record a self-healing recovery attempt.
Args:
workflow_id: Workflow identifier
node_id: Node where failure occurred
failure_reason: Reason for the failure
recovery_success: Whether recovery was successful
strategy_used: Strategy used for recovery
confidence: Confidence score of recovery
"""
# Create a custom metrics entry for recovery
recovery_metrics = {
'type': 'recovery_attempt',
'timestamp': datetime.now().isoformat(),
'workflow_id': workflow_id,
'node_id': node_id,
'failure_reason': failure_reason,
'recovery_success': recovery_success,
'strategy_used': strategy_used,
'confidence': confidence
}
with self._lock:
self._buffer.append(recovery_metrics)
# Check if buffer is full
if len(self._buffer) >= self.buffer_size:
self._flush_unlocked()
logger.debug(f"Recorded recovery attempt: {workflow_id}/{node_id} - {'success' if recovery_success else 'failed'}")

View File

@@ -0,0 +1,209 @@
"""Resource usage collection for analytics."""
import psutil
import threading
import time
import logging
from dataclasses import dataclass
from typing import Optional, Dict, Any, List
from datetime import datetime
logger = logging.getLogger(__name__)
@dataclass
class ResourceMetrics:
"""System resource usage metrics."""
timestamp: datetime
workflow_id: Optional[str] = None
execution_id: Optional[str] = None
cpu_percent: float = 0.0
memory_mb: float = 0.0
gpu_utilization: float = 0.0
gpu_memory_mb: float = 0.0
disk_io_mb: float = 0.0
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for storage."""
return {
'timestamp': self.timestamp.isoformat(),
'workflow_id': self.workflow_id,
'execution_id': self.execution_id,
'cpu_percent': self.cpu_percent,
'memory_mb': self.memory_mb,
'gpu_utilization': self.gpu_utilization,
'gpu_memory_mb': self.gpu_memory_mb,
'disk_io_mb': self.disk_io_mb
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'ResourceMetrics':
"""Create from dictionary."""
return cls(
timestamp=datetime.fromisoformat(data['timestamp']),
workflow_id=data.get('workflow_id'),
execution_id=data.get('execution_id'),
cpu_percent=data.get('cpu_percent', 0.0),
memory_mb=data.get('memory_mb', 0.0),
gpu_utilization=data.get('gpu_utilization', 0.0),
gpu_memory_mb=data.get('gpu_memory_mb', 0.0),
disk_io_mb=data.get('disk_io_mb', 0.0)
)
class ResourceCollector:
"""Collects system resource usage metrics."""
def __init__(
self,
storage_callback: Optional[callable] = None,
sample_interval_sec: float = 1.0
):
"""
Initialize resource collector.
Args:
storage_callback: Callback to persist metrics
sample_interval_sec: Interval between samples
"""
self.storage_callback = storage_callback
self.sample_interval = sample_interval_sec
self._running = False
self._thread: Optional[threading.Thread] = None
self._current_context: Dict[str, Optional[str]] = {
'workflow_id': None,
'execution_id': None
}
self._context_lock = threading.Lock()
# Initialize psutil
self._process = psutil.Process()
self._last_disk_io = None
# Try to import GPU monitoring
self._gpu_available = False
try:
import pynvml
pynvml.nvmlInit()
self._gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(0)
self._gpu_available = True
logger.info("GPU monitoring enabled")
except:
logger.info("GPU monitoring not available")
logger.info(f"ResourceCollector initialized (sample_interval={sample_interval_sec}s)")
@property
def monitoring_active(self) -> bool:
"""Check if resource monitoring is active."""
return self._running
def start(self) -> None:
"""Start collecting resource metrics."""
if self._running:
return
self._running = True
self._thread = threading.Thread(target=self._collect_loop, daemon=True)
self._thread.start()
logger.info("ResourceCollector started")
def stop(self) -> None:
"""Stop collecting resource metrics."""
self._running = False
if self._thread:
self._thread.join(timeout=5.0)
logger.info("ResourceCollector stopped")
def set_context(
self,
workflow_id: Optional[str] = None,
execution_id: Optional[str] = None
) -> None:
"""
Set current execution context for resource tracking.
Args:
workflow_id: Current workflow ID
execution_id: Current execution ID
"""
with self._context_lock:
self._current_context['workflow_id'] = workflow_id
self._current_context['execution_id'] = execution_id
def clear_context(self) -> None:
"""Clear execution context."""
with self._context_lock:
self._current_context['workflow_id'] = None
self._current_context['execution_id'] = None
def get_current_metrics(self) -> ResourceMetrics:
"""
Get current resource usage.
Returns:
ResourceMetrics with current usage
"""
with self._context_lock:
workflow_id = self._current_context['workflow_id']
execution_id = self._current_context['execution_id']
# CPU usage
cpu_percent = self._process.cpu_percent(interval=0.1)
# Memory usage
memory_info = self._process.memory_info()
memory_mb = memory_info.rss / (1024 * 1024)
# Disk I/O
disk_io_mb = 0.0
try:
disk_io = self._process.io_counters()
if self._last_disk_io:
bytes_read = disk_io.read_bytes - self._last_disk_io.read_bytes
bytes_written = disk_io.write_bytes - self._last_disk_io.write_bytes
disk_io_mb = (bytes_read + bytes_written) / (1024 * 1024)
self._last_disk_io = disk_io
except:
pass
# GPU usage
gpu_utilization = 0.0
gpu_memory_mb = 0.0
if self._gpu_available:
try:
import pynvml
util = pynvml.nvmlDeviceGetUtilizationRates(self._gpu_handle)
gpu_utilization = float(util.gpu)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(self._gpu_handle)
gpu_memory_mb = mem_info.used / (1024 * 1024)
except:
pass
return ResourceMetrics(
timestamp=datetime.now(),
workflow_id=workflow_id,
execution_id=execution_id,
cpu_percent=cpu_percent,
memory_mb=memory_mb,
gpu_utilization=gpu_utilization,
gpu_memory_mb=gpu_memory_mb,
disk_io_mb=disk_io_mb
)
def _collect_loop(self) -> None:
"""Collection loop running in background thread."""
while self._running:
try:
metrics = self.get_current_metrics()
# Persist if callback is configured
if self.storage_callback:
self.storage_callback([metrics])
except Exception as e:
logger.error(f"Error collecting resource metrics: {e}")
time.sleep(self.sample_interval)

View File

@@ -0,0 +1,15 @@
"""Analytics dashboard module."""
from .dashboard_manager import (
DashboardManager,
Dashboard,
DashboardWidget,
DashboardTemplate
)
__all__ = [
'DashboardManager',
'Dashboard',
'DashboardWidget',
'DashboardTemplate'
]

View File

@@ -0,0 +1,468 @@
"""Dashboard management for analytics."""
import logging
import json
import uuid
from typing import Dict, List, Optional, Any
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass, field
logger = logging.getLogger(__name__)
@dataclass
class DashboardWidget:
"""Dashboard widget configuration."""
widget_id: str
widget_type: str # chart, table, metric, insight
title: str
config: Dict[str, Any]
position: Dict[str, int] # x, y, width, height
def to_dict(self) -> Dict:
"""Convert to dictionary."""
return {
'widget_id': self.widget_id,
'widget_type': self.widget_type,
'title': self.title,
'config': self.config,
'position': self.position
}
@classmethod
def from_dict(cls, data: Dict) -> 'DashboardWidget':
"""Create from dictionary."""
return cls(**data)
@dataclass
class Dashboard:
"""Dashboard configuration."""
dashboard_id: str
name: str
description: str
owner: str
widgets: List[DashboardWidget] = field(default_factory=list)
layout: str = 'grid' # grid, flex
refresh_interval: int = 30 # seconds
is_public: bool = False
shared_with: List[str] = field(default_factory=list)
created_at: datetime = field(default_factory=datetime.now)
updated_at: datetime = field(default_factory=datetime.now)
def to_dict(self) -> Dict:
"""Convert to dictionary."""
return {
'dashboard_id': self.dashboard_id,
'name': self.name,
'description': self.description,
'owner': self.owner,
'widgets': [w.to_dict() for w in self.widgets],
'layout': self.layout,
'refresh_interval': self.refresh_interval,
'is_public': self.is_public,
'shared_with': self.shared_with,
'created_at': self.created_at.isoformat(),
'updated_at': self.updated_at.isoformat()
}
@classmethod
def from_dict(cls, data: Dict) -> 'Dashboard':
"""Create from dictionary."""
data = data.copy()
data['widgets'] = [DashboardWidget.from_dict(w) for w in data.get('widgets', [])]
data['created_at'] = datetime.fromisoformat(data['created_at'])
data['updated_at'] = datetime.fromisoformat(data['updated_at'])
return cls(**data)
@dataclass
class DashboardTemplate:
"""Pre-built dashboard template."""
template_id: str
name: str
description: str
category: str
widgets: List[DashboardWidget]
def to_dict(self) -> Dict:
"""Convert to dictionary."""
return {
'template_id': self.template_id,
'name': self.name,
'description': self.description,
'category': self.category,
'widgets': [w.to_dict() for w in self.widgets]
}
class DashboardManager:
"""Manage analytics dashboards."""
def __init__(self, storage_dir: str = "data/analytics/dashboards"):
"""
Initialize dashboard manager.
Args:
storage_dir: Directory for dashboard storage
"""
self.storage_dir = Path(storage_dir)
self.storage_dir.mkdir(parents=True, exist_ok=True)
self.dashboards: Dict[str, Dashboard] = {}
self.templates: Dict[str, DashboardTemplate] = {}
self._load_dashboards()
self._init_templates()
logger.info("DashboardManager initialized")
def create_dashboard(
self,
name: str,
description: str,
owner: str,
template_id: Optional[str] = None
) -> Dashboard:
"""
Create a new dashboard.
Args:
name: Dashboard name
description: Dashboard description
owner: Owner username
template_id: Optional template to use
Returns:
Created dashboard
"""
dashboard_id = str(uuid.uuid4())
# Create from template if specified
if template_id and template_id in self.templates:
template = self.templates[template_id]
widgets = [
DashboardWidget(
widget_id=str(uuid.uuid4()),
widget_type=w.widget_type,
title=w.title,
config=w.config.copy(),
position=w.position.copy()
)
for w in template.widgets
]
else:
widgets = []
dashboard = Dashboard(
dashboard_id=dashboard_id,
name=name,
description=description,
owner=owner,
widgets=widgets
)
self.dashboards[dashboard_id] = dashboard
self._save_dashboard(dashboard)
logger.info(f"Created dashboard: {dashboard_id}")
return dashboard
def get_dashboard(self, dashboard_id: str) -> Optional[Dashboard]:
"""Get dashboard by ID."""
return self.dashboards.get(dashboard_id)
def list_dashboards(
self,
owner: Optional[str] = None,
include_shared: bool = True
) -> List[Dashboard]:
"""
List dashboards.
Args:
owner: Filter by owner (None = all)
include_shared: Include dashboards shared with owner
Returns:
List of dashboards
"""
dashboards = list(self.dashboards.values())
if owner:
dashboards = [
d for d in dashboards
if d.owner == owner or
(include_shared and (d.is_public or owner in d.shared_with))
]
return dashboards
def update_dashboard(
self,
dashboard_id: str,
updates: Dict[str, Any]
) -> Optional[Dashboard]:
"""
Update dashboard configuration.
Args:
dashboard_id: Dashboard identifier
updates: Dictionary of updates
Returns:
Updated dashboard or None
"""
dashboard = self.dashboards.get(dashboard_id)
if not dashboard:
return None
# Apply updates
for key, value in updates.items():
if hasattr(dashboard, key):
setattr(dashboard, key, value)
dashboard.updated_at = datetime.now()
self._save_dashboard(dashboard)
logger.info(f"Updated dashboard: {dashboard_id}")
return dashboard
def delete_dashboard(self, dashboard_id: str) -> bool:
"""
Delete a dashboard.
Args:
dashboard_id: Dashboard identifier
Returns:
True if deleted, False if not found
"""
if dashboard_id not in self.dashboards:
return False
del self.dashboards[dashboard_id]
# Delete file
filepath = self.storage_dir / f"{dashboard_id}.json"
if filepath.exists():
filepath.unlink()
logger.info(f"Deleted dashboard: {dashboard_id}")
return True
def add_widget(
self,
dashboard_id: str,
widget_type: str,
title: str,
config: Dict[str, Any],
position: Dict[str, int]
) -> Optional[DashboardWidget]:
"""
Add widget to dashboard.
Args:
dashboard_id: Dashboard identifier
widget_type: Widget type
title: Widget title
config: Widget configuration
position: Widget position
Returns:
Created widget or None
"""
dashboard = self.dashboards.get(dashboard_id)
if not dashboard:
return None
widget = DashboardWidget(
widget_id=str(uuid.uuid4()),
widget_type=widget_type,
title=title,
config=config,
position=position
)
dashboard.widgets.append(widget)
dashboard.updated_at = datetime.now()
self._save_dashboard(dashboard)
logger.info(f"Added widget to dashboard {dashboard_id}")
return widget
def remove_widget(
self,
dashboard_id: str,
widget_id: str
) -> bool:
"""
Remove widget from dashboard.
Args:
dashboard_id: Dashboard identifier
widget_id: Widget identifier
Returns:
True if removed, False if not found
"""
dashboard = self.dashboards.get(dashboard_id)
if not dashboard:
return False
dashboard.widgets = [w for w in dashboard.widgets if w.widget_id != widget_id]
dashboard.updated_at = datetime.now()
self._save_dashboard(dashboard)
logger.info(f"Removed widget from dashboard {dashboard_id}")
return True
def share_dashboard(
self,
dashboard_id: str,
username: str
) -> bool:
"""
Share dashboard with a user.
Args:
dashboard_id: Dashboard identifier
username: Username to share with
Returns:
True if shared, False if not found
"""
dashboard = self.dashboards.get(dashboard_id)
if not dashboard:
return False
if username not in dashboard.shared_with:
dashboard.shared_with.append(username)
dashboard.updated_at = datetime.now()
self._save_dashboard(dashboard)
logger.info(f"Shared dashboard {dashboard_id} with {username}")
return True
def make_public(
self,
dashboard_id: str,
is_public: bool = True
) -> bool:
"""
Make dashboard public or private.
Args:
dashboard_id: Dashboard identifier
is_public: Whether dashboard should be public
Returns:
True if updated, False if not found
"""
dashboard = self.dashboards.get(dashboard_id)
if not dashboard:
return False
dashboard.is_public = is_public
dashboard.updated_at = datetime.now()
self._save_dashboard(dashboard)
logger.info(f"Dashboard {dashboard_id} public: {is_public}")
return True
def get_templates(self) -> List[DashboardTemplate]:
"""Get all dashboard templates."""
return list(self.templates.values())
def _load_dashboards(self) -> None:
"""Load dashboards from storage."""
for filepath in self.storage_dir.glob('*.json'):
try:
with open(filepath, 'r') as f:
data = json.load(f)
dashboard = Dashboard.from_dict(data)
self.dashboards[dashboard.dashboard_id] = dashboard
except Exception as e:
logger.error(f"Error loading dashboard {filepath}: {e}")
logger.info(f"Loaded {len(self.dashboards)} dashboards")
def _save_dashboard(self, dashboard: Dashboard) -> None:
"""Save dashboard to storage."""
filepath = self.storage_dir / f"{dashboard.dashboard_id}.json"
with open(filepath, 'w') as f:
json.dump(dashboard.to_dict(), f, indent=2)
def _init_templates(self) -> None:
"""Initialize default dashboard templates."""
# Performance Overview Template
self.templates['performance'] = DashboardTemplate(
template_id='performance',
name='Performance Overview',
description='Overview of workflow performance metrics',
category='performance',
widgets=[
DashboardWidget(
widget_id='perf_chart',
widget_type='chart',
title='Execution Duration Trend',
config={
'chart_type': 'line',
'metric': 'duration',
'time_range': '7d'
},
position={'x': 0, 'y': 0, 'width': 6, 'height': 4}
),
DashboardWidget(
widget_id='success_rate',
widget_type='metric',
title='Success Rate',
config={
'metric': 'success_rate',
'format': 'percentage'
},
position={'x': 6, 'y': 0, 'width': 3, 'height': 2}
),
DashboardWidget(
widget_id='bottlenecks',
widget_type='table',
title='Top Bottlenecks',
config={
'metric': 'bottlenecks',
'limit': 10
},
position={'x': 0, 'y': 4, 'width': 9, 'height': 4}
)
]
)
# Anomaly Detection Template
self.templates['anomalies'] = DashboardTemplate(
template_id='anomalies',
name='Anomaly Detection',
description='Real-time anomaly detection and alerts',
category='monitoring',
widgets=[
DashboardWidget(
widget_id='anomaly_chart',
widget_type='chart',
title='Anomalies Over Time',
config={
'chart_type': 'scatter',
'metric': 'anomalies',
'time_range': '24h'
},
position={'x': 0, 'y': 0, 'width': 8, 'height': 4}
),
DashboardWidget(
widget_id='anomaly_list',
widget_type='table',
title='Recent Anomalies',
config={
'metric': 'anomalies',
'limit': 20
},
position={'x': 0, 'y': 4, 'width': 12, 'height': 4}
)
]
)
logger.info(f"Initialized {len(self.templates)} dashboard templates")

View File

@@ -0,0 +1,14 @@
"""Analytics engine components."""
from .performance_analyzer import PerformanceAnalyzer, PerformanceStats
from .anomaly_detector import AnomalyDetector, Anomaly
from .insight_generator import InsightGenerator, Insight
__all__ = [
'PerformanceAnalyzer',
'PerformanceStats',
'AnomalyDetector',
'Anomaly',
'InsightGenerator',
'Insight',
]

View File

@@ -0,0 +1,311 @@
"""Anomaly detection for workflow execution."""
import logging
import statistics
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
import hashlib
from ..storage.timeseries_store import TimeSeriesStore
logger = logging.getLogger(__name__)
@dataclass
class Anomaly:
"""Detected anomaly."""
anomaly_id: str
workflow_id: str
metric_name: str
detected_at: datetime
severity: float # 0.0 to 1.0
deviation: float
baseline_value: float
actual_value: float
description: str
recommended_action: Optional[str] = None
metadata: Dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary."""
return {
'anomaly_id': self.anomaly_id,
'workflow_id': self.workflow_id,
'metric_name': self.metric_name,
'detected_at': self.detected_at.isoformat(),
'severity': self.severity,
'deviation': self.deviation,
'baseline_value': self.baseline_value,
'actual_value': self.actual_value,
'description': self.description,
'recommended_action': self.recommended_action,
'metadata': self.metadata
}
class AnomalyDetector:
"""Detects anomalies in workflow execution using statistical methods."""
def __init__(
self,
time_series_store: TimeSeriesStore,
sensitivity: float = 2.0 # Standard deviations
):
"""
Initialize anomaly detector.
Args:
time_series_store: Time series storage
sensitivity: Number of standard deviations for anomaly threshold
"""
self.store = time_series_store
self.sensitivity = sensitivity
self.baselines: Dict[str, Dict] = {}
logger.info(f"AnomalyDetector initialized (sensitivity={sensitivity})")
def detect_anomalies(
self,
workflow_id: str,
metrics: List[Dict],
metric_name: str = 'duration_ms'
) -> List[Anomaly]:
"""
Detect anomalies in metrics.
Args:
workflow_id: Workflow identifier
metrics: List of metric dictionaries
metric_name: Name of metric to analyze
Returns:
List of detected anomalies
"""
if not metrics:
return []
# Get or create baseline
baseline = self._get_baseline(workflow_id, metric_name)
if not baseline:
# Not enough data for baseline
return []
anomalies = []
for metric in metrics:
value = metric.get(metric_name)
if value is None:
continue
# Calculate deviation from baseline
deviation = abs(value - baseline['mean']) / baseline['std_dev'] if baseline['std_dev'] > 0 else 0
# Check if anomaly
if deviation > self.sensitivity:
severity = min(deviation / (self.sensitivity * 2), 1.0)
anomaly = Anomaly(
anomaly_id=self._generate_anomaly_id(workflow_id, metric_name, metric),
workflow_id=workflow_id,
metric_name=metric_name,
detected_at=datetime.now(),
severity=severity,
deviation=deviation,
baseline_value=baseline['mean'],
actual_value=value,
description=self._generate_description(metric_name, value, baseline['mean'], deviation),
recommended_action=self._generate_recommendation(metric_name, value, baseline['mean']),
metadata=metric
)
anomalies.append(anomaly)
logger.info(f"Anomaly detected: {anomaly.description}")
return anomalies
def update_baseline(
self,
workflow_id: str,
stable_period_days: int = 7,
metric_name: str = 'duration_ms'
) -> None:
"""
Update baseline from stable period.
Args:
workflow_id: Workflow identifier
stable_period_days: Number of days for baseline calculation
metric_name: Metric to calculate baseline for
"""
end_time = datetime.now()
start_time = end_time - timedelta(days=stable_period_days)
# Query metrics
metrics = self.store.query_range(
start_time=start_time,
end_time=end_time,
workflow_id=workflow_id,
metric_types=['execution']
)
executions = metrics.get('execution', [])
if not executions:
logger.warning(f"No data for baseline calculation: {workflow_id}")
return
# Extract values
values = [e.get(metric_name) for e in executions if e.get(metric_name) is not None]
if len(values) < 10: # Minimum sample size
logger.warning(f"Insufficient data for baseline: {workflow_id} ({len(values)} samples)")
return
# Calculate baseline statistics
mean = statistics.mean(values)
std_dev = statistics.stdev(values) if len(values) > 1 else 0.0
median = statistics.median(values)
baseline_key = f"{workflow_id}:{metric_name}"
self.baselines[baseline_key] = {
'mean': mean,
'std_dev': std_dev,
'median': median,
'sample_size': len(values),
'updated_at': datetime.now(),
'period_days': stable_period_days
}
logger.info(f"Baseline updated for {workflow_id}: mean={mean:.2f}, std_dev={std_dev:.2f}")
def correlate_anomalies(
self,
anomalies: List[Anomaly],
time_window_minutes: int = 30
) -> List[List[Anomaly]]:
"""
Correlate related anomalies within a time window.
Args:
anomalies: List of anomalies to correlate
time_window_minutes: Time window for correlation
Returns:
List of correlated anomaly groups
"""
if not anomalies:
return []
# Sort by detection time
sorted_anomalies = sorted(anomalies, key=lambda a: a.detected_at)
groups = []
current_group = [sorted_anomalies[0]]
for anomaly in sorted_anomalies[1:]:
# Check if within time window of last anomaly in current group
time_diff = (anomaly.detected_at - current_group[-1].detected_at).total_seconds() / 60
if time_diff <= time_window_minutes:
current_group.append(anomaly)
else:
# Start new group
if len(current_group) > 1: # Only keep groups with multiple anomalies
groups.append(current_group)
current_group = [anomaly]
# Add last group if it has multiple anomalies
if len(current_group) > 1:
groups.append(current_group)
return groups
def escalate_anomaly(
self,
anomaly: Anomaly,
duration_minutes: int,
impact_score: float
) -> Dict[str, Any]:
"""
Escalate an anomaly based on duration and impact.
Args:
anomaly: Anomaly to escalate
duration_minutes: How long the anomaly has persisted
impact_score: Impact score (0.0 to 1.0)
Returns:
Escalation information
"""
# Calculate escalation level
escalation_score = (anomaly.severity + impact_score) / 2
escalation_score *= min(duration_minutes / 60, 2.0) # Cap at 2x for duration
if escalation_score > 0.8:
level = 'critical'
elif escalation_score > 0.5:
level = 'high'
elif escalation_score > 0.3:
level = 'medium'
else:
level = 'low'
return {
'anomaly_id': anomaly.anomaly_id,
'escalation_level': level,
'escalation_score': min(escalation_score, 1.0),
'duration_minutes': duration_minutes,
'impact_score': impact_score,
'requires_immediate_action': escalation_score > 0.8
}
def _get_baseline(self, workflow_id: str, metric_name: str) -> Optional[Dict]:
"""Get baseline for workflow and metric."""
baseline_key = f"{workflow_id}:{metric_name}"
if baseline_key not in self.baselines:
# Try to calculate baseline
self.update_baseline(workflow_id, metric_name=metric_name)
return self.baselines.get(baseline_key)
def _generate_anomaly_id(self, workflow_id: str, metric_name: str, metric: Dict) -> str:
"""Generate unique anomaly ID."""
data = f"{workflow_id}:{metric_name}:{metric.get('execution_id', '')}:{datetime.now().isoformat()}"
return hashlib.md5(data.encode()).hexdigest()[:16]
def _generate_description(
self,
metric_name: str,
actual_value: float,
baseline_value: float,
deviation: float
) -> str:
"""Generate human-readable anomaly description."""
percent_diff = abs((actual_value - baseline_value) / baseline_value * 100) if baseline_value > 0 else 0
direction = "higher" if actual_value > baseline_value else "lower"
return (
f"{metric_name} is {percent_diff:.1f}% {direction} than baseline "
f"({actual_value:.2f} vs {baseline_value:.2f}, {deviation:.1f} std devs)"
)
def _generate_recommendation(
self,
metric_name: str,
actual_value: float,
baseline_value: float
) -> str:
"""Generate recommended action for anomaly."""
if actual_value > baseline_value:
if metric_name == 'duration_ms':
return "Investigate performance degradation. Check for resource constraints or code changes."
elif metric_name == 'error_rate':
return "Investigate error spike. Check logs and recent deployments."
elif metric_name in ['cpu_percent', 'memory_mb']:
return "Investigate resource usage spike. Check for memory leaks or inefficient operations."
else:
if metric_name == 'success_rate':
return "Investigate success rate drop. Check for system issues or data quality problems."
return "Monitor the situation and investigate if anomaly persists."

View File

@@ -0,0 +1,301 @@
"""Automated insight generation for workflows."""
import logging
import hashlib
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
from .performance_analyzer import PerformanceAnalyzer, PerformanceStats
from .anomaly_detector import AnomalyDetector, Anomaly
logger = logging.getLogger(__name__)
@dataclass
class Insight:
"""Generated insight with recommendation."""
insight_id: str
workflow_id: str
category: str # 'performance', 'reliability', 'resource', 'best_practice'
title: str
description: str
recommendation: str
expected_impact: str
ease_of_implementation: str # 'easy', 'medium', 'hard'
priority_score: float
supporting_data: Dict[str, Any]
created_at: datetime
implemented: bool = False
actual_impact: Optional[Dict] = None
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary."""
return {
'insight_id': self.insight_id,
'workflow_id': self.workflow_id,
'category': self.category,
'title': self.title,
'description': self.description,
'recommendation': self.recommendation,
'expected_impact': self.expected_impact,
'ease_of_implementation': self.ease_of_implementation,
'priority_score': self.priority_score,
'supporting_data': self.supporting_data,
'created_at': self.created_at.isoformat(),
'implemented': self.implemented,
'actual_impact': self.actual_impact
}
class InsightGenerator:
"""Generates automated insights and recommendations."""
def __init__(
self,
performance_analyzer: PerformanceAnalyzer,
anomaly_detector: AnomalyDetector
):
"""
Initialize insight generator.
Args:
performance_analyzer: Performance analyzer instance
anomaly_detector: Anomaly detector instance
"""
self.performance_analyzer = performance_analyzer
self.anomaly_detector = anomaly_detector
self._insight_implementations: Dict[str, Dict] = {}
logger.info("InsightGenerator initialized")
def generate_insights(
self,
workflow_id: str,
analysis_period_days: int = 30
) -> List[Insight]:
"""
Generate insights for a workflow.
Args:
workflow_id: Workflow identifier
analysis_period_days: Number of days to analyze
Returns:
List of generated insights
"""
insights = []
end_time = datetime.now()
start_time = end_time - timedelta(days=analysis_period_days)
# Analyze performance
perf_stats = self.performance_analyzer.analyze_workflow(
workflow_id,
start_time,
end_time
)
if perf_stats:
# Generate performance insights
insights.extend(self._generate_performance_insights(perf_stats))
# Generate bottleneck insights
insights.extend(self._generate_bottleneck_insights(perf_stats))
# Check for performance degradation
degradation = self.performance_analyzer.detect_performance_degradation(
workflow_id,
baseline_period=timedelta(days=7),
current_period=timedelta(days=1)
)
if degradation:
insights.append(self._generate_degradation_insight(degradation))
# Prioritize insights
insights = self.prioritize_insights(insights)
return insights
def prioritize_insights(self, insights: List[Insight]) -> List[Insight]:
"""
Prioritize insights by impact and ease.
Args:
insights: List of insights to prioritize
Returns:
Sorted list of insights
"""
# Calculate priority scores
for insight in insights:
impact_score = self._calculate_impact_score(insight.expected_impact)
ease_score = self._calculate_ease_score(insight.ease_of_implementation)
# Priority = Impact * Ease (higher is better)
insight.priority_score = impact_score * ease_score
# Sort by priority (descending)
return sorted(insights, key=lambda i: i.priority_score, reverse=True)
def track_insight_implementation(
self,
insight_id: str,
implemented: bool,
actual_impact: Optional[Dict] = None
) -> None:
"""
Track insight implementation and measure impact.
Args:
insight_id: Insight identifier
implemented: Whether insight was implemented
actual_impact: Measured impact after implementation
"""
self._insight_implementations[insight_id] = {
'implemented': implemented,
'actual_impact': actual_impact,
'tracked_at': datetime.now()
}
logger.info(f"Tracked implementation for insight {insight_id}")
def _generate_performance_insights(self, stats: PerformanceStats) -> List[Insight]:
"""Generate insights from performance statistics."""
insights = []
# High variability insight
if stats.std_dev_ms > stats.avg_duration_ms * 0.5:
insights.append(Insight(
insight_id=self._generate_id(stats.workflow_id, 'high_variability'),
workflow_id=stats.workflow_id,
category='performance',
title='High Performance Variability',
description=(
f"Execution time varies significantly (std dev: {stats.std_dev_ms:.0f}ms, "
f"avg: {stats.avg_duration_ms:.0f}ms). This indicates inconsistent performance."
),
recommendation=(
"Investigate causes of variability. Check for: "
"1) Resource contention, 2) Network latency, 3) Data size variations, "
"4) External service dependencies."
),
expected_impact="Reduce execution time variability by 30-50%",
ease_of_implementation='medium',
priority_score=0.0,
supporting_data={'stats': stats.to_dict()},
created_at=datetime.now()
))
# Slow p99 insight
if stats.p99_duration_ms > stats.median_duration_ms * 3:
insights.append(Insight(
insight_id=self._generate_id(stats.workflow_id, 'slow_p99'),
workflow_id=stats.workflow_id,
category='performance',
title='Slow 99th Percentile Performance',
description=(
f"99th percentile ({stats.p99_duration_ms:.0f}ms) is 3x slower than median "
f"({stats.median_duration_ms:.0f}ms). Some executions are significantly slower."
),
recommendation=(
"Analyze slowest executions to identify outliers. "
"Consider adding timeouts or optimizing worst-case scenarios."
),
expected_impact="Improve worst-case performance by 40-60%",
ease_of_implementation='medium',
priority_score=0.0,
supporting_data={'stats': stats.to_dict()},
created_at=datetime.now()
))
return insights
def _generate_bottleneck_insights(self, stats: PerformanceStats) -> List[Insight]:
"""Generate insights from bottleneck analysis."""
insights = []
if not stats.slowest_steps:
return insights
# Top bottleneck
top_bottleneck = stats.slowest_steps[0]
insights.append(Insight(
insight_id=self._generate_id(stats.workflow_id, 'top_bottleneck'),
workflow_id=stats.workflow_id,
category='performance',
title=f"Bottleneck: {top_bottleneck['action_type']} on {top_bottleneck['node_id']}",
description=(
f"Step '{top_bottleneck['action_type']}' takes {top_bottleneck['avg_duration_ms']:.0f}ms "
f"on average (p95: {top_bottleneck['p95_duration_ms']:.0f}ms). "
f"This is the slowest step in the workflow."
),
recommendation=(
f"Optimize the '{top_bottleneck['action_type']}' action. "
"Consider: 1) Caching results, 2) Parallel execution, "
"3) Reducing wait times, 4) Optimizing selectors."
),
expected_impact=f"Reduce overall workflow time by {(top_bottleneck['avg_duration_ms'] / stats.avg_duration_ms * 100 * 0.5):.0f}%",
ease_of_implementation='easy',
priority_score=0.0,
supporting_data={'bottleneck': top_bottleneck},
created_at=datetime.now()
))
return insights
def _generate_degradation_insight(self, degradation: Dict) -> Insight:
"""Generate insight from performance degradation."""
return Insight(
insight_id=self._generate_id(degradation['workflow_id'], 'degradation'),
workflow_id=degradation['workflow_id'],
category='performance',
title='Performance Degradation Detected',
description=(
f"Performance has degraded by {degradation['percent_change']:.1f}% "
f"(from {degradation['baseline_avg_ms']:.0f}ms to {degradation['current_avg_ms']:.0f}ms)."
),
recommendation=(
"Investigate recent changes: 1) Code deployments, 2) Data volume increases, "
"3) Infrastructure changes, 4) External service degradation."
),
expected_impact="Restore baseline performance",
ease_of_implementation='medium',
priority_score=0.0,
supporting_data=degradation,
created_at=datetime.now()
)
def _calculate_impact_score(self, expected_impact: str) -> float:
"""Calculate impact score from expected impact description."""
impact_lower = expected_impact.lower()
# Look for percentage improvements
if '50%' in impact_lower or '60%' in impact_lower:
return 1.0
elif '30%' in impact_lower or '40%' in impact_lower:
return 0.8
elif '20%' in impact_lower:
return 0.6
elif '10%' in impact_lower:
return 0.4
else:
return 0.5 # Default
def _calculate_ease_score(self, ease: str) -> float:
"""Calculate ease score from ease of implementation."""
if ease == 'easy':
return 1.0
elif ease == 'medium':
return 0.6
elif ease == 'hard':
return 0.3
else:
return 0.5
def _generate_id(self, workflow_id: str, insight_type: str) -> str:
"""Generate unique insight ID."""
data = f"{workflow_id}:{insight_type}:{datetime.now().date().isoformat()}"
return hashlib.md5(data.encode()).hexdigest()[:16]

View File

@@ -0,0 +1,359 @@
"""Performance analysis for workflows."""
import logging
import statistics
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
from ..storage.timeseries_store import TimeSeriesStore
logger = logging.getLogger(__name__)
@dataclass
class PerformanceStats:
"""Performance statistics for a workflow."""
workflow_id: str
time_period: str
execution_count: int
avg_duration_ms: float
median_duration_ms: float
p95_duration_ms: float
p99_duration_ms: float
min_duration_ms: float
max_duration_ms: float
std_dev_ms: float
slowest_steps: List[Dict]
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary."""
return {
'workflow_id': self.workflow_id,
'time_period': self.time_period,
'execution_count': self.execution_count,
'avg_duration_ms': self.avg_duration_ms,
'median_duration_ms': self.median_duration_ms,
'p95_duration_ms': self.p95_duration_ms,
'p99_duration_ms': self.p99_duration_ms,
'min_duration_ms': self.min_duration_ms,
'max_duration_ms': self.max_duration_ms,
'std_dev_ms': self.std_dev_ms,
'slowest_steps': self.slowest_steps
}
class PerformanceAnalyzer:
"""Analyzes workflow performance metrics."""
def __init__(self, time_series_store: TimeSeriesStore):
"""
Initialize performance analyzer.
Args:
time_series_store: Time series storage for metrics
"""
self.store = time_series_store
logger.info("PerformanceAnalyzer initialized")
def analyze_workflow(
self,
workflow_id: str,
start_time: datetime,
end_time: datetime
) -> Optional[PerformanceStats]:
"""
Analyze performance for a workflow.
Args:
workflow_id: Workflow identifier
start_time: Start of analysis period
end_time: End of analysis period
Returns:
PerformanceStats or None if no data
"""
# Query execution metrics
metrics = self.store.query_range(
start_time=start_time,
end_time=end_time,
workflow_id=workflow_id,
metric_types=['execution']
)
executions = metrics.get('execution', [])
if not executions:
logger.warning(f"No execution data for workflow {workflow_id}")
return None
# Filter completed executions with duration
completed = [
e for e in executions
if e.get('status') == 'completed' and e.get('duration_ms') is not None
]
if not completed:
logger.warning(f"No completed executions for workflow {workflow_id}")
return None
# Extract durations
durations = [e['duration_ms'] for e in completed]
# Calculate statistics
avg_duration = statistics.mean(durations)
median_duration = statistics.median(durations)
min_duration = min(durations)
max_duration = max(durations)
std_dev = statistics.stdev(durations) if len(durations) > 1 else 0.0
# Calculate percentiles
sorted_durations = sorted(durations)
p95_duration = self._percentile(sorted_durations, 0.95)
p99_duration = self._percentile(sorted_durations, 0.99)
# Identify slowest steps
slowest_steps = self.identify_bottlenecks(
workflow_id,
start_time,
end_time,
threshold_percentile=0.95
)
time_period = f"{start_time.isoformat()} to {end_time.isoformat()}"
return PerformanceStats(
workflow_id=workflow_id,
time_period=time_period,
execution_count=len(completed),
avg_duration_ms=avg_duration,
median_duration_ms=median_duration,
p95_duration_ms=p95_duration,
p99_duration_ms=p99_duration,
min_duration_ms=min_duration,
max_duration_ms=max_duration,
std_dev_ms=std_dev,
slowest_steps=slowest_steps[:5] # Top 5 slowest
)
def identify_bottlenecks(
self,
workflow_id: str,
start_time: datetime,
end_time: datetime,
threshold_percentile: float = 0.95
) -> List[Dict]:
"""
Identify bottleneck steps in a workflow.
Args:
workflow_id: Workflow identifier
start_time: Start of analysis period
end_time: End of analysis period
threshold_percentile: Percentile threshold for bottlenecks
Returns:
List of bottleneck steps sorted by duration
"""
# Query step metrics
metrics = self.store.query_range(
start_time=start_time,
end_time=end_time,
workflow_id=workflow_id,
metric_types=['step']
)
steps = metrics.get('step', [])
if not steps:
return []
# Group by node_id and action_type
step_groups: Dict[tuple, List[float]] = {}
for step in steps:
key = (step['node_id'], step['action_type'])
if key not in step_groups:
step_groups[key] = []
step_groups[key].append(step['duration_ms'])
# Calculate statistics for each group
bottlenecks = []
for (node_id, action_type), durations in step_groups.items():
if not durations:
continue
avg_duration = statistics.mean(durations)
p95_duration = self._percentile(sorted(durations), threshold_percentile)
bottlenecks.append({
'node_id': node_id,
'action_type': action_type,
'avg_duration_ms': avg_duration,
'p95_duration_ms': p95_duration,
'execution_count': len(durations),
'max_duration_ms': max(durations)
})
# Sort by p95 duration (descending)
bottlenecks.sort(key=lambda x: x['p95_duration_ms'], reverse=True)
return bottlenecks
def detect_performance_degradation(
self,
workflow_id: str,
baseline_period: timedelta,
current_period: timedelta,
threshold_percent: float = 20.0
) -> Optional[Dict]:
"""
Detect performance degradation compared to baseline.
Args:
workflow_id: Workflow identifier
baseline_period: Duration of baseline period (e.g., last 7 days)
current_period: Duration of current period (e.g., last 24 hours)
threshold_percent: Threshold for degradation alert (%)
Returns:
Degradation info dict or None if no degradation
"""
now = datetime.now()
# Baseline period (older)
baseline_end = now - current_period
baseline_start = baseline_end - baseline_period
# Current period (recent)
current_start = now - current_period
current_end = now
# Analyze both periods
baseline_stats = self.analyze_workflow(
workflow_id,
baseline_start,
baseline_end
)
current_stats = self.analyze_workflow(
workflow_id,
current_start,
current_end
)
if not baseline_stats or not current_stats:
logger.warning(f"Insufficient data for degradation detection: {workflow_id}")
return None
# Calculate percentage change
baseline_avg = baseline_stats.avg_duration_ms
current_avg = current_stats.avg_duration_ms
if baseline_avg == 0:
return None
percent_change = ((current_avg - baseline_avg) / baseline_avg) * 100
# Check if degradation exceeds threshold
if percent_change > threshold_percent:
return {
'workflow_id': workflow_id,
'degradation_detected': True,
'baseline_avg_ms': baseline_avg,
'current_avg_ms': current_avg,
'percent_change': percent_change,
'threshold_percent': threshold_percent,
'baseline_period': str(baseline_period),
'current_period': str(current_period),
'severity': 'high' if percent_change > threshold_percent * 2 else 'medium'
}
return None
def compare_workflows(
self,
workflow_ids: List[str],
start_time: datetime,
end_time: datetime
) -> Dict[str, PerformanceStats]:
"""
Compare performance across multiple workflows.
Args:
workflow_ids: List of workflow identifiers
start_time: Start of analysis period
end_time: End of analysis period
Returns:
Dictionary mapping workflow_id to PerformanceStats
"""
results = {}
for workflow_id in workflow_ids:
stats = self.analyze_workflow(workflow_id, start_time, end_time)
if stats:
results[workflow_id] = stats
return results
def get_performance_trend(
self,
workflow_id: str,
start_time: datetime,
end_time: datetime,
bucket_size: timedelta = timedelta(hours=1)
) -> List[Dict]:
"""
Get performance trend over time with bucketing.
Args:
workflow_id: Workflow identifier
start_time: Start of analysis period
end_time: End of analysis period
bucket_size: Size of time buckets
Returns:
List of performance data points over time
"""
trend = []
current = start_time
while current < end_time:
bucket_end = min(current + bucket_size, end_time)
stats = self.analyze_workflow(workflow_id, current, bucket_end)
if stats:
trend.append({
'timestamp': current.isoformat(),
'avg_duration_ms': stats.avg_duration_ms,
'median_duration_ms': stats.median_duration_ms,
'execution_count': stats.execution_count
})
current = bucket_end
return trend
@staticmethod
def _percentile(sorted_data: List[float], percentile: float) -> float:
"""
Calculate percentile from sorted data.
Args:
sorted_data: Sorted list of values
percentile: Percentile to calculate (0.0 to 1.0)
Returns:
Percentile value
"""
if not sorted_data:
return 0.0
if len(sorted_data) == 1:
return sorted_data[0]
# Linear interpolation
index = percentile * (len(sorted_data) - 1)
lower = int(index)
upper = min(lower + 1, len(sorted_data) - 1)
weight = index - lower
return sorted_data[lower] * (1 - weight) + sorted_data[upper] * weight

View File

@@ -0,0 +1,334 @@
"""Success rate analytics for workflows."""
import logging
from typing import Dict, List, Optional, Tuple
from datetime import datetime, timedelta
from dataclasses import dataclass
from collections import defaultdict
from ..storage.timeseries_store import TimeSeriesStore
logger = logging.getLogger(__name__)
@dataclass
class SuccessRateStats:
"""Success rate statistics."""
workflow_id: str
total_executions: int
successful_executions: int
failed_executions: int
success_rate: float
failure_categories: Dict[str, int]
reliability_score: float
time_window_start: datetime
time_window_end: datetime
def to_dict(self) -> Dict:
"""Convert to dictionary."""
return {
'workflow_id': self.workflow_id,
'total_executions': self.total_executions,
'successful_executions': self.successful_executions,
'failed_executions': self.failed_executions,
'success_rate': self.success_rate,
'failure_categories': self.failure_categories,
'reliability_score': self.reliability_score,
'time_window_start': self.time_window_start.isoformat(),
'time_window_end': self.time_window_end.isoformat()
}
@dataclass
class ReliabilityRanking:
"""Workflow reliability ranking."""
workflow_id: str
reliability_score: float
success_rate: float
stability_score: float
total_executions: int
rank: int
def to_dict(self) -> Dict:
"""Convert to dictionary."""
return {
'workflow_id': self.workflow_id,
'reliability_score': self.reliability_score,
'success_rate': self.success_rate,
'stability_score': self.stability_score,
'total_executions': self.total_executions,
'rank': self.rank
}
class SuccessRateCalculator:
"""Calculate success rates and reliability metrics."""
def __init__(self, store: TimeSeriesStore):
"""
Initialize success rate calculator.
Args:
store: Time-series storage instance
"""
self.store = store
logger.info("SuccessRateCalculator initialized")
def calculate_success_rate(
self,
workflow_id: str,
time_window_hours: int = 24
) -> SuccessRateStats:
"""
Calculate success rate for a workflow.
Args:
workflow_id: Workflow identifier
time_window_hours: Time window in hours
Returns:
Success rate statistics
"""
end_time = datetime.now()
start_time = end_time - timedelta(hours=time_window_hours)
# Query execution metrics
metrics = self.store.query_range(
metric_type='execution',
start_time=start_time,
end_time=end_time,
filters={'workflow_id': workflow_id}
)
total = len(metrics)
successful = sum(1 for m in metrics if m.get('status') == 'success')
failed = total - successful
success_rate = (successful / total * 100) if total > 0 else 0.0
# Categorize failures
failure_categories = self._categorize_failures(
[m for m in metrics if m.get('status') != 'success']
)
# Calculate reliability score
reliability_score = self._calculate_reliability_score(
success_rate=success_rate,
total_executions=total,
failure_categories=failure_categories
)
return SuccessRateStats(
workflow_id=workflow_id,
total_executions=total,
successful_executions=successful,
failed_executions=failed,
success_rate=success_rate,
failure_categories=failure_categories,
reliability_score=reliability_score,
time_window_start=start_time,
time_window_end=end_time
)
def categorize_failures(
self,
workflow_id: str,
time_window_hours: int = 24
) -> Dict[str, int]:
"""
Categorize failures by type.
Args:
workflow_id: Workflow identifier
time_window_hours: Time window in hours
Returns:
Dictionary of failure categories and counts
"""
end_time = datetime.now()
start_time = end_time - timedelta(hours=time_window_hours)
# Query failed executions
metrics = self.store.query_range(
metric_type='execution',
start_time=start_time,
end_time=end_time,
filters={'workflow_id': workflow_id}
)
failed_metrics = [m for m in metrics if m.get('status') != 'success']
return self._categorize_failures(failed_metrics)
def _categorize_failures(self, failed_metrics: List[Dict]) -> Dict[str, int]:
"""
Categorize failures by error type.
Args:
failed_metrics: List of failed execution metrics
Returns:
Dictionary of categories and counts
"""
categories = defaultdict(int)
for metric in failed_metrics:
error_msg = metric.get('error_message', '').lower()
# Categorize by error type
if 'timeout' in error_msg:
categories['timeout'] += 1
elif 'not found' in error_msg or 'element' in error_msg:
categories['element_not_found'] += 1
elif 'permission' in error_msg or 'access' in error_msg:
categories['permission_error'] += 1
elif 'network' in error_msg or 'connection' in error_msg:
categories['network_error'] += 1
elif 'validation' in error_msg:
categories['validation_error'] += 1
else:
categories['other'] += 1
return dict(categories)
def rank_workflows_by_reliability(
self,
workflow_ids: Optional[List[str]] = None,
time_window_hours: int = 168 # 1 week
) -> List[ReliabilityRanking]:
"""
Rank workflows by reliability score.
Args:
workflow_ids: List of workflow IDs (None = all)
time_window_hours: Time window in hours
Returns:
List of reliability rankings sorted by score
"""
end_time = datetime.now()
start_time = end_time - timedelta(hours=time_window_hours)
# Get all workflows if not specified
if workflow_ids is None:
metrics = self.store.query_range(
metric_type='execution',
start_time=start_time,
end_time=end_time
)
workflow_ids = list(set(m.get('workflow_id') for m in metrics if m.get('workflow_id')))
# Calculate reliability for each workflow
rankings = []
for workflow_id in workflow_ids:
stats = self.calculate_success_rate(workflow_id, time_window_hours)
# Calculate stability score (consistency over time)
stability_score = self._calculate_stability_score(
workflow_id, start_time, end_time
)
rankings.append(ReliabilityRanking(
workflow_id=workflow_id,
reliability_score=stats.reliability_score,
success_rate=stats.success_rate,
stability_score=stability_score,
total_executions=stats.total_executions,
rank=0 # Will be set after sorting
))
# Sort by reliability score (descending)
rankings.sort(key=lambda r: r.reliability_score, reverse=True)
# Assign ranks
for i, ranking in enumerate(rankings, 1):
ranking.rank = i
return rankings
def _calculate_reliability_score(
self,
success_rate: float,
total_executions: int,
failure_categories: Dict[str, int]
) -> float:
"""
Calculate overall reliability score.
Args:
success_rate: Success rate percentage
total_executions: Total number of executions
failure_categories: Failure categories
Returns:
Reliability score (0-100)
"""
# Base score from success rate (70% weight)
base_score = success_rate * 0.7
# Execution volume bonus (up to 15% for 100+ executions)
volume_bonus = min(total_executions / 100 * 15, 15)
# Failure diversity penalty (up to -15% for many failure types)
num_failure_types = len(failure_categories)
diversity_penalty = min(num_failure_types * 3, 15)
# Calculate final score
reliability_score = base_score + volume_bonus - diversity_penalty
# Clamp to 0-100
return max(0.0, min(100.0, reliability_score))
def _calculate_stability_score(
self,
workflow_id: str,
start_time: datetime,
end_time: datetime
) -> float:
"""
Calculate stability score (consistency over time).
Args:
workflow_id: Workflow identifier
start_time: Start of time window
end_time: End of time window
Returns:
Stability score (0-100)
"""
# Split time window into buckets
num_buckets = 7 # Weekly buckets
bucket_duration = (end_time - start_time) / num_buckets
bucket_success_rates = []
for i in range(num_buckets):
bucket_start = start_time + (bucket_duration * i)
bucket_end = bucket_start + bucket_duration
metrics = self.store.query_range(
metric_type='execution',
start_time=bucket_start,
end_time=bucket_end,
filters={'workflow_id': workflow_id}
)
if metrics:
successful = sum(1 for m in metrics if m.get('status') == 'success')
success_rate = (successful / len(metrics)) * 100
bucket_success_rates.append(success_rate)
if not bucket_success_rates:
return 0.0
# Calculate coefficient of variation (lower = more stable)
import statistics
mean = statistics.mean(bucket_success_rates)
if mean == 0:
return 0.0
stdev = statistics.stdev(bucket_success_rates) if len(bucket_success_rates) > 1 else 0
cv = (stdev / mean) * 100
# Convert to stability score (lower CV = higher stability)
# CV of 0 = 100 stability, CV of 50+ = 0 stability
stability_score = max(0.0, 100.0 - (cv * 2))
return stability_score

View File

@@ -0,0 +1,11 @@
"""Analytics integration module."""
from .execution_integration import (
AnalyticsExecutionIntegration,
get_analytics_integration
)
__all__ = [
'AnalyticsExecutionIntegration',
'get_analytics_integration'
]

View File

@@ -0,0 +1,370 @@
"""Integration of analytics with ExecutionLoop."""
import logging
from typing import Optional
from datetime import datetime
import uuid
from ..analytics_system import get_analytics_system
from ..collection.metrics_collector import ExecutionMetrics, StepMetrics
logger = logging.getLogger(__name__)
class AnalyticsExecutionIntegration:
"""Integrate analytics collection with workflow execution."""
def __init__(self, enabled: bool = True):
"""
Initialize analytics integration.
Args:
enabled: Whether analytics collection is enabled
"""
self.enabled = enabled
self.analytics = None
if enabled:
try:
self.analytics = get_analytics_system()
logger.info("Analytics integration enabled")
except Exception as e:
logger.error(f"Failed to initialize analytics: {e}")
self.enabled = False
def on_execution_start(
self,
workflow_id: str,
execution_id: Optional[str] = None,
total_steps: int = 0
) -> str:
"""
Called when workflow execution starts.
Args:
workflow_id: Workflow identifier
execution_id: Execution identifier (generated if None)
total_steps: Total number of steps
Returns:
Execution ID
"""
if not self.enabled or not self.analytics:
return execution_id or str(uuid.uuid4())
if execution_id is None:
execution_id = str(uuid.uuid4())
try:
# Start real-time tracking
self.analytics.realtime_analytics.track_execution(
execution_id=execution_id,
workflow_id=workflow_id,
total_steps=total_steps
)
logger.debug(f"Started tracking execution: {execution_id}")
except Exception as e:
logger.error(f"Error starting execution tracking: {e}")
return execution_id
def on_step_start(
self,
execution_id: str,
node_id: str,
step_number: int
) -> None:
"""
Called when a step starts.
Args:
execution_id: Execution identifier
node_id: Node identifier
step_number: Step number
"""
if not self.enabled or not self.analytics:
return
try:
# Update progress
self.analytics.realtime_analytics.update_progress(
execution_id=execution_id,
current_step=step_number,
current_node_id=node_id
)
except Exception as e:
logger.error(f"Error updating step progress: {e}")
def on_step_complete(
self,
execution_id: str,
workflow_id: str,
node_id: str,
action_type: str,
started_at: datetime,
completed_at: datetime,
duration: float,
success: bool,
error_message: Optional[str] = None
) -> None:
"""
Called when a step completes.
Args:
execution_id: Execution identifier
workflow_id: Workflow identifier
node_id: Node identifier
action_type: Type of action
started_at: Start timestamp
completed_at: Completion timestamp
duration: Duration in seconds
success: Whether step succeeded
error_message: Error message if failed
"""
if not self.enabled or not self.analytics:
return
try:
# Record step metrics
step_metrics = StepMetrics(
execution_id=execution_id,
workflow_id=workflow_id,
node_id=node_id,
action_type=action_type,
started_at=started_at,
completed_at=completed_at,
duration=duration,
success=success,
error_message=error_message
)
self.analytics.metrics_collector.record_step(step_metrics)
# Update real-time tracking
self.analytics.realtime_analytics.record_step_complete(
execution_id=execution_id,
success=success
)
logger.debug(f"Recorded step: {node_id} ({'success' if success else 'failed'})")
except Exception as e:
logger.error(f"Error recording step completion: {e}")
def on_execution_complete(
self,
execution_id: str,
workflow_id: str,
started_at: datetime,
completed_at: datetime,
duration: float,
status: str,
error_message: Optional[str] = None,
steps_completed: int = 0,
steps_failed: int = 0
) -> None:
"""
Called when workflow execution completes.
Args:
execution_id: Execution identifier
workflow_id: Workflow identifier
started_at: Start timestamp
completed_at: Completion timestamp
duration: Duration in seconds
status: Final status (success, failed, timeout)
error_message: Error message if failed
steps_completed: Number of steps completed
steps_failed: Number of steps failed
"""
if not self.enabled or not self.analytics:
return
try:
# Record execution metrics
execution_metrics = ExecutionMetrics(
execution_id=execution_id,
workflow_id=workflow_id,
started_at=started_at,
completed_at=completed_at,
duration=duration,
status=status,
error_message=error_message,
steps_completed=steps_completed,
steps_failed=steps_failed
)
self.analytics.metrics_collector.record_execution(execution_metrics)
# Flush to ensure persistence
self.analytics.metrics_collector.flush()
# Complete real-time tracking
self.analytics.realtime_analytics.complete_execution(
execution_id=execution_id,
status=status
)
logger.info(f"Recorded execution: {execution_id} ({status})")
except Exception as e:
logger.error(f"Error recording execution completion: {e}")
def on_recovery_attempt(
self,
execution_id: str,
workflow_id: str,
node_id: str,
strategy: str,
success: bool,
duration: float
) -> None:
"""
Called when self-healing attempts recovery.
Args:
execution_id: Execution identifier
workflow_id: Workflow identifier
node_id: Node identifier
strategy: Recovery strategy used
success: Whether recovery succeeded
duration: Recovery duration
"""
if not self.enabled or not self.analytics:
return
try:
# Record as a special step metric
recovery_metrics = StepMetrics(
execution_id=execution_id,
workflow_id=workflow_id,
node_id=f"{node_id}_recovery",
action_type=f"recovery_{strategy}",
started_at=datetime.now(),
completed_at=datetime.now(),
duration=duration,
success=success,
error_message=None if success else f"Recovery failed: {strategy}"
)
self.analytics.metrics_collector.record_step(recovery_metrics)
logger.debug(f"Recorded recovery: {strategy} ({'success' if success else 'failed'})")
except Exception as e:
logger.error(f"Error recording recovery attempt: {e}")
def get_live_metrics(self, execution_id: str) -> Optional[dict]:
"""
Get live metrics for an execution.
Args:
execution_id: Execution identifier
Returns:
Live metrics dictionary or None
"""
if not self.enabled or not self.analytics:
return None
try:
return self.analytics.realtime_analytics.get_live_metrics(execution_id)
except Exception as e:
logger.error(f"Error getting live metrics: {e}")
return None
def get_workflow_stats(self, workflow_id: str, hours: int = 24) -> Optional[dict]:
"""
Get statistics for a workflow.
Args:
workflow_id: Workflow identifier
hours: Time window in hours
Returns:
Statistics dictionary or None
"""
if not self.enabled or not self.analytics:
return None
try:
from datetime import timedelta
end_time = datetime.now()
start_time = end_time - timedelta(hours=hours)
# Get performance stats
perf_stats = self.analytics.performance_analyzer.analyze_performance(
workflow_id=workflow_id,
start_time=start_time,
end_time=end_time
)
# Get success rate
success_stats = self.analytics.success_rate_calculator.calculate_success_rate(
workflow_id=workflow_id,
time_window_hours=hours
)
return {
'performance': perf_stats.to_dict(),
'success_rate': success_stats.to_dict()
}
except Exception as e:
logger.error(f"Error getting workflow stats: {e}")
return None
def start_resource_monitoring(self, execution_id: str) -> None:
"""
Start monitoring resources for an execution.
Args:
execution_id: Execution identifier
"""
if not self.enabled or not self.analytics:
return
try:
# Tag resource metrics with execution ID
self.analytics.collectors.resource.start_monitoring(
context={'execution_id': execution_id}
)
logger.debug(f"Started resource monitoring for: {execution_id}")
except Exception as e:
logger.warning(f"Error starting resource monitoring: {e}")
def stop_resource_monitoring(self, execution_id: str) -> None:
"""
Stop monitoring resources for an execution.
Args:
execution_id: Execution identifier
"""
if not self.enabled or not self.analytics:
return
try:
self.analytics.collectors.resource.stop_monitoring()
logger.debug(f"Stopped resource monitoring for: {execution_id}")
except Exception as e:
logger.warning(f"Error stopping resource monitoring: {e}")
# Global instance
_analytics_integration: Optional[AnalyticsExecutionIntegration] = None
def get_analytics_integration(enabled: bool = True) -> AnalyticsExecutionIntegration:
"""
Get or create global analytics integration instance.
Args:
enabled: Whether analytics is enabled
Returns:
AnalyticsExecutionIntegration instance
"""
global _analytics_integration
if _analytics_integration is None:
_analytics_integration = AnalyticsExecutionIntegration(enabled=enabled)
return _analytics_integration

View File

@@ -0,0 +1,5 @@
"""Query engine for analytics data."""
from .query_engine import QueryEngine
__all__ = ['QueryEngine']

View File

@@ -0,0 +1,312 @@
"""Query engine for analytics data with caching."""
import logging
import hashlib
import json
from typing import List, Dict, Any, Optional, Tuple
from datetime import datetime
from collections import OrderedDict
from ..storage.timeseries_store import TimeSeriesStore
from ..storage.archive_storage import ArchiveStorage
logger = logging.getLogger(__name__)
class LRUCache:
"""Simple LRU cache implementation."""
def __init__(self, capacity: int = 100):
"""Initialize LRU cache."""
self.capacity = capacity
self.cache: OrderedDict = OrderedDict()
def get(self, key: str) -> Optional[Any]:
"""Get value from cache."""
if key not in self.cache:
return None
# Move to end (most recently used)
self.cache.move_to_end(key)
return self.cache[key]
def put(self, key: str, value: Any) -> None:
"""Put value in cache."""
if key in self.cache:
self.cache.move_to_end(key)
self.cache[key] = value
# Remove oldest if over capacity
if len(self.cache) > self.capacity:
self.cache.popitem(last=False)
def clear(self) -> None:
"""Clear cache."""
self.cache.clear()
def size(self) -> int:
"""Get cache size."""
return len(self.cache)
class QueryEngine:
"""Query engine for analytics data with caching."""
def __init__(
self,
time_series_store: TimeSeriesStore,
archive_storage: Optional[ArchiveStorage] = None,
cache_size: int = 100
):
"""
Initialize query engine.
Args:
time_series_store: Time series storage
archive_storage: Optional archive storage
cache_size: Size of query cache
"""
self.ts_store = time_series_store
self.archive = archive_storage
self.cache = LRUCache(cache_size)
logger.info(f"QueryEngine initialized (cache_size={cache_size})")
def query(
self,
query: Dict[str, Any],
use_cache: bool = True
) -> List[Dict]:
"""
Execute a query against analytics data.
Args:
query: Query specification with filters, time range, etc.
use_cache: Whether to use cache
Returns:
List of matching records
"""
# Generate cache key
cache_key = self._generate_cache_key(query)
# Check cache
if use_cache:
cached = self.cache.get(cache_key)
if cached is not None:
logger.debug(f"Cache hit for query: {cache_key[:8]}")
return cached
# Execute query
start_time = query.get('start_time')
end_time = query.get('end_time')
workflow_id = query.get('workflow_id')
metric_types = query.get('metric_types', ['execution', 'step', 'resource'])
if not start_time or not end_time:
raise ValueError("start_time and end_time are required")
# Convert to datetime if strings
if isinstance(start_time, str):
start_time = datetime.fromisoformat(start_time)
if isinstance(end_time, str):
end_time = datetime.fromisoformat(end_time)
# Query time series store
results = self.ts_store.query_range(
start_time=start_time,
end_time=end_time,
workflow_id=workflow_id,
metric_types=metric_types
)
# Apply additional filters
filters = query.get('filters', {})
if filters:
for metric_type, records in results.items():
results[metric_type] = self._apply_filters(records, filters)
# Flatten if requested
if query.get('flatten', False):
flattened = []
for records in results.values():
flattened.extend(records)
results = flattened
# Cache result
if use_cache:
self.cache.put(cache_key, results)
return results
def aggregate(
self,
metric: str,
aggregation: str,
group_by: List[str],
filters: Dict[str, Any],
time_range: Tuple[datetime, datetime],
use_cache: bool = True
) -> List[Dict]:
"""
Aggregate metrics with grouping.
Args:
metric: Metric field to aggregate
aggregation: Aggregation function (avg, sum, count, min, max)
group_by: Fields to group by
filters: Filter criteria
time_range: (start_time, end_time)
use_cache: Whether to use cache
Returns:
List of aggregated results
"""
# Generate cache key
cache_key = self._generate_cache_key({
'type': 'aggregate',
'metric': metric,
'aggregation': aggregation,
'group_by': group_by,
'filters': filters,
'time_range': [t.isoformat() for t in time_range]
})
# Check cache
if use_cache:
cached = self.cache.get(cache_key)
if cached is not None:
return cached
# Execute aggregation
start_time, end_time = time_range
results = self.ts_store.aggregate(
metric=metric,
aggregation=aggregation,
group_by=group_by,
start_time=start_time,
end_time=end_time,
filters=filters
)
# Cache result
if use_cache:
self.cache.put(cache_key, results)
return results
def compare(
self,
workflow_ids: List[str],
metrics: List[str],
time_range: Tuple[datetime, datetime]
) -> Dict[str, Dict]:
"""
Compare metrics across workflows.
Args:
workflow_ids: List of workflow IDs to compare
metrics: List of metrics to compare
time_range: (start_time, end_time)
Returns:
Dictionary mapping workflow_id to metrics
"""
results = {}
start_time, end_time = time_range
for workflow_id in workflow_ids:
workflow_metrics = {}
# Query metrics for this workflow
data = self.ts_store.query_range(
start_time=start_time,
end_time=end_time,
workflow_id=workflow_id
)
# Calculate requested metrics
executions = data.get('execution', [])
if executions:
for metric in metrics:
values = [e.get(metric) for e in executions if e.get(metric) is not None]
if values:
import statistics
workflow_metrics[metric] = {
'avg': statistics.mean(values),
'min': min(values),
'max': max(values),
'count': len(values)
}
results[workflow_id] = workflow_metrics
# Calculate differences
if len(workflow_ids) == 2:
results['comparison'] = self._calculate_differences(
results[workflow_ids[0]],
results[workflow_ids[1]]
)
return results
def invalidate_cache(self, pattern: Optional[str] = None) -> int:
"""
Invalidate cache entries.
Args:
pattern: Optional pattern to match (None = clear all)
Returns:
Number of entries invalidated
"""
if pattern is None:
size = self.cache.size()
self.cache.clear()
logger.info(f"Cleared entire cache ({size} entries)")
return size
# Pattern-based invalidation not implemented yet
# For now, just clear all
return self.invalidate_cache(None)
def _apply_filters(self, records: List[Dict], filters: Dict[str, Any]) -> List[Dict]:
"""Apply filters to records."""
filtered = []
for record in records:
match = True
for key, value in filters.items():
if record.get(key) != value:
match = False
break
if match:
filtered.append(record)
return filtered
def _calculate_differences(
self,
metrics1: Dict[str, Dict],
metrics2: Dict[str, Dict]
) -> Dict[str, Dict]:
"""Calculate differences between two metric sets."""
differences = {}
for metric in metrics1.keys():
if metric in metrics2:
m1 = metrics1[metric]
m2 = metrics2[metric]
differences[metric] = {
'diff_avg': m2['avg'] - m1['avg'],
'diff_percent': ((m2['avg'] - m1['avg']) / m1['avg'] * 100) if m1['avg'] != 0 else 0,
'workflow1_avg': m1['avg'],
'workflow2_avg': m2['avg']
}
return differences
def _generate_cache_key(self, query: Dict[str, Any]) -> str:
"""Generate cache key from query."""
# Sort keys for consistent hashing
query_str = json.dumps(query, sort_keys=True, default=str)
return hashlib.md5(query_str.encode()).hexdigest()

View File

@@ -0,0 +1,5 @@
"""Real-time analytics components."""
from .realtime_analytics import RealtimeAnalytics
__all__ = ['RealtimeAnalytics']

View File

@@ -0,0 +1,283 @@
"""Real-time analytics for active workflows."""
import logging
import threading
from typing import Dict, Any, Optional, List, Callable
from datetime import datetime
from dataclasses import dataclass, field
from ..collection.metrics_collector import MetricsCollector, ExecutionMetrics
logger = logging.getLogger(__name__)
@dataclass
class LiveExecution:
"""Live execution tracking."""
execution_id: str
workflow_id: str
started_at: datetime
current_step: int = 0
total_steps: int = 0
steps_completed: int = 0
steps_failed: int = 0
current_node_id: Optional[str] = None
last_update: datetime = field(default_factory=datetime.now)
@property
def progress_percent(self) -> float:
"""Calculate progress percentage."""
if self.total_steps == 0:
return 0.0
return (self.steps_completed / self.total_steps) * 100
@property
def estimated_completion(self) -> Optional[datetime]:
"""Estimate completion time."""
if self.steps_completed == 0 or self.total_steps == 0:
return None
elapsed = (datetime.now() - self.started_at).total_seconds()
avg_time_per_step = elapsed / self.steps_completed
remaining_steps = self.total_steps - self.steps_completed
estimated_remaining = avg_time_per_step * remaining_steps
from datetime import timedelta
return datetime.now() + timedelta(seconds=estimated_remaining)
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary."""
return {
'execution_id': self.execution_id,
'workflow_id': self.workflow_id,
'started_at': self.started_at.isoformat(),
'current_step': self.current_step,
'total_steps': self.total_steps,
'steps_completed': self.steps_completed,
'steps_failed': self.steps_failed,
'current_node_id': self.current_node_id,
'progress_percent': self.progress_percent,
'estimated_completion': self.estimated_completion.isoformat() if self.estimated_completion else None,
'last_update': self.last_update.isoformat()
}
class RealtimeAnalytics:
"""Real-time analytics for active workflows."""
def __init__(self, metrics_collector: Optional[MetricsCollector] = None):
"""
Initialize real-time analytics.
Args:
metrics_collector: Metrics collector instance
"""
self.collector = metrics_collector
self.active_executions: Dict[str, LiveExecution] = {}
self.subscribers: Dict[str, List[Callable]] = {}
self._lock = threading.Lock()
logger.info("RealtimeAnalytics initialized")
def track_execution(
self,
execution_id: str,
workflow_id: str,
total_steps: int = 0
) -> None:
"""
Start tracking an execution in real-time.
Args:
execution_id: Execution identifier
workflow_id: Workflow identifier
total_steps: Total number of steps
"""
with self._lock:
self.active_executions[execution_id] = LiveExecution(
execution_id=execution_id,
workflow_id=workflow_id,
started_at=datetime.now(),
total_steps=total_steps
)
# Notify subscribers
self._notify_subscribers(execution_id, 'started')
logger.info(f"Tracking execution: {execution_id}")
def update_progress(
self,
execution_id: str,
current_step: int,
total_steps: Optional[int] = None,
current_node_id: Optional[str] = None
) -> None:
"""
Update execution progress.
Args:
execution_id: Execution identifier
current_step: Current step number
total_steps: Total steps (updates if provided)
current_node_id: Current node ID
"""
with self._lock:
if execution_id not in self.active_executions:
logger.warning(f"Execution not tracked: {execution_id}")
return
execution = self.active_executions[execution_id]
execution.current_step = current_step
if total_steps is not None:
execution.total_steps = total_steps
if current_node_id is not None:
execution.current_node_id = current_node_id
execution.last_update = datetime.now()
# Notify subscribers
self._notify_subscribers(execution_id, 'progress')
def record_step_complete(
self,
execution_id: str,
success: bool
) -> None:
"""
Record step completion.
Args:
execution_id: Execution identifier
success: Whether step succeeded
"""
with self._lock:
if execution_id not in self.active_executions:
return
execution = self.active_executions[execution_id]
if success:
execution.steps_completed += 1
else:
execution.steps_failed += 1
execution.last_update = datetime.now()
# Notify subscribers
self._notify_subscribers(execution_id, 'step_complete')
def complete_execution(
self,
execution_id: str,
status: str
) -> None:
"""
Mark execution as complete.
Args:
execution_id: Execution identifier
status: Final status
"""
with self._lock:
if execution_id in self.active_executions:
del self.active_executions[execution_id]
# Notify subscribers
self._notify_subscribers(execution_id, 'completed', {'status': status})
logger.info(f"Execution completed: {execution_id} ({status})")
def get_live_metrics(self, execution_id: str) -> Optional[Dict[str, Any]]:
"""
Get live metrics for an execution.
Args:
execution_id: Execution identifier
Returns:
Live metrics dictionary or None
"""
with self._lock:
execution = self.active_executions.get(execution_id)
if not execution:
return None
return execution.to_dict()
def get_all_active(self) -> List[Dict[str, Any]]:
"""
Get all active executions.
Returns:
List of active execution metrics
"""
with self._lock:
return [e.to_dict() for e in self.active_executions.values()]
def subscribe(
self,
execution_id: str,
callback: Callable[[str, Dict], None]
) -> None:
"""
Subscribe to real-time updates for an execution.
Args:
execution_id: Execution identifier
callback: Callback function (event_type, data)
"""
with self._lock:
if execution_id not in self.subscribers:
self.subscribers[execution_id] = []
self.subscribers[execution_id].append(callback)
logger.debug(f"Subscriber added for {execution_id}")
def unsubscribe(
self,
execution_id: str,
callback: Optional[Callable] = None
) -> None:
"""
Unsubscribe from updates.
Args:
execution_id: Execution identifier
callback: Specific callback to remove (None = remove all)
"""
with self._lock:
if execution_id not in self.subscribers:
return
if callback is None:
del self.subscribers[execution_id]
else:
self.subscribers[execution_id] = [
cb for cb in self.subscribers[execution_id] if cb != callback
]
def _notify_subscribers(
self,
execution_id: str,
event_type: str,
data: Optional[Dict] = None
) -> None:
"""Notify subscribers of an event."""
with self._lock:
callbacks = self.subscribers.get(execution_id, []).copy()
if not callbacks:
return
# Get current metrics
metrics = self.get_live_metrics(execution_id)
event_data = {
'event_type': event_type,
'execution_id': execution_id,
'metrics': metrics,
**(data or {})
}
# Call subscribers (outside lock)
for callback in callbacks:
try:
callback(event_type, event_data)
except Exception as e:
logger.error(f"Subscriber callback error: {e}")

View File

@@ -0,0 +1,13 @@
"""Analytics reporting module."""
from .report_generator import (
ReportGenerator,
ReportConfig,
ScheduledReport
)
__all__ = [
'ReportGenerator',
'ReportConfig',
'ScheduledReport'
]

View File

@@ -0,0 +1,443 @@
"""Report generation for analytics data."""
import logging
import json
import csv
from typing import Dict, List, Optional, Any
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass
from io import StringIO
logger = logging.getLogger(__name__)
@dataclass
class ReportConfig:
"""Report configuration."""
title: str
metric_types: List[str]
start_time: datetime
end_time: datetime
workflow_ids: Optional[List[str]] = None
include_charts: bool = True
include_insights: bool = True
format: str = 'json' # json, csv, html, pdf
def to_dict(self) -> Dict:
"""Convert to dictionary."""
return {
'title': self.title,
'metric_types': self.metric_types,
'start_time': self.start_time.isoformat(),
'end_time': self.end_time.isoformat(),
'workflow_ids': self.workflow_ids,
'include_charts': self.include_charts,
'include_insights': self.include_insights,
'format': self.format
}
@dataclass
class ScheduledReport:
"""Scheduled report configuration."""
report_id: str
config: ReportConfig
schedule_cron: str # Cron expression
delivery_method: str # email, webhook, file
delivery_config: Dict[str, Any]
enabled: bool = True
last_run: Optional[datetime] = None
next_run: Optional[datetime] = None
def to_dict(self) -> Dict:
"""Convert to dictionary."""
return {
'report_id': self.report_id,
'config': self.config.to_dict(),
'schedule_cron': self.schedule_cron,
'delivery_method': self.delivery_method,
'delivery_config': self.delivery_config,
'enabled': self.enabled,
'last_run': self.last_run.isoformat() if self.last_run else None,
'next_run': self.next_run.isoformat() if self.next_run else None
}
class ReportGenerator:
"""Generate analytics reports in various formats."""
def __init__(
self,
query_engine, # QueryEngine
performance_analyzer, # PerformanceAnalyzer
insight_generator, # InsightGenerator
output_dir: str = "data/analytics/reports"
):
"""
Initialize report generator.
Args:
query_engine: Query engine instance
performance_analyzer: Performance analyzer instance
insight_generator: Insight generator instance
output_dir: Output directory for reports
"""
self.query_engine = query_engine
self.performance_analyzer = performance_analyzer
self.insight_generator = insight_generator
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self.scheduled_reports: Dict[str, ScheduledReport] = {}
logger.info("ReportGenerator initialized")
def generate_report(
self,
config: ReportConfig
) -> Dict[str, Any]:
"""
Generate a report based on configuration.
Args:
config: Report configuration
Returns:
Report data dictionary
"""
logger.info(f"Generating report: {config.title}")
# Collect data
report_data = {
'title': config.title,
'generated_at': datetime.now().isoformat(),
'time_range': {
'start': config.start_time.isoformat(),
'end': config.end_time.isoformat()
},
'metrics': {},
'performance': {},
'insights': []
}
# Query metrics
for metric_type in config.metric_types:
filters = {}
if config.workflow_ids:
filters['workflow_id'] = config.workflow_ids[0] # Simplified
metrics = self.query_engine.query(
metric_type=metric_type,
start_time=config.start_time,
end_time=config.end_time,
filters=filters
)
report_data['metrics'][metric_type] = metrics
# Add performance analysis
if config.workflow_ids:
for workflow_id in config.workflow_ids:
perf_stats = self.performance_analyzer.analyze_performance(
workflow_id=workflow_id,
start_time=config.start_time,
end_time=config.end_time
)
report_data['performance'][workflow_id] = perf_stats.to_dict()
# Add insights
if config.include_insights:
insights = self.insight_generator.generate_insights(
start_time=config.start_time,
end_time=config.end_time
)
report_data['insights'] = [i.to_dict() for i in insights]
return report_data
def export_json(
self,
report_data: Dict[str, Any],
filename: Optional[str] = None
) -> str:
"""
Export report as JSON.
Args:
report_data: Report data
filename: Output filename (auto-generated if None)
Returns:
Path to exported file
"""
if filename is None:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"report_{timestamp}.json"
filepath = self.output_dir / filename
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(report_data, f, indent=2)
logger.info(f"Exported JSON report: {filepath}")
return str(filepath)
def export_csv(
self,
report_data: Dict[str, Any],
filename: Optional[str] = None
) -> str:
"""
Export report as CSV.
Args:
report_data: Report data
filename: Output filename (auto-generated if None)
Returns:
Path to exported file
"""
if filename is None:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"report_{timestamp}.csv"
filepath = self.output_dir / filename
# Flatten metrics for CSV export
rows = []
for metric_type, metrics in report_data.get('metrics', {}).items():
for metric in metrics:
row = {
'metric_type': metric_type,
**metric
}
rows.append(row)
if rows:
# Get all unique keys
fieldnames = set()
for row in rows:
fieldnames.update(row.keys())
fieldnames = sorted(fieldnames)
with open(filepath, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
logger.info(f"Exported CSV report: {filepath}")
return str(filepath)
def export_html(
self,
report_data: Dict[str, Any],
filename: Optional[str] = None
) -> str:
"""
Export report as HTML.
Args:
report_data: Report data
filename: Output filename (auto-generated if None)
Returns:
Path to exported file
"""
if filename is None:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"report_{timestamp}.html"
filepath = self.output_dir / filename
# Generate HTML
html = self._generate_html(report_data)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(html)
logger.info(f"Exported HTML report: {filepath}")
return str(filepath)
def _generate_html(self, report_data: Dict[str, Any]) -> str:
"""Generate HTML report."""
html = f"""<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>{report_data['title']}</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 20px; }}
h1 {{ color: #333; }}
h2 {{ color: #666; margin-top: 30px; }}
table {{ border-collapse: collapse; width: 100%; margin: 20px 0; }}
th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
th {{ background-color: #4CAF50; color: white; }}
.insight {{ background-color: #f9f9f9; padding: 15px; margin: 10px 0; border-left: 4px solid #4CAF50; }}
.metric-section {{ margin: 20px 0; }}
</style>
</head>
<body>
<h1>{report_data['title']}</h1>
<p><strong>Generated:</strong> {report_data['generated_at']}</p>
<p><strong>Time Range:</strong> {report_data['time_range']['start']} to {report_data['time_range']['end']}</p>
"""
# Add performance section
if report_data.get('performance'):
html += "<h2>Performance Analysis</h2>\n"
for workflow_id, perf in report_data['performance'].items():
html += f"<div class='metric-section'>\n"
html += f"<h3>Workflow: {workflow_id}</h3>\n"
html += f"<p>Average Duration: {perf.get('avg_duration', 0):.2f}s</p>\n"
html += f"<p>Success Rate: {perf.get('success_rate', 0):.1f}%</p>\n"
html += "</div>\n"
# Add insights section
if report_data.get('insights'):
html += "<h2>Insights</h2>\n"
for insight in report_data['insights']:
html += f"<div class='insight'>\n"
html += f"<strong>{insight.get('title', 'Insight')}</strong>\n"
html += f"<p>{insight.get('description', '')}</p>\n"
html += "</div>\n"
html += "</body>\n</html>"
return html
def export_pdf(
self,
report_data: Dict[str, Any],
filename: Optional[str] = None
) -> str:
"""
Export report as PDF.
Note: Requires reportlab library. Falls back to HTML if not available.
Args:
report_data: Report data
filename: Output filename (auto-generated if None)
Returns:
Path to exported file
"""
try:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.units import inch
if filename is None:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"report_{timestamp}.pdf"
filepath = self.output_dir / filename
# Create PDF
doc = SimpleDocTemplate(str(filepath), pagesize=letter)
styles = getSampleStyleSheet()
story = []
# Title
title = Paragraph(report_data['title'], styles['Title'])
story.append(title)
story.append(Spacer(1, 0.2*inch))
# Metadata
meta = Paragraph(f"Generated: {report_data['generated_at']}", styles['Normal'])
story.append(meta)
story.append(Spacer(1, 0.3*inch))
# Performance section
if report_data.get('performance'):
heading = Paragraph("Performance Analysis", styles['Heading2'])
story.append(heading)
story.append(Spacer(1, 0.1*inch))
for workflow_id, perf in report_data['performance'].items():
text = f"<b>Workflow:</b> {workflow_id}<br/>"
text += f"Average Duration: {perf.get('avg_duration', 0):.2f}s<br/>"
text += f"Success Rate: {perf.get('success_rate', 0):.1f}%"
para = Paragraph(text, styles['Normal'])
story.append(para)
story.append(Spacer(1, 0.2*inch))
# Build PDF
doc.build(story)
logger.info(f"Exported PDF report: {filepath}")
return str(filepath)
except ImportError:
logger.warning("reportlab not available, falling back to HTML")
return self.export_html(report_data, filename.replace('.pdf', '.html') if filename else None)
def schedule_report(
self,
report: ScheduledReport
) -> None:
"""
Schedule a report for automatic generation.
Args:
report: Scheduled report configuration
"""
self.scheduled_reports[report.report_id] = report
logger.info(f"Scheduled report: {report.report_id}")
def get_scheduled_reports(self) -> List[ScheduledReport]:
"""Get all scheduled reports."""
return list(self.scheduled_reports.values())
def run_scheduled_report(self, report_id: str) -> Optional[str]:
"""
Run a scheduled report.
Args:
report_id: Report identifier
Returns:
Path to generated report or None
"""
report = self.scheduled_reports.get(report_id)
if not report or not report.enabled:
return None
# Generate report
report_data = self.generate_report(report.config)
# Export based on format
if report.config.format == 'json':
filepath = self.export_json(report_data)
elif report.config.format == 'csv':
filepath = self.export_csv(report_data)
elif report.config.format == 'html':
filepath = self.export_html(report_data)
elif report.config.format == 'pdf':
filepath = self.export_pdf(report_data)
else:
filepath = self.export_json(report_data)
# Update last run
report.last_run = datetime.now()
# Deliver report
self._deliver_report(report, filepath)
return filepath
def _deliver_report(
self,
report: ScheduledReport,
filepath: str
) -> None:
"""Deliver report via configured method."""
if report.delivery_method == 'file':
# Already saved to file
logger.info(f"Report saved to: {filepath}")
elif report.delivery_method == 'email':
# TODO: Implement email delivery
logger.info(f"Email delivery not yet implemented: {filepath}")
elif report.delivery_method == 'webhook':
# TODO: Implement webhook delivery
logger.info(f"Webhook delivery not yet implemented: {filepath}")

View File

@@ -0,0 +1,9 @@
"""Storage components for analytics data."""
from .timeseries_store import TimeSeriesStore
from .archive_storage import ArchiveStorage
__all__ = [
'TimeSeriesStore',
'ArchiveStorage',
]

View File

@@ -0,0 +1,393 @@
"""Archive storage for old metrics with compression."""
import logging
import gzip
import json
import os
from typing import Dict, List, Optional, Any
from datetime import datetime, timedelta
from pathlib import Path
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class RetentionPolicy:
"""Retention policy configuration."""
metric_type: str
hot_retention_days: int # Keep in main storage
archive_retention_days: int # Keep in archive
compression_enabled: bool = True
def to_dict(self) -> Dict:
"""Convert to dictionary."""
return {
'metric_type': self.metric_type,
'hot_retention_days': self.hot_retention_days,
'archive_retention_days': self.archive_retention_days,
'compression_enabled': self.compression_enabled
}
@classmethod
def from_dict(cls, data: Dict) -> 'RetentionPolicy':
"""Create from dictionary."""
return cls(**data)
class ArchiveStorage:
"""Archive storage for old metrics."""
def __init__(self, archive_dir: str = "data/analytics/archive"):
"""
Initialize archive storage.
Args:
archive_dir: Directory for archived data
"""
self.archive_dir = Path(archive_dir)
self.archive_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"ArchiveStorage initialized: {archive_dir}")
def archive_metrics(
self,
metrics: List[Dict[str, Any]],
metric_type: str,
archive_date: datetime,
compress: bool = True
) -> str:
"""
Archive metrics to compressed storage.
Args:
metrics: List of metrics to archive
metric_type: Type of metrics
archive_date: Date for archive file
compress: Whether to compress
Returns:
Path to archive file
"""
# Create archive filename
date_str = archive_date.strftime('%Y%m%d')
filename = f"{metric_type}_{date_str}.json"
if compress:
filename += ".gz"
filepath = self.archive_dir / filename
# Serialize metrics
data = {
'metric_type': metric_type,
'archive_date': archive_date.isoformat(),
'count': len(metrics),
'metrics': metrics
}
json_data = json.dumps(data, indent=2)
# Write to file (compressed or not)
if compress:
with gzip.open(filepath, 'wt', encoding='utf-8') as f:
f.write(json_data)
else:
with open(filepath, 'w', encoding='utf-8') as f:
f.write(json_data)
logger.info(f"Archived {len(metrics)} {metric_type} metrics to {filepath}")
return str(filepath)
def query_archive(
self,
metric_type: str,
start_date: datetime,
end_date: datetime,
filters: Optional[Dict[str, Any]] = None
) -> List[Dict[str, Any]]:
"""
Query archived metrics.
Args:
metric_type: Type of metrics
start_date: Start date
end_date: End date
filters: Optional filters
Returns:
List of matching metrics
"""
results = []
# Iterate through date range
current_date = start_date
while current_date <= end_date:
date_str = current_date.strftime('%Y%m%d')
# Try both compressed and uncompressed
for ext in ['.json.gz', '.json']:
filename = f"{metric_type}_{date_str}{ext}"
filepath = self.archive_dir / filename
if filepath.exists():
metrics = self._read_archive_file(filepath)
# Apply filters
if filters:
metrics = self._apply_filters(metrics, filters)
results.extend(metrics)
break
current_date += timedelta(days=1)
logger.debug(f"Query returned {len(results)} archived metrics")
return results
def _read_archive_file(self, filepath: Path) -> List[Dict[str, Any]]:
"""Read archive file (compressed or not)."""
try:
if filepath.suffix == '.gz':
with gzip.open(filepath, 'rt', encoding='utf-8') as f:
data = json.load(f)
else:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
return data.get('metrics', [])
except Exception as e:
logger.error(f"Error reading archive {filepath}: {e}")
return []
def _apply_filters(
self,
metrics: List[Dict[str, Any]],
filters: Dict[str, Any]
) -> List[Dict[str, Any]]:
"""Apply filters to metrics."""
filtered = []
for metric in metrics:
match = True
for key, value in filters.items():
if metric.get(key) != value:
match = False
break
if match:
filtered.append(metric)
return filtered
def delete_archive(
self,
metric_type: str,
before_date: datetime
) -> int:
"""
Delete archived data before a date.
Args:
metric_type: Type of metrics
before_date: Delete archives before this date
Returns:
Number of files deleted
"""
deleted = 0
# Find matching archive files
pattern = f"{metric_type}_*.json*"
for filepath in self.archive_dir.glob(pattern):
# Extract date from filename
try:
date_str = filepath.stem.split('_')[1]
if filepath.suffix == '.gz':
date_str = date_str.replace('.json', '')
file_date = datetime.strptime(date_str, '%Y%m%d')
if file_date < before_date:
filepath.unlink()
deleted += 1
logger.info(f"Deleted archive: {filepath}")
except Exception as e:
logger.error(f"Error processing {filepath}: {e}")
return deleted
def get_archive_stats(self) -> Dict[str, Any]:
"""
Get archive storage statistics.
Returns:
Dictionary with archive stats
"""
stats = {
'total_files': 0,
'total_size_bytes': 0,
'by_metric_type': {},
'oldest_archive': None,
'newest_archive': None
}
for filepath in self.archive_dir.glob('*.json*'):
stats['total_files'] += 1
stats['total_size_bytes'] += filepath.stat().st_size
# Extract metric type
metric_type = filepath.stem.split('_')[0]
if metric_type not in stats['by_metric_type']:
stats['by_metric_type'][metric_type] = {
'count': 0,
'size_bytes': 0
}
stats['by_metric_type'][metric_type]['count'] += 1
stats['by_metric_type'][metric_type]['size_bytes'] += filepath.stat().st_size
# Track oldest/newest
mtime = datetime.fromtimestamp(filepath.stat().st_mtime)
if stats['oldest_archive'] is None or mtime < stats['oldest_archive']:
stats['oldest_archive'] = mtime
if stats['newest_archive'] is None or mtime > stats['newest_archive']:
stats['newest_archive'] = mtime
# Convert to ISO format
if stats['oldest_archive']:
stats['oldest_archive'] = stats['oldest_archive'].isoformat()
if stats['newest_archive']:
stats['newest_archive'] = stats['newest_archive'].isoformat()
return stats
class RetentionPolicyEngine:
"""Engine for applying retention policies."""
def __init__(
self,
archive_storage: ArchiveStorage,
policies: Optional[List[RetentionPolicy]] = None
):
"""
Initialize retention policy engine.
Args:
archive_storage: Archive storage instance
policies: List of retention policies
"""
self.archive = archive_storage
self.policies = policies or self._default_policies()
self.policy_file = Path("data/analytics/retention_policies.json")
self._load_policies()
logger.info("RetentionPolicyEngine initialized")
def _default_policies(self) -> List[RetentionPolicy]:
"""Get default retention policies."""
return [
RetentionPolicy(
metric_type='execution',
hot_retention_days=30,
archive_retention_days=365
),
RetentionPolicy(
metric_type='step',
hot_retention_days=7,
archive_retention_days=90
),
RetentionPolicy(
metric_type='resource',
hot_retention_days=7,
archive_retention_days=30
)
]
def _load_policies(self) -> None:
"""Load policies from file."""
if self.policy_file.exists():
try:
with open(self.policy_file, 'r') as f:
data = json.load(f)
self.policies = [RetentionPolicy.from_dict(p) for p in data]
logger.info(f"Loaded {len(self.policies)} retention policies")
except Exception as e:
logger.error(f"Error loading policies: {e}")
def save_policies(self) -> None:
"""Save policies to file."""
self.policy_file.parent.mkdir(parents=True, exist_ok=True)
with open(self.policy_file, 'w') as f:
json.dump([p.to_dict() for p in self.policies], f, indent=2)
logger.info("Retention policies saved")
def add_policy(self, policy: RetentionPolicy) -> None:
"""Add or update a retention policy."""
# Remove existing policy for same metric type
self.policies = [p for p in self.policies if p.metric_type != policy.metric_type]
self.policies.append(policy)
self.save_policies()
logger.info(f"Added policy for {policy.metric_type}")
def get_policy(self, metric_type: str) -> Optional[RetentionPolicy]:
"""Get policy for a metric type."""
for policy in self.policies:
if policy.metric_type == metric_type:
return policy
return None
def apply_policies(
self,
store, # TimeSeriesStore
dry_run: bool = False
) -> Dict[str, Any]:
"""
Apply retention policies to storage.
Args:
store: TimeSeriesStore instance
dry_run: If True, don't actually delete data
Returns:
Dictionary with application results
"""
results = {
'archived': {},
'deleted': {},
'errors': []
}
now = datetime.now()
for policy in self.policies:
try:
# Archive old hot data
hot_cutoff = now - timedelta(days=policy.hot_retention_days)
metrics_to_archive = store.query_range(
metric_type=policy.metric_type,
start_time=datetime.min,
end_time=hot_cutoff
)
if metrics_to_archive and not dry_run:
archive_path = self.archive.archive_metrics(
metrics=metrics_to_archive,
metric_type=policy.metric_type,
archive_date=hot_cutoff,
compress=policy.compression_enabled
)
results['archived'][policy.metric_type] = {
'count': len(metrics_to_archive),
'path': archive_path
}
# Delete old archived data
archive_cutoff = now - timedelta(days=policy.archive_retention_days)
if not dry_run:
deleted_count = self.archive.delete_archive(
metric_type=policy.metric_type,
before_date=archive_cutoff
)
results['deleted'][policy.metric_type] = deleted_count
except Exception as e:
error_msg = f"Error applying policy for {policy.metric_type}: {e}"
logger.error(error_msg)
results['errors'].append(error_msg)
return results

View File

@@ -0,0 +1,374 @@
"""Time-series storage for analytics metrics."""
import sqlite3
import json
import logging
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
from datetime import datetime
from contextlib import contextmanager
from ..collection.metrics_collector import ExecutionMetrics, StepMetrics
from ..collection.resource_collector import ResourceMetrics
logger = logging.getLogger(__name__)
class TimeSeriesStore:
"""Store for time-series metrics data using SQLite."""
# Database schema
SCHEMA = """
-- Execution metrics table
CREATE TABLE IF NOT EXISTS execution_metrics (
execution_id TEXT PRIMARY KEY,
workflow_id TEXT NOT NULL,
started_at TIMESTAMP NOT NULL,
completed_at TIMESTAMP,
duration_ms REAL,
status TEXT NOT NULL,
steps_total INTEGER DEFAULT 0,
steps_completed INTEGER DEFAULT 0,
steps_failed INTEGER DEFAULT 0,
error_message TEXT,
context JSON
);
CREATE INDEX IF NOT EXISTS idx_workflow_time
ON execution_metrics(workflow_id, started_at);
CREATE INDEX IF NOT EXISTS idx_status
ON execution_metrics(status);
CREATE INDEX IF NOT EXISTS idx_started_at
ON execution_metrics(started_at);
-- Step metrics table
CREATE TABLE IF NOT EXISTS step_metrics (
step_id TEXT PRIMARY KEY,
execution_id TEXT NOT NULL,
workflow_id TEXT NOT NULL,
node_id TEXT NOT NULL,
action_type TEXT NOT NULL,
target_element TEXT,
started_at TIMESTAMP NOT NULL,
completed_at TIMESTAMP NOT NULL,
duration_ms REAL NOT NULL,
status TEXT NOT NULL,
confidence_score REAL,
retry_count INTEGER DEFAULT 0,
error_details TEXT,
FOREIGN KEY (execution_id) REFERENCES execution_metrics(execution_id)
);
CREATE INDEX IF NOT EXISTS idx_execution
ON step_metrics(execution_id);
CREATE INDEX IF NOT EXISTS idx_workflow_action
ON step_metrics(workflow_id, action_type);
CREATE INDEX IF NOT EXISTS idx_step_time
ON step_metrics(started_at);
-- Resource metrics table
CREATE TABLE IF NOT EXISTS resource_metrics (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp TIMESTAMP NOT NULL,
workflow_id TEXT,
execution_id TEXT,
cpu_percent REAL NOT NULL,
memory_mb REAL NOT NULL,
gpu_utilization REAL DEFAULT 0.0,
gpu_memory_mb REAL DEFAULT 0.0,
disk_io_mb REAL DEFAULT 0.0
);
CREATE INDEX IF NOT EXISTS idx_resource_time
ON resource_metrics(timestamp);
CREATE INDEX IF NOT EXISTS idx_resource_workflow
ON resource_metrics(workflow_id, timestamp);
"""
def __init__(self, storage_path: Path):
"""
Initialize time-series store.
Args:
storage_path: Path to storage directory
"""
self.storage_path = Path(storage_path)
self.storage_path.mkdir(parents=True, exist_ok=True)
self.db_path = self.storage_path / 'timeseries.db'
# Initialize database
self._init_database()
logger.info(f"TimeSeriesStore initialized at {self.db_path}")
def _init_database(self) -> None:
"""Initialize database schema."""
with self._get_connection() as conn:
conn.executescript(self.SCHEMA)
conn.commit()
@contextmanager
def _get_connection(self):
"""Get database connection context manager."""
conn = sqlite3.connect(str(self.db_path))
conn.row_factory = sqlite3.Row
try:
yield conn
finally:
conn.close()
def write_metrics(
self,
metrics: List[Any] # Union[ExecutionMetrics, StepMetrics, ResourceMetrics]
) -> None:
"""
Write metrics to time-series storage.
Args:
metrics: List of metrics to write
"""
if not metrics:
return
with self._get_connection() as conn:
for metric in metrics:
if isinstance(metric, ExecutionMetrics):
self._write_execution_metric(conn, metric)
elif isinstance(metric, StepMetrics):
self._write_step_metric(conn, metric)
elif isinstance(metric, ResourceMetrics):
self._write_resource_metric(conn, metric)
conn.commit()
logger.debug(f"Wrote {len(metrics)} metrics to storage")
def _write_execution_metric(self, conn: sqlite3.Connection, metric: ExecutionMetrics) -> None:
"""Write execution metric."""
conn.execute("""
INSERT OR REPLACE INTO execution_metrics
(execution_id, workflow_id, started_at, completed_at, duration_ms,
status, steps_total, steps_completed, steps_failed, error_message, context)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
metric.execution_id,
metric.workflow_id,
metric.started_at.isoformat(),
metric.completed_at.isoformat() if metric.completed_at else None,
metric.duration_ms,
metric.status,
metric.steps_total,
metric.steps_completed,
metric.steps_failed,
metric.error_message,
json.dumps(metric.context)
))
def _write_step_metric(self, conn: sqlite3.Connection, metric: StepMetrics) -> None:
"""Write step metric."""
conn.execute("""
INSERT OR REPLACE INTO step_metrics
(step_id, execution_id, workflow_id, node_id, action_type, target_element,
started_at, completed_at, duration_ms, status, confidence_score,
retry_count, error_details)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
metric.step_id,
metric.execution_id,
metric.workflow_id,
metric.node_id,
metric.action_type,
metric.target_element,
metric.started_at.isoformat(),
metric.completed_at.isoformat(),
metric.duration_ms,
metric.status,
metric.confidence_score,
metric.retry_count,
metric.error_details
))
def _write_resource_metric(self, conn: sqlite3.Connection, metric: ResourceMetrics) -> None:
"""Write resource metric."""
conn.execute("""
INSERT INTO resource_metrics
(timestamp, workflow_id, execution_id, cpu_percent, memory_mb,
gpu_utilization, gpu_memory_mb, disk_io_mb)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (
metric.timestamp.isoformat(),
metric.workflow_id,
metric.execution_id,
metric.cpu_percent,
metric.memory_mb,
metric.gpu_utilization,
metric.gpu_memory_mb,
metric.disk_io_mb
))
def query_range(
self,
start_time: datetime,
end_time: datetime,
workflow_id: Optional[str] = None,
metric_types: Optional[List[str]] = None
) -> Dict[str, List[Dict]]:
"""
Query metrics within a time range.
Args:
start_time: Start of time range
end_time: End of time range
workflow_id: Optional workflow ID filter
metric_types: Optional list of metric types ('execution', 'step', 'resource')
Returns:
Dictionary with metric type as key and list of metrics as value
"""
results = {}
metric_types = metric_types or ['execution', 'step', 'resource']
with self._get_connection() as conn:
if 'execution' in metric_types:
results['execution'] = self._query_execution_metrics(
conn, start_time, end_time, workflow_id
)
if 'step' in metric_types:
results['step'] = self._query_step_metrics(
conn, start_time, end_time, workflow_id
)
if 'resource' in metric_types:
results['resource'] = self._query_resource_metrics(
conn, start_time, end_time, workflow_id
)
return results
def _query_execution_metrics(
self,
conn: sqlite3.Connection,
start_time: datetime,
end_time: datetime,
workflow_id: Optional[str]
) -> List[Dict]:
"""Query execution metrics."""
query = """
SELECT * FROM execution_metrics
WHERE started_at >= ? AND started_at <= ?
"""
params = [start_time.isoformat(), end_time.isoformat()]
if workflow_id:
query += " AND workflow_id = ?"
params.append(workflow_id)
query += " ORDER BY started_at"
cursor = conn.execute(query, params)
return [dict(row) for row in cursor.fetchall()]
def _query_step_metrics(
self,
conn: sqlite3.Connection,
start_time: datetime,
end_time: datetime,
workflow_id: Optional[str]
) -> List[Dict]:
"""Query step metrics."""
query = """
SELECT * FROM step_metrics
WHERE started_at >= ? AND started_at <= ?
"""
params = [start_time.isoformat(), end_time.isoformat()]
if workflow_id:
query += " AND workflow_id = ?"
params.append(workflow_id)
query += " ORDER BY started_at"
cursor = conn.execute(query, params)
return [dict(row) for row in cursor.fetchall()]
def _query_resource_metrics(
self,
conn: sqlite3.Connection,
start_time: datetime,
end_time: datetime,
workflow_id: Optional[str]
) -> List[Dict]:
"""Query resource metrics."""
query = """
SELECT * FROM resource_metrics
WHERE timestamp >= ? AND timestamp <= ?
"""
params = [start_time.isoformat(), end_time.isoformat()]
if workflow_id:
query += " AND workflow_id = ?"
params.append(workflow_id)
query += " ORDER BY timestamp"
cursor = conn.execute(query, params)
return [dict(row) for row in cursor.fetchall()]
def aggregate(
self,
metric: str,
aggregation: str, # 'avg', 'sum', 'count', 'min', 'max'
group_by: List[str],
start_time: datetime,
end_time: datetime,
filters: Optional[Dict] = None
) -> List[Dict]:
"""
Aggregate metrics with grouping.
Args:
metric: Metric field to aggregate
aggregation: Aggregation function
group_by: Fields to group by
start_time: Start of time range
end_time: End of time range
filters: Optional filters
Returns:
List of aggregated results
"""
# Determine table based on metric
if metric in ['duration_ms', 'steps_total', 'steps_completed', 'steps_failed']:
table = 'execution_metrics'
time_field = 'started_at'
elif metric in ['confidence_score', 'retry_count']:
table = 'step_metrics'
time_field = 'started_at'
elif metric in ['cpu_percent', 'memory_mb', 'gpu_utilization']:
table = 'resource_metrics'
time_field = 'timestamp'
else:
raise ValueError(f"Unknown metric: {metric}")
# Build query
agg_func = aggregation.upper()
group_fields = ', '.join(group_by)
query = f"""
SELECT {group_fields}, {agg_func}({metric}) as value
FROM {table}
WHERE {time_field} >= ? AND {time_field} <= ?
"""
params = [start_time.isoformat(), end_time.isoformat()]
# Add filters
if filters:
for key, value in filters.items():
query += f" AND {key} = ?"
params.append(value)
query += f" GROUP BY {group_fields}"
with self._get_connection() as conn:
cursor = conn.execute(query, params)
return [dict(row) for row in cursor.fetchall()]