rpa_vision_v3/tests/property/test_analytics_properties.py

"""Property-based tests for analytics system."""

import pytest
from hypothesis import given, strategies as st, settings
from datetime import datetime, timedelta
import tempfile
import os

from core.analytics.collection.metrics_collector import (
    MetricsCollector, ExecutionMetrics, StepMetrics
)
from core.analytics.storage.timeseries_store import TimeSeriesStore
from core.analytics.storage.archive_storage import (
    ArchiveStorage, RetentionPolicyEngine, RetentionPolicy
)
from core.analytics.engine.performance_analyzer import PerformanceAnalyzer
from core.analytics.engine.anomaly_detector import AnomalyDetector
from core.analytics.engine.success_rate_calculator import SuccessRateCalculator
from core.analytics.reporting.report_generator import ReportGenerator, ReportConfig
from core.analytics.query.query_engine import QueryEngine


# Fixtures
@pytest.fixture
def temp_db():
    """Create temporary database."""
    fd, path = tempfile.mkstemp(suffix='.db')
    os.close(fd)
    yield path
    if os.path.exists(path):
        os.unlink(path)


@pytest.fixture
def temp_archive_dir():
    """Create temporary archive directory."""
    import tempfile
    import shutil
    dirpath = tempfile.mkdtemp()
    yield dirpath
    shutil.rmtree(dirpath, ignore_errors=True)


@pytest.fixture
def store(temp_db):
    """Create TimeSeriesStore instance."""
    return TimeSeriesStore(temp_db)


@pytest.fixture
def collector(store):
    """Create MetricsCollector instance."""
    return MetricsCollector(store)


# Property 1: Metrics completeness
# **Feature: rpa-analytics, Property 1: Metrics completeness**
# **Validates: Requirements 1.1, 1.4**
@given(
    execution_id=st.text(min_size=1, max_size=50),
    workflow_id=st.text(min_size=1, max_size=50),
    duration=st.floats(min_value=0.1, max_value=1000.0),
    status=st.sampled_from(['success', 'failed', 'timeout'])
)
@settings(max_examples=100, deadline=None)
def test_metrics_completeness(temp_db, execution_id, workflow_id, duration, status):
    """
    Property: For any execution metrics recorded, all required fields must be present
    when queried back from storage.
    """
    store = TimeSeriesStore(temp_db)
    collector = MetricsCollector(store)

    # Record execution
    now = datetime.now()
    execution = ExecutionMetrics(
        execution_id=execution_id,
        workflow_id=workflow_id,
        started_at=now,
        completed_at=now + timedelta(seconds=duration),
        duration=duration,
        status=status
    )

    collector.record_execution(execution)
    collector.flush()

    # Query back
    metrics = store.query_range(
        metric_type='execution',
        start_time=now - timedelta(seconds=1),
        end_time=now + timedelta(seconds=duration + 1)
    )

    # Verify completeness
    assert len(metrics) > 0
    metric = metrics[0]
    assert 'execution_id' in metric
    assert 'workflow_id' in metric
    assert 'duration' in metric
    assert 'status' in metric
    assert metric['workflow_id'] == workflow_id
    assert metric['status'] == status


# Property 3: Failure recording completeness
# **Feature: rpa-analytics, Property 3: Failure recording completeness**
# **Validates: Requirements 1.3**
@given(
    workflow_id=st.text(min_size=1, max_size=50),
    error_message=st.text(min_size=1, max_size=200)
)
@settings(max_examples=50, deadline=None)
def test_failure_recording_completeness(temp_db, workflow_id, error_message):
    """
    Property: For any failed execution, the error message must be recorded
    and retrievable.
    """
    store = TimeSeriesStore(temp_db)
    collector = MetricsCollector(store)

    now = datetime.now()
    execution = ExecutionMetrics(
        execution_id="failed_exec",
        workflow_id=workflow_id,
        started_at=now,
        completed_at=now + timedelta(seconds=10),
        duration=10.0,
        status="failed",
        error_message=error_message
    )

    collector.record_execution(execution)
    collector.flush()

    # Query failed executions
    metrics = store.query_range(
        metric_type='execution',
        start_time=now - timedelta(seconds=1),
        end_time=now + timedelta(seconds=11),
        filters={'status': 'failed'}
    )

    assert len(metrics) > 0
    assert metrics[0].get('error_message') is not None


# Property 5: Statistical accuracy
# **Feature: rpa-analytics, Property 5: Statistical accuracy**
# **Validates: Requirements 2.1**
@given(
    durations=st.lists(
        st.floats(min_value=1.0, max_value=100.0),
        min_size=10,
        max_size=50
    )
)
@settings(max_examples=50, deadline=None)
def test_statistical_accuracy(temp_db, durations):
    """
    Property: For any list of durations, calculated statistics (avg, median)
    must match expected values within tolerance.
    """
    store = TimeSeriesStore(temp_db)
    collector = MetricsCollector(store)
    analyzer = PerformanceAnalyzer(store)

    workflow_id = "test_workflow"
    now = datetime.now()

    # Record executions
    for i, duration in enumerate(durations):
        execution = ExecutionMetrics(
            execution_id=f"exec_{i}",
            workflow_id=workflow_id,
            started_at=now + timedelta(seconds=i*10),
            completed_at=now + timedelta(seconds=i*10 + duration),
            duration=duration,
            status="success"
        )
        collector.record_execution(execution)

    collector.flush()

    # Analyze
    stats = analyzer.analyze_performance(
        workflow_id=workflow_id,
        start_time=now - timedelta(seconds=1),
        end_time=now + timedelta(seconds=len(durations)*10 + 100)
    )

    # Verify statistics
    import statistics
    expected_avg = statistics.mean(durations)
    expected_median = statistics.median(durations)

    assert abs(stats.avg_duration - expected_avg) < 0.1
    assert abs(stats.median_duration - expected_median) < 0.1


# Property 8: Success rate calculation accuracy
# **Feature: rpa-analytics, Property 8: Success rate calculation accuracy**
# **Validates: Requirements 3.1**
@given(
    num_success=st.integers(min_value=0, max_value=50),
    num_failed=st.integers(min_value=0, max_value=50)
)
@settings(max_examples=50, deadline=None)
def test_success_rate_accuracy(temp_db, num_success, num_failed):
    """
    Property: For any combination of successful and failed executions,
    the calculated success rate must match the expected percentage.
    """
    if num_success + num_failed == 0:
        return  # Skip empty case

    store = TimeSeriesStore(temp_db)
    collector = MetricsCollector(store)
    calculator = SuccessRateCalculator(store)

    workflow_id = "test_workflow"
    now = datetime.now()

    # Record successful executions
    for i in range(num_success):
        execution = ExecutionMetrics(
            execution_id=f"success_{i}",
            workflow_id=workflow_id,
            started_at=now + timedelta(seconds=i),
            completed_at=now + timedelta(seconds=i+1),
            duration=1.0,
            status="success"
        )
        collector.record_execution(execution)

    # Record failed executions
    for i in range(num_failed):
        execution = ExecutionMetrics(
            execution_id=f"failed_{i}",
            workflow_id=workflow_id,
            started_at=now + timedelta(seconds=num_success+i),
            completed_at=now + timedelta(seconds=num_success+i+1),
            duration=1.0,
            status="failed"
        )
        collector.record_execution(execution)

    collector.flush()

    # Calculate success rate
    stats = calculator.calculate_success_rate(
        workflow_id=workflow_id,
        time_window_hours=1
    )

    # Verify
    total = num_success + num_failed
    expected_rate = (num_success / total) * 100

    assert abs(stats.success_rate - expected_rate) < 0.1
    assert stats.total_executions == total
    assert stats.successful_executions == num_success
    assert stats.failed_executions == num_failed


# Property 15: Filter application correctness
# **Feature: rpa-analytics, Property 15: Filter application correctness**
# **Validates: Requirements 7.1**
@given(
    workflow_ids=st.lists(
        st.text(min_size=1, max_size=20),
        min_size=2,
        max_size=5,
        unique=True
    ),
    target_workflow=st.integers(min_value=0, max_value=4)
)
@settings(max_examples=50, deadline=None)
def test_filter_correctness(temp_db, workflow_ids, target_workflow):
    """
    Property: For any set of workflows, filtering by a specific workflow_id
    must return only metrics for that workflow.
    """
    if target_workflow >= len(workflow_ids):
        target_workflow = 0

    store = TimeSeriesStore(temp_db)
    collector = MetricsCollector(store)
    engine = QueryEngine(store)

    target_id = workflow_ids[target_workflow]
    now = datetime.now()

    # Record executions for different workflows
    for i, wf_id in enumerate(workflow_ids):
        execution = ExecutionMetrics(
            execution_id=f"exec_{i}",
            workflow_id=wf_id,
            started_at=now + timedelta(seconds=i),
            completed_at=now + timedelta(seconds=i+1),
            duration=1.0,
            status="success"
        )
        collector.record_execution(execution)

    collector.flush()

    # Query with filter
    results = engine.query(
        metric_type='execution',
        start_time=now - timedelta(seconds=1),
        end_time=now + timedelta(seconds=len(workflow_ids)+1),
        filters={'workflow_id': target_id}
    )

    # Verify all results match filter
    assert len(results) > 0
    for result in results:
        assert result['workflow_id'] == target_id


# Property 16: Export format validity
# **Feature: rpa-analytics, Property 16: Export format validity**
# **Validates: Requirements 7.3**
@given(
    title=st.text(min_size=1, max_size=100),
    format_type=st.sampled_from(['json', 'csv', 'html'])
)
@settings(max_examples=30, deadline=None)
def test_export_format_validity(temp_db, temp_archive_dir, title, format_type):
    """
    Property: For any report configuration, the exported file must be
    valid and readable in the specified format.
    """
    store = TimeSeriesStore(temp_db)
    collector = MetricsCollector(store)

    # Create some test data
    now = datetime.now()
    execution = ExecutionMetrics(
        execution_id="test_exec",
        workflow_id="test_workflow",
        started_at=now,
        completed_at=now + timedelta(seconds=10),
        duration=10.0,
        status="success"
    )
    collector.record_execution(execution)
    collector.flush()

    # Generate report
    from core.analytics.engine.performance_analyzer import PerformanceAnalyzer
    from core.analytics.engine.insight_generator import InsightGenerator
    from core.analytics.engine.anomaly_detector import AnomalyDetector

    analyzer = PerformanceAnalyzer(store)
    detector = AnomalyDetector(store)
    insight_gen = InsightGenerator(analyzer, detector)
    engine = QueryEngine(store)

    generator = ReportGenerator(
        engine, analyzer, insight_gen, temp_archive_dir
    )

    config = ReportConfig(
        title=title,
        metric_types=['execution'],
        start_time=now - timedelta(hours=1),
        end_time=now + timedelta(hours=1),
        format=format_type
    )

    report_data = generator.generate_report(config)

    # Export and verify
    if format_type == 'json':
        filepath = generator.export_json(report_data)
        assert os.path.exists(filepath)
        import json
        with open(filepath, 'r') as f:
            data = json.load(f)
            assert 'title' in data

    elif format_type == 'csv':
        filepath = generator.export_csv(report_data)
        assert os.path.exists(filepath)
        import csv
        with open(filepath, 'r') as f:
            reader = csv.reader(f)
            rows = list(reader)
            assert len(rows) > 0  # At least header

    elif format_type == 'html':
        filepath = generator.export_html(report_data)
        assert os.path.exists(filepath)
        with open(filepath, 'r') as f:
            content = f.read()
            assert '<html>' in content.lower()
            assert title in content


# Property 19: Retention policy enforcement
# **Feature: rpa-analytics, Property 19: Retention policy enforcement**
# **Validates: Requirements 10.2**
@given(
    hot_days=st.integers(min_value=1, max_value=30),
    archive_days=st.integers(min_value=31, max_value=365)
)
@settings(max_examples=30, deadline=None)
def test_retention_policy_enforcement(temp_db, temp_archive_dir, hot_days, archive_days):
    """
    Property: For any retention policy, data older than hot_retention_days
    must be archived, and data older than archive_retention_days must be deleted.
    """
    store = TimeSeriesStore(temp_db)
    archive = ArchiveStorage(temp_archive_dir)
    engine = RetentionPolicyEngine(archive)

    # Create policy
    policy = RetentionPolicy(
        metric_type='execution',
        hot_retention_days=hot_days,
        archive_retention_days=archive_days,
        compression_enabled=True
    )

    engine.add_policy(policy)

    # Verify policy is stored
    retrieved_policy = engine.get_policy('execution')
    assert retrieved_policy is not None
    assert retrieved_policy.hot_retention_days == hot_days
    assert retrieved_policy.archive_retention_days == archive_days


# Property 20: Archive data integrity
# **Feature: rpa-analytics, Property 20: Archive data integrity**
# **Validates: Requirements 10.3**
@given(
    num_metrics=st.integers(min_value=1, max_value=50)
)
@settings(max_examples=30, deadline=None)
def test_archive_data_integrity(temp_archive_dir, num_metrics):
    """
    Property: For any metrics archived, querying the archive must return
    the same data that was archived.
    """
    archive = ArchiveStorage(temp_archive_dir)

    # Create test metrics
    now = datetime.now()
    metrics = []
    for i in range(num_metrics):
        metrics.append({
            'execution_id': f'exec_{i}',
            'workflow_id': 'test_workflow',
            'duration': float(i + 1),
            'status': 'success',
            'timestamp': (now + timedelta(seconds=i)).isoformat()
        })

    # Archive metrics
    archive.archive_metrics(
        metrics=metrics,
        metric_type='execution',
        archive_date=now,
        compress=True
    )

    # Query back
    retrieved = archive.query_archive(
        metric_type='execution',
        start_date=now - timedelta(days=1),
        end_date=now + timedelta(days=1)
    )

    # Verify integrity
    assert len(retrieved) == num_metrics
    for original, retrieved_metric in zip(metrics, retrieved):
        assert original['execution_id'] == retrieved_metric['execution_id']
        assert original['workflow_id'] == retrieved_metric['workflow_id']


if __name__ == '__main__':
    pytest.main([__file__, '-v', '--tb=short'])