feat(coaching): Implement complete COACHING mode infrastructure
Add comprehensive COACHING mode system with: Backend: - core/coaching module with session persistence and metrics - CoachingSessionPersistence for pause/resume sessions - CoachingMetricsCollector with learning progress tracking - REST API blueprint for coaching sessions management - Execution integration with COACHING mode support Frontend: - CoachingPanel component with keyboard shortcuts - Decision buttons (accept/reject/correct/manual/skip) - Real-time stats display and correction editor - CorrectionPacksDashboard for pack visualization - WebSocket hooks for real-time COACHING events Metrics & Monitoring: - WorkflowLearningMetrics with confidence scoring - GlobalCoachingMetrics for system-wide analytics - AUTO mode readiness detection (85% acceptance threshold) - Learning progress levels (OBSERVATION → COACHING → AUTO) Tests: - E2E tests for complete OBSERVATION → AUTO journey - Session persistence and recovery tests - Metrics threshold validation tests Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
486
tests/test_coaching_e2e.py
Normal file
486
tests/test_coaching_e2e.py
Normal file
@@ -0,0 +1,486 @@
|
||||
"""
|
||||
End-to-End Tests for COACHING Mode
|
||||
|
||||
Tests the complete OBSERVATION -> COACHING -> AUTO workflow:
|
||||
1. Start in OBSERVATION mode (record user actions)
|
||||
2. Transition to COACHING mode (suggest actions, get user feedback)
|
||||
3. Accumulate corrections in Correction Packs
|
||||
4. Track metrics and determine readiness for AUTO mode
|
||||
5. Transition to AUTO mode when confidence threshold is met
|
||||
|
||||
This test simulates the complete learning journey of a workflow.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import tempfile
|
||||
import shutil
|
||||
import time
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_storage():
|
||||
"""Create temporary storage directories."""
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
yield Path(temp_dir)
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def coaching_persistence(temp_storage):
|
||||
"""Create coaching persistence with temp storage."""
|
||||
from core.coaching import CoachingSessionPersistence
|
||||
return CoachingSessionPersistence(temp_storage / 'coaching_sessions')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def correction_service(temp_storage):
|
||||
"""Create correction pack service with temp storage."""
|
||||
from core.corrections import CorrectionPackService
|
||||
return CorrectionPackService(storage_path=temp_storage / 'correction_packs')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def metrics_collector(coaching_persistence):
|
||||
"""Create metrics collector."""
|
||||
from core.coaching import CoachingMetricsCollector
|
||||
return CoachingMetricsCollector(coaching_persistence)
|
||||
|
||||
|
||||
class TestCoachingE2E:
|
||||
"""End-to-end tests for the complete COACHING workflow."""
|
||||
|
||||
def test_complete_learning_journey(
|
||||
self,
|
||||
coaching_persistence,
|
||||
correction_service,
|
||||
metrics_collector
|
||||
):
|
||||
"""
|
||||
Test the complete learning journey from OBSERVATION to AUTO.
|
||||
|
||||
Scenario:
|
||||
1. Create workflow and start first COACHING session
|
||||
2. Make decisions (mix of accept, correct, reject)
|
||||
3. Corrections are captured in Correction Packs
|
||||
4. Run multiple sessions to build confidence
|
||||
5. Check metrics and readiness for AUTO
|
||||
"""
|
||||
workflow_id = "wf_e2e_test_001"
|
||||
|
||||
# =====================================================================
|
||||
# Phase 1: First COACHING session - Learning phase
|
||||
# =====================================================================
|
||||
print("\n=== Phase 1: First COACHING Session ===")
|
||||
|
||||
session1 = coaching_persistence.create_session(
|
||||
workflow_id=workflow_id,
|
||||
execution_id="exec_001",
|
||||
total_steps=5,
|
||||
metadata={'phase': 'learning'}
|
||||
)
|
||||
|
||||
# Simulate decisions with some corrections
|
||||
from core.coaching.session_persistence import CoachingDecisionRecord
|
||||
|
||||
decisions_p1 = [
|
||||
('accept', None),
|
||||
('correct', {'target': {'id': 'new_btn'}}),
|
||||
('accept', None),
|
||||
('reject', None),
|
||||
('accept', None),
|
||||
]
|
||||
|
||||
for i, (decision, correction) in enumerate(decisions_p1):
|
||||
record = CoachingDecisionRecord(
|
||||
step_index=i,
|
||||
node_id=f"node_{i+1}",
|
||||
action_type='click',
|
||||
decision=decision,
|
||||
correction=correction,
|
||||
feedback=f"Decision {i+1}"
|
||||
)
|
||||
session1.add_decision(record)
|
||||
|
||||
coaching_persistence.complete_session(session1.session_id, success=True)
|
||||
|
||||
# Verify session stats
|
||||
session1_reloaded = coaching_persistence.load_session(session1.session_id)
|
||||
assert session1_reloaded.stats['accepted'] == 3
|
||||
assert session1_reloaded.stats['corrected'] == 1
|
||||
assert session1_reloaded.stats['rejected'] == 1
|
||||
|
||||
print(f"Session 1 completed: {session1_reloaded.stats}")
|
||||
|
||||
# =====================================================================
|
||||
# Phase 2: Multiple sessions to improve acceptance rate
|
||||
# =====================================================================
|
||||
print("\n=== Phase 2: Multiple Training Sessions ===")
|
||||
|
||||
# Session 2: Better acceptance after learning
|
||||
session2 = coaching_persistence.create_session(
|
||||
workflow_id=workflow_id,
|
||||
execution_id="exec_002",
|
||||
total_steps=5
|
||||
)
|
||||
|
||||
# Most actions accepted now (corrections are working)
|
||||
decisions_p2 = [
|
||||
('accept', None),
|
||||
('accept', None),
|
||||
('accept', None),
|
||||
('accept', None),
|
||||
('correct', {'target': {'text': 'Submit'}}),
|
||||
]
|
||||
|
||||
for i, (decision, correction) in enumerate(decisions_p2):
|
||||
record = CoachingDecisionRecord(
|
||||
step_index=i,
|
||||
node_id=f"node_{i+1}",
|
||||
action_type='click',
|
||||
decision=decision,
|
||||
correction=correction
|
||||
)
|
||||
session2.add_decision(record)
|
||||
|
||||
coaching_persistence.complete_session(session2.session_id, success=True)
|
||||
print(f"Session 2 completed: {session2.stats}")
|
||||
|
||||
# Sessions 3-5: High acceptance rate
|
||||
for sess_num in range(3, 6):
|
||||
session = coaching_persistence.create_session(
|
||||
workflow_id=workflow_id,
|
||||
execution_id=f"exec_{sess_num:03d}",
|
||||
total_steps=5
|
||||
)
|
||||
|
||||
# All accepted
|
||||
for i in range(5):
|
||||
record = CoachingDecisionRecord(
|
||||
step_index=i,
|
||||
node_id=f"node_{i+1}",
|
||||
action_type='click',
|
||||
decision='accept'
|
||||
)
|
||||
session.add_decision(record)
|
||||
|
||||
coaching_persistence.complete_session(session.session_id, success=True)
|
||||
print(f"Session {sess_num} completed: all accepted")
|
||||
|
||||
# =====================================================================
|
||||
# Phase 3: Check Metrics and Learning Progress
|
||||
# =====================================================================
|
||||
print("\n=== Phase 3: Checking Metrics ===")
|
||||
|
||||
metrics = metrics_collector.get_workflow_metrics(workflow_id)
|
||||
|
||||
print(f"Total sessions: {metrics.total_sessions}")
|
||||
print(f"Total decisions: {metrics.total_decisions}")
|
||||
print(f"Acceptance rate: {metrics.acceptance_rate:.2%}")
|
||||
print(f"Correction rate: {metrics.correction_rate:.2%}")
|
||||
print(f"Confidence score: {metrics.confidence_score:.2f}")
|
||||
print(f"Learning progress: {metrics.learning_progress.value}")
|
||||
print(f"Ready for AUTO: {metrics.ready_for_auto}")
|
||||
print(f"Recommendations: {metrics.recommendations}")
|
||||
|
||||
# Assertions
|
||||
assert metrics.total_sessions == 5
|
||||
assert metrics.total_decisions == 25
|
||||
assert metrics.acceptance_rate > 0.8 # Should be high after training
|
||||
assert metrics.correction_rate < 0.15 # Should be low
|
||||
|
||||
# =====================================================================
|
||||
# Phase 4: Verify Readiness for AUTO
|
||||
# =====================================================================
|
||||
print("\n=== Phase 4: AUTO Mode Readiness ===")
|
||||
|
||||
# The workflow should be ready for AUTO after successful training
|
||||
assert metrics.ready_for_auto, "Workflow should be ready for AUTO mode"
|
||||
assert metrics.learning_progress.value in ['ready', 'autonomous']
|
||||
|
||||
print("SUCCESS: Workflow is ready for autonomous execution!")
|
||||
|
||||
def test_session_persistence_and_recovery(self, coaching_persistence):
|
||||
"""
|
||||
Test that COACHING sessions can be paused and resumed.
|
||||
"""
|
||||
print("\n=== Testing Session Persistence ===")
|
||||
|
||||
workflow_id = "wf_persistence_test"
|
||||
|
||||
# Create and partially complete a session
|
||||
session = coaching_persistence.create_session(
|
||||
workflow_id=workflow_id,
|
||||
execution_id="exec_persist",
|
||||
total_steps=10
|
||||
)
|
||||
|
||||
from core.coaching.session_persistence import CoachingDecisionRecord
|
||||
|
||||
# Add 3 decisions
|
||||
for i in range(3):
|
||||
record = CoachingDecisionRecord(
|
||||
step_index=i,
|
||||
node_id=f"node_{i+1}",
|
||||
action_type='click',
|
||||
decision='accept'
|
||||
)
|
||||
session.add_decision(record)
|
||||
|
||||
coaching_persistence.save_session(session)
|
||||
|
||||
# Pause the session
|
||||
coaching_persistence.pause_session(session.session_id)
|
||||
|
||||
# Verify paused
|
||||
loaded = coaching_persistence.load_session(session.session_id)
|
||||
assert loaded.status.value == 'paused'
|
||||
assert len(loaded.decisions) == 3
|
||||
assert loaded.current_step_index == 3
|
||||
|
||||
# Resume the session
|
||||
resumed = coaching_persistence.resume_session(session.session_id)
|
||||
assert resumed.status.value == 'active'
|
||||
assert resumed.can_resume() is True
|
||||
|
||||
# Continue adding decisions
|
||||
for i in range(3, 6):
|
||||
record = CoachingDecisionRecord(
|
||||
step_index=i,
|
||||
node_id=f"node_{i+1}",
|
||||
action_type='click',
|
||||
decision='accept'
|
||||
)
|
||||
resumed.add_decision(record)
|
||||
|
||||
coaching_persistence.save_session(resumed)
|
||||
|
||||
# Verify continuation
|
||||
final = coaching_persistence.load_session(session.session_id)
|
||||
assert len(final.decisions) == 6
|
||||
assert final.current_step_index == 6
|
||||
|
||||
print("SUCCESS: Session persistence and recovery works correctly!")
|
||||
|
||||
def test_correction_integration_with_coaching(
|
||||
self,
|
||||
coaching_persistence,
|
||||
correction_service
|
||||
):
|
||||
"""
|
||||
Test that COACHING corrections integrate with Correction Packs.
|
||||
"""
|
||||
print("\n=== Testing Correction Integration ===")
|
||||
|
||||
from core.corrections import CorrectionPackIntegration
|
||||
|
||||
# Create integration
|
||||
integration = CorrectionPackIntegration(
|
||||
service=correction_service,
|
||||
auto_create_pack=True
|
||||
)
|
||||
|
||||
workflow_id = "wf_correction_test"
|
||||
|
||||
# Create COACHING session
|
||||
session = coaching_persistence.create_session(
|
||||
workflow_id=workflow_id,
|
||||
execution_id="exec_correction",
|
||||
total_steps=5
|
||||
)
|
||||
|
||||
from core.coaching.session_persistence import CoachingDecisionRecord
|
||||
|
||||
# Simulate corrections
|
||||
corrections_made = [
|
||||
{
|
||||
'action_type': 'click',
|
||||
'element_type': 'button',
|
||||
'failure_reason': 'element_not_found',
|
||||
'correction_type': 'target_change',
|
||||
'original_target': {'text': 'OK'},
|
||||
'corrected_target': {'text': 'Valider'}
|
||||
},
|
||||
{
|
||||
'action_type': 'type',
|
||||
'element_type': 'input',
|
||||
'failure_reason': 'wrong_field',
|
||||
'correction_type': 'target_change',
|
||||
'original_target': {'id': 'email'},
|
||||
'corrected_target': {'name': 'user_email'}
|
||||
}
|
||||
]
|
||||
|
||||
# Add decisions with corrections
|
||||
for i, correction_data in enumerate(corrections_made):
|
||||
record = CoachingDecisionRecord(
|
||||
step_index=i,
|
||||
node_id=f"node_{i+1}",
|
||||
action_type=correction_data['action_type'],
|
||||
decision='correct',
|
||||
correction=correction_data
|
||||
)
|
||||
session.add_decision(record)
|
||||
|
||||
# Capture correction in Correction Pack
|
||||
integration.capture_correction(
|
||||
correction_data=correction_data,
|
||||
session_id=session.session_id,
|
||||
workflow_id=workflow_id
|
||||
)
|
||||
|
||||
coaching_persistence.complete_session(session.session_id, success=True)
|
||||
|
||||
# Verify corrections captured in pack
|
||||
pack = correction_service.get_pack(integration._default_pack_id)
|
||||
corrections_list = pack.get('corrections') if isinstance(pack, dict) else pack.corrections
|
||||
assert len(corrections_list) == 2
|
||||
|
||||
print(f"Captured {len(corrections_list)} corrections in Correction Pack")
|
||||
print("SUCCESS: Corrections integrated correctly!")
|
||||
|
||||
def test_metrics_threshold_for_auto_mode(self, coaching_persistence, metrics_collector):
|
||||
"""
|
||||
Test that metrics correctly determine AUTO mode readiness.
|
||||
"""
|
||||
print("\n=== Testing AUTO Mode Threshold ===")
|
||||
|
||||
from core.coaching.session_persistence import CoachingDecisionRecord
|
||||
|
||||
workflow_id = "wf_threshold_test"
|
||||
|
||||
# Test case 1: Below threshold (too few sessions)
|
||||
session = coaching_persistence.create_session(
|
||||
workflow_id=workflow_id,
|
||||
execution_id="exec_001",
|
||||
total_steps=5
|
||||
)
|
||||
|
||||
for i in range(5):
|
||||
record = CoachingDecisionRecord(
|
||||
step_index=i,
|
||||
node_id=f"node_{i+1}",
|
||||
action_type='click',
|
||||
decision='accept'
|
||||
)
|
||||
session.add_decision(record)
|
||||
|
||||
coaching_persistence.complete_session(session.session_id, success=True)
|
||||
|
||||
metrics = metrics_collector.get_workflow_metrics(workflow_id)
|
||||
assert not metrics.ready_for_auto, "Should not be ready with only 1 session"
|
||||
|
||||
# Test case 2: Meet minimum sessions
|
||||
for sess_num in range(2, 6):
|
||||
session = coaching_persistence.create_session(
|
||||
workflow_id=workflow_id,
|
||||
execution_id=f"exec_{sess_num:03d}",
|
||||
total_steps=5
|
||||
)
|
||||
|
||||
for i in range(5):
|
||||
record = CoachingDecisionRecord(
|
||||
step_index=i,
|
||||
node_id=f"node_{i+1}",
|
||||
action_type='click',
|
||||
decision='accept'
|
||||
)
|
||||
session.add_decision(record)
|
||||
|
||||
coaching_persistence.complete_session(session.session_id, success=True)
|
||||
|
||||
metrics = metrics_collector.get_workflow_metrics(workflow_id)
|
||||
print(f"After 5 sessions - Acceptance: {metrics.acceptance_rate:.2%}, Ready: {metrics.ready_for_auto}")
|
||||
assert metrics.ready_for_auto, "Should be ready after 5 sessions with high acceptance"
|
||||
|
||||
print("SUCCESS: Threshold calculation works correctly!")
|
||||
|
||||
def test_global_metrics_aggregation(self, coaching_persistence, metrics_collector):
|
||||
"""
|
||||
Test global metrics aggregation across multiple workflows.
|
||||
"""
|
||||
print("\n=== Testing Global Metrics ===")
|
||||
|
||||
from core.coaching.session_persistence import CoachingDecisionRecord
|
||||
|
||||
# Create sessions for multiple workflows
|
||||
workflows = ["wf_global_1", "wf_global_2", "wf_global_3"]
|
||||
|
||||
for wf_id in workflows:
|
||||
for sess_num in range(3):
|
||||
session = coaching_persistence.create_session(
|
||||
workflow_id=wf_id,
|
||||
execution_id=f"exec_{wf_id}_{sess_num}",
|
||||
total_steps=3
|
||||
)
|
||||
|
||||
for i in range(3):
|
||||
decision = 'accept' if i != 1 else 'correct'
|
||||
record = CoachingDecisionRecord(
|
||||
step_index=i,
|
||||
node_id=f"node_{i+1}",
|
||||
action_type='click',
|
||||
decision=decision
|
||||
)
|
||||
session.add_decision(record)
|
||||
|
||||
coaching_persistence.complete_session(session.session_id, success=True)
|
||||
|
||||
# Get global metrics
|
||||
global_metrics = metrics_collector.get_global_metrics()
|
||||
|
||||
print(f"Total workflows: {global_metrics.total_workflows}")
|
||||
print(f"Total sessions: {global_metrics.total_sessions}")
|
||||
print(f"Total decisions: {global_metrics.total_decisions}")
|
||||
print(f"Acceptance rate: {global_metrics.overall_acceptance_rate:.2%}")
|
||||
|
||||
assert global_metrics.total_workflows == 3
|
||||
assert global_metrics.total_sessions == 9 # 3 workflows x 3 sessions
|
||||
assert global_metrics.total_decisions == 27 # 9 sessions x 3 decisions
|
||||
|
||||
print("SUCCESS: Global metrics aggregation works correctly!")
|
||||
|
||||
|
||||
class TestCoachingAPIIntegration:
|
||||
"""Tests for COACHING API integration."""
|
||||
|
||||
def test_api_session_lifecycle(self, coaching_persistence):
|
||||
"""Test session lifecycle through persistence layer (API simulation)."""
|
||||
print("\n=== Testing API Session Lifecycle ===")
|
||||
|
||||
from core.coaching.session_persistence import CoachingDecisionRecord
|
||||
|
||||
# Create session (simulating POST /api/coaching-sessions)
|
||||
session = coaching_persistence.create_session(
|
||||
workflow_id="wf_api_test",
|
||||
execution_id="exec_api",
|
||||
total_steps=3
|
||||
)
|
||||
assert session.session_id is not None
|
||||
|
||||
# Add decision (simulating POST /api/coaching-sessions/{id}/decisions)
|
||||
record = CoachingDecisionRecord(
|
||||
step_index=0,
|
||||
node_id="node_1",
|
||||
action_type="click",
|
||||
decision="accept"
|
||||
)
|
||||
session.add_decision(record)
|
||||
coaching_persistence.save_session(session)
|
||||
|
||||
# Get session (simulating GET /api/coaching-sessions/{id})
|
||||
loaded = coaching_persistence.load_session(session.session_id)
|
||||
assert loaded is not None
|
||||
assert len(loaded.decisions) == 1
|
||||
|
||||
# Complete session (simulating POST /api/coaching-sessions/{id}/complete)
|
||||
completed = coaching_persistence.complete_session(session.session_id, success=True)
|
||||
assert completed.status.value == 'completed'
|
||||
|
||||
print("SUCCESS: API session lifecycle works correctly!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v', '-s'])
|
||||
Reference in New Issue
Block a user