""" End-to-End Tests for COACHING Mode Tests the complete OBSERVATION -> COACHING -> AUTO workflow: 1. Start in OBSERVATION mode (record user actions) 2. Transition to COACHING mode (suggest actions, get user feedback) 3. Accumulate corrections in Correction Packs 4. Track metrics and determine readiness for AUTO mode 5. Transition to AUTO mode when confidence threshold is met This test simulates the complete learning journey of a workflow. """ import pytest import tempfile import shutil import time from pathlib import Path from datetime import datetime from unittest.mock import MagicMock, patch @pytest.fixture def temp_storage(): """Create temporary storage directories.""" temp_dir = tempfile.mkdtemp() yield Path(temp_dir) shutil.rmtree(temp_dir, ignore_errors=True) @pytest.fixture def coaching_persistence(temp_storage): """Create coaching persistence with temp storage.""" from core.coaching import CoachingSessionPersistence return CoachingSessionPersistence(temp_storage / 'coaching_sessions') @pytest.fixture def correction_service(temp_storage): """Create correction pack service with temp storage.""" from core.corrections import CorrectionPackService return CorrectionPackService(storage_path=temp_storage / 'correction_packs') @pytest.fixture def metrics_collector(coaching_persistence): """Create metrics collector.""" from core.coaching import CoachingMetricsCollector return CoachingMetricsCollector(coaching_persistence) class TestCoachingE2E: """End-to-end tests for the complete COACHING workflow.""" def test_complete_learning_journey( self, coaching_persistence, correction_service, metrics_collector ): """ Test the complete learning journey from OBSERVATION to AUTO. Scenario: 1. Create workflow and start first COACHING session 2. Make decisions (mix of accept, correct, reject) 3. Corrections are captured in Correction Packs 4. Run multiple sessions to build confidence 5. Check metrics and readiness for AUTO """ workflow_id = "wf_e2e_test_001" # ===================================================================== # Phase 1: First COACHING session - Learning phase # ===================================================================== print("\n=== Phase 1: First COACHING Session ===") session1 = coaching_persistence.create_session( workflow_id=workflow_id, execution_id="exec_001", total_steps=5, metadata={'phase': 'learning'} ) # Simulate decisions with some corrections from core.coaching.session_persistence import CoachingDecisionRecord decisions_p1 = [ ('accept', None), ('correct', {'target': {'id': 'new_btn'}}), ('accept', None), ('reject', None), ('accept', None), ] for i, (decision, correction) in enumerate(decisions_p1): record = CoachingDecisionRecord( step_index=i, node_id=f"node_{i+1}", action_type='click', decision=decision, correction=correction, feedback=f"Decision {i+1}" ) session1.add_decision(record) coaching_persistence.save_session(session1) coaching_persistence.complete_session(session1.session_id, success=True) # Verify session stats session1_reloaded = coaching_persistence.load_session(session1.session_id) assert session1_reloaded.stats['accepted'] == 3 assert session1_reloaded.stats['corrected'] == 1 assert session1_reloaded.stats['rejected'] == 1 print(f"Session 1 completed: {session1_reloaded.stats}") # ===================================================================== # Phase 2: Multiple sessions to improve acceptance rate # ===================================================================== print("\n=== Phase 2: Multiple Training Sessions ===") # Session 2: Better acceptance after learning session2 = coaching_persistence.create_session( workflow_id=workflow_id, execution_id="exec_002", total_steps=5 ) # Most actions accepted now (corrections are working) decisions_p2 = [ ('accept', None), ('accept', None), ('accept', None), ('accept', None), ('correct', {'target': {'text': 'Submit'}}), ] for i, (decision, correction) in enumerate(decisions_p2): record = CoachingDecisionRecord( step_index=i, node_id=f"node_{i+1}", action_type='click', decision=decision, correction=correction ) session2.add_decision(record) coaching_persistence.save_session(session2) coaching_persistence.complete_session(session2.session_id, success=True) print(f"Session 2 completed: {session2.stats}") # Sessions 3-5: High acceptance rate for sess_num in range(3, 6): session = coaching_persistence.create_session( workflow_id=workflow_id, execution_id=f"exec_{sess_num:03d}", total_steps=5 ) # All accepted for i in range(5): record = CoachingDecisionRecord( step_index=i, node_id=f"node_{i+1}", action_type='click', decision='accept' ) session.add_decision(record) coaching_persistence.save_session(session) coaching_persistence.complete_session(session.session_id, success=True) print(f"Session {sess_num} completed: all accepted") # ===================================================================== # Phase 3: Check Metrics and Learning Progress # ===================================================================== print("\n=== Phase 3: Checking Metrics ===") metrics = metrics_collector.get_workflow_metrics(workflow_id) print(f"Total sessions: {metrics.total_sessions}") print(f"Total decisions: {metrics.total_decisions}") print(f"Acceptance rate: {metrics.acceptance_rate:.2%}") print(f"Correction rate: {metrics.correction_rate:.2%}") print(f"Confidence score: {metrics.confidence_score:.2f}") print(f"Learning progress: {metrics.learning_progress.value}") print(f"Ready for AUTO: {metrics.ready_for_auto}") print(f"Recommendations: {metrics.recommendations}") # Assertions assert metrics.total_sessions == 5 assert metrics.total_decisions == 25 assert metrics.acceptance_rate > 0.8 # Should be high after training assert metrics.correction_rate < 0.15 # Should be low # ===================================================================== # Phase 4: Verify Readiness for AUTO # ===================================================================== print("\n=== Phase 4: AUTO Mode Readiness ===") # The workflow should be ready for AUTO after successful training assert metrics.ready_for_auto, "Workflow should be ready for AUTO mode" assert metrics.learning_progress.value in ['ready', 'autonomous'] print("SUCCESS: Workflow is ready for autonomous execution!") def test_session_persistence_and_recovery(self, coaching_persistence): """ Test that COACHING sessions can be paused and resumed. """ print("\n=== Testing Session Persistence ===") workflow_id = "wf_persistence_test" # Create and partially complete a session session = coaching_persistence.create_session( workflow_id=workflow_id, execution_id="exec_persist", total_steps=10 ) from core.coaching.session_persistence import CoachingDecisionRecord # Add 3 decisions for i in range(3): record = CoachingDecisionRecord( step_index=i, node_id=f"node_{i+1}", action_type='click', decision='accept' ) session.add_decision(record) coaching_persistence.save_session(session) # Pause the session coaching_persistence.pause_session(session.session_id) # Verify paused loaded = coaching_persistence.load_session(session.session_id) assert loaded.status.value == 'paused' assert len(loaded.decisions) == 3 assert loaded.current_step_index == 3 # Resume the session resumed = coaching_persistence.resume_session(session.session_id) assert resumed.status.value == 'active' assert resumed.can_resume() is True # Continue adding decisions for i in range(3, 6): record = CoachingDecisionRecord( step_index=i, node_id=f"node_{i+1}", action_type='click', decision='accept' ) resumed.add_decision(record) coaching_persistence.save_session(resumed) # Verify continuation final = coaching_persistence.load_session(session.session_id) assert len(final.decisions) == 6 assert final.current_step_index == 6 print("SUCCESS: Session persistence and recovery works correctly!") def test_correction_integration_with_coaching( self, coaching_persistence, correction_service ): """ Test that COACHING corrections integrate with Correction Packs. """ print("\n=== Testing Correction Integration ===") from core.corrections import CorrectionPackIntegration # Create integration integration = CorrectionPackIntegration( service=correction_service, auto_create_pack=True ) workflow_id = "wf_correction_test" # Create COACHING session session = coaching_persistence.create_session( workflow_id=workflow_id, execution_id="exec_correction", total_steps=5 ) from core.coaching.session_persistence import CoachingDecisionRecord # Simulate corrections corrections_made = [ { 'action_type': 'click', 'element_type': 'button', 'failure_reason': 'element_not_found', 'correction_type': 'target_change', 'original_target': {'text': 'OK'}, 'corrected_target': {'text': 'Valider'} }, { 'action_type': 'type', 'element_type': 'input', 'failure_reason': 'wrong_field', 'correction_type': 'target_change', 'original_target': {'id': 'email'}, 'corrected_target': {'name': 'user_email'} } ] # Add decisions with corrections for i, correction_data in enumerate(corrections_made): record = CoachingDecisionRecord( step_index=i, node_id=f"node_{i+1}", action_type=correction_data['action_type'], decision='correct', correction=correction_data ) session.add_decision(record) # Capture correction in Correction Pack integration.capture_correction( correction_data=correction_data, session_id=session.session_id, workflow_id=workflow_id ) coaching_persistence.save_session(session) coaching_persistence.complete_session(session.session_id, success=True) # Verify corrections captured in pack pack = correction_service.get_pack(integration._default_pack_id) corrections_list = pack.get('corrections') if isinstance(pack, dict) else pack.corrections assert len(corrections_list) == 2 print(f"Captured {len(corrections_list)} corrections in Correction Pack") print("SUCCESS: Corrections integrated correctly!") def test_metrics_threshold_for_auto_mode(self, coaching_persistence, metrics_collector): """ Test that metrics correctly determine AUTO mode readiness. """ print("\n=== Testing AUTO Mode Threshold ===") from core.coaching.session_persistence import CoachingDecisionRecord workflow_id = "wf_threshold_test" # Test case 1: Below threshold (too few sessions) session = coaching_persistence.create_session( workflow_id=workflow_id, execution_id="exec_001", total_steps=5 ) for i in range(5): record = CoachingDecisionRecord( step_index=i, node_id=f"node_{i+1}", action_type='click', decision='accept' ) session.add_decision(record) coaching_persistence.save_session(session) coaching_persistence.complete_session(session.session_id, success=True) metrics = metrics_collector.get_workflow_metrics(workflow_id) assert not metrics.ready_for_auto, "Should not be ready with only 1 session" # Test case 2: Meet minimum sessions for sess_num in range(2, 6): session = coaching_persistence.create_session( workflow_id=workflow_id, execution_id=f"exec_{sess_num:03d}", total_steps=5 ) for i in range(5): record = CoachingDecisionRecord( step_index=i, node_id=f"node_{i+1}", action_type='click', decision='accept' ) session.add_decision(record) coaching_persistence.save_session(session) coaching_persistence.complete_session(session.session_id, success=True) metrics = metrics_collector.get_workflow_metrics(workflow_id) print(f"After 5 sessions - Acceptance: {metrics.acceptance_rate:.2%}, Ready: {metrics.ready_for_auto}") assert metrics.ready_for_auto, "Should be ready after 5 sessions with high acceptance" print("SUCCESS: Threshold calculation works correctly!") def test_global_metrics_aggregation(self, coaching_persistence, metrics_collector): """ Test global metrics aggregation across multiple workflows. """ print("\n=== Testing Global Metrics ===") from core.coaching.session_persistence import CoachingDecisionRecord # Create sessions for multiple workflows workflows = ["wf_global_1", "wf_global_2", "wf_global_3"] for wf_id in workflows: for sess_num in range(3): session = coaching_persistence.create_session( workflow_id=wf_id, execution_id=f"exec_{wf_id}_{sess_num}", total_steps=3 ) for i in range(3): decision = 'accept' if i != 1 else 'correct' record = CoachingDecisionRecord( step_index=i, node_id=f"node_{i+1}", action_type='click', decision=decision ) session.add_decision(record) coaching_persistence.save_session(session) coaching_persistence.complete_session(session.session_id, success=True) # Get global metrics global_metrics = metrics_collector.get_global_metrics() print(f"Total workflows: {global_metrics.total_workflows}") print(f"Total sessions: {global_metrics.total_sessions}") print(f"Total decisions: {global_metrics.total_decisions}") print(f"Acceptance rate: {global_metrics.overall_acceptance_rate:.2%}") assert global_metrics.total_workflows == 3 assert global_metrics.total_sessions == 9 # 3 workflows x 3 sessions assert global_metrics.total_decisions == 27 # 9 sessions x 3 decisions print("SUCCESS: Global metrics aggregation works correctly!") class TestCoachingAPIIntegration: """Tests for COACHING API integration.""" def test_api_session_lifecycle(self, coaching_persistence): """Test session lifecycle through persistence layer (API simulation).""" print("\n=== Testing API Session Lifecycle ===") from core.coaching.session_persistence import CoachingDecisionRecord # Create session (simulating POST /api/coaching-sessions) session = coaching_persistence.create_session( workflow_id="wf_api_test", execution_id="exec_api", total_steps=3 ) assert session.session_id is not None # Add decision (simulating POST /api/coaching-sessions/{id}/decisions) record = CoachingDecisionRecord( step_index=0, node_id="node_1", action_type="click", decision="accept" ) session.add_decision(record) coaching_persistence.save_session(session) # Get session (simulating GET /api/coaching-sessions/{id}) loaded = coaching_persistence.load_session(session.session_id) assert loaded is not None assert len(loaded.decisions) == 1 # Complete session (simulating POST /api/coaching-sessions/{id}/complete) completed = coaching_persistence.complete_session(session.session_id, success=True) assert completed.status.value == 'completed' print("SUCCESS: API session lifecycle works correctly!") if __name__ == '__main__': pytest.main([__file__, '-v', '-s'])