feat(coaching): Implement complete COACHING mode infrastructure

Add comprehensive COACHING mode system with: Backend: - core/coaching module with session persistence and metrics - CoachingSessionPersistence for pause/resume sessions - CoachingMetricsCollector with learning progress tracking - REST API blueprint for coaching sessions management - Execution integration with COACHING mode support Frontend: - CoachingPanel component with keyboard shortcuts - Decision buttons (accept/reject/correct/manual/skip) - Real-time stats display and correction editor - CorrectionPacksDashboard for pack visualization - WebSocket hooks for real-time COACHING events Metrics & Monitoring: - WorkflowLearningMetrics with confidence scoring - GlobalCoachingMetrics for system-wide analytics - AUTO mode readiness detection (85% acceptance threshold) - Learning progress levels (OBSERVATION → COACHING → AUTO) Tests: - E2E tests for complete OBSERVATION → AUTO journey - Session persistence and recovery tests - Metrics threshold validation tests Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-19 08:40:54 +01:00
parent d6e2530f2a
commit 38a1a5ddd8
21 changed files with 7269 additions and 0 deletions
--- a/tests/test_coaching_e2e.py
+++ b/tests/test_coaching_e2e.py
@@ -0,0 +1,486 @@
+"""
+End-to-End Tests for COACHING Mode
+
+Tests the complete OBSERVATION -> COACHING -> AUTO workflow:
+1. Start in OBSERVATION mode (record user actions)
+2. Transition to COACHING mode (suggest actions, get user feedback)
+3. Accumulate corrections in Correction Packs
+4. Track metrics and determine readiness for AUTO mode
+5. Transition to AUTO mode when confidence threshold is met
+
+This test simulates the complete learning journey of a workflow.
+"""
+
+import pytest
+import tempfile
+import shutil
+import time
+from pathlib import Path
+from datetime import datetime
+from unittest.mock import MagicMock, patch
+
+
+@pytest.fixture
+def temp_storage():
+    """Create temporary storage directories."""
+    temp_dir = tempfile.mkdtemp()
+    yield Path(temp_dir)
+    shutil.rmtree(temp_dir, ignore_errors=True)
+
+
+@pytest.fixture
+def coaching_persistence(temp_storage):
+    """Create coaching persistence with temp storage."""
+    from core.coaching import CoachingSessionPersistence
+    return CoachingSessionPersistence(temp_storage / 'coaching_sessions')
+
+
+@pytest.fixture
+def correction_service(temp_storage):
+    """Create correction pack service with temp storage."""
+    from core.corrections import CorrectionPackService
+    return CorrectionPackService(storage_path=temp_storage / 'correction_packs')
+
+
+@pytest.fixture
+def metrics_collector(coaching_persistence):
+    """Create metrics collector."""
+    from core.coaching import CoachingMetricsCollector
+    return CoachingMetricsCollector(coaching_persistence)
+
+
+class TestCoachingE2E:
+    """End-to-end tests for the complete COACHING workflow."""
+
+    def test_complete_learning_journey(
+        self,
+        coaching_persistence,
+        correction_service,
+        metrics_collector
+    ):
+        """
+        Test the complete learning journey from OBSERVATION to AUTO.
+
+        Scenario:
+        1. Create workflow and start first COACHING session
+        2. Make decisions (mix of accept, correct, reject)
+        3. Corrections are captured in Correction Packs
+        4. Run multiple sessions to build confidence
+        5. Check metrics and readiness for AUTO
+        """
+        workflow_id = "wf_e2e_test_001"
+
+        # =====================================================================
+        # Phase 1: First COACHING session - Learning phase
+        # =====================================================================
+        print("\n=== Phase 1: First COACHING Session ===")
+
+        session1 = coaching_persistence.create_session(
+            workflow_id=workflow_id,
+            execution_id="exec_001",
+            total_steps=5,
+            metadata={'phase': 'learning'}
+        )
+
+        # Simulate decisions with some corrections
+        from core.coaching.session_persistence import CoachingDecisionRecord
+
+        decisions_p1 = [
+            ('accept', None),
+            ('correct', {'target': {'id': 'new_btn'}}),
+            ('accept', None),
+            ('reject', None),
+            ('accept', None),
+        ]
+
+        for i, (decision, correction) in enumerate(decisions_p1):
+            record = CoachingDecisionRecord(
+                step_index=i,
+                node_id=f"node_{i+1}",
+                action_type='click',
+                decision=decision,
+                correction=correction,
+                feedback=f"Decision {i+1}"
+            )
+            session1.add_decision(record)
+
+        coaching_persistence.complete_session(session1.session_id, success=True)
+
+        # Verify session stats
+        session1_reloaded = coaching_persistence.load_session(session1.session_id)
+        assert session1_reloaded.stats['accepted'] == 3
+        assert session1_reloaded.stats['corrected'] == 1
+        assert session1_reloaded.stats['rejected'] == 1
+
+        print(f"Session 1 completed: {session1_reloaded.stats}")
+
+        # =====================================================================
+        # Phase 2: Multiple sessions to improve acceptance rate
+        # =====================================================================
+        print("\n=== Phase 2: Multiple Training Sessions ===")
+
+        # Session 2: Better acceptance after learning
+        session2 = coaching_persistence.create_session(
+            workflow_id=workflow_id,
+            execution_id="exec_002",
+            total_steps=5
+        )
+
+        # Most actions accepted now (corrections are working)
+        decisions_p2 = [
+            ('accept', None),
+            ('accept', None),
+            ('accept', None),
+            ('accept', None),
+            ('correct', {'target': {'text': 'Submit'}}),
+        ]
+
+        for i, (decision, correction) in enumerate(decisions_p2):
+            record = CoachingDecisionRecord(
+                step_index=i,
+                node_id=f"node_{i+1}",
+                action_type='click',
+                decision=decision,
+                correction=correction
+            )
+            session2.add_decision(record)
+
+        coaching_persistence.complete_session(session2.session_id, success=True)
+        print(f"Session 2 completed: {session2.stats}")
+
+        # Sessions 3-5: High acceptance rate
+        for sess_num in range(3, 6):
+            session = coaching_persistence.create_session(
+                workflow_id=workflow_id,
+                execution_id=f"exec_{sess_num:03d}",
+                total_steps=5
+            )
+
+            # All accepted
+            for i in range(5):
+                record = CoachingDecisionRecord(
+                    step_index=i,
+                    node_id=f"node_{i+1}",
+                    action_type='click',
+                    decision='accept'
+                )
+                session.add_decision(record)
+
+            coaching_persistence.complete_session(session.session_id, success=True)
+            print(f"Session {sess_num} completed: all accepted")
+
+        # =====================================================================
+        # Phase 3: Check Metrics and Learning Progress
+        # =====================================================================
+        print("\n=== Phase 3: Checking Metrics ===")
+
+        metrics = metrics_collector.get_workflow_metrics(workflow_id)
+
+        print(f"Total sessions: {metrics.total_sessions}")
+        print(f"Total decisions: {metrics.total_decisions}")
+        print(f"Acceptance rate: {metrics.acceptance_rate:.2%}")
+        print(f"Correction rate: {metrics.correction_rate:.2%}")
+        print(f"Confidence score: {metrics.confidence_score:.2f}")
+        print(f"Learning progress: {metrics.learning_progress.value}")
+        print(f"Ready for AUTO: {metrics.ready_for_auto}")
+        print(f"Recommendations: {metrics.recommendations}")
+
+        # Assertions
+        assert metrics.total_sessions == 5
+        assert metrics.total_decisions == 25
+        assert metrics.acceptance_rate > 0.8  # Should be high after training
+        assert metrics.correction_rate < 0.15  # Should be low
+
+        # =====================================================================
+        # Phase 4: Verify Readiness for AUTO
+        # =====================================================================
+        print("\n=== Phase 4: AUTO Mode Readiness ===")
+
+        # The workflow should be ready for AUTO after successful training
+        assert metrics.ready_for_auto, "Workflow should be ready for AUTO mode"
+        assert metrics.learning_progress.value in ['ready', 'autonomous']
+
+        print("SUCCESS: Workflow is ready for autonomous execution!")
+
+    def test_session_persistence_and_recovery(self, coaching_persistence):
+        """
+        Test that COACHING sessions can be paused and resumed.
+        """
+        print("\n=== Testing Session Persistence ===")
+
+        workflow_id = "wf_persistence_test"
+
+        # Create and partially complete a session
+        session = coaching_persistence.create_session(
+            workflow_id=workflow_id,
+            execution_id="exec_persist",
+            total_steps=10
+        )
+
+        from core.coaching.session_persistence import CoachingDecisionRecord
+
+        # Add 3 decisions
+        for i in range(3):
+            record = CoachingDecisionRecord(
+                step_index=i,
+                node_id=f"node_{i+1}",
+                action_type='click',
+                decision='accept'
+            )
+            session.add_decision(record)
+
+        coaching_persistence.save_session(session)
+
+        # Pause the session
+        coaching_persistence.pause_session(session.session_id)
+
+        # Verify paused
+        loaded = coaching_persistence.load_session(session.session_id)
+        assert loaded.status.value == 'paused'
+        assert len(loaded.decisions) == 3
+        assert loaded.current_step_index == 3
+
+        # Resume the session
+        resumed = coaching_persistence.resume_session(session.session_id)
+        assert resumed.status.value == 'active'
+        assert resumed.can_resume() is True
+
+        # Continue adding decisions
+        for i in range(3, 6):
+            record = CoachingDecisionRecord(
+                step_index=i,
+                node_id=f"node_{i+1}",
+                action_type='click',
+                decision='accept'
+            )
+            resumed.add_decision(record)
+
+        coaching_persistence.save_session(resumed)
+
+        # Verify continuation
+        final = coaching_persistence.load_session(session.session_id)
+        assert len(final.decisions) == 6
+        assert final.current_step_index == 6
+
+        print("SUCCESS: Session persistence and recovery works correctly!")
+
+    def test_correction_integration_with_coaching(
+        self,
+        coaching_persistence,
+        correction_service
+    ):
+        """
+        Test that COACHING corrections integrate with Correction Packs.
+        """
+        print("\n=== Testing Correction Integration ===")
+
+        from core.corrections import CorrectionPackIntegration
+
+        # Create integration
+        integration = CorrectionPackIntegration(
+            service=correction_service,
+            auto_create_pack=True
+        )
+
+        workflow_id = "wf_correction_test"
+
+        # Create COACHING session
+        session = coaching_persistence.create_session(
+            workflow_id=workflow_id,
+            execution_id="exec_correction",
+            total_steps=5
+        )
+
+        from core.coaching.session_persistence import CoachingDecisionRecord
+
+        # Simulate corrections
+        corrections_made = [
+            {
+                'action_type': 'click',
+                'element_type': 'button',
+                'failure_reason': 'element_not_found',
+                'correction_type': 'target_change',
+                'original_target': {'text': 'OK'},
+                'corrected_target': {'text': 'Valider'}
+            },
+            {
+                'action_type': 'type',
+                'element_type': 'input',
+                'failure_reason': 'wrong_field',
+                'correction_type': 'target_change',
+                'original_target': {'id': 'email'},
+                'corrected_target': {'name': 'user_email'}
+            }
+        ]
+
+        # Add decisions with corrections
+        for i, correction_data in enumerate(corrections_made):
+            record = CoachingDecisionRecord(
+                step_index=i,
+                node_id=f"node_{i+1}",
+                action_type=correction_data['action_type'],
+                decision='correct',
+                correction=correction_data
+            )
+            session.add_decision(record)
+
+            # Capture correction in Correction Pack
+            integration.capture_correction(
+                correction_data=correction_data,
+                session_id=session.session_id,
+                workflow_id=workflow_id
+            )
+
+        coaching_persistence.complete_session(session.session_id, success=True)
+
+        # Verify corrections captured in pack
+        pack = correction_service.get_pack(integration._default_pack_id)
+        corrections_list = pack.get('corrections') if isinstance(pack, dict) else pack.corrections
+        assert len(corrections_list) == 2
+
+        print(f"Captured {len(corrections_list)} corrections in Correction Pack")
+        print("SUCCESS: Corrections integrated correctly!")
+
+    def test_metrics_threshold_for_auto_mode(self, coaching_persistence, metrics_collector):
+        """
+        Test that metrics correctly determine AUTO mode readiness.
+        """
+        print("\n=== Testing AUTO Mode Threshold ===")
+
+        from core.coaching.session_persistence import CoachingDecisionRecord
+
+        workflow_id = "wf_threshold_test"
+
+        # Test case 1: Below threshold (too few sessions)
+        session = coaching_persistence.create_session(
+            workflow_id=workflow_id,
+            execution_id="exec_001",
+            total_steps=5
+        )
+
+        for i in range(5):
+            record = CoachingDecisionRecord(
+                step_index=i,
+                node_id=f"node_{i+1}",
+                action_type='click',
+                decision='accept'
+            )
+            session.add_decision(record)
+
+        coaching_persistence.complete_session(session.session_id, success=True)
+
+        metrics = metrics_collector.get_workflow_metrics(workflow_id)
+        assert not metrics.ready_for_auto, "Should not be ready with only 1 session"
+
+        # Test case 2: Meet minimum sessions
+        for sess_num in range(2, 6):
+            session = coaching_persistence.create_session(
+                workflow_id=workflow_id,
+                execution_id=f"exec_{sess_num:03d}",
+                total_steps=5
+            )
+
+            for i in range(5):
+                record = CoachingDecisionRecord(
+                    step_index=i,
+                    node_id=f"node_{i+1}",
+                    action_type='click',
+                    decision='accept'
+                )
+                session.add_decision(record)
+
+            coaching_persistence.complete_session(session.session_id, success=True)
+
+        metrics = metrics_collector.get_workflow_metrics(workflow_id)
+        print(f"After 5 sessions - Acceptance: {metrics.acceptance_rate:.2%}, Ready: {metrics.ready_for_auto}")
+        assert metrics.ready_for_auto, "Should be ready after 5 sessions with high acceptance"
+
+        print("SUCCESS: Threshold calculation works correctly!")
+
+    def test_global_metrics_aggregation(self, coaching_persistence, metrics_collector):
+        """
+        Test global metrics aggregation across multiple workflows.
+        """
+        print("\n=== Testing Global Metrics ===")
+
+        from core.coaching.session_persistence import CoachingDecisionRecord
+
+        # Create sessions for multiple workflows
+        workflows = ["wf_global_1", "wf_global_2", "wf_global_3"]
+
+        for wf_id in workflows:
+            for sess_num in range(3):
+                session = coaching_persistence.create_session(
+                    workflow_id=wf_id,
+                    execution_id=f"exec_{wf_id}_{sess_num}",
+                    total_steps=3
+                )
+
+                for i in range(3):
+                    decision = 'accept' if i != 1 else 'correct'
+                    record = CoachingDecisionRecord(
+                        step_index=i,
+                        node_id=f"node_{i+1}",
+                        action_type='click',
+                        decision=decision
+                    )
+                    session.add_decision(record)
+
+                coaching_persistence.complete_session(session.session_id, success=True)
+
+        # Get global metrics
+        global_metrics = metrics_collector.get_global_metrics()
+
+        print(f"Total workflows: {global_metrics.total_workflows}")
+        print(f"Total sessions: {global_metrics.total_sessions}")
+        print(f"Total decisions: {global_metrics.total_decisions}")
+        print(f"Acceptance rate: {global_metrics.overall_acceptance_rate:.2%}")
+
+        assert global_metrics.total_workflows == 3
+        assert global_metrics.total_sessions == 9  # 3 workflows x 3 sessions
+        assert global_metrics.total_decisions == 27  # 9 sessions x 3 decisions
+
+        print("SUCCESS: Global metrics aggregation works correctly!")
+
+
+class TestCoachingAPIIntegration:
+    """Tests for COACHING API integration."""
+
+    def test_api_session_lifecycle(self, coaching_persistence):
+        """Test session lifecycle through persistence layer (API simulation)."""
+        print("\n=== Testing API Session Lifecycle ===")
+
+        from core.coaching.session_persistence import CoachingDecisionRecord
+
+        # Create session (simulating POST /api/coaching-sessions)
+        session = coaching_persistence.create_session(
+            workflow_id="wf_api_test",
+            execution_id="exec_api",
+            total_steps=3
+        )
+        assert session.session_id is not None
+
+        # Add decision (simulating POST /api/coaching-sessions/{id}/decisions)
+        record = CoachingDecisionRecord(
+            step_index=0,
+            node_id="node_1",
+            action_type="click",
+            decision="accept"
+        )
+        session.add_decision(record)
+        coaching_persistence.save_session(session)
+
+        # Get session (simulating GET /api/coaching-sessions/{id})
+        loaded = coaching_persistence.load_session(session.session_id)
+        assert loaded is not None
+        assert len(loaded.decisions) == 1
+
+        # Complete session (simulating POST /api/coaching-sessions/{id}/complete)
+        completed = coaching_persistence.complete_session(session.session_id, success=True)
+        assert completed.status.value == 'completed'
+
+        print("SUCCESS: API session lifecycle works correctly!")
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v', '-s'])