feat(cognition): timing + écran attendu + auto-apprentissage Shadow + VLM qwen2.5vl

Mémoire de travail enrichie : - Timing par étape (durée, moyenne, alerte si lent) - Écran attendu vs observation réelle - Contexte VLM étendu VLM reasoning : default qwen2.5vl:3b (gemma4 ne supporte pas vision) Auto-apprentissage Shadow : - stream_processor apprend les dialogues automatiquement - Clic utilisateur après dialogue → pattern mémorisé - Sauvegardé dans data/learned_patterns.json GUI-R1 : 10 patterns additionnels extraits du dataset Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-20 21:52:45 +02:00
parent 04a14a56b2
commit cbe8dc95d2
3 changed files with 189 additions and 2 deletions
--- a/core/cognition/working_memory.py
+++ b/core/cognition/working_memory.py
@@ -77,10 +77,15 @@ class CognitiveContext:
    needs_help: bool = False
    help_reason: str = ""

-    # Métadonnées
+    # Timing
    session_id: str = ""
    machine_id: str = ""
    started_at: Optional[datetime] = None
+    step_started_at: Optional[datetime] = None
+    step_durations: Dict[str, List[float]] = field(default_factory=dict)
+
+    # Ce que Léa devrait voir à l'écran (comparaison attendu vs réel)
+    expected_screen: str = ""

    def record_action(self, action_type: str, target: str = "",
                      result: str = "", success: bool = True,
@@ -117,10 +122,47 @@ class CognitiveContext:

    def advance_step(self):
        """Passe à l'étape suivante du plan."""
+        # Enregistrer la durée de l'étape précédente
+        if self.step_started_at:
+            duration = (datetime.now() - self.step_started_at).total_seconds()
+            step_key = self.current_step_description or f"step_{self.current_step}"
+            self.step_durations.setdefault(step_key, []).append(duration)
+
        self.current_step += 1
+        self.step_started_at = datetime.now()
        if self.remaining_steps:
            self.current_step_description = self.remaining_steps.pop(0)

+    def get_step_timing(self) -> Optional[Dict[str, Any]]:
+        """Retourne les infos de timing de l'étape en cours."""
+        if not self.step_started_at:
+            return None
+
+        elapsed = (datetime.now() - self.step_started_at).total_seconds()
+        step_key = self.current_step_description or f"step_{self.current_step}"
+        history = self.step_durations.get(step_key, [])
+        avg = sum(history) / len(history) if history else None
+
+        result = {"elapsed_seconds": elapsed}
+        if avg:
+            result["avg_previous"] = avg
+            result["is_slow"] = elapsed > avg * 2
+        return result
+
+    def set_expected_screen(self, description: str):
+        """Définit ce que Léa devrait voir à l'écran pour cette étape."""
+        self.expected_screen = description
+
+    def check_screen_matches_expected(self) -> Optional[bool]:
+        """Compare l'observation actuelle avec l'écran attendu."""
+        if not self.expected_screen or not self.current_observation:
+            return None
+        obs_text = (self.current_observation.window_title + " " +
+                    self.current_observation.ocr_text).lower()
+        expected_words = self.expected_screen.lower().split()
+        matches = sum(1 for w in expected_words if w in obs_text)
+        return matches / max(len(expected_words), 1) > 0.3
+
    def learn(self, fact: str):
        """Enregistre un fait appris pendant l'exécution."""
        if fact not in self.learned_facts:
@@ -175,6 +217,21 @@ class CognitiveContext:
            for step in self.remaining_steps[:3]:
                lines.append(f"  - {step}")

+        timing = self.get_step_timing()
+        if timing:
+            lines.append(f"TEMPS ÉTAPE : {timing['elapsed_seconds']:.1f}s")
+            if timing.get('avg_previous'):
+                lines.append(f"MOYENNE PRÉCÉDENTE : {timing['avg_previous']:.1f}s")
+                if timing.get('is_slow'):
+                    lines.append("⚠ ÉTAPE ANORMALEMENT LENTE")
+
+        if self.expected_screen:
+            match = self.check_screen_matches_expected()
+            if match is False:
+                lines.append(f"⚠ ÉCRAN INATTENDU (attendu: {self.expected_screen})")
+            elif match is True:
+                lines.append(f"ÉCRAN CONFORME : {self.expected_screen}")
+
        lines.append(f"CONFIANCE : {self.confidence:.0%}")

        if self.needs_help:
--- a/core/execution/input_handler.py
+++ b/core/execution/input_handler.py
@@ -287,7 +287,7 @@ Si l'écran est normal sans action nécessaire, réponds action="nothing".
 Réponds UNIQUEMENT le JSON, pas d'explication."""

        ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
-        model = os.environ.get("RPA_VLM_MODEL", os.environ.get("VLM_MODEL", "gemma4:e4b"))
+        model = os.environ.get("RPA_REASONING_MODEL", os.environ.get("RPA_VLM_MODEL", "qwen2.5vl:3b"))

        response = requests.post(
            f"{ollama_url}/api/generate",