feat(cognition): timing + écran attendu + auto-apprentissage Shadow + VLM qwen2.5vl

Mémoire de travail enrichie : - Timing par étape (durée, moyenne, alerte si lent) - Écran attendu vs observation réelle - Contexte VLM étendu VLM reasoning : default qwen2.5vl:3b (gemma4 ne supporte pas vision) Auto-apprentissage Shadow : - stream_processor apprend les dialogues automatiquement - Clic utilisateur après dialogue → pattern mémorisé - Sauvegardé dans data/learned_patterns.json GUI-R1 : 10 patterns additionnels extraits du dataset Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-20 21:52:45 +02:00
parent 04a14a56b2
commit cbe8dc95d2
3 changed files with 189 additions and 2 deletions
--- a/agent_v0/server_v1/stream_processor.py
+++ b/agent_v0/server_v1/stream_processor.py
@@ -1791,6 +1791,10 @@ class StreamProcessor:
        # Workflows construits (pour le matching)
        self._workflows: Dict[str, Any] = {}
        # Shadow learning : dernier pattern UI détecté par session
        # Stocke {session_id: {"pattern": str, "ocr_text": str, "screen_state": obj, "shot_id": str}}
        self._pending_ui_patterns: Dict[str, Dict[str, Any]] = {}
        # Charger les workflows existants depuis le disque
        self._load_persisted_workflows()
@@ -1975,6 +1979,9 @@ class StreamProcessor:
        - key_combo/key_press avec uniquement des modificateurs seuls (ctrl, alt, shift, etc.)
        - key_combo/key_press avec liste de touches vide
        - text_input avec texte vide
        Shadow learning : quand un clic suit un pattern UI détecté,
        on apprend l'association dialogue→bouton.
        """
        if _is_parasitic_event(event_data):
            logger.debug(
@@ -1982,9 +1989,119 @@ class StreamProcessor:
                f"type={event_data.get('type')}, data={event_data.get('keys', event_data.get('text', ''))}"
            )
            return {"status": "event_filtered", "session_id": session_id, "reason": "parasitic"}
        # Shadow learning : si un pattern UI est en attente et qu'on reçoit un clic
        if event_data.get("type") == "mouse_click":
            self._try_shadow_learn(session_id, event_data)
        self.session_manager.add_event(session_id, event_data)
        return {"status": "event_recorded", "session_id": session_id}
    def _try_shadow_learn(self, session_id: str, click_event: Dict[str, Any]):
        """Tente d'apprendre un pattern UI depuis un clic observé en Shadow.
        Quand un screenshot contenait un pattern UI détecté (dialogue) et que
        l'utilisateur clique ensuite, on extrait le texte OCR au point de clic
        pour apprendre l'association : "quand je vois ce texte → cliquer sur ce bouton".
        """
        with self._data_lock:
            pending = self._pending_ui_patterns.pop(session_id, None)
        if not pending:
            return
        screen_state = pending.get("screen_state")
        if screen_state is None:
            return
        # Extraire la position du clic (pixels absolus)
        pos = click_event.get("pos", [])
        if not pos or len(pos) != 2:
            return
        click_x, click_y = pos[0], pos[1]
        # Trouver le texte OCR le plus proche du point de clic
        # via les ui_elements du ScreenState (ils ont bbox + label)
        clicked_label = self._find_label_at_position(screen_state, click_x, click_y)
        if not clicked_label:
            return
        # Extraire le trigger principal du texte OCR du dialogue
        ocr_text = pending.get("ocr_text", "")
        # Utiliser un extrait court comme trigger (max 80 chars, premier segment pertinent)
        trigger_text = ocr_text[:80].strip().lower()
        if not trigger_text:
            return
        logger.info(
            f"Shadow learning: pattern '{pending['pattern_name']}' "
            f"→ utilisateur a cliqué '{clicked_label}' | trigger='{trigger_text[:40]}...'"
        )
        # Sauvegarder le pattern appris
        try:
            from core.knowledge.ui_patterns import UIPatternLibrary
            lib = UIPatternLibrary()
            lib.save_learned_pattern({
                "category": "dialog",
                "triggers": [trigger_text],
                "action": "click",
                "target": clicked_label,
                "os": "windows",
                "confidence": 0.8,
            })
        except Exception as e:
            logger.warning(f"Shadow learning: échec sauvegarde pattern: {e}")
    @staticmethod
    def _find_label_at_position(screen_state, click_x: int, click_y: int) -> Optional[str]:
        """Trouve le label de l'élément UI le plus proche du point de clic.
        Parcourt les ui_elements du ScreenState et retourne le label de
        l'élément dont la bbox contient le point, ou le plus proche si aucun
        ne contient exactement le point.
        """
        ui_elements = getattr(screen_state, "ui_elements", [])
        if not ui_elements:
            return None
        best_label = None
        best_dist = float("inf")
        for elem in ui_elements:
            bbox = getattr(elem, "bbox", None)
            label = getattr(elem, "label", "")
            if not bbox or not label:
                continue
            # BBox = (x, y, width, height) — extraire les coordonnées
            try:
                bx, by = bbox.x, bbox.y
                bw, bh = bbox.width, bbox.height
            except AttributeError:
                # Fallback si bbox est une liste/tuple
                if hasattr(bbox, '__len__') and len(bbox) >= 4:
                    bx, by, bw, bh = bbox[0], bbox[1], bbox[2], bbox[3]
                else:
                    continue
            # Vérifier si le clic est dans la bbox
            if bx <= click_x <= bx + bw and by <= click_y <= by + bh:
                return label.strip()
            # Sinon calculer la distance au centre
            cx = bx + bw / 2
            cy = by + bh / 2
            dist = ((click_x - cx) ** 2 + (click_y - cy) ** 2) ** 0.5
            if dist < best_dist:
                best_dist = dist
                best_label = label.strip()
        # Ne retourner le plus proche que s'il est raisonnablement proche (< 100px)
        if best_label and best_dist < 100:
            return best_label
        return None
    # =========================================================================
    # Screenshots
    # =========================================================================
@@ -2055,6 +2172,19 @@ class StreamProcessor:
                        result["ui_pattern_action"] = pattern["action"]
                        result["ui_pattern_target"] = pattern["target"]
                        logger.info(f"Pattern UI détecté: {pattern['pattern']} → {pattern['target']}")
                        # Shadow learning : mémoriser le pattern en attente du clic utilisateur
                        with self._data_lock:
                            self._pending_ui_patterns[session_id] = {
                                "pattern_name": pattern["pattern"],
                                "ocr_text": ocr_text,
                                "screen_state": screen_state,
                                "shot_id": shot_id,
                            }
                    else:
                        # Pas de pattern connu → effacer le pending (l'écran a changé)
                        with self._data_lock:
                            self._pending_ui_patterns.pop(session_id, None)
            except ImportError:
                pass
            except Exception as e:
--- a/core/cognition/working_memory.py
+++ b/core/cognition/working_memory.py
@@ -77,10 +77,15 @@ class CognitiveContext:
    needs_help: bool = False
    help_reason: str = ""
-    # Métadonnées
+    # Timing
    session_id: str = ""
    machine_id: str = ""
    started_at: Optional[datetime] = None
    step_started_at: Optional[datetime] = None
    step_durations: Dict[str, List[float]] = field(default_factory=dict)
    # Ce que Léa devrait voir à l'écran (comparaison attendu vs réel)
    expected_screen: str = ""
    def record_action(self, action_type: str, target: str = "",
                      result: str = "", success: bool = True,
@@ -117,10 +122,47 @@ class CognitiveContext:
    def advance_step(self):
        """Passe à l'étape suivante du plan."""
        # Enregistrer la durée de l'étape précédente
        if self.step_started_at:
            duration = (datetime.now() - self.step_started_at).total_seconds()
            step_key = self.current_step_description or f"step_{self.current_step}"
            self.step_durations.setdefault(step_key, []).append(duration)
        self.current_step += 1
        self.step_started_at = datetime.now()
        if self.remaining_steps:
            self.current_step_description = self.remaining_steps.pop(0)
    def get_step_timing(self) -> Optional[Dict[str, Any]]:
        """Retourne les infos de timing de l'étape en cours."""
        if not self.step_started_at:
            return None
        elapsed = (datetime.now() - self.step_started_at).total_seconds()
        step_key = self.current_step_description or f"step_{self.current_step}"
        history = self.step_durations.get(step_key, [])
        avg = sum(history) / len(history) if history else None
        result = {"elapsed_seconds": elapsed}
        if avg:
            result["avg_previous"] = avg
            result["is_slow"] = elapsed > avg * 2
        return result
    def set_expected_screen(self, description: str):
        """Définit ce que Léa devrait voir à l'écran pour cette étape."""
        self.expected_screen = description
    def check_screen_matches_expected(self) -> Optional[bool]:
        """Compare l'observation actuelle avec l'écran attendu."""
        if not self.expected_screen or not self.current_observation:
            return None
        obs_text = (self.current_observation.window_title + " " +
                    self.current_observation.ocr_text).lower()
        expected_words = self.expected_screen.lower().split()
        matches = sum(1 for w in expected_words if w in obs_text)
        return matches / max(len(expected_words), 1) > 0.3
    def learn(self, fact: str):
        """Enregistre un fait appris pendant l'exécution."""
        if fact not in self.learned_facts:
@@ -175,6 +217,21 @@ class CognitiveContext:
            for step in self.remaining_steps[:3]:
                lines.append(f"  - {step}")
        timing = self.get_step_timing()
        if timing:
            lines.append(f"TEMPS ÉTAPE : {timing['elapsed_seconds']:.1f}s")
            if timing.get('avg_previous'):
                lines.append(f"MOYENNE PRÉCÉDENTE : {timing['avg_previous']:.1f}s")
                if timing.get('is_slow'):
                    lines.append("⚠ ÉTAPE ANORMALEMENT LENTE")
        if self.expected_screen:
            match = self.check_screen_matches_expected()
            if match is False:
                lines.append(f"⚠ ÉCRAN INATTENDU (attendu: {self.expected_screen})")
            elif match is True:
                lines.append(f"ÉCRAN CONFORME : {self.expected_screen}")
        lines.append(f"CONFIANCE : {self.confidence:.0%}")
        if self.needs_help:
--- a/core/execution/input_handler.py
+++ b/core/execution/input_handler.py
@@ -287,7 +287,7 @@ Si l'écran est normal sans action nécessaire, réponds action="nothing".
 Réponds UNIQUEMENT le JSON, pas d'explication."""
        ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
-        model = os.environ.get("RPA_VLM_MODEL", os.environ.get("VLM_MODEL", "gemma4:e4b"))
+        model = os.environ.get("RPA_REASONING_MODEL", os.environ.get("RPA_VLM_MODEL", "qwen2.5vl:3b"))
        response = requests.post(
            f"{ollama_url}/api/generate",