feat: Léa chat + IRBuilder enrichi (stratégies V4 complètes)

Aspect 2/4 Léa : interface conversationnelle - chat_interface.py : ChatSession thread-safe, états idle/planning/awaiting/executing/done - 5 endpoints REST : /api/v1/chat/* (session, message, history, confirm, sessions) - web_dashboard/chat.html + chat.js : UI minimaliste, polling 2s, pas de framework - Proxy Flask /api/chat/* → serveur streaming - 34 tests (happy path, abandon, refus, erreurs, gemma4 down) IRBuilder enrichi pour plans V4 complets - _event_to_action() appelle enrich_click_from_screenshot() quand session_dir dispo - Chaque clic porte _enrichment (by_text OCR, anchor_image_base64, vlm_description) - ExecutionCompiler consomme l'enrichissement pour produire 3 stratégies par clic Avant : [ocr] uniquement, target="unknown_window" Après : [ocr, template, vlm] avec vrai texte OCR ("Rechercher", "Ouvrir") Validé sur session réelle : 10/10 clics enrichis (by_text + anchor + vlm_description) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-10 09:01:13 +02:00
parent a6eb4c168f
commit f541bb8ce4
8 changed files with 2241 additions and 26 deletions
--- a/core/workflow/execution_compiler.py
+++ b/core/workflow/execution_compiler.py
@@ -208,18 +208,32 @@ class ExecutionCompiler:
    ) -> tuple:
        """Compiler les stratégies de résolution pour un clic.

+        Utilise les données d'enrichissement visuel (action._enrichment) si
+        disponibles (crop anchor, description VLM, window_capture).
+
        Ordre de priorité :
-        1. OCR exact (si texte connu) — 100ms, pixel-perfect
-        2. Template matching (si crop disponible) — 10ms, même interface
-        3. Position relative (si hint disponible) — instantané, fragile
-        4. VLM (dernier recours) — 2-5s, exception handler
+        1. OCR exact (si by_text disponible) — 100ms, pixel-perfect
+        2. Template matching (si anchor_image_base64) — 10ms
+        3. VLM (vlm_description) — 2-5s, exception handler

        Le learning peut réordonner si une stratégie a mieux marché avant.
        """
        primary = None
        fallbacks = []

-        target_text = action.anchor_hint or action.target
+        # Lire l'enrichissement visuel si dispo
+        enrichment = getattr(action, "_enrichment", None) or {}
+        by_text_from_enrich = enrichment.get("by_text", "")
+        anchor_b64 = enrichment.get("anchor_image_base64", "")
+        vlm_desc_from_enrich = enrichment.get("vlm_description", "")
+        window_title = enrichment.get("window_title", "")
+
+        # Source de texte : enrichissement > anchor_hint > target
+        target_text = by_text_from_enrich or action.anchor_hint or action.target
+        # Ne pas utiliser "unknown_window" comme texte OCR
+        if target_text == "unknown_window":
+            target_text = ""
+
        learned_method = learned.get(target_text, "")

        # Stratégie OCR — le texte visible est la meilleure ancre
@@ -227,41 +241,49 @@ class ExecutionCompiler:
            ocr_strategy = ResolutionStrategy(
                method="ocr",
                target_text=target_text,
-                threshold=0.8,
+                threshold=0.7,
            )
-            # Si le learning dit que l'OCR marche pour cette cible, c'est la primaire
-            if not learned_method or learned_method in ("ocr", "som_text_match", "hybrid_text_direct"):
+            if not learned_method or learned_method in ("ocr", "som_text_match", "hybrid_text_direct", "v4_ocr"):
                primary = ocr_strategy
            else:
                fallbacks.append(ocr_strategy)

        # Stratégie template — le crop visuel de l'enregistrement
-        if action.anchor_hint:
+        if anchor_b64:
            template_strategy = ResolutionStrategy(
                method="template",
-                target_text=action.anchor_hint,
+                target_text=target_text,
+                anchor_b64=anchor_b64,
                threshold=0.85,
            )
-            if learned_method in ("anchor_template", "template_matching"):
+            if learned_method in ("anchor_template", "template_matching", "v4_template"):
+                if primary:
+                    fallbacks.insert(0, primary)
                primary = template_strategy
            else:
                fallbacks.append(template_strategy)

        # Stratégie VLM — exception handler (dernier recours)
-        vlm_description = action.target or step.intent
-        vlm_strategy = ResolutionStrategy(
-            method="vlm",
-            vlm_description=vlm_description,
-            threshold=0.6,
-        )
-        fallbacks.append(vlm_strategy)
+        vlm_description = vlm_desc_from_enrich or action.target or step.intent
+        if vlm_description and vlm_description != "unknown_window":
+            vlm_strategy = ResolutionStrategy(
+                method="vlm",
+                vlm_description=vlm_description,
+                threshold=0.6,
+            )
+            fallbacks.append(vlm_strategy)

-        # Si aucune primaire trouvée, utiliser le VLM
+        # Si aucune primaire trouvée, prendre le premier fallback
        if primary is None:
            if fallbacks:
                primary = fallbacks.pop(0)
            else:
-                primary = vlm_strategy
+                # Dernier recours : VLM avec l'intention métier
+                primary = ResolutionStrategy(
+                    method="vlm",
+                    vlm_description=step.intent or "élément UI",
+                    threshold=0.5,
+                )

        return primary, fallbacks

--- a/core/workflow/ir_builder.py
+++ b/core/workflow/ir_builder.py
@@ -63,6 +63,15 @@ class IRBuilder:
        """
        t_start = time.time()

+        # Résoudre le session_dir_path pour l'enrichissement visuel
+        session_dir_path = Path(session_dir) if session_dir else None
+        if session_dir_path and not session_dir_path.is_dir():
+            logger.warning(
+                f"IRBuilder: session_dir '{session_dir}' introuvable — "
+                f"enrichissement visuel désactivé"
+            )
+            session_dir_path = None
+
        # Créer le WorkflowIR vide
        ir = WorkflowIR.new(
            name=name or f"Workflow du {time.strftime('%d/%m/%Y %H:%M')}",
@@ -90,6 +99,7 @@ class IRBuilder:
                total_steps=len(segments),
                workflow_name=ir.name,
                domain=domain,
+                session_dir_path=session_dir_path,
            )
            ir.steps.append(step)

@@ -189,6 +199,7 @@ class IRBuilder:
        total_steps: int,
        workflow_name: str,
        domain: str,
+        session_dir_path: Optional[Path] = None,
    ) -> Step:
        """Construire une Step depuis un segment d'événements.

@@ -197,7 +208,7 @@ class IRBuilder:
        # Construire la description du segment pour gemma4
        actions = []
        for evt in segment:
-            action = self._event_to_action(evt)
+            action = self._event_to_action(evt, session_dir_path=session_dir_path)
            if action:
                actions.append(action)

@@ -217,17 +228,49 @@ class IRBuilder:
            actions=actions,
        )

-    def _event_to_action(self, evt: Dict) -> Optional[Action]:
-        """Convertir un événement brut en Action."""
+    def _event_to_action(self, evt: Dict, session_dir_path: Optional[Path] = None) -> Optional[Action]:
+        """Convertir un événement brut en Action enrichie.
+
+        Pour les clics : appelle enrich_click_from_screenshot() si le session_dir
+        est disponible pour obtenir :
+        - by_text (texte OCR exact de l'élément cliqué)
+        - anchor_image_base64 (crop 80x80 pour template matching)
+        - vlm_description (description positionnelle)
+        - window_capture (rect pour le grounding ciblé)
+
+        Cet enrichissement est LA clé pour que l'ExecutionCompiler produise
+        des plans V4 complets avec toutes les stratégies (OCR + template + VLM).
+        """
        evt_type = evt.get("type", "")

        if evt_type == "mouse_click":
            window = evt.get("window", {}).get("title", "")
-            return Action(
+            pos = evt.get("pos", [0, 0])
+
+            # Action de base (fallback sans enrichissement)
+            action = Action(
                type="click",
                target=window,
-                anchor_hint=evt.get("vision_info", {}).get("text", ""),
+                anchor_hint=evt.get("vision_info", {}).get("text", "") if isinstance(evt.get("vision_info"), dict) else "",
            )
+
+            # Enrichissement visuel via enrich_click_from_screenshot
+            # Accès direct au crop OCR + anchor pour l'ExecutionCompiler
+            if session_dir_path and isinstance(pos, list) and len(pos) == 2:
+                enrichment = self._enrich_click(
+                    evt, session_dir_path, window, int(pos[0]), int(pos[1]),
+                )
+                if enrichment:
+                    # Le texte OCR devient l'anchor_hint pour l'OCR primaire
+                    by_text = enrichment.get("by_text", "")
+                    if by_text:
+                        action.anchor_hint = by_text
+                    # Stocker les métadonnées d'enrichissement dans l'action
+                    # (utilisé par l'ExecutionCompiler pour construire les stratégies)
+                    action._enrichment = enrichment
+
+            return action
+
        elif evt_type == "text_input":
            text = evt.get("text", "")
            if text:
@@ -241,6 +284,55 @@ class IRBuilder:

        return None

+    def _enrich_click(
+        self,
+        evt: Dict,
+        session_dir_path: Path,
+        window_title: str,
+        click_x: int,
+        click_y: int,
+    ) -> Optional[Dict[str, Any]]:
+        """Enrichir un clic avec OCR + crop + description.
+
+        Réutilise enrich_click_from_screenshot du stream_processor (éprouvé).
+        Retourne un dict avec by_text, anchor_image_base64, vlm_description, etc.
+        """
+        try:
+            from agent_v0.server_v1.stream_processor import enrich_click_from_screenshot
+
+            # Trouver le screenshot full
+            screenshot_id = evt.get("screenshot_id", "")
+            if not screenshot_id:
+                return None
+
+            full_path = session_dir_path / "shots" / f"{screenshot_id}_full.png"
+            if not full_path.is_file():
+                return None
+
+            # Résolution écran
+            screen_w = 1280
+            screen_h = 800
+            window_capture = evt.get("window_capture", {})
+            if window_capture.get("window_rect"):
+                rect = window_capture["window_rect"]
+                screen_w = max(screen_w, rect[2])
+                screen_h = max(screen_h, rect[3])
+
+            return enrich_click_from_screenshot(
+                screenshot_path=full_path,
+                click_x=click_x,
+                click_y=click_y,
+                screen_w=screen_w,
+                screen_h=screen_h,
+                window_title=window_title,
+                vision_info=evt.get("vision_info") if isinstance(evt.get("vision_info"), dict) else None,
+                session_dir=session_dir_path,
+                screenshot_id=screenshot_id,
+            )
+        except Exception as e:
+            logger.debug(f"IRBuilder._enrich_click: {e}")
+            return None
+
    def _describe_segment(self, segment: List[Dict]) -> str:
        """Décrire un segment en langage naturel (pour gemma4)."""
        parts = []