feat: résolution serveur pour replay Windows + VLM multi-image + métriques

Feature 4 — Résolution serveur : - Nouvelle méthode _server_resolve_target() dans executor.py - Cascade : template local → serveur /resolve_target → VLM local (fallback) - Popup handling via serveur aussi - L'agent Windows peut maintenant résoudre les clics via SomEngine+VLM Feature 5 — VLM multi-image : - _resolve_by_som() envoie l'anchor crop en 2ème image au VLM - Le VLM voit les marks numérotés + le crop de l'élément recherché Feature 6 — Métriques de résolution : - resolution_method, resolution_score, resolution_elapsed_ms - Propagés agent → serveur via /replay/result - Résumé en fin de replay (méthodes, score moyen, temps moyen) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 11:37:35 +02:00
parent 1e8e2dd9f3
commit 18792fd7b4
2 changed files with 178 additions and 17 deletions
--- a/agent_v0/agent_v1/core/executor.py
+++ b/agent_v0/agent_v1/core/executor.py
@@ -199,9 +199,15 @@ class ActionExecutorV1:
                    x_pct = resolved["x_pct"]
                    y_pct = resolved["y_pct"]
                    result["visual_resolved"] = resolved.get("resolved", False)
+                    # Métriques de résolution
+                    result["resolution_method"] = resolved.get("resolution_method", "")
+                    result["resolution_score"] = resolved.get("resolution_score", 0.0)
+                    result["resolution_elapsed_ms"] = resolved.get("resolution_elapsed_ms", 0.0)
                    if resolved.get("resolved"):
                        logger.info(
-                            f"Visual resolve OK: {resolved.get('matched_element', {}).get('label', '?')} "
+                            f"Visual resolve OK [{result['resolution_method']}] "
+                            f"{result['resolution_elapsed_ms']:.0f}ms : "
+                            f"{resolved.get('matched_element', {}).get('label', '?')} "
                            f"-> ({x_pct:.4f}, {y_pct:.4f})"
                        )

@@ -391,22 +397,44 @@ class ActionExecutorV1:

        Stratégie hybride en cascade :
        1. Template matching avec le crop anchor (rapide, fiable si l'UI n'a pas changé)
-        2. VLM identifie l'élément + template matching texte (approche hybride)
-        3. VLM direct coordonnées (legacy, peu fiable avec qwen3-vl:8b)
+        2. Serveur resolve_target (SomEngine + VLM, si serveur accessible)
+        3. VLM local (fallback pour dev/test Linux)
        """
+        import time as _time
+        t_start = _time.time()
+
        screenshot_b64 = self._capture_screenshot_b64(max_width=0, quality=75)
        if not screenshot_b64:
            logger.warning("Capture screenshot echouee pour visual resolve")
            return None

+        def _with_metrics(result, method_override=None):
+            """Enrichir le résultat avec les métriques de résolution."""
+            if result is None:
+                return None
+            elapsed_ms = (_time.time() - t_start) * 1000
+            result["resolution_method"] = method_override or result.get("method", "unknown")
+            result["resolution_score"] = result.get("score", 0.0)
+            result["resolution_elapsed_ms"] = round(elapsed_ms, 1)
+            return result
+
        # ---- ÉTAPE 1 : Template matching avec le crop anchor ----
        anchor_b64 = target_spec.get("anchor_image_base64", "")
        if anchor_b64:
            tm_result = self._template_match_anchor(screenshot_b64, anchor_b64, screen_width, screen_height)
            if tm_result and tm_result.get("resolved"):
-                return tm_result
+                return _with_metrics(tm_result)

-        # ---- ÉTAPE 2 : Approche hybride VLM identifie + template matching texte ----
+        # ---- ÉTAPE 2 : Résolution serveur (SomEngine + VLM) ----
+        if server_url:
+            server_result = self._server_resolve_target(
+                server_url, screenshot_b64, target_spec,
+                fallback_x, fallback_y, screen_width, screen_height,
+            )
+            if server_result and server_result.get("resolved"):
+                return _with_metrics(server_result)
+
+        # ---- ÉTAPE 3 : VLM local (fallback dev/test, si Ollama accessible) ----
        by_text = target_spec.get("by_text", "")
        vlm_description = target_spec.get("vlm_description", "")
        if vlm_description or by_text:
@@ -414,16 +442,78 @@ class ActionExecutorV1:
                screenshot_b64, target_spec, screen_width, screen_height
            )
            if hybrid_result and hybrid_result.get("resolved"):
-                return hybrid_result
+                return _with_metrics(hybrid_result)

-        # ---- ÉTAPE 3 : VLM direct coordonnées (legacy, peu fiable) ----
        vlm_result = self._vlm_direct_resolve(screenshot_b64, target_spec)
        if vlm_result and vlm_result.get("resolved"):
-            return vlm_result
+            return _with_metrics(vlm_result)

        print("    [VISUAL] Toutes les méthodes ont échoué")
        return None

+    def _server_resolve_target(
+        self, server_url: str, screenshot_b64: str, target_spec: dict,
+        fallback_x: float, fallback_y: float,
+        screen_width: int, screen_height: int,
+    ) -> dict:
+        """Résolution visuelle via le serveur (SomEngine + VLM sur GPU).
+
+        Le serveur dispose de SomEngine (YOLO + docTR) et du VLM (qwen3-vl).
+        L'agent envoie le screenshot + target_spec, le serveur résout et
+        retourne les coordonnées.
+        """
+        import requests as _requests
+        from .config import API_TOKEN
+
+        url = f"{server_url}/traces/stream/replay/resolve_target"
+        payload = {
+            "session_id": "",
+            "screenshot_b64": screenshot_b64,
+            "target_spec": target_spec,
+            "fallback_x_pct": fallback_x,
+            "fallback_y_pct": fallback_y,
+            "screen_width": screen_width,
+            "screen_height": screen_height,
+            "strict_mode": True,
+        }
+        headers = {"Content-Type": "application/json"}
+        if API_TOKEN:
+            headers["Authorization"] = f"Bearer {API_TOKEN}"
+
+        try:
+            print(f"    [SERVER-RESOLVE] Appel serveur {server_url}...")
+            resp = _requests.post(url, json=payload, headers=headers, timeout=30)
+            if not resp.ok:
+                logger.warning(f"Server resolve HTTP {resp.status_code}")
+                return None
+
+            data = resp.json()
+            resolved = data.get("resolved", False)
+            method = data.get("method", "server_unknown")
+
+            if resolved:
+                print(
+                    f"    [SERVER-RESOLVE] OK [{method}] "
+                    f"→ ({data.get('x_pct', 0):.3f}, {data.get('y_pct', 0):.3f}) "
+                    f"score={data.get('score', 0):.2f}"
+                )
+                logger.info(f"Server resolve OK [{method}] score={data.get('score', 0):.2f}")
+            else:
+                reason = data.get("reason", "unknown")
+                print(f"    [SERVER-RESOLVE] Échec ({reason})")
+                logger.info(f"Server resolve échoué : {reason}")
+
+            return data
+
+        except _requests.Timeout:
+            print("    [SERVER-RESOLVE] Timeout (30s)")
+            logger.warning("Server resolve timeout")
+            return None
+        except Exception as e:
+            print(f"    [SERVER-RESOLVE] Erreur : {e}")
+            logger.warning(f"Server resolve erreur : {e}")
+            return None
+
    def _template_match_anchor(
        self, screenshot_b64: str, anchor_b64: str,
        screen_width: int, screen_height: int,
@@ -832,6 +922,9 @@ Example: x_pct=0.50, y_pct=0.30"""
            "error": result.get("error"),
            "warning": result.get("warning"),
            "screenshot": result.get("screenshot"),
+            "resolution_method": result.get("resolution_method"),
+            "resolution_score": result.get("resolution_score"),
+            "resolution_elapsed_ms": result.get("resolution_elapsed_ms"),
        }
        try:
            resp2 = requests.post(
@@ -887,7 +980,29 @@ Example: x_pct=0.50, y_pct=0.30"""
            logger.warning("[POPUP-VLM] Capture screenshot échouée")
            return False

-        # Étape 1 : Le VLM identifie le bouton à cliquer
+        # Essayer la détection popup via le serveur d'abord
+        from .config import SERVER_URL, API_TOKEN
+        if SERVER_URL:
+            monitor = self.sct.monitors[1]
+            sw, sh = monitor["width"], monitor["height"]
+            server_result = self._server_resolve_target(
+                SERVER_URL, screenshot_b64,
+                {"vlm_description": "popup, dialog box, confirmation, or error message button (Oui, OK, Yes, Non, Enregistrer, Annuler)"},
+                0.5, 0.5, sw, sh,
+            )
+            if server_result and server_result.get("resolved"):
+                x_pct = server_result["x_pct"]
+                y_pct = server_result["y_pct"]
+                real_x = int(x_pct * sw)
+                real_y = int(y_pct * sh)
+                label = server_result.get("matched_element", {}).get("label", "popup")
+                print(f"    [POPUP-SERVER] Popup détectée ! Clic sur '{label}' → ({real_x}, {real_y})")
+                logger.info(f"[POPUP-SERVER] Clic popup '{label}' à ({real_x}, {real_y})")
+                self._click((real_x, real_y), "left")
+                time.sleep(1.0)
+                return True
+
+        # Fallback : VLM local identifie le bouton à cliquer
        button_text = self._vlm_identify_popup_button(screenshot_b64)
        if not button_text:
            return False  # Pas de popup ou VLM en échec
@@ -952,7 +1067,7 @@ Example: x_pct=0.50, y_pct=0.30"""
        ollama_url = f"http://{ollama_host}:11434/api/chat"

        prompt = (
-            "Look at this screenshot. Is there a popup dialog, confirmation dialog, "
+            "Regarde cette capture d'écran. Y a-t-il une popup, une boîte de dialogue, "
            "error message, or modal window visible?\n"
            "If yes, what button should I click to proceed?\n"
            "Answer ONLY the button text (like: Oui, OK, Yes, Enregistrer, Non, "
@@ -1083,7 +1198,7 @@ Example: x_pct=0.50, y_pct=0.30"""

        best_match = None
        best_val = 0.0
-        threshold = 0.55  # Seuil assez permissif pour le texte de bouton
+        threshold = 0.50  # Seuil équilibré

        # Essayer plusieurs tailles de police pour couvrir différentes résolutions
        for font_size in [14, 16, 18, 20, 22, 24, 12, 26, 28, 10]:
--- a/agent_v0/server_v1/api_stream.py
+++ b/agent_v0/server_v1/api_stream.py
@@ -408,6 +408,10 @@ class ReplayResultReport(BaseModel):
    screenshot: Optional[str] = None  # Chemin ou base64 du screenshot post-action
    screenshot_after: Optional[str] = None  # Chemin ou base64 du screenshot APRES l'action
    actual_position: Optional[Dict[str, float]] = None  # {"x": px, "y": py} position réelle du clic
+    # Métriques de résolution visuelle
+    resolution_method: Optional[str] = None  # som_text_match, som_vlm, vlm_quick_find, etc.
+    resolution_score: Optional[float] = None
+    resolution_elapsed_ms: Optional[float] = None


 class ErrorCallbackConfig(BaseModel):
@@ -2286,6 +2290,9 @@ async def report_action_result(report: ReplayResultReport):
            "actual_position": report.actual_position,
            "retry_count": retry_count,
            "verification": verification.to_dict() if verification else None,
+            "resolution_method": report.resolution_method,
+            "resolution_score": report.resolution_score,
+            "resolution_elapsed_ms": report.resolution_elapsed_ms,
        }
        replay_state["results"].append(result_entry)

@@ -2384,6 +2391,30 @@ async def report_action_result(report: ReplayResultReport):
                f" ({replay_state['retried_actions']} retries, "
                f"{replay_state['unverified_actions']} non vérifiées)"
            )
+            # Résumé des métriques de résolution visuelle
+            results_with_method = [
+                r for r in replay_state["results"]
+                if r.get("resolution_method")
+            ]
+            if results_with_method:
+                methods_count = {}
+                total_elapsed = 0.0
+                total_score = 0.0
+                for r in results_with_method:
+                    m = r["resolution_method"]
+                    methods_count[m] = methods_count.get(m, 0) + 1
+                    total_elapsed += r.get("resolution_elapsed_ms") or 0
+                    total_score += r.get("resolution_score") or 0
+                avg_elapsed = total_elapsed / len(results_with_method)
+                avg_score = total_score / len(results_with_method)
+                methods_str = ", ".join(
+                    f"{m}={c}" for m, c in sorted(methods_count.items())
+                )
+                logger.info(
+                    f"Replay {replay_state['replay_id']} métriques résolution : "
+                    f"{len(results_with_method)} resolves [{methods_str}] "
+                    f"score_moy={avg_score:.2f} temps_moy={avg_elapsed:.0f}ms"
+                )

        # Libérer le GPU pour le worker VLM si le replay est terminé ou en erreur
        if replay_state["status"] in ("completed", "error"):
@@ -3506,12 +3537,26 @@ def _resolve_by_som(
        for e in labeled_elements
    )

-    prompt = (
-        f"I'm looking for: {target_desc}\n\n"
-        f"Here are the numbered elements detected on screen:\n{elements_list}\n\n"
-        "Which number is the correct element?\n"
-        'Answer with JSON only: {"mark_id": N, "confidence": 0.9}'
-    )
+    # Multi-image : SoM annotée + anchor crop (si disponible)
+    anchor_b64 = target_spec.get("anchor_image_base64", "")
+    extra_images = [anchor_b64] if anchor_b64 else None
+
+    if extra_images:
+        prompt = (
+            "Image 1 shows the screen with numbered marks on each UI element.\n"
+            "Image 2 shows the element I'm looking for.\n\n"
+            f"Target: {target_desc}\n\n"
+            f"Detected elements:\n{elements_list}\n\n"
+            "Which mark number matches the target element in Image 2?\n"
+            'Answer with JSON only: {"mark_id": N, "confidence": 0.9}'
+        )
+    else:
+        prompt = (
+            f"I'm looking for: {target_desc}\n\n"
+            f"Detected elements:\n{elements_list}\n\n"
+            "Which number is the correct element?\n"
+            'Answer with JSON only: {"mark_id": N, "confidence": 0.9}'
+        )

    system_prompt = "You identify UI elements by number. Output JSON only, no explanation."

@@ -3523,6 +3568,7 @@ def _resolve_by_som(
            temperature=0.1,
            max_tokens=50,
            force_json=False,
+            extra_images_b64=extra_images,
        )
    except Exception as e:
        logger.warning("SoM resolve : erreur VLM — %s", e)