feat: VLM grounding direct (Qwen2.5-VL) — nouvelle stratégie de résolution

Nouvelle approche basée sur les recherches état de l'art : - _resolve_by_grounding() : le VLM retourne directement les coordonnées (pas de SomEngine + numérotation intermédiaire) - Utilise Qwen2.5-VL (entraîné pour le GUI grounding) au lieu de qwen3-vl - Parse les formats natifs : bbox_2d, JSON x/y, arrays bruts - Fallback multi-image : screenshot + crop → grounding sans description - Identification des icônes via Qwen2.5-VL (meilleur que qwen3-vl) Résultats sur session réelle (validation locale) : - Éléments avec texte (Word, Document, Fichier) : 100% corrects - Icônes sans texte (Windows logo, disquette) : en cours d'amélioration Cascade strict mode : 0. Grounding VLM direct (Qwen2.5-VL) — NOUVEAU 0.5. Template matching pour icônes 1. VLM Quick Find (fallback) 1.5. SoM + VLM 2. Template matching strict Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 18:55:00 +02:00
parent 875367dea9
commit d99b17394a
2 changed files with 243 additions and 25 deletions
--- a/agent_v0/server_v1/api_stream.py
+++ b/agent_v0/server_v1/api_stream.py
@@ -3366,6 +3366,206 @@ def _vlm_quick_find(
        return None


+# ---------------------------------------------------------------------------
+# Résolution par VLM Grounding Direct (Qwen2.5-VL)
+# ---------------------------------------------------------------------------
+
+
+def _resolve_by_grounding(
+    screenshot_path: str,
+    target_spec: Dict[str, Any],
+    screen_width: int,
+    screen_height: int,
+) -> Optional[Dict[str, Any]]:
+    """Résoudre une cible via grounding VLM direct (Qwen2.5-VL).
+
+    Le VLM reçoit le screenshot + une description textuelle et retourne
+    directement les coordonnées (bbox_2d) de l'élément. Pas de SomEngine,
+    pas de numérotation — le VLM est entraîné pour le grounding UI.
+
+    Approche plus fiable que SomEngine+VLM pour les icônes et éléments
+    visuels sans texte (logo Windows, disquette, bouton fermer).
+    """
+    import base64
+    import io
+    import re
+
+    t0 = time.time()
+
+    # Construire la description de la cible
+    by_text = target_spec.get("by_text", "").strip()
+    vlm_desc = target_spec.get("vlm_description", "").strip()
+    window_title = target_spec.get("window_title", "").strip()
+
+    if by_text:
+        description = by_text
+    elif vlm_desc:
+        description = vlm_desc
+    else:
+        return None
+
+    # Redimensionner le screenshot (800px de large pour le VLM)
+    try:
+        from PIL import Image as PILImage
+        img = PILImage.open(screenshot_path)
+        orig_w, orig_h = img.size
+        target_w = 800
+        ratio = target_w / orig_w
+        img_small = img.resize((target_w, int(orig_h * ratio)))
+        small_w, small_h = img_small.size
+
+        buf = io.BytesIO()
+        img_small.save(buf, format="JPEG", quality=75)
+        shot_b64 = base64.b64encode(buf.getvalue()).decode()
+    except Exception as e:
+        logger.warning("Grounding : erreur redimensionnement — %s", e)
+        return None
+
+    # Construire le prompt — Qwen2.5-VL retourne naturellement des bbox_2d
+    prompt = (
+        f"Look at this screenshot. Find: {description}\n"
+        "Where is it? Give the center position as percentage of the image.\n"
+        'Answer ONLY with JSON: {"x": 0.XX, "y": 0.YY}'
+    )
+
+    # Appel VLM (Qwen2.5-VL pour le grounding)
+    try:
+        import requests as _requests
+        resp = _requests.post("http://localhost:11434/api/chat", json={
+            "model": "qwen2.5vl:7b",
+            "messages": [
+                {"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."},
+                {"role": "user", "content": prompt, "images": [shot_b64]},
+            ],
+            "stream": False,
+            "options": {"temperature": 0.1, "num_predict": 80},
+        }, timeout=60)
+        content = resp.json().get("message", {}).get("content", "")
+    except Exception as e:
+        logger.info("Grounding VLM timeout/erreur : %s", e)
+        return None
+
+    elapsed = time.time() - t0
+
+    # Parser la réponse — Qwen2.5-VL retourne soit bbox_2d en pixels, soit JSON %
+    x_pct, y_pct = None, None
+
+    # Format 1 : bbox_2d en pixels [x, y] ou [x1, y1, x2, y2]
+    bbox_match = re.search(r'"bbox_2d"\s*:\s*\[([^\]]+)\]', content)
+    if bbox_match:
+        coords = [float(v.strip()) for v in bbox_match.group(1).split(",")]
+        if len(coords) == 2:
+            x_pct = coords[0] / small_w
+            y_pct = coords[1] / small_h
+        elif len(coords) >= 4:
+            x_pct = (coords[0] + coords[2]) / 2 / small_w
+            y_pct = (coords[1] + coords[3]) / 2 / small_h
+
+    # Format 2 : JSON {"x": 0.XX, "y": 0.YY}
+    if x_pct is None:
+        json_match = re.search(r'"x"\s*:\s*([\d.]+).*?"y"\s*:\s*([\d.]+)', content)
+        if json_match:
+            x_val, y_val = float(json_match.group(1)), float(json_match.group(2))
+            # Si > 1, c'est en pixels
+            if x_val > 1:
+                x_pct = x_val / small_w
+                y_pct = y_val / small_h
+            else:
+                x_pct = x_val
+                y_pct = y_val
+
+    # Format 3 : {"x_pct": 0.XX, "y_pct": 0.YY}
+    if x_pct is None:
+        pct_match = re.search(r'"x_pct"\s*:\s*([\d.]+).*?"y_pct"\s*:\s*([\d.]+)', content)
+        if pct_match:
+            x_pct = float(pct_match.group(1))
+            y_pct = float(pct_match.group(2))
+
+    # Format 4 : array brut [x1, y1, x2, y2] ou [x, y]
+    if x_pct is None:
+        arr_match = re.search(r'\[[\s]*([\d.]+)\s*,\s*([\d.]+)(?:\s*,\s*([\d.]+)\s*,\s*([\d.]+))?\s*\]', content)
+        if arr_match:
+            vals = [float(v) for v in arr_match.groups() if v is not None]
+            if len(vals) >= 4:
+                x_pct = (vals[0] + vals[2]) / 2 / small_w
+                y_pct = (vals[1] + vals[3]) / 2 / small_h
+            elif len(vals) == 2:
+                x_pct = vals[0] / small_w
+                y_pct = vals[1] / small_h
+
+    if x_pct is None or y_pct is None:
+        # Fallback multi-image : screenshot + crop → grounding sans description
+        anchor_b64 = target_spec.get("anchor_image_base64", "")
+        if anchor_b64:
+            try:
+                prompt_mi = (
+                    "Image 1 is a screenshot. Image 2 shows a UI element.\n"
+                    "Find where Image 2 appears on Image 1.\n"
+                    'Return position: {"x": NNN, "y": NNN} in pixels of Image 1.'
+                )
+                resp2 = _requests.post("http://localhost:11434/api/chat", json={
+                    "model": "qwen2.5vl:7b",
+                    "messages": [
+                        {"role": "user", "content": prompt_mi, "images": [shot_b64, anchor_b64]},
+                    ],
+                    "stream": False,
+                    "options": {"temperature": 0.1, "num_predict": 50},
+                }, timeout=60)
+                content2 = resp2.json().get("message", {}).get("content", "")
+                elapsed = time.time() - t0
+
+                # Parser tous les formats
+                arr2 = re.search(r'\[[\s]*([\d.]+)\s*,\s*([\d.]+)(?:\s*,\s*([\d.]+)\s*,\s*([\d.]+))?\s*\]', content2)
+                if arr2:
+                    vals = [float(v) for v in arr2.groups() if v is not None]
+                    if len(vals) >= 4:
+                        x_pct = (vals[0] + vals[2]) / 2 / small_w
+                        y_pct = (vals[1] + vals[3]) / 2 / small_h
+                    elif len(vals) == 2:
+                        x_pct = vals[0] / small_w
+                        y_pct = vals[1] / small_h
+                if x_pct is None:
+                    json2 = re.search(r'"x"\s*:\s*([\d.]+).*?"y"\s*:\s*([\d.]+)', content2)
+                    if json2:
+                        x_pct = float(json2.group(1)) / small_w
+                        y_pct = float(json2.group(2)) / small_h
+                if x_pct is not None:
+                    logger.info("Grounding multi-image OK (%.1fs)", elapsed)
+            except Exception as e:
+                logger.debug("Grounding multi-image erreur: %s", e)
+
+    if x_pct is None or y_pct is None:
+        logger.info(
+            "Grounding : réponse non parsable (%.1fs) — %s",
+            elapsed, content[:120],
+        )
+        return None
+
+    # Valider les bornes
+    if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0):
+        logger.info("Grounding : coordonnées hors bornes (%.3f, %.3f)", x_pct, y_pct)
+        return None
+
+    logger.info(
+        "Grounding OK [qwen2.5vl] : '%s' → (%.4f, %.4f) en %.1fs",
+        description[:50], x_pct, y_pct, elapsed,
+    )
+
+    return {
+        "resolved": True,
+        "method": "grounding_vlm",
+        "x_pct": round(x_pct, 6),
+        "y_pct": round(y_pct, 6),
+        "matched_element": {
+            "label": description[:60],
+            "type": "grounding",
+            "role": "grounding_vlm",
+            "confidence": 0.85,
+        },
+        "score": 0.85,
+    }
+
+
 # ---------------------------------------------------------------------------
 # Résolution Set-of-Mark : SomEngine (détection) + VLM (identification)
 # ---------------------------------------------------------------------------
@@ -3770,9 +3970,29 @@ def _resolve_target_sync(
                vlm_description = _build_target_description(target_spec)

        # ---------------------------------------------------------------
-        # Étape 0 : Template matching PRIORITAIRE pour les icônes sans texte
-        # Les crops 80x80 sont très discriminants pour les icônes (logo Windows,
-        # disquette, croix). Le VLM se trompe souvent sur ces éléments.
+        # Étape 0 : Grounding VLM Direct (Qwen2.5-VL)
+        # Le VLM reçoit le screenshot + description textuelle et retourne
+        # directement les coordonnées. Plus fiable que SomEngine + numérotation.
+        # ---------------------------------------------------------------
+        grounding_desc = by_text_strict or vlm_description
+        if grounding_desc:
+            grounding_result = _resolve_by_grounding(
+                screenshot_path=screenshot_path,
+                target_spec=target_spec,
+                screen_width=screen_width,
+                screen_height=screen_height,
+            )
+            if grounding_result and grounding_result.get("resolved"):
+                logger.info(
+                    "Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'",
+                    grounding_result.get("x_pct", 0),
+                    grounding_result.get("y_pct", 0),
+                    grounding_desc[:50],
+                )
+                return grounding_result
+
+        # ---------------------------------------------------------------
+        # Étape 0.5 : Template matching pour icônes sans texte (crop 80x80)
        # ---------------------------------------------------------------
        if not by_text_strict:
            result = _resolve_by_template_matching(
@@ -3784,13 +4004,13 @@ def _resolve_target_sync(
            )
            if result and result.get("score", 0) >= 0.70:
                logger.info(
-                    "Strict resolve icon : template matching OK (score=%.3f) pour icône sans texte",
+                    "Strict resolve TEMPLATE : icon match (score=%.3f)",
                    result.get("score", 0),
                )
                return result

        # ---------------------------------------------------------------
-        # Étape 1 : VLM Quick Find (compréhension sémantique)
+        # Étape 1 : VLM Quick Find (fallback, multi-image)
        # ---------------------------------------------------------------
        if vlm_description or anchor_image_b64:
            vlm_result = _vlm_quick_find(