fix: prompt natif bbox_2d pour le grounding Qwen2.5-VL

Le prompt JSON ("Answer ONLY: {x, y}") ne fonctionne plus — retourne [0.0, 0.0] systématiquement. Le prompt natif "Detect X with a bounding box" retourne des bbox_2d précis. C'est le format pour lequel Qwen2.5-VL est entraîné. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-04 22:43:46 +02:00
parent c1ce6a3964
commit 91614fbff0
1 changed files with 4 additions and 10 deletions
--- a/agent_v0/server_v1/api_stream.py
+++ b/agent_v0/server_v1/api_stream.py
@@ -4565,12 +4565,8 @@ def _resolve_by_grounding(
        logger.warning("Grounding : erreur redimensionnement — %s", e)
        return None

-    # Construire le prompt — format JSON universel (fonctionne avec gemma4, qwen2.5vl, qwen3)
-    prompt = (
-        f"Look at this screenshot. Find: {description}\n"
-        "Where is it? Give the center position as percentage of the image.\n"
-        'Answer ONLY with JSON: {"x": 0.XX, "y": 0.YY}'
-    )
+    # Prompt natif Qwen2.5-VL — format bbox_2d (le seul fiable)
+    prompt = f"Detect '{description}' in this image with a bounding box."

    # Le grounding nécessite un modèle entraîné pour les coordonnées (bbox_2d).
    # Qwen2.5-VL est le seul qui retourne des positions précises.
@@ -4610,18 +4606,16 @@ def _resolve_by_grounding(
    except Exception as e:
        logger.debug("vLLM non disponible (%s), fallback Ollama", e)

-    # Essai 2 : Ollama (qwen2.5vl:7b pour le grounding)
+    # Essai 2 : Ollama (qwen2.5vl:7b pour le grounding — format bbox_2d natif)
    if not content:
        try:
            resp = _requests.post("http://localhost:11434/api/chat", json={
                "model": _grounding_model,
                "messages": [
-                    {"role": "system", "content": "You locate UI elements on screenshots. Return coordinates as JSON."},
                    {"role": "user", "content": prompt, "images": [shot_b64]},
                ],
                "stream": False,
-                "think": False,
-                "options": {"temperature": 0.1, "num_predict": 200},
+                "options": {"temperature": 0.1, "num_predict": 100},
            }, timeout=60)
            content = resp.json().get("message", {}).get("content", "")
        except Exception as e: