From 91614fbff06a46d2d40e220e4096606365e16778 Mon Sep 17 00:00:00 2001
From: Dom <dom@rpa-vision-v3.local>
Date: Sat, 4 Apr 2026 22:43:46 +0200
Subject: [PATCH] fix: prompt natif bbox_2d pour le grounding Qwen2.5-VL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Le prompt JSON ("Answer ONLY: {x, y}") ne fonctionne plus — retourne
[0.0, 0.0] systématiquement. Le prompt natif "Detect X with a bounding
box" retourne des bbox_2d précis. C'est le format pour lequel
Qwen2.5-VL est entraîné.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 agent_v0/server_v1/api_stream.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/agent_v0/server_v1/api_stream.py b/agent_v0/server_v1/api_stream.py
index 359a75ea7..811a30a43 100644
--- a/agent_v0/server_v1/api_stream.py
+++ b/agent_v0/server_v1/api_stream.py
@@ -4565,12 +4565,8 @@ def _resolve_by_grounding(
         logger.warning("Grounding : erreur redimensionnement — %s", e)
         return None
 
-    # Construire le prompt — format JSON universel (fonctionne avec gemma4, qwen2.5vl, qwen3)
-    prompt = (
-        f"Look at this screenshot. Find: {description}\n"
-        "Where is it? Give the center position as percentage of the image.\n"
-        'Answer ONLY with JSON: {"x": 0.XX, "y": 0.YY}'
-    )
+    # Prompt natif Qwen2.5-VL — format bbox_2d (le seul fiable)
+    prompt = f"Detect '{description}' in this image with a bounding box."
 
     # Le grounding nécessite un modèle entraîné pour les coordonnées (bbox_2d).
     # Qwen2.5-VL est le seul qui retourne des positions précises.
@@ -4610,18 +4606,16 @@ def _resolve_by_grounding(
     except Exception as e:
         logger.debug("vLLM non disponible (%s), fallback Ollama", e)
 
-    # Essai 2 : Ollama (qwen2.5vl:7b pour le grounding)
+    # Essai 2 : Ollama (qwen2.5vl:7b pour le grounding — format bbox_2d natif)
     if not content:
         try:
             resp = _requests.post("http://localhost:11434/api/chat", json={
                 "model": _grounding_model,
                 "messages": [
-                    {"role": "system", "content": "You locate UI elements on screenshots. Return coordinates as JSON."},
                     {"role": "user", "content": prompt, "images": [shot_b64]},
                 ],
                 "stream": False,
-                "think": False,
-                "options": {"temperature": 0.1, "num_predict": 200},
+                "options": {"temperature": 0.1, "num_predict": 100},
             }, timeout=60)
             content = resp.json().get("message", {}).get("content", "")
         except Exception as e: