From 91614fbff06a46d2d40e220e4096606365e16778 Mon Sep 17 00:00:00 2001 From: Dom Date: Sat, 4 Apr 2026 22:43:46 +0200 Subject: [PATCH] fix: prompt natif bbox_2d pour le grounding Qwen2.5-VL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Le prompt JSON ("Answer ONLY: {x, y}") ne fonctionne plus — retourne [0.0, 0.0] systématiquement. Le prompt natif "Detect X with a bounding box" retourne des bbox_2d précis. C'est le format pour lequel Qwen2.5-VL est entraîné. Co-Authored-By: Claude Opus 4.6 (1M context) --- agent_v0/server_v1/api_stream.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/agent_v0/server_v1/api_stream.py b/agent_v0/server_v1/api_stream.py index 359a75ea7..811a30a43 100644 --- a/agent_v0/server_v1/api_stream.py +++ b/agent_v0/server_v1/api_stream.py @@ -4565,12 +4565,8 @@ def _resolve_by_grounding( logger.warning("Grounding : erreur redimensionnement — %s", e) return None - # Construire le prompt — format JSON universel (fonctionne avec gemma4, qwen2.5vl, qwen3) - prompt = ( - f"Look at this screenshot. Find: {description}\n" - "Where is it? Give the center position as percentage of the image.\n" - 'Answer ONLY with JSON: {"x": 0.XX, "y": 0.YY}' - ) + # Prompt natif Qwen2.5-VL — format bbox_2d (le seul fiable) + prompt = f"Detect '{description}' in this image with a bounding box." # Le grounding nécessite un modèle entraîné pour les coordonnées (bbox_2d). # Qwen2.5-VL est le seul qui retourne des positions précises. @@ -4610,18 +4606,16 @@ def _resolve_by_grounding( except Exception as e: logger.debug("vLLM non disponible (%s), fallback Ollama", e) - # Essai 2 : Ollama (qwen2.5vl:7b pour le grounding) + # Essai 2 : Ollama (qwen2.5vl:7b pour le grounding — format bbox_2d natif) if not content: try: resp = _requests.post("http://localhost:11434/api/chat", json={ "model": _grounding_model, "messages": [ - {"role": "system", "content": "You locate UI elements on screenshots. Return coordinates as JSON."}, {"role": "user", "content": prompt, "images": [shot_b64]}, ], "stream": False, - "think": False, - "options": {"temperature": 0.1, "num_predict": 200}, + "options": {"temperature": 0.1, "num_predict": 100}, }, timeout=60) content = resp.json().get("message", {}).get("content", "") except Exception as e: