From 1c5ff4200670b6a47b1eb2534205782627264232 Mon Sep 17 00:00:00 2001
From: Dom <dom@rpa-vision-v3.local>
Date: Sun, 5 Apr 2026 12:04:46 +0200
Subject: [PATCH] =?UTF-8?q?fix:=20ajouter=20position=20relative=20au=20pro?=
 =?UTF-8?q?mpt=20grounding=20(d=C3=A9sambigu=C3=AFsation)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Quand plusieurs éléments ont le même texte ("Rechercher" dans la taskbar
ET dans l'explorateur), la position relative (en bas, en haut, à gauche)
aide le VLM à choisir le bon.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 agent_v0/server_v1/api_stream.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/agent_v0/server_v1/api_stream.py b/agent_v0/server_v1/api_stream.py
index 76fc36776..ade8acc97 100644
--- a/agent_v0/server_v1/api_stream.py
+++ b/agent_v0/server_v1/api_stream.py
@@ -4580,7 +4580,14 @@ def _resolve_by_grounding(
         return None
 
     # Prompt natif Qwen2.5-VL — format bbox_2d (le seul fiable)
-    prompt = f"Detect '{description}' in this image with a bounding box."
+    # Ajouter la position relative pour désambiguïser (ex: deux "Rechercher" à l'écran)
+    original_pos = target_spec.get("original_position", {})
+    pos_hint = ""
+    y_rel = original_pos.get("y_relative", "")
+    x_rel = original_pos.get("x_relative", "")
+    if y_rel or x_rel:
+        pos_hint = f" located {y_rel} {x_rel} of the screen".strip()
+    prompt = f"Detect '{description}'{pos_hint} in this image with a bounding box."
 
     # Le grounding nécessite un modèle entraîné pour les coordonnées (bbox_2d).
     # Qwen2.5-VL est le seul qui retourne des positions précises.