From 729cd67743f5405810b46e2b6527619b9dc02dcf Mon Sep 17 00:00:00 2001
From: Dom <dom@rpa-vision-v3.local>
Date: Tue, 21 Apr 2026 09:44:19 +0200
Subject: [PATCH] feat(grounding): description VLM de l'ancre quand le label
 est vide
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Quand le target_text est vide ou identique au type d'action
(click_anchor, double_click_anchor...), le VLM décrit l'image
de l'ancre en 5 mots ("folder icon named Demo").

Cette description est ensuite passée à UI-TARS pour le grounding
("click on folder icon named Demo") et à l'OCR pour la recherche.

Chaîne complète : VLM décrit → OCR cherche → UI-TARS grounding → VLM raisonne.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 core/execution/input_handler.py | 55 +++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/core/execution/input_handler.py b/core/execution/input_handler.py
index 58107e817..4d212c30b 100644
--- a/core/execution/input_handler.py
+++ b/core/execution/input_handler.py
@@ -347,6 +347,20 @@ def find_element_on_screen(
     Returns:
         {'x': int, 'y': int, 'method': str, 'confidence': float} ou None
     """
+    # Si le target_text est vide ou c'est juste le type d'action,
+    # utiliser le VLM pour décrire l'image de l'ancre
+    action_types = {'click_anchor', 'double_click_anchor', 'right_click_anchor',
+                    'hover_anchor', 'focus_anchor', 'scroll_to_anchor'}
+    has_useful_text = target_text and target_text not in action_types
+
+    if not has_useful_text and anchor_image_base64:
+        desc = _describe_anchor_image(anchor_image_base64)
+        if desc:
+            logger.info(f"[Grounding] Ancre décrite par VLM: '{desc}'")
+            target_description = desc
+            if not has_useful_text:
+                target_text = desc
+
     if not target_text and not target_description:
         logger.debug("find_element_on_screen: ni target_text ni target_description fournis")
         return None
@@ -373,6 +387,47 @@ def find_element_on_screen(
     return None
 
 
+def _describe_anchor_image(anchor_image_base64: str) -> Optional[str]:
+    """Demande au VLM de décrire l'image de l'ancre en quelques mots.
+
+    Utilisé quand le label est vide — le VLM regarde le crop de l'ancre
+    et décrit ce qu'il voit ("folder icon named Demo", "Save button", etc.)
+    pour que UI-TARS puisse chercher cet élément sur l'écran complet.
+    """
+    try:
+        import requests
+        import os
+
+        if ',' in anchor_image_base64:
+            anchor_image_base64 = anchor_image_base64.split(',', 1)[1]
+
+        ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
+        model = os.environ.get("RPA_REASONING_MODEL", "qwen2.5vl:7b")
+
+        response = requests.post(
+            f"{ollama_url}/api/generate",
+            json={
+                "model": model,
+                "prompt": "Describe this UI element in 5 words maximum. Just the element name, nothing else. Example: 'folder icon named Demo' or 'Save button' or 'Chrome browser icon'",
+                "images": [anchor_image_base64],
+                "stream": False,
+                "options": {"temperature": 0.1, "num_predict": 20}
+            },
+            timeout=15
+        )
+
+        if response.status_code == 200:
+            desc = response.json().get('response', '').strip().strip('"').strip("'")
+            if desc and len(desc) > 2:
+                return desc
+
+        return None
+
+    except Exception as e:
+        logger.debug(f"Description ancre échouée: {e}")
+        return None
+
+
 def _capture_screen():
     """Capture l'écran principal et retourne (PIL.Image, width, height)."""
     try: