feat(grounding): description VLM de l'ancre quand le label est vide
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 12s
security-audit / pip-audit (CVE dépendances) (push) Successful in 10s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 13s
tests / Tests unitaires (sans GPU) (push) Failing after 14s
tests / Tests sécurité (critique) (push) Has been skipped
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 12s
security-audit / pip-audit (CVE dépendances) (push) Successful in 10s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 13s
tests / Tests unitaires (sans GPU) (push) Failing after 14s
tests / Tests sécurité (critique) (push) Has been skipped
Quand le target_text est vide ou identique au type d'action
(click_anchor, double_click_anchor...), le VLM décrit l'image
de l'ancre en 5 mots ("folder icon named Demo").
Cette description est ensuite passée à UI-TARS pour le grounding
("click on folder icon named Demo") et à l'OCR pour la recherche.
Chaîne complète : VLM décrit → OCR cherche → UI-TARS grounding → VLM raisonne.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -347,6 +347,20 @@ def find_element_on_screen(
|
|||||||
Returns:
|
Returns:
|
||||||
{'x': int, 'y': int, 'method': str, 'confidence': float} ou None
|
{'x': int, 'y': int, 'method': str, 'confidence': float} ou None
|
||||||
"""
|
"""
|
||||||
|
# Si le target_text est vide ou c'est juste le type d'action,
|
||||||
|
# utiliser le VLM pour décrire l'image de l'ancre
|
||||||
|
action_types = {'click_anchor', 'double_click_anchor', 'right_click_anchor',
|
||||||
|
'hover_anchor', 'focus_anchor', 'scroll_to_anchor'}
|
||||||
|
has_useful_text = target_text and target_text not in action_types
|
||||||
|
|
||||||
|
if not has_useful_text and anchor_image_base64:
|
||||||
|
desc = _describe_anchor_image(anchor_image_base64)
|
||||||
|
if desc:
|
||||||
|
logger.info(f"[Grounding] Ancre décrite par VLM: '{desc}'")
|
||||||
|
target_description = desc
|
||||||
|
if not has_useful_text:
|
||||||
|
target_text = desc
|
||||||
|
|
||||||
if not target_text and not target_description:
|
if not target_text and not target_description:
|
||||||
logger.debug("find_element_on_screen: ni target_text ni target_description fournis")
|
logger.debug("find_element_on_screen: ni target_text ni target_description fournis")
|
||||||
return None
|
return None
|
||||||
@@ -373,6 +387,47 @@ def find_element_on_screen(
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _describe_anchor_image(anchor_image_base64: str) -> Optional[str]:
|
||||||
|
"""Demande au VLM de décrire l'image de l'ancre en quelques mots.
|
||||||
|
|
||||||
|
Utilisé quand le label est vide — le VLM regarde le crop de l'ancre
|
||||||
|
et décrit ce qu'il voit ("folder icon named Demo", "Save button", etc.)
|
||||||
|
pour que UI-TARS puisse chercher cet élément sur l'écran complet.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
import os
|
||||||
|
|
||||||
|
if ',' in anchor_image_base64:
|
||||||
|
anchor_image_base64 = anchor_image_base64.split(',', 1)[1]
|
||||||
|
|
||||||
|
ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
||||||
|
model = os.environ.get("RPA_REASONING_MODEL", "qwen2.5vl:7b")
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
f"{ollama_url}/api/generate",
|
||||||
|
json={
|
||||||
|
"model": model,
|
||||||
|
"prompt": "Describe this UI element in 5 words maximum. Just the element name, nothing else. Example: 'folder icon named Demo' or 'Save button' or 'Chrome browser icon'",
|
||||||
|
"images": [anchor_image_base64],
|
||||||
|
"stream": False,
|
||||||
|
"options": {"temperature": 0.1, "num_predict": 20}
|
||||||
|
},
|
||||||
|
timeout=15
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
desc = response.json().get('response', '').strip().strip('"').strip("'")
|
||||||
|
if desc and len(desc) > 2:
|
||||||
|
return desc
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Description ancre échouée: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _capture_screen():
|
def _capture_screen():
|
||||||
"""Capture l'écran principal et retourne (PIL.Image, width, height)."""
|
"""Capture l'écran principal et retourne (PIL.Image, width, height)."""
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user