feat: VLM grounding direct (Qwen2.5-VL) — nouvelle stratégie de résolution
Nouvelle approche basée sur les recherches état de l'art : - _resolve_by_grounding() : le VLM retourne directement les coordonnées (pas de SomEngine + numérotation intermédiaire) - Utilise Qwen2.5-VL (entraîné pour le GUI grounding) au lieu de qwen3-vl - Parse les formats natifs : bbox_2d, JSON x/y, arrays bruts - Fallback multi-image : screenshot + crop → grounding sans description - Identification des icônes via Qwen2.5-VL (meilleur que qwen3-vl) Résultats sur session réelle (validation locale) : - Éléments avec texte (Word, Document, Fichier) : 100% corrects - Icônes sans texte (Windows logo, disquette) : en cours d'amélioration Cascade strict mode : 0. Grounding VLM direct (Qwen2.5-VL) — NOUVEAU 0.5. Template matching pour icônes 1. VLM Quick Find (fallback) 1.5. SoM + VLM 2. Template matching strict Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3366,6 +3366,206 @@ def _vlm_quick_find(
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Résolution par VLM Grounding Direct (Qwen2.5-VL)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_by_grounding(
|
||||||
|
screenshot_path: str,
|
||||||
|
target_spec: Dict[str, Any],
|
||||||
|
screen_width: int,
|
||||||
|
screen_height: int,
|
||||||
|
) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Résoudre une cible via grounding VLM direct (Qwen2.5-VL).
|
||||||
|
|
||||||
|
Le VLM reçoit le screenshot + une description textuelle et retourne
|
||||||
|
directement les coordonnées (bbox_2d) de l'élément. Pas de SomEngine,
|
||||||
|
pas de numérotation — le VLM est entraîné pour le grounding UI.
|
||||||
|
|
||||||
|
Approche plus fiable que SomEngine+VLM pour les icônes et éléments
|
||||||
|
visuels sans texte (logo Windows, disquette, bouton fermer).
|
||||||
|
"""
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
import re
|
||||||
|
|
||||||
|
t0 = time.time()
|
||||||
|
|
||||||
|
# Construire la description de la cible
|
||||||
|
by_text = target_spec.get("by_text", "").strip()
|
||||||
|
vlm_desc = target_spec.get("vlm_description", "").strip()
|
||||||
|
window_title = target_spec.get("window_title", "").strip()
|
||||||
|
|
||||||
|
if by_text:
|
||||||
|
description = by_text
|
||||||
|
elif vlm_desc:
|
||||||
|
description = vlm_desc
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Redimensionner le screenshot (800px de large pour le VLM)
|
||||||
|
try:
|
||||||
|
from PIL import Image as PILImage
|
||||||
|
img = PILImage.open(screenshot_path)
|
||||||
|
orig_w, orig_h = img.size
|
||||||
|
target_w = 800
|
||||||
|
ratio = target_w / orig_w
|
||||||
|
img_small = img.resize((target_w, int(orig_h * ratio)))
|
||||||
|
small_w, small_h = img_small.size
|
||||||
|
|
||||||
|
buf = io.BytesIO()
|
||||||
|
img_small.save(buf, format="JPEG", quality=75)
|
||||||
|
shot_b64 = base64.b64encode(buf.getvalue()).decode()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Grounding : erreur redimensionnement — %s", e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Construire le prompt — Qwen2.5-VL retourne naturellement des bbox_2d
|
||||||
|
prompt = (
|
||||||
|
f"Look at this screenshot. Find: {description}\n"
|
||||||
|
"Where is it? Give the center position as percentage of the image.\n"
|
||||||
|
'Answer ONLY with JSON: {"x": 0.XX, "y": 0.YY}'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Appel VLM (Qwen2.5-VL pour le grounding)
|
||||||
|
try:
|
||||||
|
import requests as _requests
|
||||||
|
resp = _requests.post("http://localhost:11434/api/chat", json={
|
||||||
|
"model": "qwen2.5vl:7b",
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."},
|
||||||
|
{"role": "user", "content": prompt, "images": [shot_b64]},
|
||||||
|
],
|
||||||
|
"stream": False,
|
||||||
|
"options": {"temperature": 0.1, "num_predict": 80},
|
||||||
|
}, timeout=60)
|
||||||
|
content = resp.json().get("message", {}).get("content", "")
|
||||||
|
except Exception as e:
|
||||||
|
logger.info("Grounding VLM timeout/erreur : %s", e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
elapsed = time.time() - t0
|
||||||
|
|
||||||
|
# Parser la réponse — Qwen2.5-VL retourne soit bbox_2d en pixels, soit JSON %
|
||||||
|
x_pct, y_pct = None, None
|
||||||
|
|
||||||
|
# Format 1 : bbox_2d en pixels [x, y] ou [x1, y1, x2, y2]
|
||||||
|
bbox_match = re.search(r'"bbox_2d"\s*:\s*\[([^\]]+)\]', content)
|
||||||
|
if bbox_match:
|
||||||
|
coords = [float(v.strip()) for v in bbox_match.group(1).split(",")]
|
||||||
|
if len(coords) == 2:
|
||||||
|
x_pct = coords[0] / small_w
|
||||||
|
y_pct = coords[1] / small_h
|
||||||
|
elif len(coords) >= 4:
|
||||||
|
x_pct = (coords[0] + coords[2]) / 2 / small_w
|
||||||
|
y_pct = (coords[1] + coords[3]) / 2 / small_h
|
||||||
|
|
||||||
|
# Format 2 : JSON {"x": 0.XX, "y": 0.YY}
|
||||||
|
if x_pct is None:
|
||||||
|
json_match = re.search(r'"x"\s*:\s*([\d.]+).*?"y"\s*:\s*([\d.]+)', content)
|
||||||
|
if json_match:
|
||||||
|
x_val, y_val = float(json_match.group(1)), float(json_match.group(2))
|
||||||
|
# Si > 1, c'est en pixels
|
||||||
|
if x_val > 1:
|
||||||
|
x_pct = x_val / small_w
|
||||||
|
y_pct = y_val / small_h
|
||||||
|
else:
|
||||||
|
x_pct = x_val
|
||||||
|
y_pct = y_val
|
||||||
|
|
||||||
|
# Format 3 : {"x_pct": 0.XX, "y_pct": 0.YY}
|
||||||
|
if x_pct is None:
|
||||||
|
pct_match = re.search(r'"x_pct"\s*:\s*([\d.]+).*?"y_pct"\s*:\s*([\d.]+)', content)
|
||||||
|
if pct_match:
|
||||||
|
x_pct = float(pct_match.group(1))
|
||||||
|
y_pct = float(pct_match.group(2))
|
||||||
|
|
||||||
|
# Format 4 : array brut [x1, y1, x2, y2] ou [x, y]
|
||||||
|
if x_pct is None:
|
||||||
|
arr_match = re.search(r'\[[\s]*([\d.]+)\s*,\s*([\d.]+)(?:\s*,\s*([\d.]+)\s*,\s*([\d.]+))?\s*\]', content)
|
||||||
|
if arr_match:
|
||||||
|
vals = [float(v) for v in arr_match.groups() if v is not None]
|
||||||
|
if len(vals) >= 4:
|
||||||
|
x_pct = (vals[0] + vals[2]) / 2 / small_w
|
||||||
|
y_pct = (vals[1] + vals[3]) / 2 / small_h
|
||||||
|
elif len(vals) == 2:
|
||||||
|
x_pct = vals[0] / small_w
|
||||||
|
y_pct = vals[1] / small_h
|
||||||
|
|
||||||
|
if x_pct is None or y_pct is None:
|
||||||
|
# Fallback multi-image : screenshot + crop → grounding sans description
|
||||||
|
anchor_b64 = target_spec.get("anchor_image_base64", "")
|
||||||
|
if anchor_b64:
|
||||||
|
try:
|
||||||
|
prompt_mi = (
|
||||||
|
"Image 1 is a screenshot. Image 2 shows a UI element.\n"
|
||||||
|
"Find where Image 2 appears on Image 1.\n"
|
||||||
|
'Return position: {"x": NNN, "y": NNN} in pixels of Image 1.'
|
||||||
|
)
|
||||||
|
resp2 = _requests.post("http://localhost:11434/api/chat", json={
|
||||||
|
"model": "qwen2.5vl:7b",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": prompt_mi, "images": [shot_b64, anchor_b64]},
|
||||||
|
],
|
||||||
|
"stream": False,
|
||||||
|
"options": {"temperature": 0.1, "num_predict": 50},
|
||||||
|
}, timeout=60)
|
||||||
|
content2 = resp2.json().get("message", {}).get("content", "")
|
||||||
|
elapsed = time.time() - t0
|
||||||
|
|
||||||
|
# Parser tous les formats
|
||||||
|
arr2 = re.search(r'\[[\s]*([\d.]+)\s*,\s*([\d.]+)(?:\s*,\s*([\d.]+)\s*,\s*([\d.]+))?\s*\]', content2)
|
||||||
|
if arr2:
|
||||||
|
vals = [float(v) for v in arr2.groups() if v is not None]
|
||||||
|
if len(vals) >= 4:
|
||||||
|
x_pct = (vals[0] + vals[2]) / 2 / small_w
|
||||||
|
y_pct = (vals[1] + vals[3]) / 2 / small_h
|
||||||
|
elif len(vals) == 2:
|
||||||
|
x_pct = vals[0] / small_w
|
||||||
|
y_pct = vals[1] / small_h
|
||||||
|
if x_pct is None:
|
||||||
|
json2 = re.search(r'"x"\s*:\s*([\d.]+).*?"y"\s*:\s*([\d.]+)', content2)
|
||||||
|
if json2:
|
||||||
|
x_pct = float(json2.group(1)) / small_w
|
||||||
|
y_pct = float(json2.group(2)) / small_h
|
||||||
|
if x_pct is not None:
|
||||||
|
logger.info("Grounding multi-image OK (%.1fs)", elapsed)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Grounding multi-image erreur: %s", e)
|
||||||
|
|
||||||
|
if x_pct is None or y_pct is None:
|
||||||
|
logger.info(
|
||||||
|
"Grounding : réponse non parsable (%.1fs) — %s",
|
||||||
|
elapsed, content[:120],
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Valider les bornes
|
||||||
|
if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0):
|
||||||
|
logger.info("Grounding : coordonnées hors bornes (%.3f, %.3f)", x_pct, y_pct)
|
||||||
|
return None
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Grounding OK [qwen2.5vl] : '%s' → (%.4f, %.4f) en %.1fs",
|
||||||
|
description[:50], x_pct, y_pct, elapsed,
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"resolved": True,
|
||||||
|
"method": "grounding_vlm",
|
||||||
|
"x_pct": round(x_pct, 6),
|
||||||
|
"y_pct": round(y_pct, 6),
|
||||||
|
"matched_element": {
|
||||||
|
"label": description[:60],
|
||||||
|
"type": "grounding",
|
||||||
|
"role": "grounding_vlm",
|
||||||
|
"confidence": 0.85,
|
||||||
|
},
|
||||||
|
"score": 0.85,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Résolution Set-of-Mark : SomEngine (détection) + VLM (identification)
|
# Résolution Set-of-Mark : SomEngine (détection) + VLM (identification)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -3770,9 +3970,29 @@ def _resolve_target_sync(
|
|||||||
vlm_description = _build_target_description(target_spec)
|
vlm_description = _build_target_description(target_spec)
|
||||||
|
|
||||||
# ---------------------------------------------------------------
|
# ---------------------------------------------------------------
|
||||||
# Étape 0 : Template matching PRIORITAIRE pour les icônes sans texte
|
# Étape 0 : Grounding VLM Direct (Qwen2.5-VL)
|
||||||
# Les crops 80x80 sont très discriminants pour les icônes (logo Windows,
|
# Le VLM reçoit le screenshot + description textuelle et retourne
|
||||||
# disquette, croix). Le VLM se trompe souvent sur ces éléments.
|
# directement les coordonnées. Plus fiable que SomEngine + numérotation.
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
grounding_desc = by_text_strict or vlm_description
|
||||||
|
if grounding_desc:
|
||||||
|
grounding_result = _resolve_by_grounding(
|
||||||
|
screenshot_path=screenshot_path,
|
||||||
|
target_spec=target_spec,
|
||||||
|
screen_width=screen_width,
|
||||||
|
screen_height=screen_height,
|
||||||
|
)
|
||||||
|
if grounding_result and grounding_result.get("resolved"):
|
||||||
|
logger.info(
|
||||||
|
"Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'",
|
||||||
|
grounding_result.get("x_pct", 0),
|
||||||
|
grounding_result.get("y_pct", 0),
|
||||||
|
grounding_desc[:50],
|
||||||
|
)
|
||||||
|
return grounding_result
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Étape 0.5 : Template matching pour icônes sans texte (crop 80x80)
|
||||||
# ---------------------------------------------------------------
|
# ---------------------------------------------------------------
|
||||||
if not by_text_strict:
|
if not by_text_strict:
|
||||||
result = _resolve_by_template_matching(
|
result = _resolve_by_template_matching(
|
||||||
@@ -3784,13 +4004,13 @@ def _resolve_target_sync(
|
|||||||
)
|
)
|
||||||
if result and result.get("score", 0) >= 0.70:
|
if result and result.get("score", 0) >= 0.70:
|
||||||
logger.info(
|
logger.info(
|
||||||
"Strict resolve icon : template matching OK (score=%.3f) pour icône sans texte",
|
"Strict resolve TEMPLATE : icon match (score=%.3f)",
|
||||||
result.get("score", 0),
|
result.get("score", 0),
|
||||||
)
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# ---------------------------------------------------------------
|
# ---------------------------------------------------------------
|
||||||
# Étape 1 : VLM Quick Find (compréhension sémantique)
|
# Étape 1 : VLM Quick Find (fallback, multi-image)
|
||||||
# ---------------------------------------------------------------
|
# ---------------------------------------------------------------
|
||||||
if vlm_description or anchor_image_b64:
|
if vlm_description or anchor_image_b64:
|
||||||
vlm_result = _vlm_quick_find(
|
vlm_result = _vlm_quick_find(
|
||||||
|
|||||||
@@ -458,26 +458,24 @@ def _vlm_identify_element(anchor_b64: str, window_title: str = "") -> str:
|
|||||||
img.save(tmp, format="PNG")
|
img.save(tmp, format="PNG")
|
||||||
tmp_path = tmp.name
|
tmp_path = tmp.name
|
||||||
|
|
||||||
from core.detection.ollama_client import OllamaClient
|
import requests as _requests
|
||||||
client = OllamaClient(
|
context = f" from the window '{window_title}'" if window_title else ""
|
||||||
endpoint="http://localhost:11434",
|
# Utiliser Qwen2.5-VL (meilleur pour l'identification UI que qwen3-vl)
|
||||||
model="qwen3-vl:8b",
|
crop_b64 = base64.b64encode(open(tmp_path, "rb").read()).decode()
|
||||||
timeout=15,
|
resp = _requests.post("http://localhost:11434/api/chat", json={
|
||||||
)
|
"model": "qwen2.5vl:7b",
|
||||||
context = f" in the window '{window_title}'" if window_title else ""
|
"messages": [
|
||||||
result = client.generate(
|
{"role": "system", "content": "You name UI elements in 2-5 words. No explanation."},
|
||||||
prompt=(
|
{"role": "user", "content": (
|
||||||
f"This is a cropped UI element{context}. "
|
f"This is a UI element{context}. "
|
||||||
"What is it? Answer with a short label (2-5 words max). "
|
"Name it in 2-5 words. Examples: 'save icon in title bar', "
|
||||||
"Examples: 'search bar icon', 'Word application icon', 'close button', "
|
"'Windows search icon', 'close button', 'file menu'."
|
||||||
"'file menu', 'save button'.\n"
|
), "images": [crop_b64]},
|
||||||
"Answer ONLY the label, nothing else."
|
],
|
||||||
),
|
"stream": False,
|
||||||
image_path=tmp_path,
|
"options": {"temperature": 0.1, "num_predict": 20},
|
||||||
system_prompt="You identify UI elements. Answer with a short label only.",
|
}, timeout=30)
|
||||||
temperature=0.1,
|
result = {"success": resp.ok, "response": resp.json().get("message", {}).get("content", "")}
|
||||||
max_tokens=20,
|
|
||||||
)
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
os.unlink(tmp_path)
|
os.unlink(tmp_path)
|
||||||
|
|||||||
Reference in New Issue
Block a user