feat: grounding sur image fenêtre au lieu du full screen

Utilise shot_XXXX_window.png (capture fenêtre active) au lieu du
full screen pour le grounding VLM. Image plus petite, ciblée,
sans bruit (taskbar, autres fenêtres).

Coordonnées fenêtre converties en coordonnées écran via window_rect.
window_capture (rect, window_size, click_relative) ajouté au target_spec.

Résultat : 50% → 80% de précision sur la session VM (16/20 clics).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-04-04 23:12:30 +02:00
parent 91614fbff0
commit 84a91630e9
2 changed files with 766 additions and 128 deletions

View File

@@ -4548,21 +4548,35 @@ def _resolve_by_grounding(
else:
return None
# Redimensionner le screenshot (800px de large pour le VLM)
# Utiliser la capture fenêtre si disponible (plus ciblée, moins de bruit)
# Sinon fallback sur le full screen
window_capture = target_spec.get("window_capture", {})
window_rect = window_capture.get("rect") # [x1, y1, x2, y2] écran
try:
from PIL import Image as PILImage
img = PILImage.open(screenshot_path)
from pathlib import Path
# Chercher le screenshot fenêtre (_window.png)
full_path = Path(screenshot_path)
win_path = full_path.parent / full_path.name.replace("_full.png", "_window.png")
if win_path.is_file() and window_rect:
img = PILImage.open(str(win_path))
using_window = True
logger.debug("Grounding : image fenêtre %s (%dx%d)", win_path.name, *img.size)
else:
img = PILImage.open(screenshot_path)
using_window = False
orig_w, orig_h = img.size
target_w = 800
ratio = target_w / orig_w
img_small = img.resize((target_w, int(orig_h * ratio)))
small_w, small_h = img_small.size
small_w, small_h = orig_w, orig_h # pas de redimensionnement
buf = io.BytesIO()
img_small.save(buf, format="JPEG", quality=75)
img.save(buf, format="JPEG", quality=80)
shot_b64 = base64.b64encode(buf.getvalue()).decode()
except Exception as e:
logger.warning("Grounding : erreur redimensionnement%s", e)
logger.warning("Grounding : erreur chargement image%s", e)
return None
# Prompt natif Qwen2.5-VL — format bbox_2d (le seul fiable)
@@ -4723,10 +4737,25 @@ def _resolve_by_grounding(
logger.info("Grounding : coordonnées hors bornes (%.3f, %.3f)", x_pct, y_pct)
return None
logger.info(
"Grounding OK [%s] : '%s' → (%.4f, %.4f) en %.1fs",
_grounding_model, description[:50], x_pct, y_pct, elapsed,
)
# Convertir coordonnées fenêtre → coordonnées écran
if using_window and window_rect:
win_x1, win_y1, win_x2, win_y2 = window_rect
win_w = win_x2 - win_x1
win_h = win_y2 - win_y1
# x_pct/y_pct sont relatifs à la fenêtre, convertir en relatif à l'écran
abs_x = win_x1 + x_pct * win_w
abs_y = win_y1 + y_pct * win_h
x_pct = abs_x / screen_width
y_pct = abs_y / screen_height
logger.info(
"Grounding OK [%s/window] : '%s' → (%.4f, %.4f) en %.1fs",
_grounding_model, description[:50], x_pct, y_pct, elapsed,
)
else:
logger.info(
"Grounding OK [%s/full] : '%s' → (%.4f, %.4f) en %.1fs",
_grounding_model, description[:50], x_pct, y_pct, elapsed,
)
return {
"resolved": True,