feat: résolution serveur pour replay Windows + VLM multi-image + métriques

Feature 4 — Résolution serveur :
- Nouvelle méthode _server_resolve_target() dans executor.py
- Cascade : template local → serveur /resolve_target → VLM local (fallback)
- Popup handling via serveur aussi
- L'agent Windows peut maintenant résoudre les clics via SomEngine+VLM

Feature 5 — VLM multi-image :
- _resolve_by_som() envoie l'anchor crop en 2ème image au VLM
- Le VLM voit les marks numérotés + le crop de l'élément recherché

Feature 6 — Métriques de résolution :
- resolution_method, resolution_score, resolution_elapsed_ms
- Propagés agent → serveur via /replay/result
- Résumé en fin de replay (méthodes, score moyen, temps moyen)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-03-31 11:37:35 +02:00
parent 1e8e2dd9f3
commit 18792fd7b4
2 changed files with 178 additions and 17 deletions

View File

@@ -199,9 +199,15 @@ class ActionExecutorV1:
x_pct = resolved["x_pct"]
y_pct = resolved["y_pct"]
result["visual_resolved"] = resolved.get("resolved", False)
# Métriques de résolution
result["resolution_method"] = resolved.get("resolution_method", "")
result["resolution_score"] = resolved.get("resolution_score", 0.0)
result["resolution_elapsed_ms"] = resolved.get("resolution_elapsed_ms", 0.0)
if resolved.get("resolved"):
logger.info(
f"Visual resolve OK: {resolved.get('matched_element', {}).get('label', '?')} "
f"Visual resolve OK [{result['resolution_method']}] "
f"{result['resolution_elapsed_ms']:.0f}ms : "
f"{resolved.get('matched_element', {}).get('label', '?')} "
f"-> ({x_pct:.4f}, {y_pct:.4f})"
)
@@ -391,22 +397,44 @@ class ActionExecutorV1:
Stratégie hybride en cascade :
1. Template matching avec le crop anchor (rapide, fiable si l'UI n'a pas changé)
2. VLM identifie l'élément + template matching texte (approche hybride)
3. VLM direct coordonnées (legacy, peu fiable avec qwen3-vl:8b)
2. Serveur resolve_target (SomEngine + VLM, si serveur accessible)
3. VLM local (fallback pour dev/test Linux)
"""
import time as _time
t_start = _time.time()
screenshot_b64 = self._capture_screenshot_b64(max_width=0, quality=75)
if not screenshot_b64:
logger.warning("Capture screenshot echouee pour visual resolve")
return None
def _with_metrics(result, method_override=None):
"""Enrichir le résultat avec les métriques de résolution."""
if result is None:
return None
elapsed_ms = (_time.time() - t_start) * 1000
result["resolution_method"] = method_override or result.get("method", "unknown")
result["resolution_score"] = result.get("score", 0.0)
result["resolution_elapsed_ms"] = round(elapsed_ms, 1)
return result
# ---- ÉTAPE 1 : Template matching avec le crop anchor ----
anchor_b64 = target_spec.get("anchor_image_base64", "")
if anchor_b64:
tm_result = self._template_match_anchor(screenshot_b64, anchor_b64, screen_width, screen_height)
if tm_result and tm_result.get("resolved"):
return tm_result
return _with_metrics(tm_result)
# ---- ÉTAPE 2 : Approche hybride VLM identifie + template matching texte ----
# ---- ÉTAPE 2 : Résolution serveur (SomEngine + VLM) ----
if server_url:
server_result = self._server_resolve_target(
server_url, screenshot_b64, target_spec,
fallback_x, fallback_y, screen_width, screen_height,
)
if server_result and server_result.get("resolved"):
return _with_metrics(server_result)
# ---- ÉTAPE 3 : VLM local (fallback dev/test, si Ollama accessible) ----
by_text = target_spec.get("by_text", "")
vlm_description = target_spec.get("vlm_description", "")
if vlm_description or by_text:
@@ -414,16 +442,78 @@ class ActionExecutorV1:
screenshot_b64, target_spec, screen_width, screen_height
)
if hybrid_result and hybrid_result.get("resolved"):
return hybrid_result
return _with_metrics(hybrid_result)
# ---- ÉTAPE 3 : VLM direct coordonnées (legacy, peu fiable) ----
vlm_result = self._vlm_direct_resolve(screenshot_b64, target_spec)
if vlm_result and vlm_result.get("resolved"):
return vlm_result
return _with_metrics(vlm_result)
print(" [VISUAL] Toutes les méthodes ont échoué")
return None
def _server_resolve_target(
self, server_url: str, screenshot_b64: str, target_spec: dict,
fallback_x: float, fallback_y: float,
screen_width: int, screen_height: int,
) -> dict:
"""Résolution visuelle via le serveur (SomEngine + VLM sur GPU).
Le serveur dispose de SomEngine (YOLO + docTR) et du VLM (qwen3-vl).
L'agent envoie le screenshot + target_spec, le serveur résout et
retourne les coordonnées.
"""
import requests as _requests
from .config import API_TOKEN
url = f"{server_url}/traces/stream/replay/resolve_target"
payload = {
"session_id": "",
"screenshot_b64": screenshot_b64,
"target_spec": target_spec,
"fallback_x_pct": fallback_x,
"fallback_y_pct": fallback_y,
"screen_width": screen_width,
"screen_height": screen_height,
"strict_mode": True,
}
headers = {"Content-Type": "application/json"}
if API_TOKEN:
headers["Authorization"] = f"Bearer {API_TOKEN}"
try:
print(f" [SERVER-RESOLVE] Appel serveur {server_url}...")
resp = _requests.post(url, json=payload, headers=headers, timeout=30)
if not resp.ok:
logger.warning(f"Server resolve HTTP {resp.status_code}")
return None
data = resp.json()
resolved = data.get("resolved", False)
method = data.get("method", "server_unknown")
if resolved:
print(
f" [SERVER-RESOLVE] OK [{method}] "
f"→ ({data.get('x_pct', 0):.3f}, {data.get('y_pct', 0):.3f}) "
f"score={data.get('score', 0):.2f}"
)
logger.info(f"Server resolve OK [{method}] score={data.get('score', 0):.2f}")
else:
reason = data.get("reason", "unknown")
print(f" [SERVER-RESOLVE] Échec ({reason})")
logger.info(f"Server resolve échoué : {reason}")
return data
except _requests.Timeout:
print(" [SERVER-RESOLVE] Timeout (30s)")
logger.warning("Server resolve timeout")
return None
except Exception as e:
print(f" [SERVER-RESOLVE] Erreur : {e}")
logger.warning(f"Server resolve erreur : {e}")
return None
def _template_match_anchor(
self, screenshot_b64: str, anchor_b64: str,
screen_width: int, screen_height: int,
@@ -832,6 +922,9 @@ Example: x_pct=0.50, y_pct=0.30"""
"error": result.get("error"),
"warning": result.get("warning"),
"screenshot": result.get("screenshot"),
"resolution_method": result.get("resolution_method"),
"resolution_score": result.get("resolution_score"),
"resolution_elapsed_ms": result.get("resolution_elapsed_ms"),
}
try:
resp2 = requests.post(
@@ -887,7 +980,29 @@ Example: x_pct=0.50, y_pct=0.30"""
logger.warning("[POPUP-VLM] Capture screenshot échouée")
return False
# Étape 1 : Le VLM identifie le bouton à cliquer
# Essayer la détection popup via le serveur d'abord
from .config import SERVER_URL, API_TOKEN
if SERVER_URL:
monitor = self.sct.monitors[1]
sw, sh = monitor["width"], monitor["height"]
server_result = self._server_resolve_target(
SERVER_URL, screenshot_b64,
{"vlm_description": "popup, dialog box, confirmation, or error message button (Oui, OK, Yes, Non, Enregistrer, Annuler)"},
0.5, 0.5, sw, sh,
)
if server_result and server_result.get("resolved"):
x_pct = server_result["x_pct"]
y_pct = server_result["y_pct"]
real_x = int(x_pct * sw)
real_y = int(y_pct * sh)
label = server_result.get("matched_element", {}).get("label", "popup")
print(f" [POPUP-SERVER] Popup détectée ! Clic sur '{label}' → ({real_x}, {real_y})")
logger.info(f"[POPUP-SERVER] Clic popup '{label}' à ({real_x}, {real_y})")
self._click((real_x, real_y), "left")
time.sleep(1.0)
return True
# Fallback : VLM local identifie le bouton à cliquer
button_text = self._vlm_identify_popup_button(screenshot_b64)
if not button_text:
return False # Pas de popup ou VLM en échec
@@ -952,7 +1067,7 @@ Example: x_pct=0.50, y_pct=0.30"""
ollama_url = f"http://{ollama_host}:11434/api/chat"
prompt = (
"Look at this screenshot. Is there a popup dialog, confirmation dialog, "
"Regarde cette capture d'écran. Y a-t-il une popup, une boîte de dialogue, "
"error message, or modal window visible?\n"
"If yes, what button should I click to proceed?\n"
"Answer ONLY the button text (like: Oui, OK, Yes, Enregistrer, Non, "
@@ -1083,7 +1198,7 @@ Example: x_pct=0.50, y_pct=0.30"""
best_match = None
best_val = 0.0
threshold = 0.55 # Seuil assez permissif pour le texte de bouton
threshold = 0.50 # Seuil équilibré
# Essayer plusieurs tailles de police pour couvrir différentes résolutions
for font_size in [14, 16, 18, 20, 22, 24, 12, 26, 28, 10]: