feat: résolution serveur pour replay Windows + VLM multi-image + métriques

Feature 4 — Résolution serveur :
- Nouvelle méthode _server_resolve_target() dans executor.py
- Cascade : template local → serveur /resolve_target → VLM local (fallback)
- Popup handling via serveur aussi
- L'agent Windows peut maintenant résoudre les clics via SomEngine+VLM

Feature 5 — VLM multi-image :
- _resolve_by_som() envoie l'anchor crop en 2ème image au VLM
- Le VLM voit les marks numérotés + le crop de l'élément recherché

Feature 6 — Métriques de résolution :
- resolution_method, resolution_score, resolution_elapsed_ms
- Propagés agent → serveur via /replay/result
- Résumé en fin de replay (méthodes, score moyen, temps moyen)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-03-31 11:37:35 +02:00
parent 1e8e2dd9f3
commit 18792fd7b4
2 changed files with 178 additions and 17 deletions

View File

@@ -199,9 +199,15 @@ class ActionExecutorV1:
x_pct = resolved["x_pct"] x_pct = resolved["x_pct"]
y_pct = resolved["y_pct"] y_pct = resolved["y_pct"]
result["visual_resolved"] = resolved.get("resolved", False) result["visual_resolved"] = resolved.get("resolved", False)
# Métriques de résolution
result["resolution_method"] = resolved.get("resolution_method", "")
result["resolution_score"] = resolved.get("resolution_score", 0.0)
result["resolution_elapsed_ms"] = resolved.get("resolution_elapsed_ms", 0.0)
if resolved.get("resolved"): if resolved.get("resolved"):
logger.info( logger.info(
f"Visual resolve OK: {resolved.get('matched_element', {}).get('label', '?')} " f"Visual resolve OK [{result['resolution_method']}] "
f"{result['resolution_elapsed_ms']:.0f}ms : "
f"{resolved.get('matched_element', {}).get('label', '?')} "
f"-> ({x_pct:.4f}, {y_pct:.4f})" f"-> ({x_pct:.4f}, {y_pct:.4f})"
) )
@@ -391,22 +397,44 @@ class ActionExecutorV1:
Stratégie hybride en cascade : Stratégie hybride en cascade :
1. Template matching avec le crop anchor (rapide, fiable si l'UI n'a pas changé) 1. Template matching avec le crop anchor (rapide, fiable si l'UI n'a pas changé)
2. VLM identifie l'élément + template matching texte (approche hybride) 2. Serveur resolve_target (SomEngine + VLM, si serveur accessible)
3. VLM direct coordonnées (legacy, peu fiable avec qwen3-vl:8b) 3. VLM local (fallback pour dev/test Linux)
""" """
import time as _time
t_start = _time.time()
screenshot_b64 = self._capture_screenshot_b64(max_width=0, quality=75) screenshot_b64 = self._capture_screenshot_b64(max_width=0, quality=75)
if not screenshot_b64: if not screenshot_b64:
logger.warning("Capture screenshot echouee pour visual resolve") logger.warning("Capture screenshot echouee pour visual resolve")
return None return None
def _with_metrics(result, method_override=None):
"""Enrichir le résultat avec les métriques de résolution."""
if result is None:
return None
elapsed_ms = (_time.time() - t_start) * 1000
result["resolution_method"] = method_override or result.get("method", "unknown")
result["resolution_score"] = result.get("score", 0.0)
result["resolution_elapsed_ms"] = round(elapsed_ms, 1)
return result
# ---- ÉTAPE 1 : Template matching avec le crop anchor ---- # ---- ÉTAPE 1 : Template matching avec le crop anchor ----
anchor_b64 = target_spec.get("anchor_image_base64", "") anchor_b64 = target_spec.get("anchor_image_base64", "")
if anchor_b64: if anchor_b64:
tm_result = self._template_match_anchor(screenshot_b64, anchor_b64, screen_width, screen_height) tm_result = self._template_match_anchor(screenshot_b64, anchor_b64, screen_width, screen_height)
if tm_result and tm_result.get("resolved"): if tm_result and tm_result.get("resolved"):
return tm_result return _with_metrics(tm_result)
# ---- ÉTAPE 2 : Approche hybride VLM identifie + template matching texte ---- # ---- ÉTAPE 2 : Résolution serveur (SomEngine + VLM) ----
if server_url:
server_result = self._server_resolve_target(
server_url, screenshot_b64, target_spec,
fallback_x, fallback_y, screen_width, screen_height,
)
if server_result and server_result.get("resolved"):
return _with_metrics(server_result)
# ---- ÉTAPE 3 : VLM local (fallback dev/test, si Ollama accessible) ----
by_text = target_spec.get("by_text", "") by_text = target_spec.get("by_text", "")
vlm_description = target_spec.get("vlm_description", "") vlm_description = target_spec.get("vlm_description", "")
if vlm_description or by_text: if vlm_description or by_text:
@@ -414,16 +442,78 @@ class ActionExecutorV1:
screenshot_b64, target_spec, screen_width, screen_height screenshot_b64, target_spec, screen_width, screen_height
) )
if hybrid_result and hybrid_result.get("resolved"): if hybrid_result and hybrid_result.get("resolved"):
return hybrid_result return _with_metrics(hybrid_result)
# ---- ÉTAPE 3 : VLM direct coordonnées (legacy, peu fiable) ----
vlm_result = self._vlm_direct_resolve(screenshot_b64, target_spec) vlm_result = self._vlm_direct_resolve(screenshot_b64, target_spec)
if vlm_result and vlm_result.get("resolved"): if vlm_result and vlm_result.get("resolved"):
return vlm_result return _with_metrics(vlm_result)
print(" [VISUAL] Toutes les méthodes ont échoué") print(" [VISUAL] Toutes les méthodes ont échoué")
return None return None
def _server_resolve_target(
self, server_url: str, screenshot_b64: str, target_spec: dict,
fallback_x: float, fallback_y: float,
screen_width: int, screen_height: int,
) -> dict:
"""Résolution visuelle via le serveur (SomEngine + VLM sur GPU).
Le serveur dispose de SomEngine (YOLO + docTR) et du VLM (qwen3-vl).
L'agent envoie le screenshot + target_spec, le serveur résout et
retourne les coordonnées.
"""
import requests as _requests
from .config import API_TOKEN
url = f"{server_url}/traces/stream/replay/resolve_target"
payload = {
"session_id": "",
"screenshot_b64": screenshot_b64,
"target_spec": target_spec,
"fallback_x_pct": fallback_x,
"fallback_y_pct": fallback_y,
"screen_width": screen_width,
"screen_height": screen_height,
"strict_mode": True,
}
headers = {"Content-Type": "application/json"}
if API_TOKEN:
headers["Authorization"] = f"Bearer {API_TOKEN}"
try:
print(f" [SERVER-RESOLVE] Appel serveur {server_url}...")
resp = _requests.post(url, json=payload, headers=headers, timeout=30)
if not resp.ok:
logger.warning(f"Server resolve HTTP {resp.status_code}")
return None
data = resp.json()
resolved = data.get("resolved", False)
method = data.get("method", "server_unknown")
if resolved:
print(
f" [SERVER-RESOLVE] OK [{method}] "
f"→ ({data.get('x_pct', 0):.3f}, {data.get('y_pct', 0):.3f}) "
f"score={data.get('score', 0):.2f}"
)
logger.info(f"Server resolve OK [{method}] score={data.get('score', 0):.2f}")
else:
reason = data.get("reason", "unknown")
print(f" [SERVER-RESOLVE] Échec ({reason})")
logger.info(f"Server resolve échoué : {reason}")
return data
except _requests.Timeout:
print(" [SERVER-RESOLVE] Timeout (30s)")
logger.warning("Server resolve timeout")
return None
except Exception as e:
print(f" [SERVER-RESOLVE] Erreur : {e}")
logger.warning(f"Server resolve erreur : {e}")
return None
def _template_match_anchor( def _template_match_anchor(
self, screenshot_b64: str, anchor_b64: str, self, screenshot_b64: str, anchor_b64: str,
screen_width: int, screen_height: int, screen_width: int, screen_height: int,
@@ -832,6 +922,9 @@ Example: x_pct=0.50, y_pct=0.30"""
"error": result.get("error"), "error": result.get("error"),
"warning": result.get("warning"), "warning": result.get("warning"),
"screenshot": result.get("screenshot"), "screenshot": result.get("screenshot"),
"resolution_method": result.get("resolution_method"),
"resolution_score": result.get("resolution_score"),
"resolution_elapsed_ms": result.get("resolution_elapsed_ms"),
} }
try: try:
resp2 = requests.post( resp2 = requests.post(
@@ -887,7 +980,29 @@ Example: x_pct=0.50, y_pct=0.30"""
logger.warning("[POPUP-VLM] Capture screenshot échouée") logger.warning("[POPUP-VLM] Capture screenshot échouée")
return False return False
# Étape 1 : Le VLM identifie le bouton à cliquer # Essayer la détection popup via le serveur d'abord
from .config import SERVER_URL, API_TOKEN
if SERVER_URL:
monitor = self.sct.monitors[1]
sw, sh = monitor["width"], monitor["height"]
server_result = self._server_resolve_target(
SERVER_URL, screenshot_b64,
{"vlm_description": "popup, dialog box, confirmation, or error message button (Oui, OK, Yes, Non, Enregistrer, Annuler)"},
0.5, 0.5, sw, sh,
)
if server_result and server_result.get("resolved"):
x_pct = server_result["x_pct"]
y_pct = server_result["y_pct"]
real_x = int(x_pct * sw)
real_y = int(y_pct * sh)
label = server_result.get("matched_element", {}).get("label", "popup")
print(f" [POPUP-SERVER] Popup détectée ! Clic sur '{label}' → ({real_x}, {real_y})")
logger.info(f"[POPUP-SERVER] Clic popup '{label}' à ({real_x}, {real_y})")
self._click((real_x, real_y), "left")
time.sleep(1.0)
return True
# Fallback : VLM local identifie le bouton à cliquer
button_text = self._vlm_identify_popup_button(screenshot_b64) button_text = self._vlm_identify_popup_button(screenshot_b64)
if not button_text: if not button_text:
return False # Pas de popup ou VLM en échec return False # Pas de popup ou VLM en échec
@@ -952,7 +1067,7 @@ Example: x_pct=0.50, y_pct=0.30"""
ollama_url = f"http://{ollama_host}:11434/api/chat" ollama_url = f"http://{ollama_host}:11434/api/chat"
prompt = ( prompt = (
"Look at this screenshot. Is there a popup dialog, confirmation dialog, " "Regarde cette capture d'écran. Y a-t-il une popup, une boîte de dialogue, "
"error message, or modal window visible?\n" "error message, or modal window visible?\n"
"If yes, what button should I click to proceed?\n" "If yes, what button should I click to proceed?\n"
"Answer ONLY the button text (like: Oui, OK, Yes, Enregistrer, Non, " "Answer ONLY the button text (like: Oui, OK, Yes, Enregistrer, Non, "
@@ -1083,7 +1198,7 @@ Example: x_pct=0.50, y_pct=0.30"""
best_match = None best_match = None
best_val = 0.0 best_val = 0.0
threshold = 0.55 # Seuil assez permissif pour le texte de bouton threshold = 0.50 # Seuil équilibré
# Essayer plusieurs tailles de police pour couvrir différentes résolutions # Essayer plusieurs tailles de police pour couvrir différentes résolutions
for font_size in [14, 16, 18, 20, 22, 24, 12, 26, 28, 10]: for font_size in [14, 16, 18, 20, 22, 24, 12, 26, 28, 10]:

View File

@@ -408,6 +408,10 @@ class ReplayResultReport(BaseModel):
screenshot: Optional[str] = None # Chemin ou base64 du screenshot post-action screenshot: Optional[str] = None # Chemin ou base64 du screenshot post-action
screenshot_after: Optional[str] = None # Chemin ou base64 du screenshot APRES l'action screenshot_after: Optional[str] = None # Chemin ou base64 du screenshot APRES l'action
actual_position: Optional[Dict[str, float]] = None # {"x": px, "y": py} position réelle du clic actual_position: Optional[Dict[str, float]] = None # {"x": px, "y": py} position réelle du clic
# Métriques de résolution visuelle
resolution_method: Optional[str] = None # som_text_match, som_vlm, vlm_quick_find, etc.
resolution_score: Optional[float] = None
resolution_elapsed_ms: Optional[float] = None
class ErrorCallbackConfig(BaseModel): class ErrorCallbackConfig(BaseModel):
@@ -2286,6 +2290,9 @@ async def report_action_result(report: ReplayResultReport):
"actual_position": report.actual_position, "actual_position": report.actual_position,
"retry_count": retry_count, "retry_count": retry_count,
"verification": verification.to_dict() if verification else None, "verification": verification.to_dict() if verification else None,
"resolution_method": report.resolution_method,
"resolution_score": report.resolution_score,
"resolution_elapsed_ms": report.resolution_elapsed_ms,
} }
replay_state["results"].append(result_entry) replay_state["results"].append(result_entry)
@@ -2384,6 +2391,30 @@ async def report_action_result(report: ReplayResultReport):
f" ({replay_state['retried_actions']} retries, " f" ({replay_state['retried_actions']} retries, "
f"{replay_state['unverified_actions']} non vérifiées)" f"{replay_state['unverified_actions']} non vérifiées)"
) )
# Résumé des métriques de résolution visuelle
results_with_method = [
r for r in replay_state["results"]
if r.get("resolution_method")
]
if results_with_method:
methods_count = {}
total_elapsed = 0.0
total_score = 0.0
for r in results_with_method:
m = r["resolution_method"]
methods_count[m] = methods_count.get(m, 0) + 1
total_elapsed += r.get("resolution_elapsed_ms") or 0
total_score += r.get("resolution_score") or 0
avg_elapsed = total_elapsed / len(results_with_method)
avg_score = total_score / len(results_with_method)
methods_str = ", ".join(
f"{m}={c}" for m, c in sorted(methods_count.items())
)
logger.info(
f"Replay {replay_state['replay_id']} métriques résolution : "
f"{len(results_with_method)} resolves [{methods_str}] "
f"score_moy={avg_score:.2f} temps_moy={avg_elapsed:.0f}ms"
)
# Libérer le GPU pour le worker VLM si le replay est terminé ou en erreur # Libérer le GPU pour le worker VLM si le replay est terminé ou en erreur
if replay_state["status"] in ("completed", "error"): if replay_state["status"] in ("completed", "error"):
@@ -3506,9 +3537,23 @@ def _resolve_by_som(
for e in labeled_elements for e in labeled_elements
) )
# Multi-image : SoM annotée + anchor crop (si disponible)
anchor_b64 = target_spec.get("anchor_image_base64", "")
extra_images = [anchor_b64] if anchor_b64 else None
if extra_images:
prompt = (
"Image 1 shows the screen with numbered marks on each UI element.\n"
"Image 2 shows the element I'm looking for.\n\n"
f"Target: {target_desc}\n\n"
f"Detected elements:\n{elements_list}\n\n"
"Which mark number matches the target element in Image 2?\n"
'Answer with JSON only: {"mark_id": N, "confidence": 0.9}'
)
else:
prompt = ( prompt = (
f"I'm looking for: {target_desc}\n\n" f"I'm looking for: {target_desc}\n\n"
f"Here are the numbered elements detected on screen:\n{elements_list}\n\n" f"Detected elements:\n{elements_list}\n\n"
"Which number is the correct element?\n" "Which number is the correct element?\n"
'Answer with JSON only: {"mark_id": N, "confidence": 0.9}' 'Answer with JSON only: {"mark_id": N, "confidence": 0.9}'
) )
@@ -3523,6 +3568,7 @@ def _resolve_by_som(
temperature=0.1, temperature=0.1,
max_tokens=50, max_tokens=50,
force_json=False, force_json=False,
extra_images_b64=extra_images,
) )
except Exception as e: except Exception as e:
logger.warning("SoM resolve : erreur VLM — %s", e) logger.warning("SoM resolve : erreur VLM — %s", e)