feat: résolution serveur pour replay Windows + VLM multi-image + métriques
Feature 4 — Résolution serveur : - Nouvelle méthode _server_resolve_target() dans executor.py - Cascade : template local → serveur /resolve_target → VLM local (fallback) - Popup handling via serveur aussi - L'agent Windows peut maintenant résoudre les clics via SomEngine+VLM Feature 5 — VLM multi-image : - _resolve_by_som() envoie l'anchor crop en 2ème image au VLM - Le VLM voit les marks numérotés + le crop de l'élément recherché Feature 6 — Métriques de résolution : - resolution_method, resolution_score, resolution_elapsed_ms - Propagés agent → serveur via /replay/result - Résumé en fin de replay (méthodes, score moyen, temps moyen) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -408,6 +408,10 @@ class ReplayResultReport(BaseModel):
|
||||
screenshot: Optional[str] = None # Chemin ou base64 du screenshot post-action
|
||||
screenshot_after: Optional[str] = None # Chemin ou base64 du screenshot APRES l'action
|
||||
actual_position: Optional[Dict[str, float]] = None # {"x": px, "y": py} position réelle du clic
|
||||
# Métriques de résolution visuelle
|
||||
resolution_method: Optional[str] = None # som_text_match, som_vlm, vlm_quick_find, etc.
|
||||
resolution_score: Optional[float] = None
|
||||
resolution_elapsed_ms: Optional[float] = None
|
||||
|
||||
|
||||
class ErrorCallbackConfig(BaseModel):
|
||||
@@ -2286,6 +2290,9 @@ async def report_action_result(report: ReplayResultReport):
|
||||
"actual_position": report.actual_position,
|
||||
"retry_count": retry_count,
|
||||
"verification": verification.to_dict() if verification else None,
|
||||
"resolution_method": report.resolution_method,
|
||||
"resolution_score": report.resolution_score,
|
||||
"resolution_elapsed_ms": report.resolution_elapsed_ms,
|
||||
}
|
||||
replay_state["results"].append(result_entry)
|
||||
|
||||
@@ -2384,6 +2391,30 @@ async def report_action_result(report: ReplayResultReport):
|
||||
f" ({replay_state['retried_actions']} retries, "
|
||||
f"{replay_state['unverified_actions']} non vérifiées)"
|
||||
)
|
||||
# Résumé des métriques de résolution visuelle
|
||||
results_with_method = [
|
||||
r for r in replay_state["results"]
|
||||
if r.get("resolution_method")
|
||||
]
|
||||
if results_with_method:
|
||||
methods_count = {}
|
||||
total_elapsed = 0.0
|
||||
total_score = 0.0
|
||||
for r in results_with_method:
|
||||
m = r["resolution_method"]
|
||||
methods_count[m] = methods_count.get(m, 0) + 1
|
||||
total_elapsed += r.get("resolution_elapsed_ms") or 0
|
||||
total_score += r.get("resolution_score") or 0
|
||||
avg_elapsed = total_elapsed / len(results_with_method)
|
||||
avg_score = total_score / len(results_with_method)
|
||||
methods_str = ", ".join(
|
||||
f"{m}={c}" for m, c in sorted(methods_count.items())
|
||||
)
|
||||
logger.info(
|
||||
f"Replay {replay_state['replay_id']} métriques résolution : "
|
||||
f"{len(results_with_method)} resolves [{methods_str}] "
|
||||
f"score_moy={avg_score:.2f} temps_moy={avg_elapsed:.0f}ms"
|
||||
)
|
||||
|
||||
# Libérer le GPU pour le worker VLM si le replay est terminé ou en erreur
|
||||
if replay_state["status"] in ("completed", "error"):
|
||||
@@ -3506,12 +3537,26 @@ def _resolve_by_som(
|
||||
for e in labeled_elements
|
||||
)
|
||||
|
||||
prompt = (
|
||||
f"I'm looking for: {target_desc}\n\n"
|
||||
f"Here are the numbered elements detected on screen:\n{elements_list}\n\n"
|
||||
"Which number is the correct element?\n"
|
||||
'Answer with JSON only: {"mark_id": N, "confidence": 0.9}'
|
||||
)
|
||||
# Multi-image : SoM annotée + anchor crop (si disponible)
|
||||
anchor_b64 = target_spec.get("anchor_image_base64", "")
|
||||
extra_images = [anchor_b64] if anchor_b64 else None
|
||||
|
||||
if extra_images:
|
||||
prompt = (
|
||||
"Image 1 shows the screen with numbered marks on each UI element.\n"
|
||||
"Image 2 shows the element I'm looking for.\n\n"
|
||||
f"Target: {target_desc}\n\n"
|
||||
f"Detected elements:\n{elements_list}\n\n"
|
||||
"Which mark number matches the target element in Image 2?\n"
|
||||
'Answer with JSON only: {"mark_id": N, "confidence": 0.9}'
|
||||
)
|
||||
else:
|
||||
prompt = (
|
||||
f"I'm looking for: {target_desc}\n\n"
|
||||
f"Detected elements:\n{elements_list}\n\n"
|
||||
"Which number is the correct element?\n"
|
||||
'Answer with JSON only: {"mark_id": N, "confidence": 0.9}'
|
||||
)
|
||||
|
||||
system_prompt = "You identify UI elements by number. Output JSON only, no explanation."
|
||||
|
||||
@@ -3523,6 +3568,7 @@ def _resolve_by_som(
|
||||
temperature=0.1,
|
||||
max_tokens=50,
|
||||
force_json=False,
|
||||
extra_images_b64=extra_images,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("SoM resolve : erreur VLM — %s", e)
|
||||
|
||||
Reference in New Issue
Block a user