diff --git a/agent_v0/agent_v1/core/executor.py b/agent_v0/agent_v1/core/executor.py index a1622c420..7deca00a8 100644 --- a/agent_v0/agent_v1/core/executor.py +++ b/agent_v0/agent_v1/core/executor.py @@ -199,9 +199,15 @@ class ActionExecutorV1: x_pct = resolved["x_pct"] y_pct = resolved["y_pct"] result["visual_resolved"] = resolved.get("resolved", False) + # Métriques de résolution + result["resolution_method"] = resolved.get("resolution_method", "") + result["resolution_score"] = resolved.get("resolution_score", 0.0) + result["resolution_elapsed_ms"] = resolved.get("resolution_elapsed_ms", 0.0) if resolved.get("resolved"): logger.info( - f"Visual resolve OK: {resolved.get('matched_element', {}).get('label', '?')} " + f"Visual resolve OK [{result['resolution_method']}] " + f"{result['resolution_elapsed_ms']:.0f}ms : " + f"{resolved.get('matched_element', {}).get('label', '?')} " f"-> ({x_pct:.4f}, {y_pct:.4f})" ) @@ -391,22 +397,44 @@ class ActionExecutorV1: Stratégie hybride en cascade : 1. Template matching avec le crop anchor (rapide, fiable si l'UI n'a pas changé) - 2. VLM identifie l'élément + template matching texte (approche hybride) - 3. VLM direct coordonnées (legacy, peu fiable avec qwen3-vl:8b) + 2. Serveur resolve_target (SomEngine + VLM, si serveur accessible) + 3. VLM local (fallback pour dev/test Linux) """ + import time as _time + t_start = _time.time() + screenshot_b64 = self._capture_screenshot_b64(max_width=0, quality=75) if not screenshot_b64: logger.warning("Capture screenshot echouee pour visual resolve") return None + def _with_metrics(result, method_override=None): + """Enrichir le résultat avec les métriques de résolution.""" + if result is None: + return None + elapsed_ms = (_time.time() - t_start) * 1000 + result["resolution_method"] = method_override or result.get("method", "unknown") + result["resolution_score"] = result.get("score", 0.0) + result["resolution_elapsed_ms"] = round(elapsed_ms, 1) + return result + # ---- ÉTAPE 1 : Template matching avec le crop anchor ---- anchor_b64 = target_spec.get("anchor_image_base64", "") if anchor_b64: tm_result = self._template_match_anchor(screenshot_b64, anchor_b64, screen_width, screen_height) if tm_result and tm_result.get("resolved"): - return tm_result + return _with_metrics(tm_result) - # ---- ÉTAPE 2 : Approche hybride VLM identifie + template matching texte ---- + # ---- ÉTAPE 2 : Résolution serveur (SomEngine + VLM) ---- + if server_url: + server_result = self._server_resolve_target( + server_url, screenshot_b64, target_spec, + fallback_x, fallback_y, screen_width, screen_height, + ) + if server_result and server_result.get("resolved"): + return _with_metrics(server_result) + + # ---- ÉTAPE 3 : VLM local (fallback dev/test, si Ollama accessible) ---- by_text = target_spec.get("by_text", "") vlm_description = target_spec.get("vlm_description", "") if vlm_description or by_text: @@ -414,16 +442,78 @@ class ActionExecutorV1: screenshot_b64, target_spec, screen_width, screen_height ) if hybrid_result and hybrid_result.get("resolved"): - return hybrid_result + return _with_metrics(hybrid_result) - # ---- ÉTAPE 3 : VLM direct coordonnées (legacy, peu fiable) ---- vlm_result = self._vlm_direct_resolve(screenshot_b64, target_spec) if vlm_result and vlm_result.get("resolved"): - return vlm_result + return _with_metrics(vlm_result) print(" [VISUAL] Toutes les méthodes ont échoué") return None + def _server_resolve_target( + self, server_url: str, screenshot_b64: str, target_spec: dict, + fallback_x: float, fallback_y: float, + screen_width: int, screen_height: int, + ) -> dict: + """Résolution visuelle via le serveur (SomEngine + VLM sur GPU). + + Le serveur dispose de SomEngine (YOLO + docTR) et du VLM (qwen3-vl). + L'agent envoie le screenshot + target_spec, le serveur résout et + retourne les coordonnées. + """ + import requests as _requests + from .config import API_TOKEN + + url = f"{server_url}/traces/stream/replay/resolve_target" + payload = { + "session_id": "", + "screenshot_b64": screenshot_b64, + "target_spec": target_spec, + "fallback_x_pct": fallback_x, + "fallback_y_pct": fallback_y, + "screen_width": screen_width, + "screen_height": screen_height, + "strict_mode": True, + } + headers = {"Content-Type": "application/json"} + if API_TOKEN: + headers["Authorization"] = f"Bearer {API_TOKEN}" + + try: + print(f" [SERVER-RESOLVE] Appel serveur {server_url}...") + resp = _requests.post(url, json=payload, headers=headers, timeout=30) + if not resp.ok: + logger.warning(f"Server resolve HTTP {resp.status_code}") + return None + + data = resp.json() + resolved = data.get("resolved", False) + method = data.get("method", "server_unknown") + + if resolved: + print( + f" [SERVER-RESOLVE] OK [{method}] " + f"→ ({data.get('x_pct', 0):.3f}, {data.get('y_pct', 0):.3f}) " + f"score={data.get('score', 0):.2f}" + ) + logger.info(f"Server resolve OK [{method}] score={data.get('score', 0):.2f}") + else: + reason = data.get("reason", "unknown") + print(f" [SERVER-RESOLVE] Échec ({reason})") + logger.info(f"Server resolve échoué : {reason}") + + return data + + except _requests.Timeout: + print(" [SERVER-RESOLVE] Timeout (30s)") + logger.warning("Server resolve timeout") + return None + except Exception as e: + print(f" [SERVER-RESOLVE] Erreur : {e}") + logger.warning(f"Server resolve erreur : {e}") + return None + def _template_match_anchor( self, screenshot_b64: str, anchor_b64: str, screen_width: int, screen_height: int, @@ -832,6 +922,9 @@ Example: x_pct=0.50, y_pct=0.30""" "error": result.get("error"), "warning": result.get("warning"), "screenshot": result.get("screenshot"), + "resolution_method": result.get("resolution_method"), + "resolution_score": result.get("resolution_score"), + "resolution_elapsed_ms": result.get("resolution_elapsed_ms"), } try: resp2 = requests.post( @@ -887,7 +980,29 @@ Example: x_pct=0.50, y_pct=0.30""" logger.warning("[POPUP-VLM] Capture screenshot échouée") return False - # Étape 1 : Le VLM identifie le bouton à cliquer + # Essayer la détection popup via le serveur d'abord + from .config import SERVER_URL, API_TOKEN + if SERVER_URL: + monitor = self.sct.monitors[1] + sw, sh = monitor["width"], monitor["height"] + server_result = self._server_resolve_target( + SERVER_URL, screenshot_b64, + {"vlm_description": "popup, dialog box, confirmation, or error message button (Oui, OK, Yes, Non, Enregistrer, Annuler)"}, + 0.5, 0.5, sw, sh, + ) + if server_result and server_result.get("resolved"): + x_pct = server_result["x_pct"] + y_pct = server_result["y_pct"] + real_x = int(x_pct * sw) + real_y = int(y_pct * sh) + label = server_result.get("matched_element", {}).get("label", "popup") + print(f" [POPUP-SERVER] Popup détectée ! Clic sur '{label}' → ({real_x}, {real_y})") + logger.info(f"[POPUP-SERVER] Clic popup '{label}' à ({real_x}, {real_y})") + self._click((real_x, real_y), "left") + time.sleep(1.0) + return True + + # Fallback : VLM local identifie le bouton à cliquer button_text = self._vlm_identify_popup_button(screenshot_b64) if not button_text: return False # Pas de popup ou VLM en échec @@ -952,7 +1067,7 @@ Example: x_pct=0.50, y_pct=0.30""" ollama_url = f"http://{ollama_host}:11434/api/chat" prompt = ( - "Look at this screenshot. Is there a popup dialog, confirmation dialog, " + "Regarde cette capture d'écran. Y a-t-il une popup, une boîte de dialogue, " "error message, or modal window visible?\n" "If yes, what button should I click to proceed?\n" "Answer ONLY the button text (like: Oui, OK, Yes, Enregistrer, Non, " @@ -1083,7 +1198,7 @@ Example: x_pct=0.50, y_pct=0.30""" best_match = None best_val = 0.0 - threshold = 0.55 # Seuil assez permissif pour le texte de bouton + threshold = 0.50 # Seuil équilibré # Essayer plusieurs tailles de police pour couvrir différentes résolutions for font_size in [14, 16, 18, 20, 22, 24, 12, 26, 28, 10]: diff --git a/agent_v0/server_v1/api_stream.py b/agent_v0/server_v1/api_stream.py index c6dd773f2..ef41f5be5 100644 --- a/agent_v0/server_v1/api_stream.py +++ b/agent_v0/server_v1/api_stream.py @@ -408,6 +408,10 @@ class ReplayResultReport(BaseModel): screenshot: Optional[str] = None # Chemin ou base64 du screenshot post-action screenshot_after: Optional[str] = None # Chemin ou base64 du screenshot APRES l'action actual_position: Optional[Dict[str, float]] = None # {"x": px, "y": py} position réelle du clic + # Métriques de résolution visuelle + resolution_method: Optional[str] = None # som_text_match, som_vlm, vlm_quick_find, etc. + resolution_score: Optional[float] = None + resolution_elapsed_ms: Optional[float] = None class ErrorCallbackConfig(BaseModel): @@ -2286,6 +2290,9 @@ async def report_action_result(report: ReplayResultReport): "actual_position": report.actual_position, "retry_count": retry_count, "verification": verification.to_dict() if verification else None, + "resolution_method": report.resolution_method, + "resolution_score": report.resolution_score, + "resolution_elapsed_ms": report.resolution_elapsed_ms, } replay_state["results"].append(result_entry) @@ -2384,6 +2391,30 @@ async def report_action_result(report: ReplayResultReport): f" ({replay_state['retried_actions']} retries, " f"{replay_state['unverified_actions']} non vérifiées)" ) + # Résumé des métriques de résolution visuelle + results_with_method = [ + r for r in replay_state["results"] + if r.get("resolution_method") + ] + if results_with_method: + methods_count = {} + total_elapsed = 0.0 + total_score = 0.0 + for r in results_with_method: + m = r["resolution_method"] + methods_count[m] = methods_count.get(m, 0) + 1 + total_elapsed += r.get("resolution_elapsed_ms") or 0 + total_score += r.get("resolution_score") or 0 + avg_elapsed = total_elapsed / len(results_with_method) + avg_score = total_score / len(results_with_method) + methods_str = ", ".join( + f"{m}={c}" for m, c in sorted(methods_count.items()) + ) + logger.info( + f"Replay {replay_state['replay_id']} métriques résolution : " + f"{len(results_with_method)} resolves [{methods_str}] " + f"score_moy={avg_score:.2f} temps_moy={avg_elapsed:.0f}ms" + ) # Libérer le GPU pour le worker VLM si le replay est terminé ou en erreur if replay_state["status"] in ("completed", "error"): @@ -3506,12 +3537,26 @@ def _resolve_by_som( for e in labeled_elements ) - prompt = ( - f"I'm looking for: {target_desc}\n\n" - f"Here are the numbered elements detected on screen:\n{elements_list}\n\n" - "Which number is the correct element?\n" - 'Answer with JSON only: {"mark_id": N, "confidence": 0.9}' - ) + # Multi-image : SoM annotée + anchor crop (si disponible) + anchor_b64 = target_spec.get("anchor_image_base64", "") + extra_images = [anchor_b64] if anchor_b64 else None + + if extra_images: + prompt = ( + "Image 1 shows the screen with numbered marks on each UI element.\n" + "Image 2 shows the element I'm looking for.\n\n" + f"Target: {target_desc}\n\n" + f"Detected elements:\n{elements_list}\n\n" + "Which mark number matches the target element in Image 2?\n" + 'Answer with JSON only: {"mark_id": N, "confidence": 0.9}' + ) + else: + prompt = ( + f"I'm looking for: {target_desc}\n\n" + f"Detected elements:\n{elements_list}\n\n" + "Which number is the correct element?\n" + 'Answer with JSON only: {"mark_id": N, "confidence": 0.9}' + ) system_prompt = "You identify UI elements by number. Output JSON only, no explanation." @@ -3523,6 +3568,7 @@ def _resolve_by_som( temperature=0.1, max_tokens=50, force_json=False, + extra_images_b64=extra_images, ) except Exception as e: logger.warning("SoM resolve : erreur VLM — %s", e)