From a1c97504ab56089c7ac59ddcbda7417270ae3378 Mon Sep 17 00:00:00 2001 From: Dom Date: Sun, 5 Apr 2026 00:09:08 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20Phase=201=20acteur=20=E2=80=94=20pr?= =?UTF-8?q?=C3=A9/post=20v=C3=A9rification=20titre=20fen=C3=AAtre?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pré-vérification : avant chaque clic, vérifie que le titre de la fenêtre active correspond à celui de l'enregistrement. Stop si mismatch. Post-vérification : après chaque clic, vérifie que le titre a changé vers expected_window_title (titre du prochain clic). Warning si mismatch. expected_window_title enrichi dans build_replay depuis la séquence des clics. Co-Authored-By: Claude Opus 4.6 (1M context) --- agent_v0/agent_v1/core/executor.py | 190 ++++++++++++++++++++----- agent_v0/server_v1/stream_processor.py | 11 ++ 2 files changed, 163 insertions(+), 38 deletions(-) diff --git a/agent_v0/agent_v1/core/executor.py b/agent_v0/agent_v1/core/executor.py index cb3b35978..884a4b68f 100644 --- a/agent_v0/agent_v1/core/executor.py +++ b/agent_v0/agent_v1/core/executor.py @@ -79,6 +79,8 @@ class ActionExecutorV1: self._poll_backoff_factor = 1.5 # Multiplicateur en cas d'echec # Token d'authentification API self._api_token = os.environ.get("RPA_API_TOKEN", "") + # Gestionnaire de notifications toast (pour les messages utilisateur) + self._notification_manager = None # Log de la resolution physique pour le diagnostic DPI self._log_screen_info() @@ -94,6 +96,22 @@ class ActionExecutorV1: except Exception as e: logger.debug(f"Impossible de lire la resolution ecran : {e}") + @property + def notifier(self): + """Instance NotificationManager paresseuse.""" + if self._notification_manager is None: + try: + from ..ui.notifications import NotificationManager + self._notification_manager = NotificationManager() + except Exception as e: + logger.debug(f"NotificationManager indisponible : {e}") + # Retourner un objet factice qui ne fait rien + class _Noop: + def replay_target_not_found(self, *a, **kw): + return False + self._notification_manager = _Noop() + return self._notification_manager + def _auth_headers(self) -> dict: """Headers d'authentification Bearer pour les requetes au serveur.""" if self._api_token: @@ -107,6 +125,25 @@ class ActionExecutorV1: self._sct = mss.mss() return self._sct + @staticmethod + def _describe_target(target_spec: dict) -> str: + """Construire une description humaine de la cible depuis target_spec. + + Utilisé pour les notifications et le logging quand la cible n'est pas trouvée. + """ + parts = [] + if target_spec.get("by_role"): + parts.append(target_spec["by_role"]) + if target_spec.get("by_text"): + parts.append(f"'{target_spec['by_text']}'") + if target_spec.get("vlm_description"): + parts.append(target_spec["vlm_description"][:80]) + if target_spec.get("window_title"): + parts.append(f"dans {target_spec['window_title']}") + if parts: + return " ".join(parts) + return "élément inconnu" + # ========================================================================= # Execution legacy (watchdog command.json) # ========================================================================= @@ -191,6 +228,27 @@ class ActionExecutorV1: x_pct = action.get("x_pct", 0.0) y_pct = action.get("y_pct", 0.0) + # ── Pré-vérification : titre fenêtre ── + # Vérifier que l'écran est dans l'état attendu AVANT de cliquer. + if visual_mode and target_spec: + expected_title = target_spec.get("window_title", "") + if expected_title and expected_title != "unknown_window": + from ..window_info_crossplatform import get_active_window_info + current_info = get_active_window_info() + current_title = current_info.get("title", "") + # Comparer les titres (partiel — le titre peut varier légèrement) + if expected_title.lower() not in current_title.lower() and current_title.lower() not in expected_title.lower(): + logger.warning( + f"PRÉ-VÉRIF ÉCHOUÉE : attendu '{expected_title}', " + f"actuel '{current_title}' — STOP" + ) + print(f" [PRÉ-VÉRIF] STOP — fenêtre '{current_title}' ≠ attendu '{expected_title}'") + result["success"] = False + result["error"] = f"Fenêtre incorrecte: '{current_title}' (attendu: '{expected_title}')" + return result + else: + logger.info(f"PRÉ-VÉRIF OK : '{current_title}'") + if visual_mode and target_spec and server_url: resolved = self._resolve_target_visual( server_url, target_spec, x_pct, y_pct, width, height @@ -248,16 +306,38 @@ class ActionExecutorV1: f"({x_pct:.3f}, {y_pct:.3f})" ) else: + # Cible toujours invisible après gestion popup — PAUSE supervisée + target_desc = self._describe_target(target_spec) result["success"] = False - result["error"] = "Élément non trouvé même après gestion popup" - print(f" [ERREUR] Élément toujours non trouvé après gestion popup — STOP") - logger.error(f"Action {action_id} : élément non trouvé après popup, replay stoppé") + result["error"] = "target_not_found" + result["target_description"] = target_desc + result["target_spec"] = target_spec + result["screenshot"] = self._capture_screenshot_b64() + result["warning"] = "visual_resolve_failed" + print(f" [ERREUR] Élément toujours non trouvé après gestion popup — PAUSE") + logger.error( + f"Action {action_id} : cible '{target_desc}' non trouvée " + f"après popup, replay en pause supervisée" + ) + # Notifier l'utilisateur via toast + self.notifier.replay_target_not_found(target_desc) return result else: + # Cible invisible, pas de popup — PAUSE supervisée + target_desc = self._describe_target(target_spec) result["success"] = False - result["error"] = "Visual resolve échoué — cible non trouvée à l'écran" - print(f" [ERREUR] Visual resolve échoué, pas de popup détectée — STOP") - logger.error(f"Action {action_id} : visual resolve échoué, pas de popup, replay stoppé") + result["error"] = "target_not_found" + result["target_description"] = target_desc + result["target_spec"] = target_spec + result["screenshot"] = self._capture_screenshot_b64() + result["warning"] = "visual_resolve_failed" + print(f" [ERREUR] Visual resolve échoué, pas de popup — PAUSE") + logger.error( + f"Action {action_id} : cible '{target_desc}' non trouvée, " + f"replay en pause supervisée" + ) + # Notifier l'utilisateur via toast + self.notifier.replay_target_not_found(target_desc) return result real_x = int(x_pct * width) @@ -269,12 +349,28 @@ class ActionExecutorV1: f"({real_x}, {real_y}) sur ({width}x{height}), bouton={button}" ) self._click((real_x, real_y), button) - print(f" [CLICK] Termine.") logger.info( f"Replay click [{mode}] : ({x_pct:.3f}, {y_pct:.3f}) -> " f"({real_x}, {real_y}) sur ({width}x{height})" ) + # ── Post-vérification : titre fenêtre après le clic ── + expected_after = action.get("expected_window_title", "") + if expected_after: + time.sleep(0.5) # Laisser le temps à la fenêtre de changer + from ..window_info_crossplatform import get_active_window_info + post_info = get_active_window_info() + post_title = post_info.get("title", "") + if expected_after.lower() in post_title.lower() or post_title.lower() in expected_after.lower(): + print(f" [POST-VÉRIF] OK — '{post_title}'") + logger.info(f"POST-VÉRIF OK : '{post_title}'") + else: + print(f" [POST-VÉRIF] ATTENTION — '{post_title}' ≠ attendu '{expected_after}'") + logger.warning(f"POST-VÉRIF : '{post_title}' ≠ attendu '{expected_after}'") + result["warning"] = f"post_verif_mismatch:{post_title}" + else: + print(f" [CLICK] Terminé.") + elif action_type == "type": text = action.get("text", "") raw_keys = action.get("raw_keys") @@ -636,18 +732,23 @@ class ActionExecutorV1: "What is the exact text label of this element? " "Answer ONLY the text visible on the element (button text, label, menu item)." ) - prefill = "The text is: " + # Prefill pour les modèles thinking (qwen3) — skip la phase de réflexion + _vlm_model_ident = os.environ.get("RPA_VLM_MODEL", "gemma4:e4b") + _is_thinking_ident = "qwen3" in _vlm_model_ident.lower() + + messages_ident = [ + { + "role": "system", + "content": "You read text from UI screenshots. Answer briefly with just the text.", + }, + {"role": "user", "content": prompt, "images": [screenshot_b64]}, + ] + if _is_thinking_ident: + messages_ident.append({"role": "assistant", "content": "The text is: "}) payload = { - "model": os.environ.get("RPA_VLM_MODEL", "qwen3-vl:8b"), - "messages": [ - { - "role": "system", - "content": "You read text from UI screenshots. Answer briefly with just the text.", - }, - {"role": "user", "content": prompt, "images": [screenshot_b64]}, - {"role": "assistant", "content": prefill}, - ], + "model": _vlm_model_ident, + "messages": messages_ident, "stream": False, "think": False, "options": {"temperature": 0.1, "num_predict": 30, "num_ctx": 8192}, @@ -762,16 +863,21 @@ Example: x_pct=0.50, y_pct=0.30""" ollama_host = os.environ.get("RPA_SERVER_HOST", "localhost") ollama_url = f"http://{ollama_host}:11434/api/chat" - # Prefill plus explicite pour guider la réponse - prefill = '{"x_pct": 0.' + # Prefill pour les modèles thinking (qwen3) — évite le mode réflexion >180s + _vlm_model = os.environ.get("RPA_VLM_MODEL", "gemma4:e4b") + _is_thinking = "qwen3" in _vlm_model.lower() + prefill = '{"x_pct": 0.' if _is_thinking else "" + + messages = [ + {"role": "system", "content": "You locate UI elements on screenshots. Reply with JSON only: {\"x_pct\": 0.XX, \"y_pct\": 0.XX, \"confidence\": 0.XX}"}, + {"role": "user", "content": prompt, "images": images}, + ] + if prefill: + messages.append({"role": "assistant", "content": prefill}) payload = { - "model": os.environ.get("RPA_VLM_MODEL", "qwen3-vl:8b"), - "messages": [ - {"role": "system", "content": "You locate UI elements on screenshots. Reply with JSON only: {\"x_pct\": 0.XX, \"y_pct\": 0.XX, \"confidence\": 0.XX}"}, - {"role": "user", "content": prompt, "images": images}, - {"role": "assistant", "content": prefill}, - ], + "model": _vlm_model, + "messages": messages, "stream": False, "think": False, "options": {"temperature": 0.1, "num_predict": 60, "num_ctx": 8192}, @@ -927,6 +1033,9 @@ Example: x_pct=0.50, y_pct=0.30""" "resolution_method": result.get("resolution_method"), "resolution_score": result.get("resolution_score"), "resolution_elapsed_ms": result.get("resolution_elapsed_ms"), + # Champs enrichis pour target_not_found (pause supervisée) + "target_description": result.get("target_description"), + "target_spec": result.get("target_spec"), } try: resp2 = requests.post( @@ -1077,21 +1186,26 @@ Example: x_pct=0.50, y_pct=0.30""" "If no popup: answer NO_POPUP" ) - prefill = "The button to click is: " + # Prefill pour les modèles thinking (qwen3) — skip la phase de réflexion + _vlm_model_popup = os.environ.get("RPA_VLM_MODEL", "gemma4:e4b") + _is_thinking_popup = "qwen3" in _vlm_model_popup.lower() + + messages_popup = [ + { + "role": "system", + "content": ( + "You analyze screenshots to detect popup dialogs. " + "Answer briefly with just the button text. No JSON, no coordinates." + ), + }, + {"role": "user", "content": prompt, "images": [screenshot_b64]}, + ] + if _is_thinking_popup: + messages_popup.append({"role": "assistant", "content": "The button to click is: "}) payload = { - "model": os.environ.get("RPA_VLM_MODEL", "qwen3-vl:8b"), - "messages": [ - { - "role": "system", - "content": ( - "You analyze screenshots to detect popup dialogs. " - "Answer briefly with just the button text. No JSON, no coordinates." - ), - }, - {"role": "user", "content": prompt, "images": [screenshot_b64]}, - {"role": "assistant", "content": prefill}, - ], + "model": _vlm_model_popup, + "messages": messages_popup, "stream": False, "think": False, "options": {"temperature": 0.1, "num_predict": 30, "num_ctx": 8192}, diff --git a/agent_v0/server_v1/stream_processor.py b/agent_v0/server_v1/stream_processor.py index 462e8d2a2..6595409d1 100644 --- a/agent_v0/server_v1/stream_processor.py +++ b/agent_v0/server_v1/stream_processor.py @@ -1408,6 +1408,17 @@ def build_replay_from_raw_events( if session_dir_path: _attach_expected_screenshots(result, events, session_dir_path) + # ── 9. Enrichir avec expected_window_title (titre fenêtre attendu après le clic) ── + # Pour la vérification post-action : le titre de la fenêtre APRÈS le clic + # est le window_title du PROCHAIN clic dans la séquence. + click_indices = [i for i, a in enumerate(result) if a.get("type") == "click"] + for j, ci in enumerate(click_indices): + if j + 1 < len(click_indices): + next_ci = click_indices[j + 1] + next_title = result[next_ci].get("target_spec", {}).get("window_title", "") + if next_title: + result[ci]["expected_window_title"] = next_title + # Stats visual replay visual_clicks = sum( 1 for a in result