diff --git a/agent_v0/agent_v1/core/executor.py b/agent_v0/agent_v1/core/executor.py index 44d94c4ed..d273743ea 100644 --- a/agent_v0/agent_v1/core/executor.py +++ b/agent_v0/agent_v1/core/executor.py @@ -33,6 +33,40 @@ from pynput.keyboard import Controller as KeyboardController, Key, KeyCode logger = logging.getLogger(__name__) +# Dialogues runtime connus qui peuvent apparaitre au replay alors qu'ils +# n'etaient pas presents pendant l'enregistrement source. Pour ces cas, +# l'agent applique un reflexe cible plutot qu'une pause immediate. +_KNOWN_RUNTIME_DIALOGS = ( + { + "id": "confirm_save_overwrite", + "title_patterns": ( + "confirmer l'enregistrement", + "confirm save as", + ), + "button_texts": ("Oui", "Yes", "Remplacer", "Replace"), + "skip_current_action_after_handle": True, + }, +) + +# Dialogues dont l'identification exige plus que le titre seul. +# Exemple : "Bloc-notes" est ambigu ; en revanche, "Bloc-notes" + +# bouton "Ne pas enregistrer" au premier plan décrit sans ambiguïté +# le modal "Voulez-vous enregistrer les modifications ?". +_CONTEXTUAL_RUNTIME_DIALOGS = ( + { + "id": "notepad_unsaved_changes", + "title_patterns": ( + "bloc-notes", + "notepad", + ), + "evidence_texts": ( + "Ne pas enregistrer", + "Don't Save", + ), + "button_texts": ("Enregistrer", "Save"), + }, +) + # Mapping des noms de touches spéciales vers pynput.Key _SPECIAL_KEYS = { "enter": Key.enter, @@ -153,6 +187,565 @@ class ActionExecutorV1: return lambda *a, **kw: None return _Noop() + @staticmethod + def _anchor_match_within_drift( + matched_x_pct: float, + matched_y_pct: float, + fallback_x_pct: float, + fallback_y_pct: float, + max_drift: float = 0.25, + ) -> bool: + """Garde drift pour le template matching local (``ANCHOR-TM``). + + Patch 2026-05-23 — brief 0756 : le match cv2.matchTemplate de + l'anchor enregistré peut trouver un crop visuellement similaire + n'importe où à l'écran (ex: bandeau OBS Studio en arrière-plan + ressemble vaguement à un onglet Notepad). Sans garde, l'agent + clique au mauvais endroit et l'action suivante part en pause. + + On rejette tout match dont la position diverge de plus de + ``max_drift`` (fraction d'écran, dans chaque axe) par rapport + aux coordonnées fallback enregistrées de l'action. + + Si aucune coordonnée fallback significative n'est connue + (0,0 = placeholder), on n'applique pas la garde — le caller + n'a aucune référence à laquelle se comparer. + """ + if fallback_x_pct == 0.0 and fallback_y_pct == 0.0: + return True + return ( + abs(matched_x_pct - fallback_x_pct) <= max_drift + and abs(matched_y_pct - fallback_y_pct) <= max_drift + ) + + @staticmethod + def _window_title_matches_any(current_title: str, patterns: list) -> bool: + """Retourne True si ``current_title`` contient au moins un des + ``patterns`` (substring match, case-insensitive). Utilisé par + la garde ``verify_screen.expected_window_title_contains`` du + setup auto Windows pour bloquer la chaîne quand le menu + Démarrer / la barre Rechercher n'est pas réellement actif. + + Une liste vide retourne True (pas de garde demandée). + """ + if not patterns: + return True + current_lower = (current_title or "").lower() + return any( + str(p).lower() in current_lower + for p in patterns + if p + ) + + @staticmethod + def _normalize_loose_text(value: str) -> str: + """Normaliser légèrement un libellé UI pour matching robuste. + + Windows mélange parfois apostrophes ASCII et typographiques dans + les titres (`l'enregistrement` vs `l’enregistrement`). On unifie + aussi les tirets et espaces pour éviter les faux négatifs sur des + dialogues connus pourtant évidents. + """ + if not value: + return "" + normalized = str(value).casefold().translate( + str.maketrans({ + "’": "'", + "‘": "'", + "`": "'", + "´": "'", + "–": "-", + "—": "-", + "−": "-", + "\xa0": " ", + }) + ) + return " ".join(normalized.split()) + + @staticmethod + def _match_known_runtime_dialog(current_title: str) -> Optional[Dict[str, Any]]: + """Identifier un dialogue runtime connu par son titre. + + Ces dialogues ne viennent pas toujours de la trace source. + Exemple : une popup d'ecrasement de fichier apparait au replay + parce que le fichier existe deja sur la machine cible. + """ + current_normalized = ActionExecutorV1._normalize_loose_text(current_title) + if not current_normalized: + return None + for spec in _KNOWN_RUNTIME_DIALOGS: + for pattern in spec.get("title_patterns", ()): + pattern_normalized = ActionExecutorV1._normalize_loose_text(pattern) + if pattern_normalized and pattern_normalized in current_normalized: + return dict(spec) + return None + + def _match_contextual_runtime_dialog( + self, + current_title: str, + screenshot_b64: str, + ) -> Optional[Dict[str, Any]]: + """Identifier un dialogue runtime via titre + evidence visuelle. + + Sert pour les modaux dont le titre seul n'est pas assez discriminant + (`Bloc-notes`, `Notepad`, etc.). On demande une evidence textuelle + simple et locale, sans VLM. + """ + current_normalized = self._normalize_loose_text(current_title) + if not current_normalized or not screenshot_b64: + return None + + for spec in _CONTEXTUAL_RUNTIME_DIALOGS: + title_match = False + for pattern in spec.get("title_patterns", ()): + pattern_normalized = self._normalize_loose_text(pattern) + if pattern_normalized and pattern_normalized in current_normalized: + title_match = True + break + if not title_match: + continue + + for evidence_text in spec.get("evidence_texts", ()): + try: + if self._find_text_on_screen(screenshot_b64, evidence_text): + return dict(spec) + except Exception as e: + logger.debug( + "Contexte dialogue runtime: probe '%s' échouée: %s", + evidence_text, + e, + ) + return None + + @staticmethod + def _action_targets_runtime_dialog_button( + action: Optional[Dict[str, Any]], + target_spec: Optional[Dict[str, Any]], + dialog_spec: Optional[Dict[str, Any]], + ) -> bool: + """Dire si l'action courante vise déjà un bouton du dialogue au focus.""" + if not isinstance(target_spec, dict) or not isinstance(dialog_spec, dict): + return False + + action_type = str((action or {}).get("type", "")).strip().lower() + if action_type != "click": + return False + + button_candidates = [ + str(target_spec.get("by_text", "") or ""), + str(target_spec.get("vlm_description", "") or ""), + ] + normalized_candidates = [ + ActionExecutorV1._normalize_loose_text(text) + for text in button_candidates + if text + ] + if not normalized_candidates: + return False + + for button_text in dialog_spec.get("button_texts", ()): + normalized_button = ActionExecutorV1._normalize_loose_text(button_text) + if not normalized_button: + continue + for candidate in normalized_candidates: + if ( + candidate == normalized_button + or normalized_button in candidate + or candidate in normalized_button + ): + return True + return False + + def _maybe_contextualize_action_to_foreground_dialog( + self, + action: Dict[str, Any], + target_spec: Dict[str, Any], + ) -> Optional[Dict[str, Any]]: + """Recontextualiser l'action si un modal connu a pris le focus. + + Principe : si le premier plan est un dialogue connu ET que l'action + courante vise justement un de ses boutons, on adapte le contexte + d'exécution à cette fenêtre réelle. On ne "rejoue plus le parent" : + on agit là où l'utilisateur regarderait, c'est-à-dire dans le modal. + """ + try: + from ..window_info_crossplatform import ( + get_active_window_info, + get_active_window_rect, + ) + except Exception: + return None + + current_info = get_active_window_info() + current_title = str(current_info.get("title", "") or "") + if not current_title: + return None + + screenshot_b64 = self._capture_screenshot_b64(max_width=0, quality=75) + if not screenshot_b64: + return None + + dialog_spec = self._match_known_runtime_dialog(current_title) + if dialog_spec is None: + dialog_spec = self._match_contextual_runtime_dialog( + current_title, + screenshot_b64, + ) + if dialog_spec is None: + return None + + if not self._action_targets_runtime_dialog_button( + action, + target_spec, + dialog_spec, + ): + return None + + adapted_action = dict(action) + adapted_target_spec = dict(target_spec) + previous_window_title = str(adapted_target_spec.get("window_title", "") or "") + + adapted_action["expected_window_before"] = current_title + adapted_target_spec["window_title"] = current_title + + context_hints = dict(adapted_target_spec.get("context_hints") or {}) + context_hints["foreground_dialog_id"] = dialog_spec.get("id", "") + context_hints["foreground_dialog_title"] = current_title + if previous_window_title: + context_hints["source_window_title"] = previous_window_title + adapted_target_spec["context_hints"] = context_hints + + rect_info = get_active_window_rect() + if rect_info and rect_info.get("rect"): + rect = rect_info["rect"] + if isinstance(rect, (list, tuple)) and len(rect) == 4: + adapted_target_spec["window_capture"] = { + "title": current_title, + "app_name": rect_info.get("app_name", ""), + "rect": list(rect), + "window_size": [ + max(0, int(rect[2]) - int(rect[0])), + max(0, int(rect[3]) - int(rect[1])), + ], + } + + logger.info( + "[REPLAY] Action %s recontextualisée sur dialogue au focus '%s' " + "(dialog_id=%s, by_text='%s')", + adapted_action.get("action_id", "unknown"), + current_title, + dialog_spec.get("id", ""), + adapted_target_spec.get("by_text", ""), + ) + return { + "action": adapted_action, + "target_spec": adapted_target_spec, + "dialog_spec": dialog_spec, + "dialog_title": current_title, + } + + @staticmethod + def _normalize_window_hint(value: str) -> str: + """Normaliser un titre de fenêtre pour comparaisons souples. + + Les titres replay peuvent contenir un `*` de document modifié ou + de petites variations typographiques. On le retire ici pour + décider si une transition de fenêtre était réellement attendue. + """ + normalized = ActionExecutorV1._normalize_loose_text(value) + return normalized.strip(" *") + + @staticmethod + def _requires_post_verify_window_transition( + action: Optional[Dict[str, Any]], + target_spec: Optional[Dict[str, Any]], + expected_after: str, + ) -> bool: + """Dire si l'action exige vraiment l'ouverture d'une autre fenêtre. + + Exemple clé : un clic `Enregistrer` dans Bloc-notes qui doit + ouvrir `Enregistrer sous`. Si la fenêtre attendue n'apparait pas, + un simple changement global de pixels ne doit pas être validé + comme succès. + """ + if not expected_after: + return False + + before = "" + if isinstance(action, dict): + before = str(action.get("expected_window_before", "") or "") + if not before and isinstance(target_spec, dict): + before = str(target_spec.get("window_title", "") or "") + if not before: + return False + + before_normalized = ActionExecutorV1._normalize_window_hint(before) + after_normalized = ActionExecutorV1._normalize_window_hint(expected_after) + if not before_normalized or not after_normalized: + return False + + return not ( + before_normalized == after_normalized + or before_normalized in after_normalized + or after_normalized in before_normalized + ) + + @staticmethod + def _is_start_button_target(target_spec: Optional[Dict[str, Any]]) -> bool: + """Détecter une action replay sémantisée comme bouton Démarrer.""" + if not isinstance(target_spec, dict): + return False + role = str((target_spec.get("by_role") or "")).strip().lower() + return role == "start_button" + + @staticmethod + def _is_close_tab_target(target_spec: Optional[Dict[str, Any]]) -> bool: + """Détecter une action replay sémantisée comme fermeture d'onglet. + + Le compilateur replay peut produire un click `tab_close_button` + avec `context_hints.interaction=close_tab`. Quand le `x` de + l'onglet n'est visible qu'au survol, le grounding visuel peut + échouer proprement alors que l'intention sémantique reste claire. + """ + if not isinstance(target_spec, dict): + return False + context_hints = target_spec.get("context_hints") or {} + interaction = str((context_hints.get("interaction") or "")).strip().lower() + role = str((target_spec.get("by_role") or "")).strip().lower() + return interaction == "close_tab" or role == "tab_close_button" + + def _maybe_execute_start_button_hotkey_fallback( + self, + action: Dict[str, Any], + target_spec: Optional[Dict[str, Any]], + *, + visual_resolved: bool, + resolution_method: str, + ) -> Optional[Dict[str, Any]]: + """Fallback sémantique pour `start_button` pendant le setup Windows. + + Le clic Démarrer issu de l'enregistrement peut finir en + `position_fallback` si le grounding ne retrouve pas proprement le + bouton sur la machine cible. Ce clic aveugle s'est révélé fragile en + live : parfois aucun effet, parfois un clic voisin sur la taskbar. + + Quand l'intention sémantique est claire (`by_role=start_button`) et + qu'on est dans le setup auto, on préfère presser la touche Windows : + elle exprime exactement "ouvrir Démarrer" sans dépendre de la géométrie + de la barre des tâches. + """ + if not action.get("_setup_phase"): + return None + if not self._is_start_button_target(target_spec): + return None + if visual_resolved and resolution_method != "position_fallback": + return None + + logger.warning( + "[REPLAY] start_button fragile -> fallback touche Windows " + "(action_id=%s, visual_resolved=%s, method=%s)", + action.get("action_id", "unknown"), + visual_resolved, + resolution_method or "?", + ) + print(" [START_BUTTON] Fallback sémantique -> touche Windows") + self._execute_key_combo(["win"]) + time.sleep(0.4) + return { + "warning": "start_button_hotkey_fallback", + "resolution_method": "semantic_start_button_hotkey", + "resolution_score": 1.0, + } + + def _maybe_execute_close_tab_hotkey_fallback( + self, + action: Dict[str, Any], + target_spec: Optional[Dict[str, Any]], + ) -> Optional[Dict[str, Any]]: + """Fallback sémantique pour `close_tab` quand le `x` n'est pas visible. + + On utilise `Ctrl+W`, qui correspond à l'intention "fermer + l'onglet actif" sur Bloc-notes moderne et la plupart des apps à + onglets. Le fallback reste strictement borné aux actions déjà + sémantisées `close_tab` pour éviter tout raccourci hasardeux. + """ + if action.get("_setup_phase"): + return None + if not self._is_close_tab_target(target_spec): + return None + + logger.warning( + "[REPLAY] close_tab non résolu visuellement -> fallback Ctrl+W " + "(action_id=%s)", + action.get("action_id", "unknown"), + ) + print(" [CLOSE_TAB] Cible cachée/hover-only -> fallback Ctrl+W") + self._execute_key_combo(["ctrl", "w"]) + time.sleep(0.4) + return { + "warning": "close_tab_hotkey_fallback", + "resolution_method": "semantic_close_tab_hotkey", + "resolution_score": 1.0, + } + + def _handle_known_runtime_dialog( + self, + dialog_spec: Dict[str, Any], + current_title: str, + screen_width: int, + screen_height: int, + ) -> Optional[Dict[str, Any]]: + """Cliquer le bouton attendu d'un dialogue runtime connu. + + Strategie : + 1. Resolution serveur par texte du bouton (vision stricte) + 2. Fallback local par template matching sur le texte + 3. Pas de fallback Enter ici : sur "Confirmer l'enregistrement", + le focus peut etre sur "Non", donc Enter serait ambigu. + """ + from ..config import SERVER_URL + + screenshot_b64 = self._capture_screenshot_b64(max_width=0, quality=75) + if not screenshot_b64: + return None + + for button_text in dialog_spec.get("button_texts", ()): + if SERVER_URL: + target_spec = { + "by_text": button_text, + "by_role": "dialog_button", + "window_title": current_title, + "vlm_description": ( + f"Dans la fenêtre '{current_title}', " + f"le bouton '{button_text}'" + ), + } + resolved = self._server_resolve_target( + SERVER_URL, + screenshot_b64, + target_spec, + 0.5, + 0.5, + screen_width, + screen_height, + ) + if resolved and resolved.get("resolved"): + x_pct = float(resolved.get("x_pct", 0.5)) + y_pct = float(resolved.get("y_pct", 0.5)) + self._click( + (int(x_pct * screen_width), int(y_pct * screen_height)), + "left", + ) + time.sleep(0.8) + logger.info( + f"[RUNTIME-DIALOG] '{current_title}' gere via serveur " + f"-> bouton '{button_text}' [{resolved.get('method', 'server')}]" + ) + return { + "handled": True, + "button_text": button_text, + "x_pct": x_pct, + "y_pct": y_pct, + "resolution_method": resolved.get( + "method", "runtime_dialog_server" + ), + "resolution_score": resolved.get("score", 0.0), + } + + local_pos = self._find_text_on_screen(screenshot_b64, button_text) + if local_pos: + real_x, real_y = local_pos + self._click((real_x, real_y), "left") + time.sleep(0.8) + logger.info( + f"[RUNTIME-DIALOG] '{current_title}' gere localement " + f"-> bouton '{button_text}' [dialog_text_template]" + ) + return { + "handled": True, + "button_text": button_text, + "x_pct": real_x / max(screen_width, 1), + "y_pct": real_y / max(screen_height, 1), + "resolution_method": "dialog_text_template", + "resolution_score": 0.8, + } + + logger.info( + f"[RUNTIME-DIALOG] Aucun bouton resolu pour '{current_title}'" + ) + return None + + def _maybe_handle_runtime_dialog_before_pause( + self, + action: Dict[str, Any], + target_spec: Dict[str, Any], + expected_title: str, + current_title: str, + screen_width: int, + screen_height: int, + ) -> Optional[Dict[str, Any]]: + """Tenter de gerer un dialogue runtime avant la pause supervisee. + + Quand la fenetre active ne correspond pas au contrat de l'action, + on verifie si un dialogue runtime connu explique l'ecart. Si oui, + on applique le reflexe associe. + """ + dialog_spec = self._match_known_runtime_dialog(current_title) + if not dialog_spec: + return None + + if self._check_and_pause_on_system_dialog(context="runtime_dialog_known"): + pause_info = self._system_dialog_pause or {} + return { + "action_id": action.get("action_id", "unknown"), + "success": False, + "error": ( + f"system_dialog:{pause_info.get('category', 'unknown')}" + ), + "screenshot": self._capture_screenshot_b64(), + "visual_resolved": False, + "system_dialog": pause_info, + "needs_human": True, + } + + handled = self._handle_known_runtime_dialog( + dialog_spec, + current_title, + screen_width, + screen_height, + ) + if not handled: + return None + + if dialog_spec.get("skip_current_action_after_handle", False): + logger.info( + f"[RUNTIME-DIALOG] Dialogue '{current_title}' gere -> " + f"action {action.get('action_id', 'unknown')} skippée" + ) + return { + "action_id": action.get("action_id", "unknown"), + "success": True, + "error": None, + "warning": "runtime_dialog_handled_skip", + "screenshot": self._capture_screenshot_b64(), + "visual_resolved": False, + "resolution_method": f"runtime_dialog:{dialog_spec['id']}", + "resolution_score": handled.get("resolution_score", 0.0), + "actual_position": { + "x_pct": handled.get("x_pct", 0.5), + "y_pct": handled.get("y_pct", 0.5), + }, + "correction": { + "trigger": "runtime_dialog", + "dialog_id": dialog_spec["id"], + "dialog_title": current_title, + "button_text": handled.get("button_text", ""), + "expected_window": expected_title, + }, + } + + return None + def _auth_headers(self) -> dict: """Headers d'authentification Bearer pour les requetes au serveur.""" if self._api_token: @@ -568,21 +1161,26 @@ class ActionExecutorV1: if cond_window: try: from ..window_info_crossplatform import get_active_window_info - current_info = get_active_window_info() - current_title = current_info.get("title", "") + + # Polling de 4 secondes pour laisser le temps au dialogue d'apparaître + # (race condition Windows classique). + found_cond = False + for _ in range(8): # 8 x 0.5s = 4s + current_info = get_active_window_info() + current_title = current_info.get("title", "") + + # Comparaison souple (sous-chaîne) + cond_lower = cond_window.lower() + current_lower = current_title.lower() if current_title else "" + if cond_lower in current_lower or current_lower in cond_lower: + found_cond = True + break + time.sleep(0.5) - # Comparaison souple (sous-chaîne) - cond_lower = cond_window.lower() - current_lower = current_title.lower() if current_title else "" - match = ( - cond_lower in current_lower - or current_lower in cond_lower - ) - if not match: + if not found_cond: logger.info( f"[CONDITIONNEL] Skip action {action_id} — " - f"dialogue '{cond_window}' absent " - f"(fenêtre actuelle: '{current_title}')" + f"dialogue '{cond_window}' absent après 4s" ) print( f" [SKIP] Dialogue '{cond_window}' absent → action skippée" @@ -592,7 +1190,7 @@ class ActionExecutorV1: return result else: logger.info( - f"[CONDITIONNEL] Dialogue '{cond_window}' présent → exécution" + f"[CONDITIONNEL] Dialogue '{cond_window}' détecté → exécution" ) except Exception as e: logger.debug(f"Vérif conditionnelle échouée : {e}") @@ -637,6 +1235,20 @@ class ActionExecutorV1: x_pct = action.get("x_pct", 0.0) y_pct = action.get("y_pct", 0.0) + # ── Réflexe contexte : modal au focus > workflow parent ── + # Si une boîte connue a pris le premier plan et que l'action en + # cours vise justement un de ses boutons, on adapte le contexte + # AVANT toute pré-vérification/résolution. Cela évite de chercher + # "dans le parent" alors que l'utilisateur regarderait le modal. + if visual_mode and target_spec and action_type == "click": + adapted = self._maybe_contextualize_action_to_foreground_dialog( + action, + target_spec, + ) + if adapted is not None: + action = adapted["action"] + target_spec = adapted["target_spec"] + # Extraire le nom de l'application depuis un titre de fenêtre def _app_name(title): for sep in [" – ", " - ", " — "]: @@ -709,6 +1321,16 @@ class ActionExecutorV1: f"[LEA] Fenêtre incorrecte : attendu '{expected_title}', " f"actuel '{current_title}'" ) + auto_result = self._maybe_handle_runtime_dialog_before_pause( + action=action, + target_spec=target_spec, + expected_title=expected_title, + current_title=current_title, + screen_width=width, + screen_height=height, + ) + if auto_result is not None: + return auto_result print( f" [PRÉ-VÉRIF] Fenêtre '{current_title}' ≠ " f"attendu '{expected_title}' → mode apprentissage" @@ -752,14 +1374,22 @@ class ActionExecutorV1: "actual_window": current_title, } else: - # Timeout ou pas d'action → skipper cette action - # L'état est peut-être déjà correct (ex: Ctrl+S - # a sauvé sans dialogue → action de dialogue inutile) - result["success"] = True - result["warning"] = "wrong_window_skipped" - logger.info( - f"[LEA] Wrong window sans correction → skip " - f"(l'état est peut-être déjà atteint)" + # Timeout ou pas d'action : rester honnête et + # remonter une vraie pause supervisée. Skipper + # silencieusement laisse le replay dériver sur + # des coordonnées devenues invalides. + result["success"] = False + result["error"] = ( + f"Fenêtre incorrecte : attendu '{expected_title}', " + f"actuel '{current_title}'" + ) + result["warning"] = "wrong_window" + result["target_description"] = expected_title + result["target_spec"] = target_spec + result["screenshot"] = self._capture_screenshot_b64() + logger.warning( + f"[LEA] Wrong window sans correction → pause " + f"(attendu '{expected_title}', actuel '{current_title}')" ) return result else: @@ -845,22 +1475,25 @@ class ActionExecutorV1: # Court-circuite le grounding serveur pour les clicks sur Windows natif. # 10-20ms au lieu de 2-5s pour un clic — c'est le cœur du V4. uia_resolved = False + resolve_order = [] + uia_target = None if visual_mode and target_spec and action_type == "click": resolve_order = target_spec.get("resolve_order", []) uia_target = target_spec.get("uia_target") - if resolve_order and resolve_order[0] == "uia" and uia_target: - uia_coords = self._resolve_via_uia_local(uia_target, width, height) - if uia_coords: - x_pct, y_pct = uia_coords - result["visual_resolved"] = True - result["resolution_method"] = "v4_uia_local" - result["resolution_score"] = 0.95 - uia_resolved = True - print(f" [UIA] résolu en local: ({x_pct:.4f}, {y_pct:.4f})") - logger.info( - f"V4 UIA local OK : {uia_target.get('name', '?')} " - f"→ ({x_pct:.4f}, {y_pct:.4f})" - ) + + if resolve_order and resolve_order[0] == "uia" and uia_target: + uia_coords = self._resolve_via_uia_local(uia_target, width, height) + if uia_coords: + x_pct, y_pct = uia_coords + result["visual_resolved"] = True + result["resolution_method"] = "v4_uia_local" + result["resolution_score"] = 0.95 + uia_resolved = True + print(f" [UIA] résolu en local: ({x_pct:.4f}, {y_pct:.4f})") + logger.info( + f"V4 UIA local OK : {uia_target.get('name', '?')} " + f"→ ({x_pct:.4f}, {y_pct:.4f})" + ) if not uia_resolved and visual_mode and target_spec and server_url: # ── GROUNDING : localisation pure via GroundingEngine (fallback) ── @@ -897,7 +1530,27 @@ class ActionExecutorV1: # Si visual_mode est activé, le resolve DOIT réussir. # Pas de fallback blind — Léa VOIT l'écran et CHERCHE # l'élément. Si toute la cascade échoue → pause supervisée. - if visual_mode and not result.get("visual_resolved"): + hotkey_fallback = None + if visual_mode and target_spec and not hotkey_fallback: + hotkey_fallback = self._maybe_execute_start_button_hotkey_fallback( + action, + target_spec, + visual_resolved=bool(result.get("visual_resolved")), + resolution_method=str(result.get("resolution_method", "") or ""), + ) + if hotkey_fallback: + result.update(hotkey_fallback) + result["visual_resolved"] = False + if visual_mode and not result.get("visual_resolved") and not hotkey_fallback: + hotkey_fallback = self._maybe_execute_close_tab_hotkey_fallback( + action, + target_spec, + ) + if hotkey_fallback: + result.update(hotkey_fallback) + result["visual_resolved"] = False + + if visual_mode and not result.get("visual_resolved") and not hotkey_fallback: # ── Policy : décider quoi faire quand grounding échoue ── from .policy import PolicyEngine, Decision policy = PolicyEngine(self) @@ -1055,28 +1708,35 @@ class ActionExecutorV1: result["screenshot"] = self._capture_screenshot_b64() result["warning"] = "visual_resolve_failed" - real_x = int(x_pct * width) - real_y = int(y_pct * height) - button = action.get("button", "left") - mode = "VISUAL" if result.get("visual_resolved") else "COORD" - print( - f" [CLICK] [{mode}] ({x_pct:.3f}, {y_pct:.3f}) -> " - f"({real_x}, {real_y}) sur ({width}x{height}), bouton={button}" - ) - self._click((real_x, real_y), button) - # Phase 1 apprentissage : exposer les coordonnées RÉSOLUES - # utilisées pour le clic. Le serveur (/replay/result) les lit - # directement comme source de vérité pour la mémoire. - # On donne des percentages car la mémoire est indépendante - # de la résolution écran du client. - result["actual_position"] = { - "x_pct": float(x_pct), - "y_pct": float(y_pct), - } - logger.info( - f"Replay click [{mode}] : ({x_pct:.3f}, {y_pct:.3f}) -> " - f"({real_x}, {real_y}) sur ({width}x{height})" - ) + if hotkey_fallback: + logger.info( + "Replay click [%s] : fallback hotkey sémantique pour action %s", + result.get("resolution_method", "semantic_hotkey"), + action_id, + ) + else: + real_x = int(x_pct * width) + real_y = int(y_pct * height) + button = action.get("button", "left") + mode = "VISUAL" if result.get("visual_resolved") else "COORD" + print( + f" [CLICK] [{mode}] ({x_pct:.3f}, {y_pct:.3f}) -> " + f"({real_x}, {real_y}) sur ({width}x{height}), bouton={button}" + ) + self._click((real_x, real_y), button) + # Phase 1 apprentissage : exposer les coordonnées RÉSOLUES + # utilisées pour le clic. Le serveur (/replay/result) les lit + # directement comme source de vérité pour la mémoire. + # On donne des percentages car la mémoire est indépendante + # de la résolution écran du client. + result["actual_position"] = { + "x_pct": float(x_pct), + "y_pct": float(y_pct), + } + logger.info( + f"Replay click [{mode}] : ({x_pct:.3f}, {y_pct:.3f}) -> " + f"({real_x}, {real_y}) sur ({width}x{height})" + ) # ── Post-vérification : polling du titre fenêtre ── # On attend que le titre change vers celui attendu (max 10s) @@ -1088,47 +1748,120 @@ class ActionExecutorV1: poll_interval = 0.3 elapsed_wait = 0.0 matched = False + post_title = "" + expected_app_after = _app_name(expected_after) + runtime_dialog_handled = None + runtime_dialog_handle_count = 0 + max_runtime_dialog_handles = 2 + + def _matches_expected_window(window_title: str) -> bool: + window_app = _app_name(window_title) + return ( + window_app == expected_app_after + or expected_after.lower() in window_title.lower() + or window_title.lower() in expected_after.lower() + ) + while elapsed_wait < max_wait: time.sleep(poll_interval) elapsed_wait += poll_interval post_info = get_active_window_info() post_title = post_info.get("title", "") - post_app = _app_name(post_title) - expected_app_after = _app_name(expected_after) - if (post_app == expected_app_after - or expected_after.lower() in post_title.lower() - or post_title.lower() in expected_after.lower()): + if _matches_expected_window(post_title): matched = True break + dialog_spec = self._match_known_runtime_dialog(post_title) + if ( + dialog_spec + and runtime_dialog_handle_count < max_runtime_dialog_handles + ): + handled = self._handle_known_runtime_dialog( + dialog_spec, + post_title, + width, + height, + ) + if handled: + runtime_dialog_handle_count += 1 + runtime_dialog_handled = { + "dialog_id": dialog_spec.get("id", ""), + "dialog_title": post_title, + "button_text": handled.get("button_text", ""), + } + logger.info( + "POST-VÉRIF runtime dialog intermédiaire géré : " + "'%s' -> bouton '%s' (tentative %d/%d)", + post_title, + handled.get("button_text", ""), + runtime_dialog_handle_count, + max_runtime_dialog_handles, + ) + continue if matched: - print(f" [POST-VÉRIF] OK en {elapsed_wait:.1f}s — '{post_title}'") - logger.info(f"POST-VÉRIF OK en {elapsed_wait:.1f}s : '{post_title}'") + if runtime_dialog_handled: + result["warning"] = "runtime_dialog_handled_post_verify" + result["runtime_dialog"] = runtime_dialog_handled + print( + f" [POST-VÉRIF] Dialogue runtime géré " + f"→ retour '{post_title}'" + ) + logger.info( + "POST-VÉRIF runtime dialog géré : '%s' -> '%s'", + runtime_dialog_handled.get("dialog_title", ""), + post_title, + ) + else: + print(f" [POST-VÉRIF] OK en {elapsed_wait:.1f}s — '{post_title}'") + logger.info(f"POST-VÉRIF OK en {elapsed_wait:.1f}s : '{post_title}'") else: print(f" [POST-VÉRIF] TIMEOUT {max_wait}s — '{post_title}' ≠ '{expected_after}'") logger.warning(f"POST-VÉRIF TIMEOUT : '{post_title}' ≠ '{expected_after}'") + if runtime_dialog_handled: + result["warning"] = ( + f"runtime_dialog_handled_post_verify:{post_title}" + ) + result["runtime_dialog"] = runtime_dialog_handled + logger.warning( + "POST-VÉRIF runtime dialog géré mais " + "fenêtre finale inattendue : '%s' ≠ '%s'", + post_title, + expected_after, + ) # Contrôle strict : si success_strict, on STOP. - # Sinon on continue avec un warning (legacy). - is_strict = bool(action.get("success_strict")) - if is_strict: - result["success"] = False - result["error"] = ( - f"Post-vérif échouée : fenêtre '{post_title}' " - f"au lieu de '{expected_after}'" - ) - result["warning"] = "wrong_window" - print( - f" [POST-VÉRIF] STOP STRICT — l'étape ne s'est " - f"pas déroulée comme prévu, arrêt du replay" - ) - try: - self.notifier.replay_wrong_window( - post_title, expected_after, + # On durcit aussi les vrais changements de fenêtre + # attendus (ex: Bloc-notes -> Enregistrer sous), + # sinon un simple changement global d'écran peut + # valider à tort une action qui a dérivé. + if not matched: + requires_transition = ( + self._requires_post_verify_window_transition( + action, + target_spec, + expected_after, ) - except Exception: - pass - return result - else: - result["warning"] = f"post_verif_timeout:{post_title}" + ) + if bool(action.get("success_strict")) or requires_transition: + result["success"] = False + result["error"] = ( + f"Post-vérif échouée : fenêtre '{post_title}' " + f"au lieu de '{expected_after}'" + ) + result["warning"] = "wrong_window" + result["needs_human"] = True + result["screenshot"] = self._capture_screenshot_b64() + print( + f" [POST-VÉRIF] STOP STRICT — l'étape ne s'est " + f"pas déroulée comme prévu, arrêt du replay" + ) + try: + self.notifier.replay_wrong_window( + post_title, expected_after, + ) + except Exception: + pass + return result + elif not result.get("warning"): + result["warning"] = f"post_verif_timeout:{post_title}" else: print(f" [CLICK] Terminé.") @@ -1187,9 +1920,9 @@ class ActionExecutorV1: elif action_type == "verify_screen": # Vérification visuelle entre les groupes du replay hybride. - # Pour l'instant, on fait un wait de 2s pour laisser l'écran - # se stabiliser. La vérification réelle sera faite par le - # pre-check côté serveur dans GET /replay/next. + # Pour l'instant, on fait un wait court pour laisser l'écran + # se stabiliser. La vérification CLIP réelle est faite par + # le pre-check côté serveur dans GET /replay/next. expected_node = action.get("expected_node", "?") timeout_ms = action.get("timeout_ms", 5000) wait_s = min(timeout_ms / 1000.0, 2.0) @@ -1198,6 +1931,70 @@ class ActionExecutorV1: f"(node attendu: {expected_node}, wait={wait_s}s)" ) time.sleep(wait_s) + + # ── Garde optionnelle : titre fenêtre attendu ── + # Patch 22 mai 2026 : permet aux étapes du setup auto + # Windows d'exiger qu'une fenêtre précise (menu Démarrer, + # barre Rechercher) soit réellement active avant de + # poursuivre. Sans cette garde, une frappe peut partir + # dans le systray overflow popup et le setup ne s'en + # apercevait qu'au click_result final, trop tard. + title_patterns = action.get("expected_window_title_contains") + if title_patterns: + if isinstance(title_patterns, str): + patterns = [title_patterns] + else: + patterns = [p for p in title_patterns if p] + from ..window_info_crossplatform import get_active_window_info + current_info = get_active_window_info() or {} + current_title = str(current_info.get("title", "") or "") + if not self._window_title_matches_any(current_title, patterns): + logger.warning( + "[LEA] verify_screen garde KO : attendu un titre " + "contenant %s, actuel '%s'", + patterns, current_title, + ) + print( + f" [VERIFY] Garde titre KO " + f"(patterns={patterns}, actuel='{current_title}') " + "→ apprentissage humain" + ) + try: + self.notifier.replay_learning_mode( + raison="wrong_window_setup_guard", + target_description=" ou ".join(patterns), + window_title=current_title, + ) + except Exception: + pass + human_actions = self._capture_human_correction(timeout_s=120) + if human_actions: + result["success"] = True + result["resolution_method"] = "human_supervised" + result["warning"] = "setup_guard_window_mismatch" + result["correction"] = { + "actions": human_actions, + "action_count": len(human_actions), + "trigger": "setup_guard_window_mismatch", + "expected_patterns": patterns, + "actual_window": current_title, + } + else: + result["success"] = False + result["error"] = ( + f"verify_screen titre fenêtre KO : attendu " + f"un titre contenant {patterns}, " + f"actuel '{current_title}'" + ) + result["warning"] = "setup_guard_window_mismatch" + result["needs_human"] = True + result["screenshot"] = self._capture_screenshot_b64() + return result + logger.info( + "[LEA] verify_screen garde OK : '%s' matche %s", + current_title, patterns, + ) + print(f" [VERIFY] Termine (verification deferred au serveur).") logger.info( f"Replay verify_screen : node={expected_node}, " @@ -1219,7 +2016,24 @@ class ActionExecutorV1: # Verifie UNIQUEMENT, ne tente PAS de gerer les popups # (Enter/Escape perturbent l'application). # Signale l'echec honnêtement — le serveur decide du retry. - if needs_screen_check and hash_before: + # + # Patch 22 mai 2026 : pour les actions du setup auto Windows + # (`_setup_phase=True`), on neutralise la validation par + # simple pixel-change. C'est la garde `verify_screen` qui + # suit (`expected_window_title_contains`) qui tranche. + # Sans ce skip, un click_start qui ouvre en fait le systray + # overflow popup serait validé comme succès (l'écran a bien + # changé), et la frappe `bloc` partirait dans la mauvaise + # fenêtre avant que la garde n'ait pu rattraper. + is_setup_action = bool(action.get("_setup_phase")) + if needs_screen_check and hash_before and is_setup_action: + logger.info( + f"[LEA] Setup action {action_id} : validation " + "pixel-change skippée (garde verify_screen ultérieure)" + ) + # Stabilisation minimale avant la garde suivante. + time.sleep(0.5) + elif needs_screen_check and hash_before: screen_changed = self._wait_for_screen_change( hash_before, timeout_ms=3000 ) @@ -1359,7 +2173,10 @@ class ActionExecutorV1: # ---- ÉTAPE 2 : Template matching local (fallback si serveur down) ---- anchor_b64 = target_spec.get("anchor_image_base64", "") if anchor_b64: - tm_result = self._template_match_anchor(screenshot_b64, anchor_b64, screen_width, screen_height) + tm_result = self._template_match_anchor( + screenshot_b64, anchor_b64, screen_width, screen_height, + fallback_x_pct=fallback_x, fallback_y_pct=fallback_y, + ) if tm_result and tm_result.get("resolved"): return _with_metrics(tm_result) @@ -1448,12 +2265,19 @@ class ActionExecutorV1: def _template_match_anchor( self, screenshot_b64: str, anchor_b64: str, screen_width: int, screen_height: int, + fallback_x_pct: float = 0.0, fallback_y_pct: float = 0.0, + max_drift: float = 0.25, ) -> dict: """Template matching direct avec le crop anchor (image de référence). Le crop anchor est une capture de l'élément UI lors de l'enregistrement. Si l'UI est identique (même résolution, même thème), le match est quasi-parfait et très rapide (~10ms). + + Patch 2026-05-23 — brief 0756 : garde drift par rapport à + ``(fallback_x_pct, fallback_y_pct)``. Sans ça, un crop visuellement + proche présent ailleurs à l'écran (ex: bandeau OBS Studio en + arrière-plan) peut être accepté à tort. """ import cv2 import numpy as np @@ -1487,6 +2311,26 @@ class ActionExecutorV1: x_pct = cx / screenshot.shape[1] y_pct = cy / screenshot.shape[0] + # Garde drift : refuser un match trop loin de la + # position fallback enregistrée (anti faux positif sur + # crop similaire ailleurs à l'écran). + if not self._anchor_match_within_drift( + x_pct, y_pct, fallback_x_pct, fallback_y_pct, + max_drift=max_drift, + ): + print( + f" [ANCHOR-TM] REJET drift " + f"({x_pct:.3f}, {y_pct:.3f}) loin de " + f"({fallback_x_pct:.3f}, {fallback_y_pct:.3f})" + ) + logger.warning( + f"[ANCHOR-TM] Rejet drift : match ({x_pct:.3f}, " + f"{y_pct:.3f}) score={max_val:.3f} hors zone " + f"fallback ({fallback_x_pct:.3f}, " + f"{fallback_y_pct:.3f}) max_drift={max_drift:.2f}" + ) + return None + print( f" [ANCHOR-TM] TROUVÉ ({x_pct:.3f}, {y_pct:.3f}) " f"score={max_val:.3f}" @@ -1829,6 +2673,7 @@ Example: x_pct=0.50, y_pct=0.30""" # résolution). Fallback notifier.notify si la ChatWindow n'est pas # câblée (mode headless / tests). if data.get("replay_paused"): + self._replay_paused = True pause_msg = data.get("pause_message") or "Léa a besoin de votre aide" replay_id = data.get("replay_id") or "" pause_key = (replay_id, pause_msg) @@ -1876,8 +2721,11 @@ Example: x_pct=0.50, y_pct=0.30""" action = data.get("action") if action is None: + self._replay_paused = False return False + self._replay_paused = False + except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: # Backoff exponentiel : augmenter le delai de polling self._poll_backoff = min( diff --git a/agent_v0/agent_v1/core/grounding.py b/agent_v0/agent_v1/core/grounding.py index 0082555a3..f778358fa 100644 --- a/agent_v0/agent_v1/core/grounding.py +++ b/agent_v0/agent_v1/core/grounding.py @@ -74,6 +74,142 @@ class GroundingEngine: """ self._executor = executor + @staticmethod + def _should_scope_to_active_window(target_spec: Dict[str, Any]) -> bool: + """Déterminer si le grounding doit être limité à la fenêtre active.""" + if str(target_spec.get("screen_scope", "")).strip().lower() == "full_screen": + return False + + by_role = str(target_spec.get("by_role", "")).strip().lower() + if by_role in {"start_button"}: + return False + + return True + + @staticmethod + def _targets_lea_window(target_spec: Dict[str, Any]) -> bool: + """Déterminer si la cible pointe explicitement vers l'UI de Léa.""" + try: + from ..ui.messages import est_fenetre_lea + except Exception: + return False + + context_hints = target_spec.get("context_hints") or {} + hints = [ + target_spec.get("window_title", ""), + context_hints.get("window_title", ""), + target_spec.get("vlm_description", ""), + target_spec.get("by_text", ""), + ] + return any(est_fenetre_lea(str(hint)) for hint in hints if hint) + + @staticmethod + def _is_plausible_window_rect( + rect: Optional[List[int]], + title: str, + screen_width: int, + screen_height: int, + ) -> bool: + """Valider qu'un rect actif ressemble à une vraie fenêtre utilisable. + + Rejette explicitement les zones système "bar-like" (taskbar, systray) + et les titres inconnus/bruités. Le grounding ne doit jamais se + contraindre à une zone non validée. + """ + if not rect or len(rect) != 4: + return False + + try: + from ..ui.messages import est_fenetre_bruit + except Exception: + def est_fenetre_bruit(_title: str) -> bool: + return not _title or _title.strip().lower() == "unknown_window" + + w = rect[2] - rect[0] + h = rect[3] - rect[1] + title_clean = str(title or "").strip() + if w <= 50 or h <= 50: + return False + title_lower = title_clean.lower() + is_unknown_title = not title_clean or title_lower == "unknown_window" + if not is_unknown_title and est_fenetre_bruit(title_clean): + return False + + # Une zone très plate, surtout en bas d'écran et très large, est + # typiquement une barre des tâches / systray, pas une vraie fenêtre. + # On réduit le seuil de hauteur à 120px pour ne pas rejeter les petits modaux. + is_bar_like = ( + h < 120 + or (w > 0.9 * screen_width and h < 0.15 * screen_height) + ) + + # Exception : si le titre contient un mot-clé de dialogue connu, + # on considère que c'est plausible même si c'est petit. + keywords = ["enregistrer sous", "save as", "voulez-vous", "confirm", "attention", "error", "erreur"] + if any(k in title_lower for k in keywords): + return h >= 80 # Un dialogue fait au moins 80px (titre + bouton) + + return not is_bar_like + + @staticmethod + def _visual_scope_hints(target_spec: Dict[str, Any]) -> List[str]: + """Construire des indices textuels à chercher dans le crop fenêtre.""" + hints: List[str] = [] + raw_hints = [ + target_spec.get("window_title", ""), + (target_spec.get("context_hints") or {}).get("window_title", ""), + target_spec.get("by_text", ""), + ] + for raw in raw_hints: + text = str(raw or "").strip() + if not text: + continue + text = text.lstrip("*").strip() + variants = [text] + for sep in (" – ", " - ", " — "): + if sep in text: + variants.extend(part.strip().lstrip("*") for part in text.split(sep)) + for variant in variants: + if variant and len(variant) >= 3 and variant not in hints: + hints.append(variant) + return hints + + def _window_crop_matches_target_visually( + self, + screenshot_b64: str, + target_spec: Dict[str, Any], + ) -> bool: + """Vérifier visuellement qu'un crop contraint contient la bonne cible. + + Principe: ne jamais faire confiance au rect système seul. Si aucun + indice textuel n'est disponible, on laisse passer le crop plausible + pour ne pas sur-bloquer les cibles purement iconiques. + """ + hints = self._visual_scope_hints(target_spec) + if not hints: + return True + + finder = getattr(self._executor, "_find_text_on_screen", None) + if not callable(finder): + return True + + for hint in hints: + try: + if finder(screenshot_b64, hint): + logger.info( + "Grounding fenêtre validé visuellement via '%s'", + hint, + ) + return True + except Exception as e: + logger.debug("Validation visuelle du crop échouée pour '%s': %s", hint, e) + logger.info( + "Grounding plein écran : crop fenêtre rejeté par validation visuelle " + "(hints=%s)", + hints, + ) + return False + def locate( self, server_url: str, @@ -128,35 +264,63 @@ class GroundingEngine: t_start = time.time() - # ── Capture contrainte à la fenêtre active ── - # Le grounding ne voit QUE la fenêtre attendue — pas la taskbar, - # pas le systray, pas les autres apps. Comme un humain qui regarde - # l'application sur laquelle il travaille. window_rect = None - try: - from ..window_info_crossplatform import get_active_window_rect - win_info = get_active_window_rect() - if win_info and win_info.get("rect"): - r = win_info["rect"] # [left, top, right, bottom] - # Validation : fenêtre visible et pas minuscule - w = r[2] - r[0] - h = r[3] - r[1] - if w > 50 and h > 50: - window_rect = { - "left": max(0, r[0]), - "top": max(0, r[1]), - "width": min(w, screen_width), - "height": min(h, screen_height), - } - logger.info( - f"Grounding contraint à la fenêtre : " - f"{window_rect['width']}x{window_rect['height']} " - f"à ({window_rect['left']}, {window_rect['top']})" - ) - except Exception as e: - logger.debug(f"Pas de window rect disponible : {e}") + active_title = "" + if self._should_scope_to_active_window(target_spec): + # ── Capture contrainte à la fenêtre active ── + # Le grounding ne voit QUE la fenêtre attendue — pas la taskbar, + # pas le systray, pas les autres apps. Comme un humain qui regarde + # l'application sur laquelle il travaille. + try: + from ..window_info_crossplatform import get_active_window_rect + from ..ui.messages import est_fenetre_lea + + win_info = get_active_window_rect() + if win_info and win_info.get("rect"): + active_title = str(win_info.get("title", "") or "") + if est_fenetre_lea(active_title) and not self._targets_lea_window(target_spec): + logger.info( + "Grounding plein écran : fenêtre active Léa ignorée pour " + "cible externe (%s)", + target_spec.get("by_text", "") or target_spec.get("by_role", ""), + ) + win_info = None + if win_info and win_info.get("rect"): + r = win_info["rect"] # [left, top, right, bottom] + if self._is_plausible_window_rect(r, active_title, screen_width, screen_height): + w = r[2] - r[0] + h = r[3] - r[1] + window_rect = { + "left": max(0, r[0]), + "top": max(0, r[1]), + "width": min(w, screen_width), + "height": min(h, screen_height), + } + logger.info( + f"Grounding contraint à la fenêtre : " + f"{window_rect['width']}x{window_rect['height']} " + f"à ({window_rect['left']}, {window_rect['top']})" + ) + else: + logger.info( + "Grounding plein écran : rect actif rejeté " + "(title='%s', rect=%s)", + active_title, + r, + ) + except Exception as e: + logger.debug(f"Pas de window rect disponible : {e}") + else: + logger.info( + "Grounding plein écran pour by_role='%s'", + target_spec.get("by_role", ""), + ) screenshot_b64 = self._capture_window_or_screen(window_rect) + if window_rect and screenshot_b64: + if not self._window_crop_matches_target_visually(screenshot_b64, target_spec): + window_rect = None + screenshot_b64 = self._capture_window_or_screen(None) if not screenshot_b64: return GroundingResult( found=False, detail="Capture screenshot échouée", @@ -186,6 +350,18 @@ class GroundingEngine: result.elapsed_ms = (time.time() - t_start) * 1000 return result + if target_spec.get("allow_position_fallback"): + if 0.0 <= fallback_x <= 1.0 and 0.0 <= fallback_y <= 1.0: + return GroundingResult( + found=True, + x_pct=fallback_x, + y_pct=fallback_y, + method="position_fallback", + score=0.2, + detail="fallback positionnel explicite", + elapsed_ms=(time.time() - t_start) * 1000, + ) + return GroundingResult( found=False, detail=f"Toutes les stratégies ont échoué ({', '.join(strategies)})", @@ -258,7 +434,12 @@ class GroundingEngine: anchor_b64 = target_spec.get("anchor_image_base64", "") if anchor_b64: raw = self._executor._template_match_anchor( - screenshot_b64, anchor_b64, screen_width, screen_height, + screenshot_b64, + anchor_b64, + screen_width, + screen_height, + fallback_x_pct=fallback_x, + fallback_y_pct=fallback_y, ) if raw and raw.get("resolved"): return GroundingResult( diff --git a/agent_v0/agent_v1/finalize_contract.py b/agent_v0/agent_v1/finalize_contract.py new file mode 100644 index 000000000..c2b367212 --- /dev/null +++ b/agent_v0/agent_v1/finalize_contract.py @@ -0,0 +1,39 @@ +"""Dispatch léger du contrat enrichi de /finalize côté agent.""" + +from __future__ import annotations + +import logging +from typing import Any, Dict + + +logger = logging.getLogger(__name__) + + +def dispatch_finalize_result(ui: Any, payload: Dict[str, Any], replay_name: str) -> None: + """Router le résultat de /finalize vers la bonne surface UI agent.""" + if not isinstance(payload, dict): + return + + replay_request = payload.get("replay_request") or {} + replay_launch = payload.get("replay_launch") or {} + + if replay_launch.get("status") == "started": + logger.info("Replay direct déjà lancé par le serveur après finalize") + return + + if not payload.get("replay_ready") or not replay_request: + return + + if replay_launch.get("status") == "failed": + logger.warning( + "Auto-replay serveur échoué après finalize, proposition manuelle" + ) + + if ui is None or not hasattr(ui, "offer_finalize_replay"): + logger.info("UI indisponible pour proposer un test immédiat") + return + + ui.offer_finalize_replay( + replay_request, + replay_name or "la tâche que vous venez d'enregistrer", + ) diff --git a/agent_v0/agent_v1/main.py b/agent_v0/agent_v1/main.py index 55ef5391b..ff8cc509a 100644 --- a/agent_v0/agent_v1/main.py +++ b/agent_v0/agent_v1/main.py @@ -28,6 +28,7 @@ from .ui.chat_window import ChatWindow from .ui.capture_server import CaptureServer from .session.storage import SessionStorage from .vision.capturer import VisionCapturer +from .finalize_contract import dispatch_finalize_result # Import optionnel du client serveur (pour le chat et les workflows) # Deux chemins : relatif (depuis agent_v0.agent_v1) ou absolu (depuis C:\rpa_vision\agent_v1) @@ -80,6 +81,7 @@ class AgentV1: self._executor = None # Flag pour indiquer qu'un replay est en cours (eviter les conflits) self._replay_active = False + self._last_recording_name = "" # Etat partage entre systray et chat (source de verite unique) self._state = AgentState() @@ -210,12 +212,14 @@ class AgentV1: time.sleep(30) # Vérifier toutes les 30s def start_session(self, workflow_name): + self._last_recording_name = workflow_name self.session_id = f"sess_{time.strftime('%Y%m%dT%H%M%S')}_{uuid.uuid4().hex[:6]}" self.session_dir = self.storage.get_session_dir(self.session_id) self.vision = VisionCapturer(str(self.session_dir)) self.streamer = TraceStreamer(self.session_id, machine_id=self.machine_id) + self.streamer.set_on_finalize_result(self._on_finalize_result) self.captor = EventCaptorV1(self._on_event_bridge) # Initialiser l'executeur partage @@ -325,6 +329,15 @@ class AgentV1: # pour enchainer les actions du workflow time.sleep(0.2) else: + if getattr(self._executor, "_replay_paused", False): + if not self._replay_active: + self._replay_active = True + self.ui.set_replay_active(True) + self._state.set_replay_active(True) + poll_delay = getattr(self._executor, '_poll_backoff', REPLAY_POLL_INTERVAL) + time.sleep(max(poll_delay, REPLAY_POLL_INTERVAL)) + continue + # Pas d'action en attente — utiliser le backoff de l'executor # (augmente si le serveur est indisponible, reset a 1s sinon) if self._replay_active: @@ -429,6 +442,11 @@ class AgentV1: f"agent_{self.user_id}" ) + def _on_finalize_result(self, payload: dict) -> None: + """Réagir au contrat enrichi de /finalize côté agent.""" + replay_name = self._last_recording_name or "la tâche que vous venez d'enregistrer" + dispatch_finalize_result(self.ui, payload, replay_name) + _last_heartbeat_hash: str = "" def _heartbeat_loop(self): diff --git a/agent_v0/agent_v1/network/streamer.py b/agent_v0/agent_v1/network/streamer.py index ffe2fad8e..382a6419e 100644 --- a/agent_v0/agent_v1/network/streamer.py +++ b/agent_v0/agent_v1/network/streamer.py @@ -30,6 +30,7 @@ import os import queue import threading import time +from typing import Callable, Optional import requests from PIL import Image @@ -95,6 +96,11 @@ class TraceStreamer: # Initialisé paresseusement pour ne pas payer le coût SQLite en dehors # d'un streaming actif. self._buffer: PersistentBuffer | None = None + self._on_finalize_result: Optional[Callable[[dict], None]] = None + + def set_on_finalize_result(self, callback: Optional[Callable[[dict], None]]) -> None: + """Définir un callback appelé avec le payload JSON de /finalize.""" + self._on_finalize_result = callback def _get_buffer(self) -> PersistentBuffer: """Retourne le buffer persistant, en l'initialisant au besoin.""" @@ -621,6 +627,14 @@ class TraceStreamer: if resp.ok: result = resp.json() logger.info(f"Session finalisée: {result}") + if self._on_finalize_result is not None: + try: + self._on_finalize_result(result) + except Exception as cb_error: + logger.warning( + "Callback finalize ignoré après erreur: %s", + cb_error, + ) else: logger.warning(f"Finalisation échouée: {resp.status_code}") except Exception as e: diff --git a/agent_v0/agent_v1/ui/capture_server.py b/agent_v0/agent_v1/ui/capture_server.py index de5e0cc17..d65a3c0f4 100644 --- a/agent_v0/agent_v1/ui/capture_server.py +++ b/agent_v0/agent_v1/ui/capture_server.py @@ -158,14 +158,25 @@ class CaptureHandler(BaseHTTPRequestHandler): """Capture l'ecran principal et le renvoie en base64 JPEG.""" t0 = time.perf_counter() try: - import mss - from PIL import Image + from ..vision.capturer import ( + capture_foreground_window_image, + capture_screen_image, + ) - with mss.mss() as sct: - monitor = sct.monitors[1] # ecran principal - raw = sct.grab(monitor) - - img = Image.frombytes("RGB", raw.size, raw.bgra, "raw", "BGRX") + _monitor, img, meta = capture_screen_image() + if img is None: + img, win_meta = capture_foreground_window_image() + meta.update(win_meta) + if img is None: + elapsed_ms = (time.perf_counter() - t0) * 1000 + logger.error("Erreur capture : aucun backend exploitable (%s)", meta) + self._send_json(503, { + "error": "capture_unavailable", + "source": meta.get("backend", "unknown"), + "capture_ms": round(elapsed_ms), + "diagnostics": meta, + }) + return # Floutage des données sensibles (conformité AI Act) if BLUR_SENSITIVE: @@ -180,15 +191,22 @@ class CaptureHandler(BaseHTTPRequestHandler): img_b64 = base64.b64encode(buf.getvalue()).decode() elapsed_ms = (time.perf_counter() - t0) * 1000 - logger.info(f"Capture {img.width}x{img.height} en {elapsed_ms:.0f}ms") + logger.info( + "Capture %sx%s via %s en %.0fms", + img.width, + img.height, + meta.get("backend", "unknown"), + elapsed_ms, + ) self._send_json(200, { "image": img_b64, "width": img.width, "height": img.height, "format": "jpeg", - "source": "windows_live", + "source": meta.get("backend", "windows_live"), "capture_ms": round(elapsed_ms), + "diagnostics": meta, }) except Exception as e: diff --git a/agent_v0/agent_v1/ui/chat_window.py b/agent_v0/agent_v1/ui/chat_window.py index 03ce8a176..8dff238bf 100644 --- a/agent_v0/agent_v1/ui/chat_window.py +++ b/agent_v0/agent_v1/ui/chat_window.py @@ -894,6 +894,34 @@ class ChatWindow: except Exception: logger.debug("clear chat history silenced", exc_info=True) + @staticmethod + def _compute_paused_bubble_height(reason_str: str) -> tuple: + """Calcule la hauteur du Text (en lignes) + si une scrollbar est + nécessaire pour le message d'une bulle paused. + + Patch 22 mai 2026 — fix troncature : on prend en compte les \\n + explicites (les `reason` serveur peuvent lister plusieurs + candidats avec un saut de ligne par item) en plus de la longueur + en caractères, et on active la scrollbar dès que le cap est + atteint pour éviter que du contenu disparaisse silencieusement. + + Retourne ``(height_lines, needs_scrollbar)``. + """ + if not reason_str: + return 2, False + text = str(reason_str) + # Estimation : ~60 chars/ligne effectifs avec wraplength. + wrapped_lines = (len(text) // 60) + 1 + explicit_lines = text.count("\n") + 1 + estimated = max(wrapped_lines, explicit_lines) + cap = 12 + height = max(2, min(cap, estimated)) + # Scrollbar dès que le cap est atteint OU contenu long (filet + # textuel : ≥ 200 chars implique souvent un débordement visuel + # même quand les lignes brutes sont peu nombreuses). + needs_scroll = (estimated >= cap) or (len(text) > 200) + return height, needs_scroll + def _render_paused_bubble(self, payload: Dict[str, Any]) -> None: tk = self._tk if getattr(self, "_msg_frame", None) is None: @@ -923,22 +951,23 @@ class ChatWindow: # Message scrollable pour les longs reasons (ex: 200+ chars depuis le serveur). # On utilise un Text en mode read-only avec hauteur calculée selon la longueur. - # Au-delà de 280 chars, scrollbar interne ; sinon Text auto-fitté. + # Patch 22 mai 2026 : prendre en compte les \n explicites (titres + # fenêtre / patterns) et activer la scrollbar dès que le cap de + # hauteur est atteint — sinon les bulles de pause étaient + # tronquées visuellement sans aucun ascenseur visible. reason_str = str(reason) - # Estimation simple : ~70 chars/ligne avec wraplength - approx_lines = max(2, min(8, (len(reason_str) // 60) + 1)) + height_lines, needs_scroll = self._compute_paused_bubble_height(reason_str) msg_frame = tk.Frame(inner, bg=PAUSED_BG) msg_frame.pack(fill=tk.X, anchor=tk.W, pady=(6, 0)) reason_text = tk.Text( msg_frame, bg=PAUSED_BG, fg=PAUSED_FG, - font=FONT_MSG, wrap=tk.WORD, bd=0, height=approx_lines, + font=FONT_MSG, wrap=tk.WORD, bd=0, height=height_lines, highlightthickness=0, relief=tk.FLAT, cursor="arrow", ) reason_text.insert("1.0", reason_str) reason_text.configure(state="disabled") reason_text.pack(side=tk.LEFT, fill=tk.X, expand=True) - # Scrollbar interne uniquement si le contenu déborde (long messages) - if len(reason_str) > 280: + if needs_scroll: reason_scroll = tk.Scrollbar( msg_frame, orient=tk.VERTICAL, command=reason_text.yview, width=8, @@ -1019,27 +1048,40 @@ class ChatWindow: UX fix 8 mai 2026 : on désactive les 2 boutons et on affiche un message de feedback dès le clic, sans attendre l'ack serveur. Le bus émet en arrière-plan ; si la connexion est tombée, on log un warning visible. + + Fallback HTTP 22 mai 2026 : si le bus SocketIO est déconnecté, on + retombe sur un POST direct ``/replay/{id}/resume`` via + ``server_client``. Si les deux échouent, on ré-active les boutons + et on saute l'auto-hide pour permettre à l'utilisateur de + réessayer manuellement (sinon le replay reste figé côté serveur). """ if not replay_id: self._update_paused_feedback("⚠ replay_id manquant — impossible de relancer") return - emitted = False - if self._bus is not None and self._bus.connected: - emitted = self._bus.resume_replay(replay_id) - # Feedback immédiat : disable boutons + message + emitted, channel = self._dispatch_paused_action( + replay_id, + bus_method="resume_replay", + client_method="resume_replay", + ) self._disable_paused_buttons() if emitted: self._update_paused_feedback("→ Reprise demandée…") - logger.info("paused_bubble: lea:replay_resume émis pour %s", replay_id) - else: - self._update_paused_feedback("⚠ Bus indisponible — réessayez dans 5s") - logger.warning("paused_bubble: bus déconnecté, resume non émis") - # UX fix mai 2026 : minimiser la fenêtre vers le systray après 500ms - # (laisse à l'utilisateur le temps de voir "Reprise demandée…"). - try: - self._root.after(500, self._do_hide) - except Exception: - logger.debug("auto-hide on resume silenced", exc_info=True) + logger.info( + "paused_bubble: replay_resume émis pour %s via %s", + replay_id, channel, + ) + try: + self._root.after(500, self._do_hide) + except Exception: + logger.debug("auto-hide on resume silenced", exc_info=True) + return + # Échec sur les deux canaux : laisser l'utilisateur réessayer. + self._update_paused_feedback("⚠ Serveur injoignable — réessayez") + self._enable_paused_buttons() + logger.warning( + "paused_bubble: bus et HTTP indisponibles, resume non émis " + "pour %s", replay_id, + ) def _on_paused_abort(self, replay_id: str) -> None: """Bouton Annuler : émettre lea:replay_abort + fermeture locale immédiate. @@ -1048,17 +1090,30 @@ class ChatWindow: n'envoie pas de lea:resumed pour un abort, donc sans cette fermeture locale la bulle restait coincée — c'était la cause de "Annuler ne fonctionne pas" rapportée par Dom). + + Fallback HTTP 22 mai 2026 : symétrique de ``_on_paused_resume`` — + si le bus est déconnecté, POST direct ``/replay/{id}/cancel``. + L'abort ferme la bulle localement quelle que soit l'issue (l'état + serveur sera réconcilié au prochain poll /replay/next). """ - emitted = False - if self._bus is not None and self._bus.connected: - emitted = self._bus.abort_replay(replay_id) + emitted, channel = self._dispatch_paused_action( + replay_id, + bus_method="abort_replay", + client_method="abort_replay", + ) self._disable_paused_buttons() if emitted: self._update_paused_feedback("✗ Annulé") - logger.info("paused_bubble: lea:replay_abort émis pour %s", replay_id) + logger.info( + "paused_bubble: replay_abort émis pour %s via %s", + replay_id, channel, + ) else: - self._update_paused_feedback("✗ Annulé (bus indisponible)") - logger.warning("paused_bubble: bus déconnecté, abort non émis") + self._update_paused_feedback("✗ Annulé (serveur injoignable)") + logger.warning( + "paused_bubble: bus et HTTP indisponibles, abort non émis " + "pour %s", replay_id, + ) # Fermer la bulle en local (l'abort n'a pas de lea:resumed associé) self._close_active_paused_bubble(reason="abort_local") # UX fix mai 2026 : minimiser la fenêtre après 500ms (cohérence @@ -1068,6 +1123,34 @@ class ChatWindow: except Exception: logger.debug("auto-hide on abort silenced", exc_info=True) + def _dispatch_paused_action( + self, + replay_id: str, + bus_method: str, + client_method: str, + ) -> tuple: + """Envoyer une action de bulle paused via bus puis fallback HTTP. + + Retourne ``(emitted, channel)`` où ``channel`` vaut ``"bus"``, + ``"http"`` ou ``""`` (aucun chemin n'a abouti). + """ + if self._bus is not None and getattr(self._bus, "connected", False): + try: + if getattr(self._bus, bus_method)(replay_id): + return True, "bus" + except Exception: + logger.debug("paused_bubble: bus %s silenced", bus_method, exc_info=True) + if self._server_client is not None and hasattr(self._server_client, client_method): + try: + if getattr(self._server_client, client_method)(replay_id): + return True, "http" + except Exception: + logger.debug( + "paused_bubble: server_client %s silenced", + client_method, exc_info=True, + ) + return False, "" + def _disable_paused_buttons(self) -> None: if not self._active_paused_bubble: return @@ -1077,6 +1160,19 @@ class ChatWindow: except Exception: logger.debug("disable paused buttons silenced", exc_info=True) + def _enable_paused_buttons(self) -> None: + """Ré-activer les boutons Continuer/Annuler de la bulle paused + active. Appelé quand l'envoi a échoué sur tous les canaux — + l'utilisateur doit pouvoir réessayer manuellement. + """ + if not self._active_paused_bubble: + return + try: + self._active_paused_bubble["btn_resume"].config(state="normal") + self._active_paused_bubble["btn_abort"].config(state="normal") + except Exception: + logger.debug("enable paused buttons silenced", exc_info=True) + def _update_paused_feedback(self, text: str) -> None: if not self._active_paused_bubble: return diff --git a/agent_v0/agent_v1/ui/smart_tray.py b/agent_v0/agent_v1/ui/smart_tray.py index df0588a1f..e158e101c 100644 --- a/agent_v0/agent_v1/ui/smart_tray.py +++ b/agent_v0/agent_v1/ui/smart_tray.py @@ -504,6 +504,100 @@ class SmartTrayV1: threading.Thread(target=_replay, daemon=True).start() + def _launch_replay_request( + self, + replay_request: Dict[str, Any], + replay_name: str, + ) -> None: + """Lance un replay direct depuis un payload `replay_request` serveur.""" + endpoint = (replay_request or {}).get("endpoint", "") + session_id = (replay_request or {}).get("session_id", "") + machine_id = (replay_request or {}).get("machine_id") or self.machine_id + + if endpoint != "/api/v1/traces/stream/replay-session" or not session_id: + logger.warning("Replay request non supporté: %s", replay_request) + self._notifier.notify( + "Léa", + "Je ne peux pas lancer ce test automatique pour le moment.", + ) + return + + def _replay(): + if self.server_client is None: + return + + with self._state_lock: + self._replay_active = True + self._update_icon() + self._notifier.notify( + "Léa", + f"Le système d'intelligence artificielle exécute la " + f"tâche '{replay_name}' sur votre écran.", + ) + + try: + import requests + auth_headers = {} + if self.server_client is not None: + auth_headers = self.server_client._auth_headers() + resp = requests.post( + f"{self.server_client._stream_base}{endpoint}", + params={ + "session_id": session_id, + "machine_id": machine_id, + }, + headers=auth_headers, + timeout=30, + allow_redirects=False, + ) + if resp.ok: + logger.info( + "Replay direct démarré pour session %s (machine=%s)", + session_id, + machine_id, + ) + else: + self._notifier.notify( + "Léa", + "Hmm, le serveur a refusé le test immédiat.", + ) + except Exception as e: + logger.error("Erreur lancement replay direct : %s", e) + self._notifier.notify( + "Léa", + f"Oups, un problème : {e}", + ) + finally: + with self._state_lock: + self._replay_active = False + self._update_icon() + + threading.Thread(target=_replay, daemon=True).start() + + def offer_finalize_replay( + self, + replay_request: Dict[str, Any], + replay_name: str, + ) -> None: + """Proposer à l'utilisateur de tester immédiatement la tâche apprise.""" + if not replay_request or not replay_request.get("session_id"): + return + + def _offer(): + self._notifier.notify( + "Léa", + f"J'ai compris la tâche '{replay_name}'. Voulez-vous la tester ?", + ) + if not _ask_consent( + "Léa — Test immédiat", + f"J'ai compris la tâche '{replay_name}'. " + "Voulez-vous la tester maintenant ?", + ): + return + self._launch_replay_request(replay_request, replay_name) + + threading.Thread(target=_offer, daemon=True).start() + def _on_emergency_stop(self, _icon=None, _item=None) -> None: """Arret d'urgence — stoppe TOUTES les activites de l'agent immediatement. diff --git a/agent_v0/agent_v1/vision/capturer.py b/agent_v0/agent_v1/vision/capturer.py index 0b091be8d..b7abeaa2a 100644 --- a/agent_v0/agent_v1/vision/capturer.py +++ b/agent_v0/agent_v1/vision/capturer.py @@ -15,7 +15,7 @@ import time import logging import hashlib import platform -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple from PIL import Image, ImageFilter, ImageStat import mss from ..config import TARGETED_CROP_SIZE, SCREENSHOT_QUALITY, BLUR_SENSITIVE @@ -86,6 +86,337 @@ def _enrich_with_monitor_info(payload: dict) -> dict: payload["monitors_geometry"] = _get_monitors_geometry() return payload + +# Garde dimensions monitor (démo GHT 19 mai 2026) : mss.monitors[1] peut +# retourner intermittemment des dims tronquées (cas observé 2560×60). Utiliser +# ces dims pour normaliser des coords empoisonne la mémoire (TargetMemoryStore). +MIN_MONITOR_WIDTH = 200 +MIN_MONITOR_HEIGHT = 200 +MONITOR_MAX_ATTEMPTS = 2 +MONITOR_RETRY_DELAY_S = 0.05 +BLACK_FRAME_MEAN_MAX = 1.0 +BLACK_FRAME_STDDEV_MAX = 1.0 +BLACK_FRAME_MAX_LUMA = 3 + + +def _is_monitor_sane(monitor) -> bool: + """True si les dims du monitor sont au-dessus du seuil de plausibilité.""" + if not isinstance(monitor, dict): + return False + w = monitor.get("width", 0) or 0 + h = monitor.get("height", 0) or 0 + return w >= MIN_MONITOR_WIDTH and h >= MIN_MONITOR_HEIGHT + + +def _dim_str(monitor) -> str: + """Représentation courte WxH pour les logs (gère monitor=None).""" + if not isinstance(monitor, dict): + return "?x?" + return f"{monitor.get('width', '?')}x{monitor.get('height', '?')}" + + +def _acquire_safe_grab(max_attempts: int = MONITOR_MAX_ATTEMPTS, + retry_delay_s: float = MONITOR_RETRY_DELAY_S, + allow_secondary_fallback: bool = True): + """Ouvre mss et capture un monitor avec dimensions plausibles. + + Stratégie en cascade : + 1. À chaque tentative, ouvrir un nouveau `mss.mss()` (peut rafraîchir le + cache interne) et examiner monitors[1..n]. + 2. Préférer monitors[1] (écran principal physique). Si aberrant ET + `allow_secondary_fallback=True`, prendre le premier monitors[2..n] + sain avec un WARNING explicite. + 3. Si `allow_secondary_fallback=False`, on n'accepte QUE monitors[1]. + Utile pour les méthodes qui reçoivent des coordonnées (x, y) en + système écran composite : capturer un monitor secondaire produirait + une image saine mais décalée par rapport à ces coords. + 4. Si aucune dim plausible : attendre `retry_delay_s` et retenter. + 5. Après `max_attempts` infructueuses : log ERROR et retourner + (None, None) pour que l'appelant tombe en sortie d'erreur explicite. + + Args: + max_attempts: nombre de tentatives mss avant abandon. + retry_delay_s: délai entre tentatives. + allow_secondary_fallback: si False, refuser monitors[2..n] (fail-closed + pour les méthodes coord-bearing). + + Returns: + Tuple (monitor_dict, PIL.Image) si capture saine réussie, + (None, None) sinon. + """ + last_aberrant = None + secondary_seen = False # un monitor secondaire sain a été vu mais refusé + for attempt in range(max_attempts): + with mss.mss() as sct: + monitors = list(sct.monitors) if sct.monitors else [] + chosen = None + chosen_idx = None + for idx in range(1, len(monitors)): + candidate = monitors[idx] + if not _is_monitor_sane(candidate): + last_aberrant = candidate + logger.warning( + "Monitor[%d] dims aberrantes (%s, seuil %dx%d) " + "— attempt %d/%d", + idx, _dim_str(candidate), + MIN_MONITOR_WIDTH, MIN_MONITOR_HEIGHT, + attempt + 1, max_attempts, + ) + continue + # Monitor sain trouvé + if idx == 1 or allow_secondary_fallback: + chosen = candidate + chosen_idx = idx + break + # Sinon : sain mais secondaire interdit pour cet appelant + secondary_seen = True + logger.warning( + "Monitor[%d] sain (%s) mais fallback secondaire refusé " + "(allow_secondary_fallback=False) — capture cohérente " + "des coords impossible", + idx, _dim_str(candidate), + ) + if chosen is not None: + if chosen_idx != 1 or attempt > 0: + logger.warning( + "Capture fallback : monitor[%d] dim=%s, attempt=%d", + chosen_idx, _dim_str(chosen), attempt + 1, + ) + sct_img = sct.grab(chosen) + img = Image.frombytes( + "RGB", sct_img.size, sct_img.bgra, "raw", "BGRX", + ) + return chosen, img + if attempt < max_attempts - 1: + time.sleep(retry_delay_s) + if secondary_seen and not allow_secondary_fallback: + logger.error( + "Capture abandonnée : monitor[1] aberrant après %d tentatives " + "(dernier vu %s) et fallback secondaire désactivé " + "pour préserver la cohérence des coordonnées", + max_attempts, _dim_str(last_aberrant), + ) + else: + logger.error( + "Aucun monitor avec dims plausibles trouvé après %d tentatives " + "(dernier vu : %s, seuil %dx%d) — capture abandonnée", + max_attempts, _dim_str(last_aberrant), + MIN_MONITOR_WIDTH, MIN_MONITOR_HEIGHT, + ) + return None, None + + +def _compute_luma_stats(img: Image.Image) -> Dict[str, float | int]: + """Retourne des stats simples de luminance pour diagnostiquer un frame noir.""" + gray = img.convert("L") + stat = ImageStat.Stat(gray) + min_luma, max_luma = gray.getextrema() + return { + "mean": round(float(stat.mean[0]) if stat.mean else 0.0, 2), + "stddev": round(float(stat.stddev[0]) if stat.stddev else 0.0, 2), + "min": int(min_luma), + "max": int(max_luma), + } + + +def _is_effectively_black(img: Image.Image) -> bool: + """Heuristique fail-closed pour refuser un screenshot pratiquement noir.""" + stats = _compute_luma_stats(img) + return ( + stats["max"] <= BLACK_FRAME_MAX_LUMA + and stats["mean"] <= BLACK_FRAME_MEAN_MAX + and stats["stddev"] <= BLACK_FRAME_STDDEV_MAX + ) + + +def _capture_via_imagegrab() -> Tuple[Optional[Dict[str, int]], Optional[Image.Image], Dict[str, Any]]: + """Fallback Windows via Pillow/ImageGrab. + + Utile quand `mss` retourne un frame noir alors que la session graphique + utilisateur reste visible. + """ + if _SYSTEM != "Windows": + return None, None, {"backend": "imagegrab", "error": "unsupported_platform"} + + try: + from PIL import ImageGrab + except ImportError as exc: + return None, None, {"backend": "imagegrab", "error": str(exc)} + + try: + img = ImageGrab.grab(all_screens=True) + except Exception as exc: + logger.warning("ImageGrab indisponible pour le fallback capture : %s", exc) + return None, None, {"backend": "imagegrab", "error": str(exc)} + + monitor = {"left": 0, "top": 0, "width": img.width, "height": img.height} + return monitor, img, { + "backend": "imagegrab", + "luma": _compute_luma_stats(img), + } + + +def capture_screen_image( + allow_secondary_fallback: bool = True, +) -> Tuple[Optional[Dict[str, int]], Optional[Image.Image], Dict[str, Any]]: + """Capture plein écran avec diagnostic noir + fallback Windows. + + Returns: + (monitor, image, meta) où image peut être None si aucun backend plein + écran n'a produit une image exploitable. + """ + monitor, img = _acquire_safe_grab( + allow_secondary_fallback=allow_secondary_fallback + ) + meta: Dict[str, Any] = {"backend": "mss"} + + if img is not None: + meta["luma"] = _compute_luma_stats(img) + if not _is_effectively_black(img): + return monitor, img, meta + logger.warning( + "Capture mss quasi noire (%s) — tentative de fallback", + meta["luma"], + ) + meta["mss_black_frame"] = True + else: + meta["mss_unavailable"] = True + + fallback_monitor, fallback_img, fallback_meta = _capture_via_imagegrab() + if fallback_img is not None: + if not _is_effectively_black(fallback_img): + logger.warning( + "Capture fallback via ImageGrab (%sx%s)", + fallback_img.width, + fallback_img.height, + ) + return fallback_monitor, fallback_img, fallback_meta + logger.warning( + "Capture ImageGrab quasi noire (%s)", + fallback_meta.get("luma"), + ) + meta["imagegrab_black_frame"] = True + + meta["imagegrab_error"] = fallback_meta.get("error") + return None, None, meta + + +def _capture_window_image_windows( + hwnd: int, + width: int, + height: int, +) -> Tuple[Optional[Image.Image], Dict[str, Any]]: + """Capture une fenêtre Windows via PrintWindow. + + Fallback utile quand la capture plein écran est noire mais que la fenêtre + active reste imprimable par l'API Win32. + """ + if _SYSTEM != "Windows": + return None, {"backend": "printwindow", "error": "unsupported_platform"} + + try: + import ctypes + import win32gui + import win32ui + except ImportError as exc: + return None, {"backend": "printwindow", "error": str(exc)} + + last_error = None + for flag in (3, 2, 0): + wnd_dc = None + src_dc = None + mem_dc = None + bmp = None + try: + wnd_dc = win32gui.GetWindowDC(hwnd) + if not wnd_dc: + raise RuntimeError("GetWindowDC a retourné 0") + src_dc = win32ui.CreateDCFromHandle(wnd_dc) + mem_dc = src_dc.CreateCompatibleDC() + bmp = win32ui.CreateBitmap() + bmp.CreateCompatibleBitmap(src_dc, width, height) + mem_dc.SelectObject(bmp) + result = ctypes.windll.user32.PrintWindow( + hwnd, mem_dc.GetSafeHdc(), flag + ) + bits = bmp.GetBitmapBits(True) + img = Image.frombuffer( + "RGB", (width, height), bits, "raw", "BGRX", 0, 1 + ) + luma = _compute_luma_stats(img) + if result or not _is_effectively_black(img): + return img, { + "backend": f"printwindow:{flag}", + "printwindow_result": int(result), + "luma": luma, + } + except Exception as exc: + last_error = str(exc) + finally: + try: + if bmp is not None: + win32gui.DeleteObject(bmp.GetHandle()) + except Exception: + pass + try: + if mem_dc is not None: + mem_dc.DeleteDC() + except Exception: + pass + try: + if src_dc is not None: + src_dc.DeleteDC() + except Exception: + pass + try: + if wnd_dc is not None: + win32gui.ReleaseDC(hwnd, wnd_dc) + except Exception: + pass + + return None, { + "backend": "printwindow", + "error": last_error or "no_usable_frame", + } + + +def capture_foreground_window_image() -> Tuple[Optional[Image.Image], Dict[str, Any]]: + """Capture la fenêtre au focus via API native si disponible.""" + try: + from ..window_info_crossplatform import get_active_window_rect + + rect_info = get_active_window_rect() + except Exception as exc: + return None, {"backend": "printwindow", "error": str(exc)} + + if not rect_info: + return None, {"backend": "printwindow", "error": "active_window_unavailable"} + + win_w, win_h = rect_info.get("size", [0, 0]) + hwnd = rect_info.get("hwnd") + if not hwnd or win_w <= 0 or win_h <= 0: + return None, { + "backend": "printwindow", + "error": "active_window_handle_unavailable", + "title": rect_info.get("title", "unknown_window"), + } + + img, meta = _capture_window_image_windows(hwnd, win_w, win_h) + if img is None: + return None, meta + + meta.update( + { + "title": rect_info.get("title", "unknown_window"), + "app_name": rect_info.get("app_name", "unknown_app"), + "rect": rect_info.get("rect"), + "window_size": rect_info.get("size"), + "hwnd": hwnd, + } + ) + return img, meta + + class VisionCapturer: def __init__(self, session_dir: str): self.session_dir = session_dir @@ -103,25 +434,35 @@ class VisionCapturer: (utile pour le contextualisation des heartbeats côté serveur). """ try: - with mss.mss() as sct: - monitor = sct.monitors[1] - sct_img = sct.grab(monitor) - img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX") + _monitor, img, meta = capture_screen_image() + if img is None: + img, win_meta = capture_foreground_window_image() + if img is None: + logger.error( + "Capture plein contexte indisponible (meta=%s, window=%s)", + meta, + win_meta, + ) + return "" + logger.warning( + "Capture plein contexte dégradée via fenêtre active (%s)", + win_meta.get("backend"), + ) - # Détection de changement (pour Heartbeat) - if not force: - current_hash = self._compute_quick_hash(img) - if current_hash == self.last_img_hash: - return "" # Pas de changement, on économise la fibre - self.last_img_hash = current_hash + # Détection de changement (pour Heartbeat) + if not force: + current_hash = self._compute_quick_hash(img) + if current_hash == self.last_img_hash: + return "" # Pas de changement, on économise la fibre + self.last_img_hash = current_hash - # Floutage des données sensibles (conformité AI Act) - if BLUR_SENSITIVE: - blur_sensitive_regions(img) + # Floutage des données sensibles (conformité AI Act) + if BLUR_SENSITIVE: + blur_sensitive_regions(img) - path = os.path.join(self.shots_dir, f"context_{int(time.time())}_{name_suffix}.png") - img.save(path, "PNG", quality=SCREENSHOT_QUALITY) - return path + path = os.path.join(self.shots_dir, f"context_{int(time.time())}_{name_suffix}.png") + img.save(path, "PNG", quality=SCREENSHOT_QUALITY) + return path except Exception as e: logger.error(f"Erreur Context Capture: {e}") return "" @@ -145,46 +486,62 @@ class VisionCapturer: sont toujours retournés (fallback gracieux). """ try: - with mss.mss() as sct: - full_path = os.path.join(self.shots_dir, f"{screenshot_id}_full.png") - monitor = sct.monitors[1] - sct_img = sct.grab(monitor) - img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX") - - # Capture du Crop (Cœur de l'apprentissage qwen3-vl) - crop_path = os.path.join(self.shots_dir, f"{screenshot_id}_crop.png") - w, h = TARGETED_CROP_SIZE - left = max(0, x - w // 2) - top = max(0, y - h // 2) - crop_img = img.crop((left, top, left + w, top + h)) - - if anonymize: - crop_img = crop_img.filter(ImageFilter.GaussianBlur(radius=4)) - - # Floutage des données sensibles (conformité AI Act) - if BLUR_SENSITIVE: - blur_sensitive_regions(img) - blur_sensitive_regions(crop_img) - - img.save(full_path, "PNG", quality=SCREENSHOT_QUALITY) - crop_img.save(crop_path, "PNG", quality=SCREENSHOT_QUALITY) - - # Mise à jour du hash pour le prochain heartbeat - self.last_img_hash = self._compute_quick_hash(img) - - result = {"full": full_path, "crop": crop_path} - - # --- Capture de la fenêtre active --- - # Ajout non-bloquant : enrichit le résultat avec l'image - # de la fenêtre seule + métadonnées (titre, rect, clic relatif) - window_info = self.capture_active_window(x, y, screenshot_id, full_img=img) + # Coords (x, y) sont en système écran composite ; cropper depuis + # un monitor secondaire (offset ≠ 0) produirait une image saine + # mais décalée → fail-closed sur fallback secondaire. + _monitor, img, meta = capture_screen_image( + allow_secondary_fallback=False + ) + if img is None: + window_info = self.capture_active_window( + x, y, screenshot_id, full_img=None + ) if window_info: - result["window_capture"] = window_info + result = {"window_capture": window_info} + _enrich_with_monitor_info(result) + logger.warning( + "capture_dual dégradée: fenêtre active seule (%s)", + meta, + ) + return result + return {} - # QW1 — enrichissement multi-écrans (additif, fallback gracieux) - _enrich_with_monitor_info(result) + full_path = os.path.join(self.shots_dir, f"{screenshot_id}_full.png") - return result + # Capture du Crop (Cœur de l'apprentissage qwen3-vl) + crop_path = os.path.join(self.shots_dir, f"{screenshot_id}_crop.png") + w, h = TARGETED_CROP_SIZE + left = max(0, x - w // 2) + top = max(0, y - h // 2) + crop_img = img.crop((left, top, left + w, top + h)) + + if anonymize: + crop_img = crop_img.filter(ImageFilter.GaussianBlur(radius=4)) + + # Floutage des données sensibles (conformité AI Act) + if BLUR_SENSITIVE: + blur_sensitive_regions(img) + blur_sensitive_regions(crop_img) + + img.save(full_path, "PNG", quality=SCREENSHOT_QUALITY) + crop_img.save(crop_path, "PNG", quality=SCREENSHOT_QUALITY) + + # Mise à jour du hash pour le prochain heartbeat + self.last_img_hash = self._compute_quick_hash(img) + + result = {"full": full_path, "crop": crop_path} + + # --- Capture de la fenêtre active --- + # Ajout non-bloquant : enrichit le résultat avec l'image + # de la fenêtre seule + métadonnées (titre, rect, clic relatif) + window_info = self.capture_active_window(x, y, screenshot_id, full_img=img) + if window_info: + result["window_capture"] = window_info + + # QW1 — enrichissement multi-écrans (additif, fallback gracieux) + _enrich_with_monitor_info(result) + + return result except Exception as e: logger.error(f"Erreur Dual Capture: {e}") return {} @@ -239,33 +596,54 @@ class VisionCapturer: # Si le clic est en dehors de la fenêtre, on le signale mais on continue click_inside = (0 <= click_rel_x <= win_w and 0 <= click_rel_y <= win_h) + window_img = None + # --- Crop de la fenêtre depuis le plein écran --- if full_img is None: - # Pas de screenshot fourni — en capturer un (cas standalone) + # Pas de screenshot fourni — en capturer un (cas standalone). + # win_rect est en coords globales ; cropper depuis un monitor + # secondaire produirait une image décalée → fail-closed sur + # fallback secondaire. try: - with mss.mss() as sct: - monitor = sct.monitors[1] - sct_img = sct.grab(monitor) - full_img = Image.frombytes( - "RGB", sct_img.size, sct_img.bgra, "raw", "BGRX" - ) + _monitor, full_img, _meta = capture_screen_image( + allow_secondary_fallback=False + ) except Exception as e: logger.error(f"Erreur capture plein écran pour fenêtre : {e}") - return None + full_img = None - # Borner le crop aux limites de l'image plein écran - img_w, img_h = full_img.size - crop_left = max(0, win_left) - crop_top = max(0, win_top) - crop_right = min(img_w, win_right) - crop_bottom = min(img_h, win_bottom) + if full_img is not None and not _is_effectively_black(full_img): + img_w, img_h = full_img.size + crop_left = max(0, win_left) + crop_top = max(0, win_top) + crop_right = min(img_w, win_right) + crop_bottom = min(img_h, win_bottom) - if crop_right <= crop_left or crop_bottom <= crop_top: - logger.debug("Fenêtre hors écran — skip capture fenêtre") + if crop_right > crop_left and crop_bottom > crop_top: + window_img = full_img.crop( + (crop_left, crop_top, crop_right, crop_bottom) + ) + else: + logger.debug("Fenêtre hors écran — fallback natif si possible") + elif full_img is not None: + logger.warning( + "capture_active_window: screenshot plein écran noir, fallback natif" + ) + + if window_img is None and rect_info.get("hwnd"): + window_img, native_meta = _capture_window_image_windows( + rect_info["hwnd"], win_w, win_h + ) + if window_img is not None: + logger.warning( + "capture_active_window via fallback natif (%s)", + native_meta.get("backend"), + ) + + if window_img is None: + logger.debug("Fenêtre hors écran ou capture native indisponible") return None - window_img = full_img.crop((crop_left, crop_top, crop_right, crop_bottom)) - # Floutage conformité AI Act if BLUR_SENSITIVE: blur_sensitive_regions(window_img) diff --git a/agent_v0/lea_ui/server_client.py b/agent_v0/lea_ui/server_client.py index 9274fd4d4..4f67d8f0e 100644 --- a/agent_v0/lea_ui/server_client.py +++ b/agent_v0/lea_ui/server_client.py @@ -338,6 +338,50 @@ class LeaServerClient: except Exception: return None + def resume_replay(self, replay_id: str) -> bool: + """Reprendre un replay en pause supervisée via HTTP direct. + + Fallback du chemin SocketIO (`lea:replay_resume` → agent_chat) + utilisé quand le bus feedback est déconnecté au moment où + l'utilisateur clique « Continuer » dans la bulle paused. + + Retourne True si le serveur streaming a accepté la reprise. + """ + if not replay_id: + return False + try: + import requests + resp = requests.post( + f"{self._stream_url}/traces/stream/replay/{replay_id}/resume", + headers=self._auth_headers(), + timeout=10, + ) + return bool(resp.ok) + except Exception: + logger.debug("resume_replay HTTP silenced", exc_info=True) + return False + + def abort_replay(self, replay_id: str) -> bool: + """Annuler un replay en pause supervisée via HTTP direct. + + Symétrique de ``resume_replay`` : fallback du chemin SocketIO + (`lea:replay_abort`) quand le bus feedback est déconnecté. + POSTe sur ``/replay/{id}/cancel`` côté serveur streaming. + """ + if not replay_id: + return False + try: + import requests + resp = requests.post( + f"{self._stream_url}/traces/stream/replay/{replay_id}/cancel", + headers=self._auth_headers(), + timeout=10, + ) + return bool(resp.ok) + except Exception: + logger.debug("abort_replay HTTP silenced", exc_info=True) + return False + def report_action_result( self, session_id: str, diff --git a/agent_v0/server_v1/api_stream.py b/agent_v0/server_v1/api_stream.py index a29c8a22b..0044f6834 100644 --- a/agent_v0/server_v1/api_stream.py +++ b/agent_v0/server_v1/api_stream.py @@ -61,7 +61,9 @@ MAX_ACTIONS_PER_REPLAY = 500 # Max actions par requête de replay MAX_REPLAY_STATES = 1000 # Max entrées dans _replay_states REPLAY_STATE_TTL_SECONDS = 3600 # Nettoyage auto des replays terminés après 1h -# Actions en cours de retry : action_id -> {"action": ..., "retry_count": N, "replay_id": ...} +# Actions in-flight / retry : action_id -> transport + retry metadata. +# `action` remains the semantic/original action for reporting/retry logic, +# while `dispatched_action` tracks the exact payload last sent to Lea. _retry_pending: Dict[str, Dict[str, Any]] = {} # Callbacks d'erreur par replay_id : replay_id -> callback_url @@ -207,12 +209,14 @@ from .replay_engine import ( _MAX_ACTION_TEXT_LENGTH, _MAX_KEYS_PER_COMBO, _KNOWN_KEY_NAMES, + _auto_launch_replay_after_finalize, _validate_replay_action, _APP_LAUNCH_COMMANDS, _APP_VISUAL_SEARCH, _SETUP_IGNORE_APPS, _extract_required_apps_from_events, _extract_required_apps_from_workflow, + _trim_redundant_setup_events, _resolve_launch_command, _infer_app_from_window_titles, _get_visual_search_info, @@ -475,6 +479,19 @@ def _clear_replay_lock(): logger.error(f"Erreur suppression replay lock : {e}") +def _memory_window_title_for_action(action_meta: Dict[str, Any]) -> str: + """Résoudre le meilleur window_title disponible pour la mémoire persistante.""" + action_meta = action_meta or {} + target_spec = action_meta.get("target_spec") or {} + context_hints = target_spec.get("context_hints") or {} + return ( + action_meta.get("expected_window_before", "") + or target_spec.get("window_title", "") + or context_hints.get("window_title", "") + or action_meta.get("window_title", "") + ) + + def _get_worker_queue_status() -> Dict[str, Any]: """Retourne l'état de la queue du worker VLM (pour le monitoring).""" queue = [] @@ -544,6 +561,34 @@ _machine_replay_target: Dict[str, str] = {} _replay_states: Dict[str, Dict[str, Any]] = {} +def _remove_queued_action_duplicates(session_id: str, action_id: str) -> int: + """Retirer d'une queue les copies exactes d'une action déjà acquittée. + + Le watchdog peut re-pousser une action orpheline en tête de queue. Si le + report original arrive juste après, cette copie resend doit être jetée, + sinon Léa ré-exécute la même action avec le même `action_id` et peut + toggler l'état UI (ex: touche Windows qui referme Démarrer). + """ + if not session_id or not action_id: + return 0 + queue = _replay_queues.get(session_id, []) + if not queue: + return 0 + + filtered: List[Dict[str, Any]] = [] + removed = 0 + for queued_action in queue: + queued_id = str((queued_action or {}).get("action_id", "") or "") + if queued_id == action_id: + removed += 1 + continue + filtered.append(queued_action) + + if removed: + _replay_queues[session_id] = filtered + return removed + + class StreamEvent(BaseModel): session_id: str timestamp: float @@ -832,6 +877,16 @@ async def startup(): threading.Thread(target=_preload_easyocr, daemon=True, name="preload_easyocr").start() + from .replay_watchdog import get_or_create_watchdog + + app.state.replay_watchdog = get_or_create_watchdog( + retry_pending=_retry_pending, + replay_queues=_replay_queues, + async_lock_factory=_async_replay_lock, + sse_notifier=None, + ) + await app.state.replay_watchdog.start() + logger.info( "API Streaming démarrée — StreamProcessor, Worker et Cleanup prêts. " "VLM Worker dans un process séparé (run_worker.py)." @@ -886,6 +941,9 @@ def _load_existing_workflows(): async def shutdown(): global _cleanup_running _cleanup_running = False + watchdog = getattr(app.state, "replay_watchdog", None) + if watchdog is not None: + await watchdog.stop(timeout_s=3.0) worker.stop() # Nettoyer le replay lock au shutdown (sinon le worker VLM resterait bloqué) _clear_replay_lock() @@ -1477,17 +1535,24 @@ def _process_screenshot_thread(session_id: str, shot_id: str, path: str): # ========================================================================= @app.post("/api/v1/traces/stream/finalize") -async def finalize(session_id: str, machine_id: str = "default"): +async def finalize( + session_id: str, + machine_id: str = "default", + launch_replay: bool = False, +): """Clôture la session et place le traitement en file d'attente. Ne bloque plus : marque la session comme finalisée et l'ajoute à la queue du worker VLM (process séparé) pour analyse + construction workflow. Le client peut suivre la progression via GET /api/v1/traces/stream/processing/status. + Optionnellement, il peut aussi déclencher immédiatement un replay direct + depuis la session finalisée (chemin Lea-first, sans attendre le workflow VLM). Args: session_id: Identifiant de la session à finaliser machine_id: Identifiant machine (informatif, le machine_id est déjà dans la session) + launch_replay: Si vrai, tente de lancer immédiatement /replay-session """ # Vérifier que la session existe session = processor.session_manager.get_session(session_id) @@ -1501,6 +1566,10 @@ async def finalize(session_id: str, machine_id: str = "default"): processor.session_manager.finalize(session_id) logger.info(f"Session {session_id} finalisée, ajout à la queue du worker VLM") + resolved_machine_id = machine_id + if resolved_machine_id == "default" and getattr(session, "machine_id", ""): + resolved_machine_id = session.machine_id + # Nettoyer les structures d'enrichissement temps réel pour cette session with _enrichment_lock: keys_to_remove = [k for k in _pending_click_enrichments if k[0] == session_id] @@ -1521,17 +1590,70 @@ async def finalize(session_id: str, machine_id: str = "default"): if shots_dir.exists(): full_shots_count = len(list(shots_dir.glob("shot_*_full.png"))) - return { + # Patch 2026-05-23 (brief 0902 deferred-workflow) : par défaut, on + # ne propose plus le replay direct immédiat post-finalize — le chemin + # produit cible est le workflow compilé par le worker VLM. Le client + # attend la disponibilité du workflow nommé pour proposer un test. + # Le replay direct reste accessible (smoke/debug) en activant + # RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE=true côté serveur, OU + # en appelant explicitement POST /api/v1/traces/stream/replay-session + # depuis un outil de test. + _direct_replay_enabled = _auto_launch_replay_after_finalize() + + response = { "status": "queued_for_processing", "session_id": session_id, "machine_id": session.machine_id, "screenshots_to_analyze": full_shots_count, + "replay_ready": _direct_replay_enabled, "message": ( f"Session finalisée. {full_shots_count} screenshots seront analysés " "en arrière-plan. Suivez la progression via " - "GET /api/v1/traces/stream/processing/status" + "GET /api/v1/traces/stream/processing/status." ), } + if _direct_replay_enabled: + response["replay_request"] = { + "endpoint": "/api/v1/traces/stream/replay-session", + "session_id": session_id, + "machine_id": resolved_machine_id, + } + response["message"] += ( + " Le replay direct est disponible via " + "POST /api/v1/traces/stream/replay-session" + ) + + if not launch_replay: + return response + + try: + replay_result = await replay_from_session( + session_id=session_id, + machine_id=resolved_machine_id, + ) + except HTTPException as exc: + logger.warning( + "Finalize %s : replay direct non lancé (%s)", + session_id, + exc.detail, + ) + response["replay_launch"] = { + "status": "failed", + "status_code": exc.status_code, + "detail": exc.detail, + } + response["message"] += ( + " Le lancement automatique du replay direct a échoué ; " + "la session reste finalisée et re-jouable manuellement." + ) + return response + + response["replay_launch"] = { + "status": "started", + "replay": replay_result, + } + response["message"] += " Le replay direct a été lancé immédiatement." + return response # ========================================================================= @@ -2262,18 +2384,39 @@ async def replay_from_session( if session_mem and session_mem.events: _merge_enrichments_into_raw_events(raw_events, session_mem.events) - # ── 3. Construire le replay propre depuis les events bruts ── - # Passer le répertoire de session pour activer le visual replay (crops de référence) + # Répertoire de session utilisé par le visual replay et les anchors setup session_dir = str(events_file.parent) + + # ── 3. Préparer le setup environnement et couper le préambule source ── + setup_actions = [] + app_info = _extract_required_apps_from_events( + raw_events, + session_dir=session_dir, + ) + replay_raw_events = raw_events + if app_info: + setup_actions = _generate_setup_actions(app_info, setup_id_prefix="setup_sess") + if setup_actions: + replay_raw_events = _trim_redundant_setup_events(raw_events, app_info) + logger.info( + "replay-session %s : %d actions de setup préparées avant le replay " + "(app=%s, cmd=%s, raw_trim=%d→%d)", + session_id, len(setup_actions), + app_info.get("primary_app"), app_info.get("primary_launch_cmd"), + len(raw_events), len(replay_raw_events), + ) + + # ── 4. Construire le replay propre depuis les events bruts ── + # Passer le répertoire de session pour activer le visual replay (crops de référence) actions = build_replay_from_raw_events( - raw_events, session_id=session_id, session_dir=session_dir, + replay_raw_events, session_id=session_id, session_dir=session_dir, ) if not actions: raise HTTPException( status_code=400, detail=f"Session '{session_id}' : aucune action exploitable après nettoyage " - f"({len(raw_events)} événements bruts)" + f"({len(replay_raw_events)} événements bruts)" ) # Limite de sécurité @@ -2305,23 +2448,10 @@ async def replay_from_session( if _gesture_catalog and actions: actions = _gesture_catalog.optimize_replay_actions(actions) - # ── 3b. Setup environnement — ouvrir les applications nécessaires ── - # Analyser les événements bruts pour détecter quelles applications sont requises - # et injecter des actions de setup en tête de la queue de replay. - setup_actions = [] - app_info = _extract_required_apps_from_events(raw_events) - if app_info: - setup_actions = _generate_setup_actions(app_info, setup_id_prefix="setup_sess") - if setup_actions: - actions = setup_actions + actions - logger.info( - "replay-session %s : %d actions de setup injectées avant le replay " - "(app=%s, cmd=%s)", - session_id, len(setup_actions), - app_info.get("primary_app"), app_info.get("primary_launch_cmd"), - ) + if setup_actions: + actions = setup_actions + actions - # ── 4. Trouver la session de replay cible (Agent V1 actif) ── + # ── 5. Trouver la session de replay cible (Agent V1 actif) ── # L'agent actif peut avoir une session différente de la session source target_session_id = _find_active_agent_session(machine_id=machine_id) if not target_session_id: @@ -2335,7 +2465,7 @@ async def replay_from_session( "Lancez l'Agent V1 sur le PC cible." ) - # ── 5. Injecter dans la queue de replay ── + # ── 6. Injecter dans la queue de replay ── replay_id = f"replay_sess_{uuid.uuid4().hex[:8]}" async with _async_replay_lock(): @@ -3265,11 +3395,35 @@ async def get_next_action(session_id: str, machine_id: str = "default"): # NE PAS écraser si _schedule_retry a déjà mis le bon retry_count action_id_sent = action.get("action_id", "") if action_id_sent and action_id_sent not in _retry_pending: + now = time.time() _retry_pending[action_id_sent] = { "action": dict(action), + "dispatched_action": dict(action), "retry_count": 0, - "replay_id": "", + "replay_id": owning_replay.get("replay_id", "") if owning_replay else "", + "session_id": session_id, + "machine_id": machine_id, + "dispatched_at": now, + "first_dispatched_at": now, + "resent_count": 0, + "last_resent_at": 0.0, } + elif action_id_sent: + existing = _retry_pending.get(action_id_sent) + if existing is not None: + now = time.time() + existing.setdefault("action", dict(action)) + existing["dispatched_action"] = dict(action) + existing["replay_id"] = existing.get("replay_id") or ( + owning_replay.get("replay_id", "") if owning_replay else "" + ) + existing["session_id"] = session_id + existing["machine_id"] = machine_id + existing["dispatched_at"] = now + if not existing.get("first_dispatched_at"): + existing["first_dispatched_at"] = now + existing.setdefault("resent_count", 0) + existing.setdefault("last_resent_at", 0.0) # [REPLAY] log structuré pour suivre une action à travers toute la chaîne # Grep facile : journalctl --user -u rpa-streaming -f | grep REPLAY @@ -3400,6 +3554,15 @@ async def report_action_result(report: ReplayResultReport): ) return {"status": "no_active_replay", "session_id": session_id} + removed_dupes = _remove_queued_action_duplicates(session_id, action_id) + if removed_dupes: + logger.warning( + "[REPLAY] REPORT cleanup session=%s action_id=%s removed_queue_duplicates=%d", + session_id, + action_id, + removed_dupes, + ) + # Récupérer l'info de retry pour cette action (si c'est un retry) retry_info = _retry_pending.pop(action_id, None) retry_count = retry_info["retry_count"] if retry_info else 0 @@ -3631,10 +3794,7 @@ async def report_action_result(report: ReplayResultReport): _current = _actions_meta[_idx] or {} if _current.get("type") == "click": _mem_target_spec = _current.get("target_spec") or {} - _mem_window_title = ( - _mem_target_spec.get("window_title", "") - or _mem_target_spec.get("expected_window_before", "") - ) + _mem_window_title = _memory_window_title_for_action(_current) if _mem_window_title: _mem_success = ( @@ -3749,6 +3909,7 @@ async def report_action_result(report: ReplayResultReport): "target_description": f"Dialogue système : {_sys_category}", "screenshot_b64": screenshot_after or report.screenshot, "target_spec": _tspec_sys, + "original_action": dict(original_action or {}), "reason": "system_dialog", "system_dialog": _sys_info, "error_detail": _sys_reason or (report.error or ""), @@ -3814,6 +3975,7 @@ async def report_action_result(report: ReplayResultReport): "target_description": _target_desc_ww, "screenshot_b64": screenshot_after or report.screenshot, "target_spec": _tspec_ww, + "original_action": dict(original_action or {}), "reason": "wrong_window", "error_detail": report.error or "", } @@ -3888,6 +4050,7 @@ async def report_action_result(report: ReplayResultReport): "target_description": _target_desc, "screenshot_b64": screenshot_after or report.screenshot, "target_spec": _tspec, + "original_action": dict(original_action or {}), "reason": "no_screen_change_strict", "resolution_method": report.resolution_method or "", "resolution_score": report.resolution_score or 0, @@ -3947,6 +4110,7 @@ async def report_action_result(report: ReplayResultReport): "target_description": target_desc, "screenshot_b64": screenshot_after or report.screenshot, "target_spec": report.target_spec, + "original_action": dict(original_action or {}), } replay_state["pause_message"] = f"Je ne vois pas '{target_desc}' à l'écran" error_entry = { @@ -3989,6 +4153,7 @@ async def report_action_result(report: ReplayResultReport): "target_description": target_desc, "screenshot_b64": screenshot_after or report.screenshot, "target_spec": report.target_spec, + "original_action": dict(original_action or {}), } replay_state["pause_message"] = f"Je ne vois pas '{target_desc}' à l'écran" error_entry = { @@ -4341,8 +4506,14 @@ async def resume_replay( and failed_action.get("reason") != "user_request"): # Reconstruire l'action a partir du retry_pending ou de l'original original_action_id = failed_action["action_id"] + original = failed_action.get("original_action") + if isinstance(original, dict) and original: + original = dict(original) + else: + original = None # Chercher l'action originale dans les retry_pending - original = _retry_pending.pop(original_action_id, {}).get("action") + if not original: + original = _retry_pending.pop(original_action_id, {}).get("action") if not original: # Reconstruire un minimum depuis le failed_action context original = { @@ -4358,8 +4529,15 @@ async def resume_replay( # Stocker dans retry_pending pour le suivi _retry_pending[resume_id] = { "action": original, + "dispatched_action": dict(resume_action), "retry_count": 0, "replay_id": replay_id, + "session_id": session_id, + "machine_id": state.get("machine_id", "default"), + "dispatched_at": 0.0, + "first_dispatched_at": 0.0, + "resent_count": 0, + "last_resent_at": 0.0, "reason": "resume_after_pause", } queue = _replay_queues.get(session_id, []) @@ -4399,6 +4577,13 @@ async def cancel_replay(replay_id: str): return {"status": "cancelled", "replay_id": replay_id, "session_id": session_id} +@app.get("/api/v1/traces/stream/replay/watchdog/metrics") +async def watchdog_metrics(): + from .replay_watchdog import get_metrics_snapshot + + return {"watchdog": get_metrics_snapshot()} + + # ========================================================================= # Visual Replay — Résolution visuelle des cibles (module resolve_engine) # ========================================================================= @@ -4545,10 +4730,13 @@ async def resolve_target(request: ResolveTargetRequest): # Validation qualité en sortie de cascade : seuil de score + garde # de proximité contre les coords enregistrées. Single point of # insertion, n'altère pas la cascade existante. + # target_spec propagé pour relaxation contextuelle (switch_tab + + # som_element calibré, cf. resolve_engine.py 2026-05-22). result = _validate_resolution_quality( result, request.fallback_x_pct, request.fallback_y_pct, + target_spec=request.target_spec, ) # Pré-check sémantique post-cascade : OCR sur une zone autour de la @@ -4581,6 +4769,15 @@ async def resolve_target(request: ResolveTargetRequest): _by_text = (request.target_spec.get("by_text") or "").strip() if _by_text: from agent_v0.server_v1.resolve_engine import _validate_text_at_position + # Propager la bbox SoM enregistrée (si présente) au + # pré-check OCR : pour les éléments étroits (onglets + # Notepad moderne, ~30-40px haut), le radius générique + # capture du texte voisin et rejette à tort. + # Patch 2026-05-23 — cf. inbox_codex/…_notepad-tab-ocr-precheck. + _som_bbox = ( + (request.target_spec.get("som_element") or {}) + .get("bbox_norm") + ) _is_valid, _observed, _ocr_ms = _validate_text_at_position( tmp_path, float(result.get("x_pct", 0) or 0), @@ -4588,6 +4785,7 @@ async def resolve_target(request: ResolveTargetRequest): _by_text, effective_w, effective_h, + som_bbox_norm=_som_bbox, ) logger.info( "[REPLAY] Pre-check OCR ACTIF : '%s' attendu @ (%.4f, %.4f) " @@ -4600,7 +4798,16 @@ async def resolve_target(request: ResolveTargetRequest): _is_valid, _ocr_ms, ) - if not _is_valid: + # Patch 2026-05-23 : rejet uniquement si OCR a effectivement + # lu *autre chose* que la cible. Si observed est vide, l'OCR + # n'a rien lu (crop bbox SoM trop petit / contraste faible + # sur onglet Notepad moderne) — ambigu, on garde la + # résolution serveur. La garde drift ANCHOR-TM côté agent + # bloque les vrais faux positifs. + from agent_v0.server_v1.resolve_engine import ( + _should_reject_on_text_mismatch, + ) + if _should_reject_on_text_mismatch(_is_valid, _observed): logger.warning( "[REPLAY] Pre-check OCR REJET : '%s' attendu @ (%.4f, %.4f) " "via %s mais OCR voit '%s' (%.0fms)", @@ -4620,6 +4827,15 @@ async def resolve_target(request: ResolveTargetRequest): "x_pct": None, "y_pct": None, } + elif not _is_valid: + # observed vide → on log mais on accepte + logger.info( + "[REPLAY] Pre-check OCR observed='' (crop trop " + "petit/contraste faible) — on garde la résolution " + "via %s (score=%s), garde drift agent protège en aval", + result.get("method", "?"), + result.get("score"), + ) # [REPLAY] log structuré de sortie résolution (après validation) # Note: x_pct/y_pct peuvent être None quand le pré-check OCR rejette diff --git a/agent_v0/server_v1/live_session_manager.py b/agent_v0/server_v1/live_session_manager.py index 2042d6a6e..81743bdcc 100644 --- a/agent_v0/server_v1/live_session_manager.py +++ b/agent_v0/server_v1/live_session_manager.py @@ -17,6 +17,20 @@ from typing import Any, Dict, List, Optional logger = logging.getLogger(__name__) +def _infer_machine_id_from_session_id(session_id: str, fallback: str = "default") -> str: + """Déduire le machine_id depuis un session_id spécial si possible. + + Les heartbeats de fond de Léa utilisent `bg_` comme + identifiant de session. Lors d'un redémarrage serveur, ces sessions + peuvent être restaurées depuis la persistance JSON avec `machine_id` + resté à `default`. On rétablit ici l'information machine pour que les + replays ciblés retrouvent bien la session de fond active. + """ + if session_id.startswith("bg_") and len(session_id) > 3: + return session_id[3:] + return fallback + + @dataclass class LiveSessionState: """État d'une session active en mémoire.""" @@ -86,11 +100,18 @@ class LiveSessionManager: def _load_persisted_sessions(self): """Charger les sessions sauvegardées au démarrage (JSON state files).""" count = 0 - for session_file in sorted(self._persist_dir.glob("sess_*.json")): + session_files = sorted(self._persist_dir.glob("sess_*.json")) + session_files += sorted(self._persist_dir.glob("bg_*.json")) + for session_file in session_files: try: with open(session_file, 'r', encoding='utf-8') as f: data = json.load(f) session = LiveSessionState.from_dict(data) + if session.machine_id == "default": + session.machine_id = _infer_machine_id_from_session_id( + session.session_id, + fallback=session.machine_id, + ) self._sessions[session.session_id] = session count += 1 except Exception as e: @@ -117,7 +138,7 @@ class LiveSessionManager: for jsonl_file in sorted(live_dir.glob("**/live_events.jsonl")): session_dir = jsonl_file.parent session_id = session_dir.name - if not session_id.startswith("sess_"): + if not (session_id.startswith("sess_") or session_id.startswith("bg_")): continue if session_id in self._sessions: continue @@ -125,7 +146,7 @@ class LiveSessionManager: # Déduire le machine_id depuis le chemin parent parent_name = session_dir.parent.name if parent_name == live_dir.name: - machine_id = "default" + machine_id = _infer_machine_id_from_session_id(session_id) else: machine_id = parent_name diff --git a/agent_v0/server_v1/replay_engine.py b/agent_v0/server_v1/replay_engine.py index 042854a43..8b2bdf43a 100644 --- a/agent_v0/server_v1/replay_engine.py +++ b/agent_v0/server_v1/replay_engine.py @@ -13,6 +13,8 @@ Contient : Extrait de api_stream.py pour clarifier l'architecture. """ +import base64 +import io import json import logging import os @@ -21,6 +23,7 @@ import threading import time import uuid from collections import defaultdict +from pathlib import Path from typing import Any, Dict, List, Optional logger = logging.getLogger("api_stream") @@ -73,6 +76,22 @@ _KNOWN_KEY_NAMES = { } +def _auto_launch_replay_after_finalize() -> bool: + """Décide si ``/finalize`` doit proposer un replay direct immédiat. + + Patch 2026-05-23 (brief 0902 deferred-workflow) : le chemin produit + cible est le workflow compilé par le worker VLM en arrière-plan, + pas le replay direct depuis ``live_events.jsonl``. Le replay direct + reste utile pour le smoke/debug — on l'active explicitement via + la variable d'env ``RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE``. + + Default-deny : toute valeur autre que ``true``/``1``/``yes`` + (case-insensitive, après strip) retourne ``False``. + """ + raw = os.environ.get("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", "") + return raw.strip().lower() in {"true", "1", "yes"} + + def _validate_replay_action(action: dict) -> Optional[str]: """Valide une action de replay. Retourne un message d'erreur ou None si valide.""" action_type = action.get("type", "") @@ -247,8 +266,407 @@ _SETUP_IGNORE_APPS = { "runtimebroker.exe", } +# Certaines applications Windows légères sont plus robustes à lancer via +# `Win+R` + commande shell qu'à travers un setup 100% visuel Démarrer / +# Rechercher. On active cette stratégie de façon ciblée pour les cas validés +# en live afin d'éviter que la validation métier dépende d'un chemin Windows +# fragile sans rapport avec le workflow testé. +_SETUP_RUN_DIALOG_APPS = { + "notepad.exe", +} -def _extract_required_apps_from_events(raw_events: list) -> Dict[str, Any]: + +# Tokens de titres "neutres" = état initial d'une application fraîchement +# lancée par le setup auto (Win+Démarrer → recherche → clic résultat). +# Quand la session source contient un focus_change vers un de ces titres +# peu après le premier focus app, le trim coupe jusqu'à ce focus pour +# éliminer les clics intra-app redondants (bascule d'onglet, fermeture de +# fenêtre précédente, etc.) que le setup auto rend inutiles. +_NEUTRAL_TITLE_TOKENS = frozenset({ + "sans titre", # Bloc-notes FR + "untitled", # Notepad EN + "document1", # Word + "classeur1", # Excel FR + "book1", # Excel EN + "présentation1", # PowerPoint FR + "presentation1", # PowerPoint EN +}) +# Nombre d'events bruts inspectés après le premier focus vers primary_app +# pour rechercher un focus vers un titre neutre. Volontairement court +# pour ne pas couper un workflow qui re-visite un titre neutre bien plus +# tard (filet de sécurité). +_TRIM_NEUTRAL_LOOKAHEAD = 15 + + +def _is_neutral_window_title(window_title: str) -> bool: + """Détecter si un titre de fenêtre correspond à l'état initial vide + d'une application (fenêtre fraîchement ouverte par le setup auto). + + Exemples : ``Sans titre – Bloc-notes`` → True, + ``http://foo.txt – Bloc-notes`` → False, + ``Document1 - Word`` → True. + """ + if not window_title: + return False + title = str(window_title).strip().lower() + for sep in (" – ", " - "): + if sep in title: + title = title.split(sep, 1)[0].strip() + break + return title.lstrip("*").strip() in _NEUTRAL_TITLE_TOKENS + + +def _relative_position_labels(x: int, y: int, screen_w: int, screen_h: int) -> Dict[str, str]: + """Décrire une position en termes relatifs sur l'écran.""" + y_relative = "" + x_relative = "" + if screen_h > 0: + y_relative = ( + "en bas" if y / screen_h > 0.8 + else "en haut" if y / screen_h < 0.2 + else "au milieu" + ) + if screen_w > 0: + x_relative = ( + "à gauche" if x / screen_w < 0.3 + else "à droite" if x / screen_w > 0.7 + else "au centre" + ) + return { + "x_relative": x_relative, + "y_relative": y_relative, + } + + +def _extract_launch_result_target( + raw_events: list, + primary_app: str, +) -> Optional[Dict[str, Any]]: + """Retrouver le vrai clic de lancement depuis SearchHost.exe. + + Cherche un clic dans la fenêtre de recherche Windows qui est suivi + rapidement d'un focus vers l'application principale. Ce clic sert de + meilleure cible pour le setup auto qu'une cible synthétique + `display_name/app_icon`, trop fragile sur les résultats de recherche. + """ + primary_app_lower = (primary_app or "").lower() + if not primary_app_lower: + return None + + for idx, raw_evt in enumerate(raw_events): + event_data = raw_evt.get("event", raw_evt) + if event_data.get("type") != "mouse_click": + continue + + window = event_data.get("window", {}) + if not isinstance(window, dict): + continue + if window.get("app_name", "").lower() != "searchhost.exe": + continue + + pos = event_data.get("pos") or [] + if not isinstance(pos, list) or len(pos) != 2: + continue + + screen_meta = event_data.get("screen_metadata", {}) + screen_res = ( + screen_meta.get("screen_resolution") + or event_data.get("screen_resolution") + or [] + ) + if not isinstance(screen_res, list) or len(screen_res) != 2: + continue + + try: + click_x = int(pos[0]) + click_y = int(pos[1]) + screen_w = int(screen_res[0]) + screen_h = int(screen_res[1]) + except (TypeError, ValueError): + continue + + if screen_w <= 0 or screen_h <= 0: + continue + + click_ts = event_data.get("timestamp") + launched = False + for follow_evt in raw_events[idx + 1: idx + 8]: + follow_data = follow_evt.get("event", follow_evt) + if follow_data.get("type") != "window_focus_change": + continue + to_info = follow_data.get("to", {}) + if not isinstance(to_info, dict): + continue + if to_info.get("app_name", "").lower() != primary_app_lower: + continue + + follow_ts = follow_data.get("timestamp") + if ( + isinstance(click_ts, (int, float)) + and isinstance(follow_ts, (int, float)) + and follow_ts - click_ts > 5.0 + ): + break + + launched = True + break + + if not launched: + continue + + pos_labels = _relative_position_labels(click_x, click_y, screen_w, screen_h) + position_desc = " ".join( + part for part in [pos_labels["y_relative"], pos_labels["x_relative"]] if part + ) + window_title = window.get("title", "") or "Rechercher" + target: Dict[str, Any] = { + "x_pct": round(click_x / screen_w, 6), + "y_pct": round(click_y / screen_h, 6), + "window_title": window_title, + "expected_window_before": window_title, + "original_position": pos_labels, + "source_app": window.get("app_name", ""), + } + + if position_desc: + target["position_desc"] = position_desc + + window_capture = event_data.get("window_capture", {}) + if isinstance(window_capture, dict): + click_relative = window_capture.get("click_relative") + window_size = window_capture.get("window_size") + if ( + isinstance(click_relative, list) + and len(click_relative) == 2 + and isinstance(window_size, list) + and len(window_size) == 2 + ): + target["window_capture"] = { + "click_relative": click_relative, + "window_size": window_size, + } + + return target + + return None + + +def _extract_start_menu_target(raw_events: list) -> Optional[Dict[str, Any]]: + """Retrouver le vrai clic sur Démarrer depuis les événements bruts.""" + return _extract_start_menu_target_from_session(raw_events, session_dir=None) + + +def _load_click_anchor_from_session( + session_dir: Optional[str], + screenshot_id: str, + click_x: int, + click_y: int, +) -> str: + """Charger un crop 80x80 depuis le screenshot source stocké côté serveur.""" + if not session_dir or not screenshot_id: + return "" + + full_path = Path(session_dir) / "shots" / f"{screenshot_id}_full.png" + if not full_path.is_file(): + return "" + + try: + from PIL import Image + + img = Image.open(full_path) + crop_size = 40 + x1 = max(0, click_x - crop_size) + y1 = max(0, click_y - crop_size) + x2 = min(img.width, click_x + crop_size) + y2 = min(img.height, click_y + crop_size) + cropped = img.crop((x1, y1, x2, y2)) + buf = io.BytesIO() + cropped.save(buf, format="PNG") + return base64.b64encode(buf.getvalue()).decode("utf-8") + except Exception as e: + logger.debug("setup start anchor: crop échoué pour %s: %s", full_path, e) + return "" + + +def _extract_start_menu_target_from_session( + raw_events: list, + session_dir: Optional[str] = None, +) -> Optional[Dict[str, Any]]: + """Retrouver le vrai clic sur Démarrer et son ancre visuelle si disponible.""" + for idx, raw_evt in enumerate(raw_events): + event_data = raw_evt.get("event", raw_evt) + if event_data.get("type") != "mouse_click": + continue + + pos = event_data.get("pos") or [] + if not isinstance(pos, list) or len(pos) != 2: + continue + + screen_meta = event_data.get("screen_metadata", {}) + screen_res = ( + screen_meta.get("screen_resolution") + or event_data.get("screen_resolution") + or [] + ) + if not isinstance(screen_res, list) or len(screen_res) != 2: + continue + + click_ts = event_data.get("timestamp") + opened_search = False + for follow_evt in raw_events[idx + 1: idx + 6]: + follow_data = follow_evt.get("event", follow_evt) + if follow_data.get("type") != "window_focus_change": + continue + to_info = follow_data.get("to", {}) + if not isinstance(to_info, dict): + continue + if to_info.get("app_name", "").lower() != "searchhost.exe": + continue + + follow_ts = follow_data.get("timestamp") + if ( + isinstance(click_ts, (int, float)) + and isinstance(follow_ts, (int, float)) + and follow_ts - click_ts > 3.0 + ): + break + + opened_search = True + break + + if not opened_search: + continue + + try: + click_x = int(pos[0]) + click_y = int(pos[1]) + screen_w = int(screen_res[0]) + screen_h = int(screen_res[1]) + except (TypeError, ValueError): + continue + + if screen_w <= 0 or screen_h <= 0: + continue + + pos_labels = _relative_position_labels(click_x, click_y, screen_w, screen_h) + position_desc = " ".join( + part for part in [pos_labels["y_relative"], pos_labels["x_relative"]] if part + ) + target: Dict[str, Any] = { + "x_pct": round(click_x / screen_w, 6), + "y_pct": round(click_y / screen_h, 6), + "original_position": pos_labels, + "position_desc": position_desc, + } + anchor_b64 = _load_click_anchor_from_session( + session_dir=session_dir, + screenshot_id=str(event_data.get("screenshot_id", "")).strip(), + click_x=click_x, + click_y=click_y, + ) + if anchor_b64: + target["anchor_image_base64"] = anchor_b64 + return target + + return None + + +def _extract_search_box_interaction(raw_events: list) -> Optional[Dict[str, Any]]: + """Déterminer comment la recherche Windows a été activée. + + Cas utile observé en prod : + - clic Démarrer -> focus SearchHost -> saisie directe + => inutile de générer un clic artificiel sur le champ de recherche. + """ + for idx, raw_evt in enumerate(raw_events): + event_data = raw_evt.get("event", raw_evt) + if event_data.get("type") != "window_focus_change": + continue + + to_info = event_data.get("to", {}) + if not isinstance(to_info, dict): + continue + if to_info.get("app_name", "").lower() != "searchhost.exe": + continue + + search_window_title = to_info.get("title", "") or "Rechercher" + for follow_evt in raw_events[idx + 1: idx + 8]: + follow_data = follow_evt.get("event", follow_evt) + follow_type = follow_data.get("type") + + if follow_type == "text_input": + window = follow_data.get("window", {}) + if isinstance(window, dict) and window.get("app_name", "").lower() == "searchhost.exe": + return { + "mode": "direct_typing", + "window_title": window.get("title", "") or search_window_title, + } + continue + + if follow_type == "mouse_click": + window = follow_data.get("window", {}) + if not (isinstance(window, dict) and window.get("app_name", "").lower() == "searchhost.exe"): + continue + + pos = follow_data.get("pos") or [] + screen_meta = follow_data.get("screen_metadata", {}) + screen_res = ( + screen_meta.get("screen_resolution") + or follow_data.get("screen_resolution") + or [] + ) + if not (isinstance(pos, list) and len(pos) == 2 and isinstance(screen_res, list) and len(screen_res) == 2): + continue + + has_text_after = False + for later_evt in raw_events[idx + 2: idx + 8]: + later_data = later_evt.get("event", later_evt) + if later_data.get("type") != "text_input": + continue + later_window = later_data.get("window", {}) + if isinstance(later_window, dict) and later_window.get("app_name", "").lower() == "searchhost.exe": + has_text_after = True + break + if not has_text_after: + continue + + try: + click_x = int(pos[0]) + click_y = int(pos[1]) + screen_w = int(screen_res[0]) + screen_h = int(screen_res[1]) + except (TypeError, ValueError): + continue + + if screen_w <= 0 or screen_h <= 0: + continue + + pos_labels = _relative_position_labels(click_x, click_y, screen_w, screen_h) + position_desc = " ".join( + part for part in [pos_labels["y_relative"], pos_labels["x_relative"]] if part + ) + return { + "mode": "click_then_type", + "x_pct": round(click_x / screen_w, 6), + "y_pct": round(click_y / screen_h, 6), + "window_title": window.get("title", "") or search_window_title, + "expected_window_before": window.get("title", "") or search_window_title, + "original_position": pos_labels, + "position_desc": position_desc, + } + + if follow_type == "window_focus_change": + next_to = follow_data.get("to", {}) + if isinstance(next_to, dict) and next_to.get("app_name", "").lower() != "searchhost.exe": + break + + return None + + +def _extract_required_apps_from_events( + raw_events: list, + session_dir: Optional[str] = None, +) -> Dict[str, Any]: """Extraire les applications requises depuis les événements bruts d'une session. Analyse les window_focus_change pour identifier : @@ -264,6 +682,9 @@ def _extract_required_apps_from_events(raw_events: list) -> Dict[str, Any]: - primary_launch_cmd: str (commande Win+R, ex: "notepad") - first_window_title: str (titre de la première fenêtre applicative) - apps: dict[str, int] (app_name -> nombre d'occurrences) + - start_menu_target: dict optionnel (vrai clic Démarrer) + - search_box_interaction: dict optionnel (saisie directe ou vrai clic SearchHost) + - launch_result_target: dict optionnel (vrai clic SearchHost -> app) """ app_counts: Dict[str, int] = defaultdict(int) first_app = None @@ -308,13 +729,140 @@ def _extract_required_apps_from_events(raw_events: list) -> Dict[str, Any]: # Résoudre la commande de lancement primary_launch_cmd = _resolve_launch_command(primary_app) + start_menu_target = _extract_start_menu_target_from_session( + raw_events, + session_dir=session_dir, + ) + search_box_interaction = _extract_search_box_interaction(raw_events) + launch_result_target = _extract_launch_result_target(raw_events, primary_app) - return { + result = { "primary_app": primary_app, "primary_launch_cmd": primary_launch_cmd, "first_window_title": first_window_title or "", "apps": dict(app_counts), } + if start_menu_target: + result["start_menu_target"] = start_menu_target + if search_box_interaction: + result["search_box_interaction"] = search_box_interaction + if launch_result_target: + result["launch_result_target"] = launch_result_target + return result + + +def _trim_redundant_setup_events( + raw_events: List[Dict[str, Any]], + app_info: Dict[str, Any], +) -> List[Dict[str, Any]]: + """Couper le préambule de lancement déjà couvert par le setup injecté. + + Quand `/replay-session` injecte un setup visuel (Démarrer -> SearchHost -> + résultat d'application), les événements bruts de la session source + contiennent encore cette même séquence. Sans coupe, le replay rejoue + l'ouverture de l'application une deuxième fois et dérive hors contexte. + + Stratégie : + - chercher la première `window_focus_change` vers l'application principale + - préférer un titre qui matche `first_window_title` + - conserver uniquement les événements APRÈS cette bascule de focus + + Args: + raw_events: événements source complets de la session. + app_info: résultat de `_extract_required_apps_from_events`. + + Returns: + Liste coupée si un point de reprise fiable est trouvé, sinon la liste + d'origine inchangée. + """ + if not raw_events or not app_info: + return raw_events + + primary_app = str(app_info.get("primary_app", "")).strip().lower() + first_title = str(app_info.get("first_window_title", "")).strip().lower() + if not primary_app: + return raw_events + + first_primary_idx = None + matched_idx = None + neutral_idx = None + + # Si le titre observé en premier est déjà neutre, le setup amène + # déjà l'app dans le même état → comportement legacy suffit. + first_title_is_neutral = _is_neutral_window_title(first_title) + + for idx, raw_evt in enumerate(raw_events): + event_data = raw_evt.get("event", raw_evt) + if event_data.get("type", "") != "window_focus_change": + continue + + to_info = event_data.get("to", {}) + if not isinstance(to_info, dict): + continue + + app_name = str(to_info.get("app_name", "")).strip().lower() + if app_name != primary_app: + continue + + title = str(to_info.get("title", "")).strip() + title_lower = title.lower() + + if first_primary_idx is None: + first_primary_idx = idx + + # Priorité : focus vers un titre neutre proche du premier focus + # → c'est l'état que le setup auto va réellement produire, donc + # les events entre le premier focus et celui-ci sont redondants + # (bascule d'onglet vers la fenêtre vide, etc.). + if ( + not first_title_is_neutral + and neutral_idx is None + and _is_neutral_window_title(title) + and (idx - first_primary_idx) <= _TRIM_NEUTRAL_LOOKAHEAD + ): + neutral_idx = idx + + if matched_idx is None: + if not first_title: + matched_idx = idx + elif title_lower and ( + first_title in title_lower or title_lower in first_title + ): + matched_idx = idx + + # Early exit dès qu'on a trouvé ce qu'on cherche, ou qu'on est + # sorti du lookahead pour le neutral. + if neutral_idx is not None: + break + if matched_idx is not None and ( + first_title_is_neutral + or (idx - first_primary_idx) > _TRIM_NEUTRAL_LOOKAHEAD + ): + break + + cut_idx = ( + neutral_idx + if neutral_idx is not None + else (matched_idx if matched_idx is not None else first_primary_idx) + ) + if cut_idx is None: + logger.info( + "setup trim : aucun focus initial trouvé pour '%s' — replay brut conservé", + primary_app, + ) + return raw_events + + trimmed = raw_events[cut_idx + 1:] + logger.info( + "setup trim : %d événements retirés avant le replay brut " + "(app=%s, titre=%s, neutral=%s, restant=%d)", + cut_idx + 1, + primary_app, + app_info.get("first_window_title", ""), + neutral_idx is not None, + len(trimmed), + ) + return trimmed def _extract_required_apps_from_workflow(workflow) -> Dict[str, Any]: @@ -466,6 +1014,124 @@ def _get_visual_search_info(app_name: str) -> Dict[str, str]: } +def _should_use_run_dialog_setup(primary_app: str, launch_cmd: str) -> bool: + """Déterminer si le setup doit passer par `Win+R`. + + On cible seulement quelques apps connues où le chemin Démarrer → + Rechercher s'est montré fragile en live, alors que la commande shell est + stable et sémantiquement équivalente pour préparer l'environnement. + """ + app_lower = str(primary_app or "").strip().lower() + launch_cmd = str(launch_cmd or "").strip() + return bool(app_lower in _SETUP_RUN_DIALOG_APPS and launch_cmd) + + +def _generate_run_dialog_setup_actions( + app_info: Dict[str, Any], + setup_id_prefix: str = "setup", +) -> List[Dict[str, Any]]: + """Générer un setup sémantique `Win+R -> commande -> Enter`. + + Utilisé pour les applications dont l'ouverture via Démarrer/Rechercher + ajoute une fragilité Windows sans valeur métier pour le replay. + """ + launch_cmd = str(app_info.get("primary_launch_cmd", "") or "").strip() + primary_app = str(app_info.get("primary_app", "") or "").strip() + first_title = str(app_info.get("first_window_title", "") or "").strip() + visual_info = _get_visual_search_info(primary_app) + display_name = str(visual_info.get("display_name", "") or "").strip() + + if not launch_cmd or not primary_app: + return [] + + heavy_apps = {"winword.exe", "excel.exe", "powerpnt.exe", "outlook.exe", "code.exe"} + wait_ms = 3000 if primary_app.lower() in heavy_apps else 2000 + title_patterns: List[str] = [] + for candidate in ( + display_name, + launch_cmd, + primary_app[:-4] if primary_app.lower().endswith(".exe") else primary_app, + ): + candidate = str(candidate or "").strip() + if not candidate: + continue + if candidate.lower() not in {p.lower() for p in title_patterns}: + title_patterns.append(candidate) + if " " in candidate: + last_token = candidate.split()[-1].strip() + if last_token and last_token.lower() not in {p.lower() for p in title_patterns}: + title_patterns.append(last_token) + + actions: List[Dict[str, Any]] = [ + { + "action_id": f"act_{setup_id_prefix}_open_run", + "type": "key_combo", + "keys": ["win", "r"], + "_setup_phase": True, + "_setup_step": "open_run_dialog", + "_setup_strategy": "run_dialog", + }, + { + "action_id": f"act_{setup_id_prefix}_wait_run", + "type": "wait", + "duration_ms": 500, + "_setup_phase": True, + "_setup_step": "wait_run_dialog", + "_setup_strategy": "run_dialog", + }, + { + "action_id": f"act_{setup_id_prefix}_type_launch_cmd", + "type": "type", + "text": launch_cmd, + "_setup_phase": True, + "_setup_step": "type_launch_command", + "_setup_strategy": "run_dialog", + }, + { + "action_id": f"act_{setup_id_prefix}_wait_launch_cmd", + "type": "wait", + "duration_ms": 300, + "_setup_phase": True, + "_setup_step": "wait_launch_command", + "_setup_strategy": "run_dialog", + }, + { + "action_id": f"act_{setup_id_prefix}_submit_run", + "type": "key_combo", + "keys": ["enter"], + "_setup_phase": True, + "_setup_step": "submit_run_dialog", + "_setup_strategy": "run_dialog", + }, + { + "action_id": f"act_{setup_id_prefix}_wait_launch", + "type": "wait", + "duration_ms": wait_ms, + "_setup_phase": True, + "_setup_step": "wait_app_launch", + "_setup_strategy": "run_dialog", + }, + ] + + if title_patterns or first_title: + actions.append({ + "action_id": f"act_{setup_id_prefix}_verify", + "type": "verify_screen", + "expected_node": "setup_initial", + "timeout_ms": 5000, + "_setup_phase": True, + "_setup_step": "verify_app_ready", + "_setup_strategy": "run_dialog", + "expected_window_title_contains": title_patterns or [first_title], + }) + + logger.info( + "Setup env sémantique généré : %d actions pour lancer '%s' via Win+R (%s)", + len(actions), primary_app, launch_cmd, + ) + return actions + + def _generate_setup_actions( app_info: Dict[str, Any], setup_id_prefix: str = "setup", @@ -515,11 +1181,20 @@ def _generate_setup_actions( logger.debug("setup_actions : app '%s' ignorée (système)", primary_app) return [] + if _should_use_run_dialog_setup(primary_app, launch_cmd): + return _generate_run_dialog_setup_actions( + app_info, + setup_id_prefix=setup_id_prefix, + ) + # Obtenir les informations de recherche visuelle pour cette app visual_info = _get_visual_search_info(primary_app) search_text = visual_info["search_text"] display_name = visual_info["display_name"] vlm_description = visual_info["vlm_description"] + start_menu_target = app_info.get("start_menu_target", {}) or {} + search_box_interaction = app_info.get("search_box_interaction", {}) or {} + launch_result_target = app_info.get("launch_result_target", {}) or {} actions = [] @@ -531,21 +1206,48 @@ def _generate_setup_actions( # 1. Clic visuel sur le bouton Démarrer (toujours visible, bas-gauche) # Le VLM résout la position exacte ; x_pct/y_pct sont des fallbacks. + start_click_spec = { + "by_text": "Démarrer", + "by_role": "start_button", + "vlm_description": ( + "Le bouton Démarrer de Windows (icône Windows), " + "en bas à gauche de la barre des tâches" + ), + "screen_scope": "full_screen", + } + start_click_x = 0.02 + start_click_y = 0.98 + + if start_menu_target: + start_click_x = float(start_menu_target.get("x_pct", start_click_x)) + start_click_y = float(start_menu_target.get("y_pct", start_click_y)) + start_click_spec["by_text"] = "" + start_click_spec["allow_position_fallback"] = True + anchor_b64 = str(start_menu_target.get("anchor_image_base64", "")).strip() + if anchor_b64: + start_click_spec["anchor_image_base64"] = anchor_b64 + original_position = start_menu_target.get("original_position") + if isinstance(original_position, dict) and original_position: + start_click_spec["original_position"] = dict(original_position) + position_desc = str(start_menu_target.get("position_desc", "")).strip() + if position_desc: + start_click_spec["vlm_description"] = ( + "L'icône Windows du bouton Démarrer dans la barre des tâches, " + f"visible {position_desc} de l'écran" + ) + else: + start_click_spec["vlm_description"] = ( + "L'icône Windows du bouton Démarrer dans la barre des tâches" + ) + actions.append({ "action_id": f"act_{setup_id_prefix}_click_start", "type": "click", - "x_pct": 0.02, - "y_pct": 0.98, + "x_pct": start_click_x, + "y_pct": start_click_y, "button": "left", "visual_mode": True, - "target_spec": { - "by_text": "Démarrer", - "by_role": "start_button", - "vlm_description": ( - "Le bouton Démarrer de Windows (icône Windows), " - "en bas à gauche de la barre des tâches" - ), - }, + "target_spec": start_click_spec, "_setup_phase": True, "_setup_step": "click_start_menu", }) @@ -559,15 +1261,35 @@ def _generate_setup_actions( "_setup_step": "wait_start_menu", }) - # 3. Clic visuel sur la barre de recherche du menu Démarrer + # 2b. Garde visuelle : le menu Démarrer / la barre de recherche + # doit être réellement actif avant de continuer. Sans cette garde, + # un click_start qui touche en fait le systray overflow popup + # laisse le setup taper « bloc » dans la mauvaise fenêtre (cf. + # run live 2026-05-22 replay_sess_76b7d067). L'exécuteur compare + # le titre actif aux patterns ci-dessous (substring case-insensitive, + # FR+EN+app-name) et bascule en apprentissage humain si aucun match. actions.append({ - "action_id": f"act_{setup_id_prefix}_click_search", - "type": "click", - "x_pct": 0.20, - "y_pct": 0.92, - "button": "left", - "visual_mode": True, - "target_spec": { + "action_id": f"act_{setup_id_prefix}_verify_start_open", + "type": "verify_screen", + "expected_node": "", + "timeout_ms": 1500, + "expected_window_title_contains": [ + "Rechercher", + "Recherche", + "Search", + "Cortana", + "Démarrer", + "Start", + "SearchHost", + "StartMenuExperienceHost", + ], + "_setup_phase": True, + "_setup_step": "verify_start_menu_open", + }) + + search_mode = str(search_box_interaction.get("mode", "")).strip() + if search_mode != "direct_typing": + search_click_spec = { "by_text": "Rechercher", "by_role": "search_box", "vlm_description": ( @@ -575,19 +1297,74 @@ def _generate_setup_actions( "de Windows, souvent intitulé 'Tapez ici pour rechercher' " "ou 'Rechercher'" ), - }, - "_setup_phase": True, - "_setup_step": "click_search_box", - }) + } + search_click_x = 0.20 + search_click_y = 0.92 + search_expected_window = "" - # 4. Attendre que la barre de recherche soit active et prête - actions.append({ - "action_id": f"act_{setup_id_prefix}_wait_search_ready", - "type": "wait", - "duration_ms": 500, - "_setup_phase": True, - "_setup_step": "wait_search_ready", - }) + if search_mode == "click_then_type": + search_click_x = float(search_box_interaction.get("x_pct", search_click_x)) + search_click_y = float(search_box_interaction.get("y_pct", search_click_y)) + search_expected_window = str( + search_box_interaction.get("expected_window_before") + or search_box_interaction.get("window_title") + or "" + ) + search_click_spec["window_title"] = str( + search_box_interaction.get("window_title", "") + ).strip() + original_position = search_box_interaction.get("original_position") + if isinstance(original_position, dict) and original_position: + search_click_spec["original_position"] = dict(original_position) + position_desc = str(search_box_interaction.get("position_desc", "")).strip() + if position_desc: + search_click_spec["vlm_description"] = ( + f"Dans la fenêtre '{search_click_spec['window_title']}', " + f"le champ de recherche se trouve {position_desc} de l'écran" + ) + + search_click_action = { + "action_id": f"act_{setup_id_prefix}_click_search", + "type": "click", + "x_pct": search_click_x, + "y_pct": search_click_y, + "button": "left", + "visual_mode": True, + "target_spec": search_click_spec, + "_setup_phase": True, + "_setup_step": "click_search_box", + } + if search_expected_window: + search_click_action["expected_window_before"] = search_expected_window + + actions.append(search_click_action) + + # 4. Attendre que la barre de recherche soit active et prête + actions.append({ + "action_id": f"act_{setup_id_prefix}_wait_search_ready", + "type": "wait", + "duration_ms": 500, + "_setup_phase": True, + "_setup_step": "wait_search_ready", + }) + + # 4b. Garde visuelle : la barre Rechercher doit effectivement + # avoir le focus avant la frappe. On combine le titre récupéré + # de la session source (`search_box_interaction.window_title`) + # avec un fallback FR/EN générique. + search_window_hint = str(search_box_interaction.get("window_title", "")).strip() + verify_patterns = ["Rechercher", "Recherche", "Search"] + if search_window_hint and search_window_hint not in verify_patterns: + verify_patterns = [search_window_hint] + verify_patterns + actions.append({ + "action_id": f"act_{setup_id_prefix}_verify_search_active", + "type": "verify_screen", + "expected_node": "", + "timeout_ms": 1500, + "expected_window_title_contains": verify_patterns, + "_setup_phase": True, + "_setup_step": "verify_search_box_active", + }) # 5. Taper le nom visuel de l'application (texte français) actions.append({ @@ -607,21 +1384,86 @@ def _generate_setup_actions( "_setup_step": "wait_search_results", }) - # 7. Clic visuel sur le résultat de l'application dans la liste + # 6b. Dernière garde avant le clic résultat : la barre Rechercher + # (et donc la liste de résultats) doit toujours être active. Sans + # cette garde finale, un focus perdu pendant wait_search_results + # fait cliquer click_app_result dans la mauvaise surface (constat + # live 2026-05-22 — fenêtre observée « Fenêtre de dépassement de + # capacité de la barre d'état système »). actions.append({ + "action_id": f"act_{setup_id_prefix}_verify_results_visible", + "type": "verify_screen", + "expected_node": "", + "timeout_ms": 1500, + "expected_window_title_contains": [ + "Rechercher", + "Recherche", + "Search", + "Cortana", + "SearchHost", + "StartMenuExperienceHost", + ], + "_setup_phase": True, + "_setup_step": "verify_search_results_visible", + }) + + # 7. Clic visuel sur le résultat de l'application dans la liste + click_result_spec = { + "by_text": display_name, + "by_role": "app_icon", + "vlm_description": vlm_description, + } + click_result_x = 0.20 + click_result_y = 0.50 + click_result_expected_window = "" + + if launch_result_target: + click_result_x = float(launch_result_target.get("x_pct", click_result_x)) + click_result_y = float(launch_result_target.get("y_pct", click_result_y)) + click_result_expected_window = str( + launch_result_target.get("expected_window_before") + or launch_result_target.get("window_title") + or "" + ) + click_result_spec["by_role"] = "search_result" + click_result_spec["allow_position_fallback"] = True + click_result_spec["window_title"] = str(launch_result_target.get("window_title", "")).strip() + original_position = launch_result_target.get("original_position") + if isinstance(original_position, dict) and original_position: + click_result_spec["original_position"] = dict(original_position) + window_capture = launch_result_target.get("window_capture") + if isinstance(window_capture, dict) and window_capture: + click_result_spec["window_capture"] = dict(window_capture) + + position_desc = str(launch_result_target.get("position_desc", "")).strip() + if position_desc: + click_result_spec["vlm_description"] = ( + f"Dans la fenêtre '{click_result_spec['window_title']}', " + f"le résultat de recherche de l'application '{display_name}' " + f"se trouve {position_desc} de l'écran" + ) + else: + click_result_spec["vlm_description"] = ( + f"Dans la fenêtre '{click_result_spec['window_title']}', " + f"cliquez sur le résultat de recherche de l'application '{display_name}'" + ) + + click_result_action = { "action_id": f"act_{setup_id_prefix}_click_result", "type": "click", - "x_pct": 0.20, - "y_pct": 0.50, + "x_pct": click_result_x, + "y_pct": click_result_y, "button": "left", "visual_mode": True, - "target_spec": { - "by_text": display_name, - "by_role": "app_icon", - "vlm_description": vlm_description, - }, + "target_spec": click_result_spec, "_setup_phase": True, "_setup_step": "click_app_result", + } + if click_result_expected_window: + click_result_action["expected_window_before"] = click_result_expected_window + + actions.append({ + **click_result_action, }) # 8. Attendre que l'application s'ouvre @@ -664,22 +1506,39 @@ def _find_active_agent_session(session_manager, machine_id: Optional[str] = None """Trouver la dernière session Agent V1 pour le replay. Stratégie en 2 passes : - 1. D'abord chercher une session non-finalisée (Agent V1 actif) - 2. Sinon, prendre la plus récente même finalisée + 1. D'abord chercher une session `sess_*` non-finalisée (Agent V1 actif) + 2. Sinon, pour une machine ciblée, réutiliser `bg_` si présent + 3. Sinon, prendre la plus récente `sess_*` même finalisée Args: session_manager: Instance LiveSessionManager. machine_id: Si fourni, ne chercher que les sessions de cette machine. """ with session_manager._lock: + bg_session_id = f"bg_{machine_id}" if machine_id else None + + def _matches_machine(session) -> bool: + if machine_id is None: + return True + if session.machine_id == machine_id: + return True + # Robustesse au redémarrage : certaines sessions de fond peuvent + # encore être restaurées avec machine_id='default' alors que leur + # session_id encode déjà la vraie machine. + return bool(bg_session_id and session.session_id == bg_session_id) + all_agent_sessions = [ s for s in session_manager._sessions.values() if s.session_id.startswith("sess_") - and (machine_id is None or s.machine_id == machine_id) + and _matches_machine(s) ] - - if not all_agent_sessions: - return None + background_session = next( + ( + s for s in session_manager._sessions.values() + if bg_session_id and s.session_id == bg_session_id + ), + None, + ) # Trier par session_id (contient un timestamp) — plus récent d'abord all_agent_sessions.sort(key=lambda s: s.session_id, reverse=True) @@ -689,8 +1548,18 @@ def _find_active_agent_session(session_manager, machine_id: Optional[str] = None if not s.finalized: return s.session_id - # Passe 2 : fallback sur la plus récente (même finalisée) - return all_agent_sessions[0].session_id + # Passe 2 : fallback sur la session de fond de la machine si elle existe. + if background_session and not background_session.finalized: + return background_session.session_id + + # Passe 3 : fallback sur la plus récente (même finalisée) + if all_agent_sessions: + return all_agent_sessions[0].session_id + + if background_session: + return background_session.session_id + + return None def _workflow_to_actions( @@ -1876,8 +2745,15 @@ def _schedule_retry( # Stocker l'info de retry pour le prochain report_action_result retry_pending[retry_action_id] = { "action": action, + "dispatched_action": retry_action, "retry_count": next_retry, "replay_id": replay_state["replay_id"], + "session_id": session_id, + "machine_id": replay_state.get("machine_id", "default"), + "dispatched_at": 0.0, + "first_dispatched_at": 0.0, + "resent_count": 0, + "last_resent_at": 0.0, "reason": reason, } diff --git a/agent_v0/server_v1/replay_learner.py b/agent_v0/server_v1/replay_learner.py index 32c479a7a..f9f16d265 100644 --- a/agent_v0/server_v1/replay_learner.py +++ b/agent_v0/server_v1/replay_learner.py @@ -188,7 +188,12 @@ class ReplayLearner: """ target_spec = action.get("target_spec", {}) by_text = target_spec.get("by_text", "") - window_title = target_spec.get("window_title", "") + window_title = ( + target_spec.get("window_title", "") + or action.get("window_title", "") + or target_spec.get("expected_window_before", "") + or (target_spec.get("context_hints") or {}).get("window_title", "") + ) x_pct = correction.get("x_pct", 0.0) y_pct = correction.get("y_pct", 0.0) @@ -207,20 +212,36 @@ class ReplayLearner: # Stocker dans target_memory.db pour le lookup futur try: - from .replay_memory import get_target_memory_store - store = get_target_memory_store() - if store: - store.record_success( - screen_signature="human_correction", + from .replay_memory import memory_record_success + stored = False + if window_title: + stored = memory_record_success( + window_title=window_title, target_spec=target_spec, - resolved_position={"x_pct": x_pct, "y_pct": y_pct}, + x_pct=float(x_pct), + y_pct=float(y_pct), method="human_supervised", - score=1.0, + confidence=1.0, ) + else: + logger.warning( + "[APPRENTISSAGE] Correction humaine non persistée : " + "window_title absent pour '%s'", + by_text, + ) + + if stored: logger.info( f"[APPRENTISSAGE] Correction stockée dans target_memory : " f"'{by_text}' → ({x_pct:.4f}, {y_pct:.4f})" ) + elif window_title: + logger.warning( + "[APPRENTISSAGE] Correction humaine non persistée : " + "échec memory_record_success pour '%s' dans '%s'", + by_text, + window_title, + ) except Exception as e: logger.warning(f"Learning: échec stockage target_memory: {e}") diff --git a/agent_v0/server_v1/replay_memory.py b/agent_v0/server_v1/replay_memory.py index 9ea0b89bb..62f709a47 100644 --- a/agent_v0/server_v1/replay_memory.py +++ b/agent_v0/server_v1/replay_memory.py @@ -103,15 +103,53 @@ def compute_screen_sig(window_title: str) -> str: return hashlib.sha256(norm.encode("utf-8")).hexdigest()[:16] +def _round_float_list(values: Any, precision: int = 4) -> Optional[tuple[float, ...]]: + """Normaliser une liste de coordonnées flottantes pour le hash mémoire.""" + if not isinstance(values, (list, tuple)): + return None + out = [] + for value in values: + try: + out.append(round(float(value), precision)) + except (TypeError, ValueError): + return None + return tuple(out) + + +def _int_pair(values: Any) -> Optional[tuple[int, int]]: + """Extraire une paire entière stable pour les hints spatiaux.""" + if not isinstance(values, (list, tuple)) or len(values) < 2: + return None + try: + return int(values[0]), int(values[1]) + except (TypeError, ValueError): + return None + + +def _should_reuse_recorded_window_relative_coords(fp: Any) -> bool: + """Décider si on doit remplacer la mémoire apprise par la position source. + + Cette réécriture n'est légitime que pour les entrées faibles de type + `position_fallback`/`v4_unknown`, où la mémoire ne contient pas une vraie + localisation visuelle robuste mais seulement un clic écran dépendant de la + résolution. Pour les méthodes visuelles apprises (template, SoM, OCR...), + réinjecter un vieux `click_relative` source crée des collisions et des + dérives sur des boutons homonymes (`Enregistrer`, `OK`, etc.). + """ + method = str(getattr(fp, "etype", "") or "").strip().lower() + return method in {"position_fallback", "v4_unknown"} + + class _TargetSpecLike: """Adaptateur dict → objet pour `TargetMemoryStore._hash_target_spec()`. Le hash interne de TargetMemoryStore utilise `getattr(spec, "by_role", ...)` qui ne fonctionne pas avec un dict brut. On expose les attributs nécessaires. - On intègre aussi `resolve_order` et `vlm_description` dans `context_hints` - pour qu'ils entrent dans le hash — deux actions avec le même `by_text` - mais un `resolve_order` différent doivent avoir des hashes distincts. + On intègre aussi `resolve_order`, `vlm_description` et des indices + spatiaux (SoM, click_relative) dans `context_hints` pour qu'ils entrent + dans le hash. Sinon, deux actions `Enregistrer` dans la même fenêtre + mais à des emplacements différents collisionnent. """ __slots__ = ("by_role", "by_text", "by_position", "context_hints") @@ -131,6 +169,21 @@ class _TargetSpecLike: hints["_vlm_desc"] = str(d["vlm_description"]) if d.get("anchor_hint"): hints["_anchor_hint"] = str(d["anchor_hint"]) + + som_element = d.get("som_element") or {} + som_bbox = _round_float_list(som_element.get("bbox_norm")) + if som_bbox: + hints["_som_bbox"] = som_bbox + som_center = _round_float_list(som_element.get("center_norm"), precision=5) + if som_center: + hints["_som_center"] = som_center + + window_capture = d.get("window_capture") or {} + click_relative = _int_pair(window_capture.get("click_relative")) + window_size = _int_pair(window_capture.get("window_size")) + if click_relative and window_size: + hints["_window_rel"] = f"{click_relative[0]},{click_relative[1]}@{window_size[0]}x{window_size[1]}" + self.context_hints = hints @@ -176,6 +229,46 @@ def memory_lookup( logger.debug("memory_lookup: fingerprint bbox invalide") return None + # Quand l'entrée mémoire provient d'un simple `position_fallback`, les + # coordonnées stockées reflètent surtout la géométrie écran source. Dans + # ce cas précis, réutiliser la position relative enregistrée dans la + # fenêtre source reste préférable si elle existe. + # + # En revanche, pour une méthode visuelle réellement apprise + # (`anchor_template`, `som_*`, `hybrid_text_direct`, ...), remplacer les + # coords mémorisées par un vieux `click_relative` crée des dérives sur + # des cibles textuelles homonymes. On garde donc les coords apprises. + window_capture = target_spec.get("window_capture") or {} + click_relative = window_capture.get("click_relative") + window_size = window_capture.get("window_size") + if ( + _should_reuse_recorded_window_relative_coords(fp) + and ( + isinstance(click_relative, (list, tuple)) + and len(click_relative) >= 2 + and isinstance(window_size, (list, tuple)) + and len(window_size) >= 2 + ) + ): + try: + rel_x = float(click_relative[0]) + rel_y = float(click_relative[1]) + win_w = float(window_size[0]) + win_h = float(window_size[1]) + if win_w > 1 and win_h > 1: + x_pct = rel_x / win_w + y_pct = rel_y / win_h + logger.info( + "memory_lookup: coords fenêtre source réutilisées " + "(click_relative=%s, window_size=%s) -> (%.4f, %.4f)", + click_relative, + window_size, + x_pct, + y_pct, + ) + except (TypeError, ValueError, ZeroDivisionError): + logger.debug("memory_lookup: window_capture invalide, fallback bbox") + # Sanity check : les pourcentages doivent être dans [0, 1] if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0): logger.warning( diff --git a/agent_v0/server_v1/replay_verifier.py b/agent_v0/server_v1/replay_verifier.py index f520f1a35..761819e91 100644 --- a/agent_v0/server_v1/replay_verifier.py +++ b/agent_v0/server_v1/replay_verifier.py @@ -328,10 +328,11 @@ class ReplayVerifier: ), ) - # Cas 4 : Pas de changement (key_combo, wait) - # Pour les raccourcis clavier et attentes, l'absence de changement - # n'est pas forcément un problème (ex: Ctrl+C ne change pas l'écran) - if action_type in ("key_combo", "wait"): + # Cas 4 : Pas de changement (key_combo, wait, verify_screen) + # `verify_screen` côté agent n'est qu'une temporisation de stabilisation. + # Il ne doit pas exiger un NOUVEAU changement visuel sinon le setup + # boucle inutilement une fois l'application déjà ouverte. + if action_type in ("key_combo", "wait", "verify_screen"): return VerificationResult( verified=True, confidence=0.4, diff --git a/agent_v0/server_v1/replay_watchdog.py b/agent_v0/server_v1/replay_watchdog.py new file mode 100644 index 000000000..871029f2d --- /dev/null +++ b/agent_v0/server_v1/replay_watchdog.py @@ -0,0 +1,329 @@ +"""Replay orphan watchdog for in-flight replay actions. + +This module watches `_retry_pending` and re-pushes actions that were +dispatched by the server but never acknowledged by the Windows agent. +""" + +from __future__ import annotations + +import asyncio +import contextlib +import logging +import os +import time +from typing import Any, Callable, Dict, List, Optional, Tuple + + +logger = logging.getLogger(__name__) + + +def _env_bool(name: str, default: str) -> bool: + return os.environ.get(name, default).strip().lower() in { + "1", + "true", + "yes", + "on", + } + + +def _env_float(name: str, default: float) -> float: + try: + return float(os.environ.get(name, str(default))) + except (TypeError, ValueError): + logger.warning("Watchdog: invalid env %s, fallback=%s", name, default) + return default + + +def _env_int(name: str, default: int) -> int: + try: + return int(os.environ.get(name, str(default))) + except (TypeError, ValueError): + logger.warning("Watchdog: invalid env %s, fallback=%s", name, default) + return default + + +def _env_max_resends(default: int) -> int: + raw = os.environ.get("RPA_WATCHDOG_MAX_RESENDS") + if raw is None or not str(raw).strip(): + raw = os.environ.get("RPA_WATCHDOG_MAX_RETRIES") + try: + return int(raw) if raw is not None else default + except (TypeError, ValueError): + logger.warning("Watchdog: invalid max resend env, fallback=%s", default) + return default + + +WATCHDOG_ENABLED = _env_bool("RPA_WATCHDOG_ENABLED", "1") +WATCHDOG_SCAN_INTERVAL_S = _env_float("RPA_WATCHDOG_SCAN_INTERVAL_S", 10.0) +WATCHDOG_ORPHAN_TIMEOUT_S = _env_float("RPA_WATCHDOG_ORPHAN_TIMEOUT_S", 45.0) +WATCHDOG_MAX_RESENDS = _env_max_resends(2) +WATCHDOG_REPUSH_POSITION = ( + os.environ.get("RPA_WATCHDOG_REPUSH_POSITION", "head").strip().lower() +) + + +_metrics_lock = asyncio.Lock() +_metrics: Dict[str, Any] = { + "orphans_detected_total": 0, + "orphans_resent_total": 0, + "orphans_giveup_total": 0, + "scans_total": 0, + "scans_failed_total": 0, + "last_scan_ts": 0.0, + "last_scan_duration_ms": 0.0, + "current_in_flight_count": 0, + "current_orphan_count": 0, +} + + +async def _bump(key: str, delta: int = 1) -> None: + async with _metrics_lock: + _metrics[key] = _metrics.get(key, 0) + delta + + +def get_metrics_snapshot() -> Dict[str, Any]: + return dict(_metrics) + + +SseNotifier = Callable[[str, str], None] + + +class ReplayWatchdog: + """Background coroutine that re-pushes orphaned replay actions.""" + + def __init__( + self, + retry_pending: Dict[str, Dict[str, Any]], + replay_queues: Dict[str, List[Dict[str, Any]]], + async_lock_factory: Callable[[], Any], + sse_notifier: Optional[SseNotifier] = None, + ) -> None: + self._retry_pending = retry_pending + self._replay_queues = replay_queues + self._async_lock = async_lock_factory + self._sse_notifier = sse_notifier + self._task: Optional[asyncio.Task] = None + self._stopped = asyncio.Event() + + async def start(self) -> None: + if not WATCHDOG_ENABLED: + logger.info("[WATCHDOG] disabled via RPA_WATCHDOG_ENABLED=0") + return + if self._task is not None and not self._task.done(): + logger.warning("[WATCHDOG] already started") + return + self._stopped.clear() + self._task = asyncio.create_task(self._run(), name="replay_watchdog") + logger.info( + "[WATCHDOG] started scan=%.1fs orphan_timeout=%.1fs max_resends=%d repush=%s", + WATCHDOG_SCAN_INTERVAL_S, + WATCHDOG_ORPHAN_TIMEOUT_S, + WATCHDOG_MAX_RESENDS, + WATCHDOG_REPUSH_POSITION, + ) + + async def stop(self, timeout_s: float = 5.0) -> None: + if self._task is None: + return + self._stopped.set() + self._task.cancel() + try: + await asyncio.wait_for(self._task, timeout=timeout_s) + except asyncio.CancelledError: + pass + except asyncio.TimeoutError: + logger.warning("[WATCHDOG] stop timeout after %.1fs", timeout_s) + except Exception: + logger.exception("[WATCHDOG] unexpected stop error") + self._task = None + logger.info("[WATCHDOG] stopped") + + async def _run(self) -> None: + try: + while not self._stopped.is_set(): + try: + await asyncio.wait_for( + self._stopped.wait(), + timeout=WATCHDOG_SCAN_INTERVAL_S, + ) + break + except asyncio.TimeoutError: + pass + + try: + await self._scan_once() + except Exception: + await _bump("scans_failed_total") + logger.exception("[WATCHDOG] scan failed") + except asyncio.CancelledError: + logger.info("[WATCHDOG] cancelled") + raise + finally: + logger.info("[WATCHDOG] loop terminated") + + async def _scan_once(self) -> Dict[str, int]: + t0 = time.time() + await _bump("scans_total") + + resent = 0 + gaveup = 0 + skipped = 0 + in_flight = 0 + orphans = 0 + + orphan_targets: List[Tuple[str, Dict[str, Any]]] = [] + async with self._async_lock(): + for action_id, info in list(self._retry_pending.items()): + dispatched_at = info.get("dispatched_at", 0.0) or 0.0 + if dispatched_at <= 0: + skipped += 1 + continue + age = t0 - dispatched_at + in_flight += 1 + if age < WATCHDOG_ORPHAN_TIMEOUT_S: + continue + orphans += 1 + orphan_targets.append((action_id, dict(info))) + + for action_id, info in orphan_targets: + await _bump("orphans_detected_total") + resent_count = int(info.get("resent_count", 0) or 0) + + if resent_count >= WATCHDOG_MAX_RESENDS: + async with self._async_lock(): + self._retry_pending.pop(action_id, None) + age_total = t0 - float(info.get("first_dispatched_at", t0) or t0) + logger.error( + "[BUS] lea:dispatch_orphan_giveup action_id=%s resent=%d age_total=%.1fs " + "session=%s machine=%s replay=%s", + action_id, + resent_count, + age_total, + info.get("session_id", "?"), + info.get("machine_id", "?"), + info.get("replay_id", "?"), + ) + gaveup += 1 + await _bump("orphans_giveup_total") + continue + + session_id = info.get("session_id") + machine_id = info.get("machine_id", "default") + action = info.get("dispatched_action") or info.get("action") + if not session_id or not isinstance(action, dict): + logger.warning( + "[WATCHDOG] invalid schema for %s session_id=%r action_type=%s", + action_id, + session_id, + type(action).__name__, + ) + async with self._async_lock(): + self._retry_pending.pop(action_id, None) + continue + + async with self._async_lock(): + existing = self._retry_pending.get(action_id) + if existing is None: + logger.debug( + "[WATCHDOG] %s acked between snapshot and resend; skip", + action_id, + ) + continue + queue = self._replay_queues.setdefault(session_id, []) + if WATCHDOG_REPUSH_POSITION == "tail": + queue.append(dict(action)) + else: + queue.insert(0, dict(action)) + existing["resent_count"] = resent_count + 1 + existing["last_resent_at"] = time.time() + existing["dispatched_at"] = 0.0 + + age_total = t0 - float(info.get("first_dispatched_at", t0) or t0) + logger.warning( + "[BUS] lea:dispatch_orphan_resent action_id=%s resent=%d/%d age=%.1fs " + "session=%s machine=%s replay=%s", + action_id, + resent_count + 1, + WATCHDOG_MAX_RESENDS, + age_total, + session_id, + machine_id, + info.get("replay_id", "?"), + ) + resent += 1 + await _bump("orphans_resent_total") + + if self._sse_notifier is not None: + try: + self._sse_notifier(session_id, machine_id) + except Exception as exc: + logger.debug("[WATCHDOG] sse notifier failed: %s", exc) + + elapsed_ms = (time.time() - t0) * 1000.0 + async with _metrics_lock: + _metrics["last_scan_ts"] = t0 + _metrics["last_scan_duration_ms"] = elapsed_ms + _metrics["current_in_flight_count"] = in_flight + _metrics["current_orphan_count"] = orphans + scans_total = _metrics["scans_total"] + + if orphans or gaveup: + logger.info( + "[METRIC] watchdog scan=%d orphans=%d resent=%d gaveup=%d " + "in_flight=%d skipped=%d elapsed_ms=%.1f", + scans_total, + orphans, + resent, + gaveup, + in_flight, + skipped, + elapsed_ms, + ) + + return { + "orphans": orphans, + "resent": resent, + "gaveup": gaveup, + "skipped": skipped, + "in_flight": in_flight, + } + + +_singleton: Optional[ReplayWatchdog] = None + + +def get_or_create_watchdog( + retry_pending: Dict[str, Dict[str, Any]], + replay_queues: Dict[str, List[Dict[str, Any]]], + async_lock_factory: Callable[[], Any], + sse_notifier: Optional[SseNotifier] = None, +) -> ReplayWatchdog: + global _singleton + if _singleton is None: + _singleton = ReplayWatchdog( + retry_pending=retry_pending, + replay_queues=replay_queues, + async_lock_factory=async_lock_factory, + sse_notifier=sse_notifier, + ) + return _singleton + + +@contextlib.asynccontextmanager +async def watchdog_lifespan( + retry_pending: Dict[str, Dict[str, Any]], + replay_queues: Dict[str, List[Dict[str, Any]]], + async_lock_factory: Callable[[], Any], + sse_notifier: Optional[SseNotifier] = None, +): + watchdog = get_or_create_watchdog( + retry_pending=retry_pending, + replay_queues=replay_queues, + async_lock_factory=async_lock_factory, + sse_notifier=sse_notifier, + ) + await watchdog.start() + try: + yield watchdog + finally: + await watchdog.stop() diff --git a/agent_v0/server_v1/resolve_engine.py b/agent_v0/server_v1/resolve_engine.py index 125aad41e..c38d54fe4 100644 --- a/agent_v0/server_v1/resolve_engine.py +++ b/agent_v0/server_v1/resolve_engine.py @@ -243,6 +243,168 @@ def _validate_match_context( return True +def _has_meaningful_recorded_coords( + fallback_x_pct: float, + fallback_y_pct: float, +) -> bool: + """Indiquer si les coordonnées fallback représentent une vraie position source.""" + return ( + fallback_x_pct > 0.001 + and fallback_y_pct > 0.001 + and not ( + abs(fallback_x_pct - 0.5) < 0.001 + and abs(fallback_y_pct - 0.5) < 0.001 + ) + ) + + +def _is_close_tab_target(target_spec: Optional[Dict[str, Any]]) -> bool: + """Détecter une action close_tab issue du compilateur replay.""" + if not isinstance(target_spec, dict): + return False + context_hints = target_spec.get("context_hints") or {} + return str((context_hints.get("interaction") or "")).strip().lower() == "close_tab" + + +def _get_expected_close_tab_coords( + target_spec: Optional[Dict[str, Any]], + screen_width: int, + screen_height: int, + fallback_x_pct: float = 0.0, + fallback_y_pct: float = 0.0, +) -> Optional[tuple[float, float]]: + """Retrouver la position attendue la plus fiable pour un close_tab. + + Ordre de préférence : + 1. Coordonnées fallback explicites de l'action replay + 2. centre SoM calibré à l'enregistrement + 3. click_relative + rect fenêtre source + """ + if _has_meaningful_recorded_coords(fallback_x_pct, fallback_y_pct): + return float(fallback_x_pct), float(fallback_y_pct) + + if not isinstance(target_spec, dict): + return None + + som_center = (target_spec.get("som_element") or {}).get("center_norm") + if isinstance(som_center, (list, tuple)) and len(som_center) >= 2: + try: + exp_x = float(som_center[0]) + exp_y = float(som_center[1]) + if 0.0 <= exp_x <= 1.0 and 0.0 <= exp_y <= 1.0: + return exp_x, exp_y + except (TypeError, ValueError): + pass + + window_capture = target_spec.get("window_capture") or {} + rect = window_capture.get("rect") + click_relative = window_capture.get("click_relative") + if ( + isinstance(rect, (list, tuple)) + and len(rect) >= 4 + and isinstance(click_relative, (list, tuple)) + and len(click_relative) >= 2 + and screen_width > 0 + and screen_height > 0 + ): + try: + abs_x = float(rect[0]) + float(click_relative[0]) + abs_y = float(rect[1]) + float(click_relative[1]) + exp_x = abs_x / float(screen_width) + exp_y = abs_y / float(screen_height) + if 0.0 <= exp_x <= 1.0 and 0.0 <= exp_y <= 1.0: + return exp_x, exp_y + except (TypeError, ValueError, ZeroDivisionError): + pass + + return None + + +def _is_close_tab_result_plausible( + resolved_x: float, + resolved_y: float, + target_spec: Optional[Dict[str, Any]], + screen_width: int, + screen_height: int, + fallback_x_pct: float = 0.0, + fallback_y_pct: float = 0.0, +) -> bool: + """Filtrer les faux positifs close_tab qui dérivent vers le bouton fermer.""" + if not _is_close_tab_target(target_spec): + return True + + expected = _get_expected_close_tab_coords( + target_spec, + screen_width, + screen_height, + fallback_x_pct=fallback_x_pct, + fallback_y_pct=fallback_y_pct, + ) + if expected is None: + return True + + exp_x, exp_y = expected + dx = abs(float(resolved_x) - exp_x) + dy = abs(float(resolved_y) - exp_y) + distance = (dx ** 2 + dy ** 2) ** 0.5 + is_plausible = dx <= 0.18 and distance <= 0.20 + if not is_plausible: + logger.warning( + "close_tab guard : résultat rejeté car trop éloigné de la zone " + "source (resolved=(%.4f, %.4f), expected=(%.4f, %.4f), " + "drift=(%.4f, %.4f), dist=%.4f)", + float(resolved_x), + float(resolved_y), + exp_x, + exp_y, + dx, + dy, + distance, + ) + return is_plausible + + +def _is_start_button_vlm_result_plausible( + result: Dict[str, Any], + fallback_x_pct: float, + fallback_y_pct: float, + target_spec: Dict[str, Any], + max_distance: float = 0.20, +) -> bool: + """Filtrer les faux positifs VLM sur le bouton Démarrer. + + Le bouton Démarrer est un singleton système. Quand on dispose d'un vrai clic + enregistré (`fallback_*`), une localisation VLM très éloignée de cette zone + est plus probablement un faux positif qu'un vrai déplacement UI. + """ + by_role = str(target_spec.get("by_role", "") or "").strip().lower() + if by_role != "start_button": + return True + + if not _has_meaningful_recorded_coords(fallback_x_pct, fallback_y_pct): + return True + + if _validate_match_context( + result, + fallback_x_pct, + fallback_y_pct, + target_spec, + max_distance=max_distance, + ): + return True + + logger.warning( + "Start button guard : résultat VLM rejeté car trop éloigné de la " + "position enregistrée (resolved=(%.4f, %.4f), expected=(%.4f, %.4f), max=%.2f)", + float(result.get("x_pct", 0) or 0), + float(result.get("y_pct", 0) or 0), + fallback_x_pct, + fallback_y_pct, + max_distance, + ) + return False + + # ========================================================================= # YOLO/OmniParser — Résolution par détection d'éléments UI # ========================================================================= @@ -1109,16 +1271,66 @@ def _resolve_by_som( # Centre du match match_cx = max_loc[0] + anc_w // 2 match_cy = max_loc[1] + anc_h // 2 + interaction = str( + (target_spec.get("context_hints") or {}).get("interaction", "") or "" + ).strip().lower() + + if interaction == "close_tab": + elapsed = time.time() - t0 + cx_norm = match_cx / screen_width if screen_width > 0 else 0.0 + cy_norm = match_cy / screen_height if screen_height > 0 else 0.0 + if _is_close_tab_result_plausible( + cx_norm, + cy_norm, + target_spec, + screen_width, + screen_height, + ): + logger.info( + "SoM resolve ANCHOR exact close_tab : score=%.3f " + "centre=(%d, %d) → (%.4f, %.4f) en %.1fs", + max_score, match_cx, match_cy, cx_norm, cy_norm, elapsed, + ) + return { + "resolved": True, + "method": "som_anchor_match", + "x_pct": round(cx_norm, 6), + "y_pct": round(cy_norm, 6), + "matched_element": { + "label": "close_tab_button", + "type": "visual_anchor", + "role": "som_anchor_exact", + "confidence": max_score, + }, + "score": max_score, + "match_box": { + "x": int(max_loc[0]), + "y": int(max_loc[1]), + "width": int(anc_w), + "height": int(anc_h), + }, + } + logger.warning( + "SoM resolve ANCHOR exact close_tab rejeté : score=%.3f " + "centre=(%d, %d) → (%.4f, %.4f), passage VLM/fallback", + max_score, match_cx, match_cy, cx_norm, cy_norm, + ) + # Ne pas recycler ce faux match vers l'élément SoM le plus + # proche : pour close_tab, cela retombe facilement sur le + # bouton de fermeture de la fenêtre. + best_elem = None + else: + best_elem = None # Trouver l'élément SomEngine le plus proche du centre du match - best_elem = None best_dist = float("inf") - for elem in som_result.elements: - cx, cy = elem.center - dist = ((match_cx - cx) ** 2 + (match_cy - cy) ** 2) ** 0.5 - if dist < best_dist: - best_dist = dist - best_elem = elem + if best_elem is None and interaction != "close_tab": + for elem in som_result.elements: + cx, cy = elem.center + dist = ((match_cx - cx) ** 2 + (match_cy - cy) ** 2) ** 0.5 + if dist < best_dist: + best_dist = dist + best_elem = elem if best_elem and best_dist < 100: # Max 100px de distance elapsed = time.time() - t0 @@ -1584,6 +1796,49 @@ def _resolve_target_sync( "fallback cascade legacy" ) + # =================================================================== + # Cas spécial : boutons de dialogue runtime ("Oui", "Non", "OK", ...) + # =================================================================== + # Ces boutons sont textuels, sans ancre stable, et apparaissent souvent + # au milieu d'une action déjà en cours. Si on les laisse partir dans la + # cascade générique (VLM -> SoM -> ScreenAnalyzer), on peut bloquer + # l'action principale assez longtemps pour déclencher le watchdog. + # Contrat voulu : OCR direct rapide, sinon abandon immédiat pour que le + # client essaie son fallback local par template texte. + dialog_role = str(target_spec.get("by_role", "") or "").strip().lower() + dialog_text = str(target_spec.get("by_text", "") or "").strip() + if dialog_role == "dialog_button" and dialog_text and not anchor_image_b64: + ocr_result = _resolve_by_ocr_text( + screenshot_path=screenshot_path, + target_text=dialog_text, + screen_width=screen_width, + screen_height=screen_height, + ) + if ocr_result and ocr_result.get("score", 0) >= 0.80: + ocr_result["method"] = "hybrid_text_direct" + logger.info( + "Resolve dialog_button OCR-DIRECT : OK '%s' → (%.4f, %.4f) score=%.2f", + dialog_text[:40], + ocr_result.get("x_pct", 0), + ocr_result.get("y_pct", 0), + ocr_result.get("score", 0), + ) + return ocr_result + + logger.info( + "Resolve dialog_button OCR-only : '%s' non trouvé " + "(fenêtre='%s') — skip VLM/SoM/ScreenAnalyzer", + dialog_text[:40], + str(target_spec.get("window_title", "") or "")[:80], + ) + return { + "resolved": False, + "method": "dialog_button_ocr_only", + "reason": "ocr_direct_failed_dialog_button_no_vlm", + "x_pct": fallback_x_pct, + "y_pct": fallback_y_pct, + } + # =================================================================== # MODE STRICT (replay sessions) — Stratégie VLM-FIRST # =================================================================== @@ -1656,13 +1911,25 @@ def _resolve_target_sync( screen_height=screen_height, ) if grounding_result and grounding_result.get("resolved"): - logger.info( - "Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'", - grounding_result.get("x_pct", 0), - grounding_result.get("y_pct", 0), - by_text_strict[:50], + if _is_close_tab_result_plausible( + float(grounding_result.get("x_pct", 0) or 0), + float(grounding_result.get("y_pct", 0) or 0), + target_spec, + screen_width, + screen_height, + fallback_x_pct=fallback_x_pct, + fallback_y_pct=fallback_y_pct, + ): + logger.info( + "Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'", + grounding_result.get("x_pct", 0), + grounding_result.get("y_pct", 0), + by_text_strict[:50], + ) + return grounding_result + logger.warning( + "Strict resolve GROUNDING : résultat close_tab rejeté, passage template/VLM" ) - return grounding_result if not by_text_strict or by_text_source not in ("ocr", "vlm"): # Template matching pour les éléments sans texte (icônes pures) @@ -1690,11 +1957,23 @@ def _resolve_target_sync( abs_y = window_rect[1] + y_tm * tm_screen_h result["x_pct"] = round(abs_x / screen_width, 6) result["y_pct"] = round(abs_y / screen_height, 6) - logger.info( - "Strict resolve TEMPLATE : icon match (score=%.3f)", - result.get("score", 0), + if _is_close_tab_result_plausible( + float(result.get("x_pct", 0) or 0), + float(result.get("y_pct", 0) or 0), + target_spec, + screen_width, + screen_height, + fallback_x_pct=fallback_x_pct, + fallback_y_pct=fallback_y_pct, + ): + logger.info( + "Strict resolve TEMPLATE : icon match (score=%.3f)", + result.get("score", 0), + ) + return result + logger.warning( + "Strict resolve TEMPLATE : résultat close_tab rejeté, passage cascade suivante" ) - return result # --------------------------------------------------------------- # Étape 0.5 : OCR direct (hybrid_text_direct) — chemin rapide @@ -1739,6 +2018,27 @@ def _resolve_target_sync( by_text_strict[:40], ) + # Les boutons de dialogues runtime connus ("Oui", "Non", "OK", etc.) + # ne doivent pas partir dans la cascade lente VLM -> SoM. Si l'OCR + # direct ne les trouve pas immédiatement, on rend la main au client + # pour son fallback local par template texte, sinon on bloque l'action + # principale assez longtemps pour déclencher le watchdog. + dialog_role = str(target_spec.get("by_role", "") or "").strip().lower() + if dialog_role == "dialog_button" and by_text_strict and not anchor_image_b64: + logger.info( + "Strict resolve dialog_button : OCR-direct only pour '%s' " + "(fenêtre='%s') — skip VLM/SoM/template", + by_text_strict[:40], + str(target_spec.get("window_title", "") or "")[:80], + ) + return { + "resolved": False, + "method": "dialog_button_ocr_only", + "reason": "ocr_direct_failed_dialog_button_no_vlm", + "x_pct": fallback_x_pct, + "y_pct": fallback_y_pct, + } + # --------------------------------------------------------------- # Étape 1 : VLM Quick Find (fallback, multi-image) # --------------------------------------------------------------- @@ -1750,12 +2050,29 @@ def _resolve_target_sync( ) if vlm_result and vlm_result.get("resolved"): if vlm_result.get("score", 0) >= 0.3: - logger.info( - "Strict resolve VLM-first : VLM OK (score=%.2f) pour '%s'", - vlm_result.get("score", 0), - vlm_description[:60] if vlm_description else "(anchor)", + if _is_start_button_vlm_result_plausible( + vlm_result, + fallback_x_pct, + fallback_y_pct, + target_spec, + ) and _is_close_tab_result_plausible( + float(vlm_result.get("x_pct", 0) or 0), + float(vlm_result.get("y_pct", 0) or 0), + target_spec, + screen_width, + screen_height, + fallback_x_pct=fallback_x_pct, + fallback_y_pct=fallback_y_pct, + ): + logger.info( + "Strict resolve VLM-first : VLM OK (score=%.2f) pour '%s'", + vlm_result.get("score", 0), + vlm_description[:60] if vlm_description else "(anchor)", + ) + return vlm_result + logger.warning( + "Strict resolve VLM-first : résultat VLM rejeté par un garde-fou, passage SoM/template" ) - return vlm_result else: logger.info( "Strict resolve VLM-first : VLM score=%.2f trop bas, passage template", @@ -1782,12 +2099,24 @@ def _resolve_target_sync( screen_height=screen_height, ) if som_result and som_result.get("resolved"): - logger.info( - "Strict resolve SoM+VLM : OK (score=%.2f, mark=#%s)", - som_result.get("score", 0), - som_result.get("matched_element", {}).get("som_id", "?"), + if _is_close_tab_result_plausible( + float(som_result.get("x_pct", 0) or 0), + float(som_result.get("y_pct", 0) or 0), + target_spec, + screen_width, + screen_height, + fallback_x_pct=fallback_x_pct, + fallback_y_pct=fallback_y_pct, + ): + logger.info( + "Strict resolve SoM+VLM : OK (score=%.2f, mark=#%s)", + som_result.get("score", 0), + som_result.get("matched_element", {}).get("som_id", "?"), + ) + return som_result + logger.warning( + "Strict resolve SoM+VLM : résultat close_tab rejeté, passage template matching" ) - return som_result else: logger.info("Strict resolve SoM+VLM : échoué, passage template matching") @@ -1805,12 +2134,24 @@ def _resolve_target_sync( score = result.get("score", 0) # Score >= 0.95 : match quasi-parfait, pas besoin de valider le contexte if score >= 0.95: - logger.info( - "Strict resolve VLM-first : template matching fallback OK " - "(score=%.3f >= 0.95, contexte skip — match quasi-parfait)", - score, + if _is_close_tab_result_plausible( + float(result.get("x_pct", 0) or 0), + float(result.get("y_pct", 0) or 0), + target_spec, + screen_width, + screen_height, + fallback_x_pct=fallback_x_pct, + fallback_y_pct=fallback_y_pct, + ): + logger.info( + "Strict resolve VLM-first : template matching fallback OK " + "(score=%.3f >= 0.95, contexte skip — match quasi-parfait)", + score, + ) + return result + logger.warning( + "Strict resolve TEMPLATE : match close_tab très fort mais hors zone source, rejeté" ) - return result elif _validate_match_context(result, fallback_x_pct, fallback_y_pct, target_spec): logger.info( "Strict resolve VLM-first : template matching fallback OK " @@ -2189,6 +2530,37 @@ def _text_match_fuzzy(expected: str, observed: str, min_token_ratio: float = 0.6 return matched / len(tokens) >= min_token_ratio +_SOM_BBOX_OCR_PADDING_PX: int = 8 +_SOM_BBOX_MIN_DIM_PX: int = 12 + + +def _should_reject_on_text_mismatch( + is_valid: bool, + observed: Optional[str], +) -> bool: + """Décide si le pré-check OCR doit rejeter la résolution. + + Patch 2026-05-23 : on distingue deux cas d'échec du fuzzy match : + + - ``observed`` contient du texte (ex: ``'9 ?'``, ``'OBS Studio…'``) + → mismatch confirmé, la cascade a probablement cliqué ailleurs + → on rejette. + - ``observed`` est vide ou whitespace + → l'OCR n'a rien lu (zone trop petite, texte peu contrasté, + modèle EasyOCR sous le seuil de détection). C'est ambigu : + ce n'est PAS la preuve d'un faux positif, on accepte la + résolution serveur. La garde drift ANCHOR-TM côté agent + protège en aval contre les vrais faux positifs. + + Si ``is_valid=True`` → jamais de rejet (cas nominal). + """ + if is_valid: + return False + if observed is None: + return False + return bool(str(observed).strip()) + + def _validate_text_at_position( screenshot_path: str, x_pct: float, @@ -2197,9 +2569,20 @@ def _validate_text_at_position( screen_width: int, screen_height: int, radius_px: int = 280, + som_bbox_norm: Optional[List[float]] = None, ) -> tuple: - """Pré-check sémantique : OCR sur une zone autour de (x_pct, y_pct) et - vérifie que `expected_text` y est présent (substring ou fuzzy 50%). + """Pré-check sémantique : OCR sur une zone et vérifie que + `expected_text` y est présent (substring ou fuzzy 50%). + + Zone OCR (par priorité) : + 1. Si ``som_bbox_norm = [x1, y1, x2, y2]`` (normalisé 0..1) est + fourni et a une largeur/hauteur > _SOM_BBOX_MIN_DIM_PX en + pixels écran : OCR sur cette bbox élargie d'un padding court. + Plus précis pour les éléments étroits (onglets Notepad + moderne, ~30-40px haut) que le radius générique qui capture + le texte voisin (status bar, etc.). + 2. Sinon : fallback historique → carré de ``radius_px`` autour + de (x_pct, y_pct). Retourne (is_valid: bool, observed_text: str, elapsed_ms: float). @@ -2219,16 +2602,52 @@ def _validate_text_at_position( t0 = time.time() img = Image.open(screenshot_path).convert("RGB") img_w, img_h = img.size - cx = int(x_pct * screen_width) - cy = int(y_pct * screen_height) - # Saturer dans les bornes de l'image (le screenshot peut être plus - # large que la fenêtre logique — utiliser min(img_*, screen_*) en sécurité). max_x = min(img_w, screen_width) max_y = min(img_h, screen_height) - x1 = max(0, cx - radius_px) - y1 = max(0, cy - radius_px) - x2 = min(max_x, cx + radius_px) - y2 = min(max_y, cy + radius_px) + + # --- Tentative 1 : zone OCR depuis la bbox SoM (préférée) --- + x1 = y1 = x2 = y2 = None + if ( + isinstance(som_bbox_norm, (list, tuple)) + and len(som_bbox_norm) == 4 + ): + try: + bx1, by1, bx2, by2 = (float(v) for v in som_bbox_norm) + # Tolérer ordre inversé. + bx1, bx2 = sorted((bx1, bx2)) + by1, by2 = sorted((by1, by2)) + # Refuser les bboxes dégénérées AVANT padding : si + # l'élément cible fait < _SOM_BBOX_MIN_DIM_PX en + # natif, c'est probablement une bbox d'apparence + # (curseur, séparateur 1px) — pas un label OCRable. + raw_w = (bx2 - bx1) * screen_width + raw_h = (by2 - by1) * screen_height + if ( + raw_w >= _SOM_BBOX_MIN_DIM_PX + and raw_h >= _SOM_BBOX_MIN_DIM_PX + ): + # Conversion en pixels écran + clipping et padding. + px1 = int(bx1 * screen_width) - _SOM_BBOX_OCR_PADDING_PX + py1 = int(by1 * screen_height) - _SOM_BBOX_OCR_PADDING_PX + px2 = int(bx2 * screen_width) + _SOM_BBOX_OCR_PADDING_PX + py2 = int(by2 * screen_height) + _SOM_BBOX_OCR_PADDING_PX + x1 = max(0, px1) + y1 = max(0, py1) + x2 = min(max_x, px2) + y2 = min(max_y, py2) + except (TypeError, ValueError): + # Bbox malformée : fallback silencieux sur le radius. + x1 = y1 = x2 = y2 = None + + # --- Fallback : carré radius_px autour de (x_pct, y_pct) --- + if x1 is None: + cx = int(x_pct * screen_width) + cy = int(y_pct * screen_height) + x1 = max(0, cx - radius_px) + y1 = max(0, cy - radius_px) + x2 = min(max_x, cx + radius_px) + y2 = min(max_y, cy + radius_px) + if x2 - x1 < 10 or y2 - y1 < 10: return True, "", 0.0 crop = img.crop((x1, y1, x2, y2)) @@ -2246,6 +2665,7 @@ def _validate_resolution_quality( result: Optional[Dict[str, Any]], fallback_x_pct: float, fallback_y_pct: float, + target_spec: Optional[Dict[str, Any]] = None, ) -> Optional[Dict[str, Any]]: """Valide un résultat de résolution et le rejette s'il est peu fiable. @@ -2263,6 +2683,16 @@ def _validate_resolution_quality( elle n'est PAS appelée par les méthodes internes de la cascade, mais uniquement depuis le handler HTTP `/resolve_target` après que la cascade a produit son meilleur candidat. + + Argument optionnel `target_spec` : permet d'appliquer des relaxations + contextuelles. Cas couvert (2026-05-22) : pour une cible + `context_hints.interaction == "switch_tab"` qui dispose d'un + `som_element.bbox_norm`, on abaisse le seuil des méthodes ``som_*`` + de 0.75 → 0.60. Justification : (1) le focus_change pré-clic + prouve qu'on est dans la bonne fenêtre, (2) la bbox SoM a été + calibrée à l'enregistrement et reste valide, (3) les onglets + Notepad moderne sont visuellement quasi-identiques → score VLM + inévitablement lower. """ if not result or not isinstance(result, dict): return result @@ -2291,6 +2721,52 @@ def _validate_resolution_quality( min_score = threshold break + # Relaxation contextuelle pour switch_tab + SoM calibré (2026-05-22). + # Les onglets Notepad moderne (et apps similaires) sont visuellement + # quasi-identiques : le grounding VLM/SoM produit fréquemment un + # score 0.65-0.75, juste sous le seuil strict. Comme le contexte + # `interaction=switch_tab` + bbox SoM enregistrée + focus_change + # pré-clic confirment déjà la fenêtre et la zone, on relâche le + # seuil des méthodes som_* à 0.60 dans CE cas précis uniquement. + if ( + min_score is not None + and target_spec + and method.startswith("som_") + ): + context_hints = target_spec.get("context_hints") or {} + is_tab_switch = ( + context_hints.get("interaction") == "switch_tab" + and target_spec.get("by_role") == "tab" + ) + som_element = target_spec.get("som_element") or {} + has_calibrated_som = bool(som_element.get("bbox_norm")) + if is_tab_switch and has_calibrated_som: + relaxed = 0.60 + if relaxed < min_score: + logger.info( + "[REPLAY] switch_tab + som_element calibré → seuil " + "som_* relâché %.2f → %.2f (cible='%s')", + min_score, relaxed, + target_spec.get("by_text", ""), + ) + min_score = relaxed + + is_close_tab = ( + method == "som_anchor_match" + and str((context_hints.get("interaction") or "")).strip().lower() == "close_tab" + and not str(target_spec.get("by_text", "") or "").strip() + and bool(target_spec.get("anchor_image_base64")) + ) + if is_close_tab: + relaxed = 0.70 + if relaxed < min_score: + logger.info( + "[REPLAY] close_tab + anchor-only → seuil som_anchor_match " + "relâché %.2f → %.2f", + min_score, relaxed, + ) + min_score = relaxed + if min_score is not None and score < min_score: logger.warning( "[REPLAY] Resolution REJETÉE (score trop bas) : method=%s score=%.3f < %.2f", @@ -2306,13 +2782,40 @@ def _validate_resolution_quality( "y_pct": fallback_y_pct, } + if _is_close_tab_target(target_spec) and not _is_close_tab_result_plausible( + resolved_x, + resolved_y, + target_spec, + 0, + 0, + fallback_x_pct=fallback_x_pct, + fallback_y_pct=fallback_y_pct, + ): + logger.warning( + "[REPLAY] Resolution REJETÉE (close_tab hors zone source) : " + "method=%s resolved=(%.3f, %.3f) expected=(%.3f, %.3f)", + method, + resolved_x, + resolved_y, + fallback_x_pct, + fallback_y_pct, + ) + return { + "resolved": False, + "method": f"rejected_close_tab_zone_{method}", + "reason": "close_tab_out_of_recorded_zone", + "original_method": method, + "original_score": score, + "x_pct": fallback_x_pct, + "y_pct": fallback_y_pct, + } + # --- Check 2 : garde de proximité --- # On n'applique la garde que si les coordonnées enregistrées ont un # sens (pas des placeholders 0.5/0.5 des plans V4 ni des 0.0/0.0). - _has_recorded_coords = ( - fallback_x_pct > 0.001 - and fallback_y_pct > 0.001 - and not (abs(fallback_x_pct - 0.5) < 0.001 and abs(fallback_y_pct - 0.5) < 0.001) + _has_recorded_coords = _has_meaningful_recorded_coords( + fallback_x_pct, + fallback_y_pct, ) if _has_recorded_coords: dx = abs(resolved_x - fallback_x_pct) diff --git a/agent_v0/server_v1/stream_processor.py b/agent_v0/server_v1/stream_processor.py index 05e481492..0e94db832 100644 --- a/agent_v0/server_v1/stream_processor.py +++ b/agent_v0/server_v1/stream_processor.py @@ -1025,6 +1025,345 @@ def enrich_click_from_screenshot( return result +def _title_to_tab_label(window_title: str) -> str: + """Réduire un titre de fenêtre en libellé d'onglet probable. + + Exemples: + - "Sans titre – Bloc-notes" -> "Sans titre" + - "*test – Bloc-notes" -> "test" + """ + title = str(window_title or "").strip() + if not title: + return "" + + for sep in (" – ", " - "): + if sep in title: + head = title.split(sep, 1)[0].strip() + if head: + title = head + break + + return title.lstrip("*").strip() + + +def _split_window_title_head_suffix(window_title: str) -> tuple[str, str]: + """Découper un titre de fenêtre en ``(head, suffix)`` si possible. + + Exemples: + - ``Sans titre – Bloc-notes`` -> (``Sans titre``, ``Bloc-notes``) + - ``Page 1 - Google Chrome`` -> (``Page 1``, ``Google Chrome``) + - ``Enregistrer sous`` -> ("", "") + """ + title = str(window_title or "").strip() + if not title: + return "", "" + + for sep in (" – ", " - "): + if sep in title: + head, suffix = title.split(sep, 1) + head = head.strip() + suffix = suffix.strip() + if head and suffix: + return head, suffix + return "", "" + + +def _looks_like_same_app_tab_switch(from_title: str, to_title: str) -> bool: + """Vrai si la transition de focus ressemble à un vrai changement d'onglet. + + On exige que les deux titres partagent un suffixe applicatif stable + (ex: ``Bloc-notes``, ``Google Chrome``). Cela exclut les dialogs + modaux same-app comme ``Enregistrer sous`` qui ne sont pas des + onglets et ne doivent pas être compilés en ``switch_tab``. + """ + from_head, from_suffix = _split_window_title_head_suffix(from_title) + to_head, to_suffix = _split_window_title_head_suffix(to_title) + if not (from_head and from_suffix and to_head and to_suffix): + return False + return from_suffix.casefold() == to_suffix.casefold() + + +def _infer_tab_switch_target( + raw_events: list, + click_event: Dict[str, Any], +) -> Optional[Dict[str, Any]]: + """Détecter un clic d'onglet à partir d'une bascule de focus dans la même app. + + Cas réel observé: + - fenêtre active `http...txt – Bloc-notes` + - clic dans la barre d'onglets (y relatif ~40 px) + - focus immédiat vers `Sans titre – Bloc-notes` + + Dans ce cas, l'ancre image seule est trop fragile. On enrichit donc le + target_spec avec un libellé d'onglet explicite (`by_text='Sans titre'`, + `by_role='tab'`). + """ + event_type = click_event.get("type", "") + if event_type != "mouse_click": + return None + + window = click_event.get("window", {}) + if not isinstance(window, dict): + return None + + from_title = str(window.get("title", "")).strip() + app_name = str(window.get("app_name", "")).strip().lower() + if not from_title or not app_name: + return None + + # Heuristique: on ne traite que les clics très hauts dans la fenêtre, + # typiques d'une barre d'onglets / bouton de fermeture d'onglet. + window_capture = click_event.get("window_capture", {}) + if not isinstance(window_capture, dict): + return None + click_relative = window_capture.get("click_relative") + if not (isinstance(click_relative, list) and len(click_relative) == 2): + return None + try: + rel_y = int(click_relative[1]) + except (TypeError, ValueError): + return None + if rel_y > 90: + return None + + click_ts = click_event.get("timestamp") + click_pos = click_event.get("pos") or [] + + match_idx = None + for idx, raw_evt in enumerate(raw_events): + event_data = raw_evt.get("event", raw_evt) + if event_data.get("type") != "mouse_click": + continue + if event_data.get("timestamp") != click_ts: + continue + if (event_data.get("pos") or []) != click_pos: + continue + match_idx = idx + break + + if match_idx is None: + return None + + for follow_evt in raw_events[match_idx + 1: match_idx + 7]: + follow_data = follow_evt.get("event", follow_evt) + follow_type = follow_data.get("type", "") + if follow_type in {"mouse_click", "text_input", "key_press", "key_combo"}: + # Un autre geste utilisateur est intervenu avant le focus_change : + # le focus observé n'est plus attribuable avec confiance à CE clic. + return None + if follow_type != "window_focus_change": + continue + + to_info = follow_data.get("to", {}) + if not isinstance(to_info, dict): + continue + if str(to_info.get("app_name", "")).strip().lower() != app_name: + continue + + to_title = str(to_info.get("title", "")).strip() + if not to_title or to_title == from_title: + continue + if not _looks_like_same_app_tab_switch(from_title, to_title): + return None + + follow_ts = follow_data.get("timestamp") + if ( + isinstance(click_ts, (int, float)) + and isinstance(follow_ts, (int, float)) + and follow_ts - click_ts > 3.0 + ): + break + + tab_label = _title_to_tab_label(to_title) + if not tab_label: + return None + + return { + "by_text": tab_label, + "by_role": "tab", + "window_title": from_title, + "context_hints": { + "window_title": from_title, + "switch_to_window_title": to_title, + "interaction": "switch_tab", + }, + "vlm_description": ( + f"Dans la fenêtre '{from_title}', l'onglet '{tab_label}' " + "dans la barre d'onglets en haut" + ), + } + + return None + + +def _infer_close_tab_target( + raw_events: list, + click_event: Dict[str, Any], +) -> Optional[Dict[str, Any]]: + """Détecter un clic sur le bouton fermer de l'onglet actif. + + Pattern ciblé observé sur Bloc-notes moderne : + - clic très haut dans la barre d'onglets sur un titre ``*... – Bloc-notes`` + - un clic suivant dans la même fenêtre + - puis focus vers ``Enregistrer sous`` + + Cela correspond à la fermeture d'un onglet modifié qui déclenche ensuite + le flow de sauvegarde. On enrichit le clic avec un hint sémantique pour + viser le vrai bouton ``x`` de l'onglet actif plutôt qu'un simple `yolo`. + """ + event_type = click_event.get("type", "") + if event_type != "mouse_click": + return None + + window = click_event.get("window", {}) + if not isinstance(window, dict): + return None + + from_title = str(window.get("title", "")).strip() + app_name = str(window.get("app_name", "")).strip().lower() + if not from_title or not app_name or not from_title.startswith("*"): + return None + + window_capture = click_event.get("window_capture", {}) + if not isinstance(window_capture, dict): + return None + click_relative = window_capture.get("click_relative") + if not (isinstance(click_relative, list) and len(click_relative) == 2): + return None + try: + rel_y = int(click_relative[1]) + except (TypeError, ValueError): + return None + if rel_y > 90: + return None + + click_ts = click_event.get("timestamp") + click_pos = click_event.get("pos") or [] + match_idx = None + for idx, raw_evt in enumerate(raw_events): + event_data = raw_evt.get("event", raw_evt) + if event_data.get("type") != "mouse_click": + continue + if event_data.get("timestamp") != click_ts: + continue + if (event_data.get("pos") or []) != click_pos: + continue + match_idx = idx + break + + if match_idx is None: + return None + + saw_follow_click_same_window = False + for follow_evt in raw_events[match_idx + 1: match_idx + 8]: + follow_data = follow_evt.get("event", follow_evt) + follow_type = follow_data.get("type", "") + + if follow_type in {"text_input", "key_press", "key_combo"}: + return None + + if follow_type == "mouse_click": + follow_window = follow_data.get("window", {}) + if not isinstance(follow_window, dict): + return None + follow_app = str(follow_window.get("app_name", "")).strip().lower() + follow_title = str(follow_window.get("title", "")).strip() + if follow_app != app_name: + return None + if follow_title == from_title: + saw_follow_click_same_window = True + continue + return None + + if follow_type != "window_focus_change" or not saw_follow_click_same_window: + continue + + to_info = follow_data.get("to", {}) + if not isinstance(to_info, dict): + continue + if str(to_info.get("app_name", "")).strip().lower() != app_name: + continue + to_title = str(to_info.get("title", "")).strip() + if to_title != "Enregistrer sous": + continue + + follow_ts = follow_data.get("timestamp") + if ( + isinstance(click_ts, (int, float)) + and isinstance(follow_ts, (int, float)) + and follow_ts - click_ts > 5.0 + ): + break + + tab_label = _title_to_tab_label(from_title) + if not tab_label: + return None + + return { + "by_text": "", + "by_role": "tab_close_button", + "window_title": from_title, + "context_hints": { + "window_title": from_title, + "active_tab_label": tab_label, + "interaction": "close_tab", + }, + "vlm_description": ( + f"Dans la fenêtre '{from_title}', le bouton x pour fermer " + f"l'onglet actif '{tab_label}' dans la barre d'onglets en haut" + ), + } + + return None + + +def _attach_expected_window_before(actions: list, raw_events: list) -> None: + """Attacher la fenêtre attendue AVANT chaque clic en rejouant les + raw events et en conservant le dernier ``window_focus_change.to.title``. + + Pourquoi : ``mouse_click.window.title`` capturé pendant + l'enregistrement peut être obsolète si une transition de fenêtre + se produit juste avant la capture (ex: dialog Windows qui s'ouvre + milliseconde avant le clic suivant). Le serveur dispose pourtant + des ``window_focus_change`` consécutifs — on s'en sert pour poser + explicitement ``expected_window_before`` sur le clic, lu en priorité + absolue par la pré-vérif côté agent. + + Idempotent : si une action a déjà ``expected_window_before``, on + ne touche pas. + """ + if not actions or not raw_events: + return + + last_focus_title = "" + action_idx = 0 + + def _next_click_idx(start: int) -> int: + i = start + while i < len(actions) and actions[i].get("type") != "click": + i += 1 + return i + + for raw_evt in raw_events: + ev = raw_evt.get("event", raw_evt) if isinstance(raw_evt, dict) else {} + etype = ev.get("type", "") + if etype == "window_focus_change": + to_info = ev.get("to") or {} + title = str(to_info.get("title", "") or "").strip() + if title and title != "unknown_window": + last_focus_title = title + continue + if etype != "mouse_click": + continue + action_idx = _next_click_idx(action_idx) + if action_idx >= len(actions): + return + a = actions[action_idx] + if last_focus_title and not a.get("expected_window_before"): + a["expected_window_before"] = last_focus_title + action_idx += 1 + + def _attach_expected_screenshots( actions: list, raw_events: list, session_dir: Path, ) -> None: @@ -1591,6 +1930,8 @@ def build_replay_from_raw_events( k: v for k, v in enrichment.items() if k != "by_position" # by_position est déjà dans x_pct/y_pct } + if action.get("window_title") and not action["target_spec"].get("window_title"): + action["target_spec"]["window_title"] = action["window_title"] # Ajouter les métadonnées fenêtre pour le grounding ciblé wc = evt.get("window_capture", {}) if wc.get("rect"): @@ -1600,6 +1941,33 @@ def build_replay_from_raw_events( "click_relative": wc.get("click_relative"), } + tab_switch_target = _infer_tab_switch_target(events, evt) + if tab_switch_target: + target_spec = action.setdefault("target_spec", {}) + # Préférer une sémantique explicite d'onglet à un rôle brut + # `yolo`/anchor-only quand le flux brut montre une vraie + # bascule de focus dans la même application. + if not target_spec.get("by_text"): + target_spec["by_text"] = tab_switch_target["by_text"] + target_spec["by_role"] = tab_switch_target["by_role"] + target_spec["window_title"] = tab_switch_target["window_title"] + target_spec["vlm_description"] = tab_switch_target["vlm_description"] + context_hints = dict(target_spec.get("context_hints") or {}) + context_hints.update(tab_switch_target["context_hints"]) + target_spec["context_hints"] = context_hints + action["visual_mode"] = True + + close_tab_target = _infer_close_tab_target(events, evt) + if close_tab_target: + target_spec = action.setdefault("target_spec", {}) + target_spec["by_role"] = close_tab_target["by_role"] + target_spec["window_title"] = close_tab_target["window_title"] + target_spec["vlm_description"] = close_tab_target["vlm_description"] + context_hints = dict(target_spec.get("context_hints") or {}) + context_hints.update(close_tab_target["context_hints"]) + target_spec["context_hints"] = context_hints + action["visual_mode"] = True + elif evt_type == "text_input": text = evt.get("text", "") if not text: @@ -1695,6 +2063,21 @@ def build_replay_from_raw_events( if next_title: result[ci]["expected_window_title"] = next_title + # ── 9b. Pré-condition fiable : expected_window_before ── + # Bug live 2026-05-22 (act_raw_c70976c8) : window.title d'un + # mouse_click peut être obsolète quand une transition de fenêtre + # (ex: ouverture dialog "Enregistrer sous") se produit juste avant + # la capture du click. Sans correction, target_spec.window_title + # reste sur l'ancien titre et la pré-vérif côté agent + # (executor.py:653) déclenche une pause supervisée à tort. + # + # On rejoue les raw events en maintenant le dernier titre vu via + # window_focus_change.to.title et on le pose comme + # expected_window_before sur chaque clic qui n'en a pas déjà un. + # Le champ est lu en priorité absolue par la pré-vérif agent, donc + # il prime sur target_spec.window_title obsolète. + _attach_expected_window_before(result, events) + # ── 10. Enrichir avec intention + expected_result via gemma4 (Critic) ── # gemma4 analyse chaque action dans son contexte pour produire : # - intention : ce que l'utilisateur veut accomplir diff --git a/tests/integration/test_client_server_compat.py b/tests/integration/test_client_server_compat.py index 72816f839..790675275 100644 --- a/tests/integration/test_client_server_compat.py +++ b/tests/integration/test_client_server_compat.py @@ -64,6 +64,31 @@ class TestStreamerEndpoints: _, kwargs = finalize_calls[0] assert kwargs["params"]["session_id"] == "sess_test_002" + def test_finalize_callback_receives_server_payload(self): + """Le payload enrichi de /finalize est remonté au callback client.""" + from agent_v0.agent_v1.network.streamer import TraceStreamer + + payload = { + "status": "queued_for_processing", + "replay_ready": True, + "replay_request": { + "endpoint": "/api/v1/traces/stream/replay-session", + "session_id": "sess_test_008", + "machine_id": "pc-alpha", + }, + } + seen = [] + + with patch("agent_v0.agent_v1.network.streamer.requests") as mock_req: + mock_req.post.return_value = MagicMock(ok=True, json=lambda: payload) + streamer = TraceStreamer("sess_test_008") + streamer.set_on_finalize_result(seen.append) + streamer._server_available = True + streamer.running = False + streamer._finalize_session() + + assert seen == [payload] + # ========================================================================= # Payload formats diff --git a/tests/integration/test_finalize_replay_chain.py b/tests/integration/test_finalize_replay_chain.py new file mode 100644 index 000000000..b899fc361 --- /dev/null +++ b/tests/integration/test_finalize_replay_chain.py @@ -0,0 +1,134 @@ +"""Tests du chainage produit finalize -> replay-session.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + + +_ROOT = str(Path(__file__).resolve().parents[2]) +if _ROOT not in sys.path: + sys.path.insert(0, _ROOT) + + +class TestFinalizeReplayChain: + _TEST_API_TOKEN = "test_finalize_replay_chain_token_0123456789" + + @pytest.fixture(autouse=True) + def _ensure_api_token(self, monkeypatch): + monkeypatch.setenv("RPA_API_TOKEN", self._TEST_API_TOKEN) + api_stream_mod = sys.modules.get("agent_v0.server_v1.api_stream") + if api_stream_mod is not None: + monkeypatch.setattr(api_stream_mod, "API_TOKEN", self._TEST_API_TOKEN) + + @pytest.fixture + def client(self, tmp_path, monkeypatch): + from fastapi.testclient import TestClient + from agent_v0.server_v1 import api_stream + from agent_v0.server_v1.stream_processor import StreamProcessor + from agent_v0.server_v1.worker_stream import StreamWorker + + original_processor = api_stream.processor + original_worker = api_stream.worker + test_processor = StreamProcessor(data_dir=str(tmp_path)) + api_stream.processor = test_processor + api_stream.worker = StreamWorker( + live_dir=str(tmp_path), + processor=test_processor, + ) + monkeypatch.setattr(api_stream, "_enqueue_to_worker", lambda session_id: None) + + client = TestClient(api_stream.app, raise_server_exceptions=False) + yield client, api_stream, test_processor, api_stream.API_TOKEN + + api_stream.processor = original_processor + api_stream.worker = original_worker + + def test_finalize_exposes_replay_request_without_launch(self, client): + c, _, proc, token = client + proc.session_manager.register_session("sess_final_001", machine_id="pc-alpha") + + resp = c.post( + "/api/v1/traces/stream/finalize", + params={"session_id": "sess_final_001"}, + headers={"Authorization": f"Bearer {token}"}, + ) + + assert resp.status_code == 200 + data = resp.json() + assert data["status"] == "queued_for_processing" + assert data["replay_ready"] is True + assert data["replay_request"] == { + "endpoint": "/api/v1/traces/stream/replay-session", + "session_id": "sess_final_001", + "machine_id": "pc-alpha", + } + assert "replay_launch" not in data + + def test_finalize_can_launch_replay_session(self, client, monkeypatch): + c, api_stream, proc, token = client + proc.session_manager.register_session("sess_final_002", machine_id="pc-beta") + calls = [] + + async def fake_replay_from_session(session_id: str, machine_id: str = "default"): + calls.append((session_id, machine_id)) + return { + "replay_id": "replay_sess_1234abcd", + "status": "running", + "source_session_id": session_id, + "target_session_id": "agent_demo", + "machine_id": machine_id, + "total_actions": 7, + } + + monkeypatch.setattr(api_stream, "replay_from_session", fake_replay_from_session) + + resp = c.post( + "/api/v1/traces/stream/finalize", + params={ + "session_id": "sess_final_002", + "launch_replay": "true", + }, + headers={"Authorization": f"Bearer {token}"}, + ) + + assert resp.status_code == 200 + data = resp.json() + assert calls == [("sess_final_002", "pc-beta")] + assert data["replay_launch"]["status"] == "started" + assert data["replay_launch"]["replay"]["replay_id"] == "replay_sess_1234abcd" + assert data["replay_launch"]["replay"]["source_session_id"] == "sess_final_002" + assert data["replay_launch"]["replay"]["machine_id"] == "pc-beta" + + def test_finalize_remains_successful_if_auto_replay_fails(self, client, monkeypatch): + c, api_stream, proc, token = client + proc.session_manager.register_session("sess_final_003", machine_id="pc-gamma") + + async def fake_replay_from_session(session_id: str, machine_id: str = "default"): + raise api_stream.HTTPException( + status_code=404, + detail=f"Aucune session Agent V1 active sur {machine_id}", + ) + + monkeypatch.setattr(api_stream, "replay_from_session", fake_replay_from_session) + + resp = c.post( + "/api/v1/traces/stream/finalize", + params={ + "session_id": "sess_final_003", + "launch_replay": "true", + }, + headers={"Authorization": f"Bearer {token}"}, + ) + + assert resp.status_code == 200 + data = resp.json() + assert data["status"] == "queued_for_processing" + assert data["replay_launch"] == { + "status": "failed", + "status_code": 404, + "detail": "Aucune session Agent V1 active sur pc-gamma", + } + assert data["replay_request"]["machine_id"] == "pc-gamma" diff --git a/tests/integration/test_replay_resume_preserves_original_action.py b/tests/integration/test_replay_resume_preserves_original_action.py new file mode 100644 index 000000000..f7f8218bb --- /dev/null +++ b/tests/integration/test_replay_resume_preserves_original_action.py @@ -0,0 +1,161 @@ +"""Tests intégration : /replay/resume doit réinjecter l'action complète en pause.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + + +_ROOT = str(Path(__file__).resolve().parents[2]) +if _ROOT not in sys.path: + sys.path.insert(0, _ROOT) + + +class TestReplayResumePreservesOriginalAction: + _TEST_API_TOKEN = "test_replay_resume_preserves_original_action_token" + + @pytest.fixture(autouse=True) + def _ensure_api_token(self, monkeypatch): + monkeypatch.setenv("RPA_API_TOKEN", self._TEST_API_TOKEN) + api_stream_mod = sys.modules.get("agent_v0.server_v1.api_stream") + if api_stream_mod is not None: + monkeypatch.setattr(api_stream_mod, "API_TOKEN", self._TEST_API_TOKEN) + + @pytest.fixture + def client(self, monkeypatch): + from fastapi.testclient import TestClient + from agent_v0.server_v1 import api_stream + + monkeypatch.setattr(api_stream, "API_TOKEN", self._TEST_API_TOKEN) + + saved_states = dict(api_stream._replay_states) + saved_queues = dict(api_stream._replay_queues) + saved_retry = dict(api_stream._retry_pending) + + api_stream._replay_states.clear() + api_stream._replay_queues.clear() + api_stream._retry_pending.clear() + + client = TestClient(api_stream.app, raise_server_exceptions=False) + yield client, api_stream, self._TEST_API_TOKEN + + api_stream._replay_states.clear() + api_stream._replay_states.update(saved_states) + api_stream._replay_queues.clear() + api_stream._replay_queues.update(saved_queues) + api_stream._retry_pending.clear() + api_stream._retry_pending.update(saved_retry) + + def test_resume_reinjects_full_original_action_from_failed_action(self, client): + http_client, api_stream, token = client + + original_action = { + "action_id": "act_raw_75272d22", + "type": "click", + "visual_mode": True, + "x_pct": 0.8781, + "y_pct": 0.9856, + "expected_window_before": "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes", + "target_spec": { + "window_title": "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes", + "by_role": "yolo", + }, + } + + api_stream._replay_states["replay_xyz"] = { + "replay_id": "replay_xyz", + "session_id": "sess_resume_xyz", + "machine_id": "pc-alpha", + "status": "paused_need_help", + "failed_action": { + "action_id": "act_raw_75272d22", + "type": "click", + "reason": "wrong_window", + "target_spec": original_action["target_spec"], + "original_action": original_action, + }, + "pause_message": "Replay en pause", + "safety_checks": [], + "checks_acknowledged": [], + "params": {}, + } + api_stream._replay_queues["sess_resume_xyz"] = [] + + resp = http_client.post( + "/api/v1/traces/stream/replay/replay_xyz/resume", + headers={"Authorization": f"Bearer {token}"}, + ) + + assert resp.status_code == 200 + data = resp.json() + assert data["status"] == "resumed" + + reinjected = api_stream._replay_queues["sess_resume_xyz"][0] + assert reinjected["action_id"] == "act_raw_75272d22_resume" + assert reinjected["x_pct"] == pytest.approx(0.8781) + assert reinjected["y_pct"] == pytest.approx(0.9856) + assert reinjected["expected_window_before"] == ( + "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes" + ) + assert reinjected["target_spec"]["window_title"] == ( + "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes" + ) + + def test_resume_dispatch_backfills_retry_pending_for_watchdog(self, client): + http_client, api_stream, token = client + + original_action = { + "action_id": "act_resume_01", + "type": "click", + "visual_mode": True, + "x_pct": 0.41, + "y_pct": 0.52, + "target_spec": {"window_title": "test - Bloc-notes"}, + } + + api_stream._replay_states["replay_resume_watchdog"] = { + "replay_id": "replay_resume_watchdog", + "session_id": "sess_resume_watchdog", + "machine_id": "pc-watchdog", + "status": "paused_need_help", + "failed_action": { + "action_id": "act_resume_01", + "type": "click", + "reason": "wrong_window", + "target_spec": original_action["target_spec"], + "original_action": original_action, + }, + "pause_message": "Replay en pause", + "safety_checks": [], + "checks_acknowledged": [], + "params": {}, + } + api_stream._replay_queues["sess_resume_watchdog"] = [] + + resume_resp = http_client.post( + "/api/v1/traces/stream/replay/replay_resume_watchdog/resume", + headers={"Authorization": f"Bearer {token}"}, + ) + + assert resume_resp.status_code == 200 + + next_resp = http_client.get( + "/api/v1/traces/stream/replay/next", + params={"session_id": "sess_resume_watchdog", "machine_id": "pc-watchdog"}, + ) + + assert next_resp.status_code == 200 + payload = next_resp.json() + dispatched = payload["action"] + assert dispatched["action_id"] == "act_resume_01_resume" + + retry_info = api_stream._retry_pending["act_resume_01_resume"] + assert retry_info["action"]["action_id"] == "act_resume_01" + assert retry_info["dispatched_action"]["action_id"] == "act_resume_01_resume" + assert retry_info["session_id"] == "sess_resume_watchdog" + assert retry_info["machine_id"] == "pc-watchdog" + assert retry_info["replay_id"] == "replay_resume_watchdog" + assert retry_info["first_dispatched_at"] > 0 + assert retry_info["dispatched_at"] >= retry_info["first_dispatched_at"] diff --git a/tests/integration/test_replay_session_trim_neutral.py b/tests/integration/test_replay_session_trim_neutral.py new file mode 100644 index 000000000..d87721b8d --- /dev/null +++ b/tests/integration/test_replay_session_trim_neutral.py @@ -0,0 +1,151 @@ +"""Non-régression — trim du préambule redondant pour /replay-session. + +Bug fixé le 2026-05-20 (cf. ``docs/AUDIT_FINALIZE_CONTRACT_INTEGRATION_2026-05-20.md`` +et ``CR_AUDIT_PAUSED_RESUME_BUS_2026-05-22.md``) : sur la session source +``sess_20260520T102916_066851``, le premier event raw rejoué après le +setup auto Windows était un clic intra-Notepad sur la barre d'onglets +qui basculait de ``http...txt – Bloc-notes`` vers ``Sans titre – Bloc-notes``. +Comme le setup amène déjà Notepad dans ``Sans titre``, ce clic ne +modifiait rien à l'écran → `retry_threshold`. + +Ce test reproduit la chaîne complète d'``api_stream.replay-session`` +côté serveur (sans HTTP) sur une fixture synthétique correspondante, +et vérifie que la première action utile post-setup est bien la +saisie de texte ``test`` — pas un clic de bascule d'onglet ``Sans titre``. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +ROOT = Path(__file__).parent.parent.parent +sys.path.insert(0, str(ROOT)) + +import pytest + +from agent_v0.server_v1.replay_engine import ( # noqa: E402 + _extract_required_apps_from_events, + _generate_setup_actions, + _trim_redundant_setup_events, +) +from agent_v0.server_v1.stream_processor import ( # noqa: E402 + build_replay_from_raw_events, +) + + +def _make_session_events() -> list: + """Reproduit le pattern de ``sess_20260520T102916_066851`` : + Démarrer → Rechercher → Notepad ouvre un fichier .txt → l'utilisateur + clique sur l'onglet ``Sans titre`` → tape ``test`` → Ctrl+S. + + L'enregistrement initial passe par un titre non-neutre puis bascule + sur un titre neutre — c'est le scénario qui piégeait le trim.""" + return [ + # Démarrer + {"event": { + "type": "window_focus_change", + "to": {"app_name": "explorer.exe", "title": "Explorateur"}, + }}, + {"event": { + "type": "mouse_click", "pos": [50, 1430], "timestamp": 1.0, + "window": {"app_name": "explorer.exe", "title": "Explorateur"}, + }}, + # SearchHost + {"event": { + "type": "window_focus_change", + "to": {"app_name": "SearchHost.exe", "title": "Rechercher"}, + }}, + {"event": { + "type": "text_input", "text": "bloc", "timestamp": 2.0, + "window": {"app_name": "SearchHost.exe", "title": "Rechercher"}, + }}, + {"event": { + "type": "mouse_click", "pos": [681, 448], "timestamp": 2.5, + "window": {"app_name": "SearchHost.exe", "title": "Rechercher"}, + }}, + # Notepad ouvre un fichier .txt existant (non-neutre) + {"event": { + "type": "window_focus_change", + "to": { + "app_name": "Notepad.exe", + "title": "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes", + }, + }}, + # Clic dans la barre d'onglets (y=40) → bascule vers Sans titre + {"event": { + "type": "mouse_click", "pos": [1191, 40], "timestamp": 4.0, + "window": { + "app_name": "Notepad.exe", + "title": "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes", + }, + "window_capture": {"click_relative": [1191, 40]}, + }}, + {"event": { + "type": "window_focus_change", + "to": {"app_name": "Notepad.exe", "title": "Sans titre – Bloc-notes"}, + }}, + # Saisie réelle de l'utilisateur — c'est la première action utile + {"event": { + "type": "text_input", "text": "test", "timestamp": 5.0, + "window": {"app_name": "Notepad.exe", + "title": "Sans titre – Bloc-notes"}, + }}, + ] + + +def test_replay_session_pipeline_skips_redundant_tab_switch(tmp_path): + """Pipeline complet replay-session : setup auto + trim + build doit + produire un replay dont la première action post-setup est la saisie + ``test``, pas le clic de bascule d'onglet ``Sans titre``. + """ + raw_events = _make_session_events() + app_info = _extract_required_apps_from_events(raw_events) + + # 1) Setup auto reconnaît Notepad et génère ses actions + assert app_info.get("primary_app") == "Notepad.exe" + setup_actions = _generate_setup_actions(app_info, setup_id_prefix="setup_sess") + assert setup_actions, "le setup auto doit injecter des actions Notepad" + action_ids = {a.get("action_id", "") for a in setup_actions} + assert any("click_start" in aid for aid in action_ids) + assert any("click_result" in aid for aid in action_ids) + + # 2) Trim : le clic intra-Notepad redondant doit disparaître + trimmed = _trim_redundant_setup_events(raw_events, app_info) + click_titles = [ + (ev.get("event") or ev).get("window", {}).get("title", "") + for ev in trimmed + if (ev.get("event") or ev).get("type") == "mouse_click" + ] + assert not any( + "http192.168.1.40" in t for t in click_titles + ), "le clic intra-Notepad redondant doit être coupé par le trim" + + # 3) Build replay propre : la première action utile post-trim est + # la saisie 'test' — pas un click "Sans titre" issu de + # _infer_tab_switch_target. + actions = build_replay_from_raw_events( + trimmed, session_id="sess_synthetic", session_dir=str(tmp_path), + ) + actionable = [a for a in actions if a.get("type") in ("click", "type", "key_combo")] + assert actionable, "le replay doit contenir au moins une action utile" + + first = actionable[0] + assert first.get("type") == "type", ( + f"première action utile doit être 'type', pas '{first.get('type')}' " + f"(target_spec={first.get('target_spec')})" + ) + assert first.get("text") == "test" + + # Sanity : aucune action click ne doit cibler "Sans titre" (= la + # bascule d'onglet inférée par _infer_tab_switch_target) dans le + # replay nettoyé. + sans_titre_clicks = [ + a for a in actions + if a.get("type") == "click" + and a.get("target_spec", {}).get("by_text", "").strip().lower() == "sans titre" + ] + assert not sans_titre_clicks, ( + "le replay ne doit plus contenir de click ciblant 'Sans titre' " + f"(trouvés : {sans_titre_clicks})" + ) diff --git a/tests/integration/test_replay_watchdog.py b/tests/integration/test_replay_watchdog.py new file mode 100644 index 000000000..a70e019a4 --- /dev/null +++ b/tests/integration/test_replay_watchdog.py @@ -0,0 +1,352 @@ +"""Integration tests for the replay orphan watchdog.""" + +from __future__ import annotations + +import asyncio +import contextlib +import importlib +import time +from typing import Any, Dict, List + +import pytest + + +@contextlib.asynccontextmanager +async def fake_lock(): + yield + + +@pytest.fixture(autouse=True) +def reset_watchdog_singleton(): + import agent_v0.server_v1.replay_watchdog as wd_mod + + wd_mod._singleton = None + for key in list(wd_mod._metrics.keys()): + if isinstance(wd_mod._metrics[key], (int, float)): + wd_mod._metrics[key] = 0 + yield + + +@pytest.fixture +def env_short_timeout(monkeypatch): + monkeypatch.setenv("RPA_WATCHDOG_ENABLED", "1") + monkeypatch.setenv("RPA_WATCHDOG_SCAN_INTERVAL_S", "0.1") + monkeypatch.setenv("RPA_WATCHDOG_ORPHAN_TIMEOUT_S", "0.2") + monkeypatch.setenv("RPA_WATCHDOG_MAX_RESENDS", "2") + + import agent_v0.server_v1.replay_watchdog as wd_mod + + importlib.reload(wd_mod) + yield + + +@pytest.mark.asyncio +async def test_no_orphan_below_timeout(env_short_timeout): + from agent_v0.server_v1.replay_watchdog import ReplayWatchdog + + now = time.time() + retry_pending: Dict[str, Dict[str, Any]] = { + "act1": { + "action": {"action_id": "act1", "type": "click"}, + "dispatched_action": {"action_id": "act1", "type": "click"}, + "session_id": "sess1", + "machine_id": "m1", + "dispatched_at": now, + "first_dispatched_at": now, + "resent_count": 0, + } + } + replay_queues: Dict[str, List[Dict[str, Any]]] = {"sess1": []} + watchdog = ReplayWatchdog(retry_pending, replay_queues, fake_lock) + + result = await watchdog._scan_once() + + assert result == { + "orphans": 0, + "resent": 0, + "gaveup": 0, + "skipped": 0, + "in_flight": 1, + } + assert replay_queues["sess1"] == [] + assert retry_pending["act1"]["resent_count"] == 0 + + +@pytest.mark.asyncio +async def test_orphan_above_timeout_resent_in_head(env_short_timeout): + from agent_v0.server_v1.replay_watchdog import ReplayWatchdog + + action = {"action_id": "act1", "type": "click"} + other = {"action_id": "act_next", "type": "click"} + retry_pending = { + "act1": { + "action": {"action_id": "original", "type": "click"}, + "dispatched_action": action, + "session_id": "sess1", + "machine_id": "m1", + "dispatched_at": time.time() - 5.0, + "first_dispatched_at": time.time() - 5.0, + "resent_count": 0, + } + } + replay_queues = {"sess1": [other]} + watchdog = ReplayWatchdog(retry_pending, replay_queues, fake_lock) + + result = await watchdog._scan_once() + + assert result["resent"] == 1 + assert replay_queues["sess1"] == [action, other] + assert retry_pending["act1"]["resent_count"] == 1 + assert retry_pending["act1"]["dispatched_at"] == 0.0 + + +@pytest.mark.asyncio +async def test_giveup_after_max_resends(env_short_timeout): + from agent_v0.server_v1.replay_watchdog import ReplayWatchdog + + retry_pending = { + "act1": { + "action": {"action_id": "act1", "type": "click"}, + "dispatched_action": {"action_id": "act1", "type": "click"}, + "session_id": "sess1", + "machine_id": "m1", + "dispatched_at": time.time() - 5.0, + "first_dispatched_at": time.time() - 90.0, + "resent_count": 2, + } + } + replay_queues = {"sess1": []} + watchdog = ReplayWatchdog(retry_pending, replay_queues, fake_lock) + + result = await watchdog._scan_once() + + assert result["gaveup"] == 1 + assert result["resent"] == 0 + assert "act1" not in retry_pending + assert replay_queues["sess1"] == [] + + +@pytest.mark.asyncio +async def test_race_report_arrives_during_scan(env_short_timeout): + from agent_v0.server_v1.replay_watchdog import ReplayWatchdog + + retry_pending = { + "act1": { + "action": {"action_id": "act1", "type": "click"}, + "dispatched_action": {"action_id": "act1", "type": "click"}, + "session_id": "sess1", + "machine_id": "m1", + "dispatched_at": time.time() - 5.0, + "first_dispatched_at": time.time() - 5.0, + "resent_count": 0, + } + } + replay_queues = {"sess1": []} + + @contextlib.asynccontextmanager + async def lock_that_pops_before_resend(): + count = getattr(lock_that_pops_before_resend, "_count", 0) + 1 + lock_that_pops_before_resend._count = count + if count == 2: + retry_pending.pop("act1", None) + yield + + watchdog = ReplayWatchdog(retry_pending, replay_queues, lock_that_pops_before_resend) + result = await watchdog._scan_once() + + assert result["orphans"] == 1 + assert result["resent"] == 0 + assert replay_queues["sess1"] == [] + + +@pytest.mark.asyncio +async def test_disabled_via_env(monkeypatch): + monkeypatch.setenv("RPA_WATCHDOG_ENABLED", "0") + + import agent_v0.server_v1.replay_watchdog as wd_mod + + importlib.reload(wd_mod) + watchdog = wd_mod.ReplayWatchdog({}, {}, fake_lock) + + await watchdog.start() + + assert watchdog._task is None + await watchdog.stop() + + +@pytest.mark.asyncio +async def test_lifecycle_start_stop_clean(env_short_timeout): + from agent_v0.server_v1.replay_watchdog import ReplayWatchdog + + watchdog = ReplayWatchdog({}, {}, fake_lock) + await watchdog.start() + + assert watchdog._task is not None + assert not watchdog._task.done() + + await asyncio.sleep(0.25) + await watchdog.stop(timeout_s=2.0) + + assert watchdog._task is None + + +@pytest.mark.asyncio +async def test_orphan_with_repush_tail(monkeypatch, env_short_timeout): + monkeypatch.setenv("RPA_WATCHDOG_REPUSH_POSITION", "tail") + + import agent_v0.server_v1.replay_watchdog as wd_mod + + importlib.reload(wd_mod) + from agent_v0.server_v1.replay_watchdog import ReplayWatchdog + + action = {"action_id": "act1", "type": "click"} + other = {"action_id": "act_next", "type": "click"} + retry_pending = { + "act1": { + "action": {"action_id": "original", "type": "click"}, + "dispatched_action": action, + "session_id": "sess1", + "machine_id": "m1", + "dispatched_at": time.time() - 5.0, + "first_dispatched_at": time.time() - 5.0, + "resent_count": 0, + } + } + replay_queues = {"sess1": [other]} + watchdog = ReplayWatchdog(retry_pending, replay_queues, fake_lock) + + await watchdog._scan_once() + + assert replay_queues["sess1"] == [other, action] + + +@pytest.mark.asyncio +async def test_metrics_snapshot(env_short_timeout): + from agent_v0.server_v1.replay_watchdog import ReplayWatchdog, get_metrics_snapshot + + retry_pending = { + "act1": { + "action": {"action_id": "act1", "type": "click"}, + "dispatched_action": {"action_id": "act1", "type": "click"}, + "session_id": "sess1", + "machine_id": "m1", + "dispatched_at": time.time() - 5.0, + "first_dispatched_at": time.time() - 5.0, + "resent_count": 0, + } + } + watchdog = ReplayWatchdog(retry_pending, {"sess1": []}, fake_lock) + + await watchdog._scan_once() + snapshot = get_metrics_snapshot() + + assert snapshot["scans_total"] >= 1 + assert snapshot["orphans_detected_total"] >= 1 + assert snapshot["orphans_resent_total"] >= 1 + + +def test_default_orphan_timeout_matches_spec(monkeypatch): + monkeypatch.delenv("RPA_WATCHDOG_ORPHAN_TIMEOUT_S", raising=False) + + import agent_v0.server_v1.replay_watchdog as wd_mod + + importlib.reload(wd_mod) + + assert wd_mod.WATCHDOG_ORPHAN_TIMEOUT_S == 45.0 + + +@pytest.mark.asyncio +async def test_late_report_clears_resent_duplicate_from_queue(monkeypatch): + monkeypatch.setenv("RPA_API_TOKEN", "test_replay_watchdog_token") + + from agent_v0.server_v1 import api_stream + + monkeypatch.setattr(api_stream, "API_TOKEN", "test_replay_watchdog_token") + + saved_states = dict(api_stream._replay_states) + saved_queues = dict(api_stream._replay_queues) + saved_retry = dict(api_stream._retry_pending) + + api_stream._replay_states.clear() + api_stream._replay_queues.clear() + api_stream._retry_pending.clear() + + try: + action = { + "action_id": "act_setup_sess_click_start", + "type": "click", + "visual_mode": True, + "x_pct": 0.387891, + "y_pct": 0.974375, + "_setup_phase": True, + "target_spec": {"by_role": "start_button"}, + } + next_action = {"action_id": "act_setup_sess_wait_start", "type": "wait"} + replay_id = "replay_watchdog_dup" + session_id = "sess_watchdog_dup" + now = time.time() + + api_stream._replay_states[replay_id] = { + "replay_id": replay_id, + "workflow_id": "session_replay:test", + "session_id": session_id, + "machine_id": "pc-watchdog", + "status": "running", + "total_actions": 2, + "completed_actions": 0, + "failed_actions": 0, + "current_action_index": 0, + "params": {}, + "results": [], + "actions": [action, next_action], + "retried_actions": 0, + "unverified_actions": 0, + "error_log": [], + "last_screenshot": None, + "failed_action": None, + "pause_message": None, + "variables": {}, + "safety_checks": [], + "checks_acknowledged": [], + "pause_reason": "", + "pause_payload": None, + } + api_stream._replay_queues[session_id] = [dict(action), dict(next_action)] + api_stream._retry_pending[action["action_id"]] = { + "action": dict(action), + "dispatched_action": dict(action), + "retry_count": 0, + "replay_id": replay_id, + "session_id": session_id, + "machine_id": "pc-watchdog", + "dispatched_at": now, + "first_dispatched_at": now - 5.0, + "resent_count": 1, + "last_resent_at": now - 1.0, + } + + report = api_stream.ReplayResultReport( + session_id=session_id, + action_id=action["action_id"], + success=True, + warning="start_button_hotkey_fallback", + resolution_method="semantic_start_button_hotkey", + resolution_score=1.0, + ) + + result = await api_stream.report_action_result(report) + + assert result["status"] == "recorded" + assert [a["action_id"] for a in api_stream._replay_queues[session_id]] == [ + "act_setup_sess_wait_start" + ] + assert action["action_id"] not in api_stream._retry_pending + assert api_stream._replay_states[replay_id]["completed_actions"] == 1 + assert api_stream._replay_states[replay_id]["current_action_index"] == 1 + finally: + api_stream._replay_states.clear() + api_stream._replay_states.update(saved_states) + api_stream._replay_queues.clear() + api_stream._replay_queues.update(saved_queues) + api_stream._retry_pending.clear() + api_stream._retry_pending.update(saved_retry) diff --git a/tests/integration/test_stream_processor.py b/tests/integration/test_stream_processor.py index 8002da2b5..7c6495b4c 100644 --- a/tests/integration/test_stream_processor.py +++ b/tests/integration/test_stream_processor.py @@ -112,6 +112,58 @@ class TestLiveSessionManager: assert len(raw["screenshots"]) == 1 assert raw["screenshots"][0]["screenshot_id"] == "shot_full_001" + def test_discovers_bg_session_machine_id_from_root_folder(self, tmp_path): + from agent_v0.server_v1.live_session_manager import LiveSessionManager + + live_dir = tmp_path / "live_sessions" + session_dir = live_dir / "bg_DESKTOP-58D5CAC_windows" + session_dir.mkdir(parents=True) + (session_dir / "live_events.jsonl").write_text("{}", encoding="utf-8") + + mgr = LiveSessionManager( + persist_dir=str(tmp_path / "persist"), + live_sessions_dir=str(live_dir), + ) + + session = mgr.get_session("bg_DESKTOP-58D5CAC_windows") + assert session is not None + assert session.machine_id == "DESKTOP-58D5CAC_windows" + + def test_loads_persisted_bg_session_with_machine_id_inferred(self, tmp_path): + from agent_v0.server_v1.live_session_manager import LiveSessionManager + + persist_dir = tmp_path / "persist" + persist_dir.mkdir() + (persist_dir / "bg_DESKTOP-58D5CAC_windows.json").write_text( + '{"session_id":"bg_DESKTOP-58D5CAC_windows","machine_id":"default",' + '"events":[],"shot_paths":{},"last_window_info":{"title":"Unknown","app_name":"unknown"},' + '"created_at":"2026-05-20T14:00:00","last_activity":"2026-05-20T14:00:00",' + '"finalized":false,"window_titles_seen":{},"app_names_seen":{}}', + encoding="utf-8", + ) + + mgr = LiveSessionManager(persist_dir=str(persist_dir)) + + session = mgr.get_session("bg_DESKTOP-58D5CAC_windows") + assert session is not None + assert session.machine_id == "DESKTOP-58D5CAC_windows" + + def test_find_active_agent_session_falls_back_to_bg_machine_session(self, tmp_path): + from agent_v0.server_v1.live_session_manager import LiveSessionManager + from agent_v0.server_v1.replay_engine import _find_active_agent_session + + mgr = LiveSessionManager(persist_dir=str(tmp_path / "persist")) + mgr.register_session( + "sess_20260520T102916_066851", + machine_id="DESKTOP-58D5CAC_windows", + ) + mgr.finalize("sess_20260520T102916_066851") + mgr.register_session("bg_DESKTOP-58D5CAC_windows") + + active = _find_active_agent_session(mgr, machine_id="DESKTOP-58D5CAC_windows") + + assert active == "bg_DESKTOP-58D5CAC_windows" + # ========================================================================= # StreamProcessor @@ -195,6 +247,238 @@ class TestStreamProcessor: assert stats["total_workflows"] == 0 assert stats["initialized"] is False + def test_build_replay_does_not_compile_save_dialog_open_as_switch_tab( + self, tmp_path, monkeypatch, + ): + """`Enregistrer sous` same-app n'est pas un onglet. + + Régression live 2026-05-23 : un clic menu dans Notepad était + recompilé en faux `switch_tab`, ce qui injectait un clic parasite + avant la vraie ouverture de dialog. + """ + from agent_v0.server_v1 import stream_processor as sp + + session_dir = tmp_path / "sess" + (session_dir / "shots").mkdir(parents=True) + + monkeypatch.setattr(sp, "_load_crop_for_event", lambda *args, **kwargs: None) + monkeypatch.setattr( + sp, + "enrich_click_from_screenshot", + lambda *args, **kwargs: {"anchor_image_base64": "abc123", "by_role": "yolo"}, + ) + monkeypatch.setattr(sp, "_attach_expected_screenshots", lambda *args, **kwargs: None) + monkeypatch.setattr(sp, "_enrich_actions_with_intentions", lambda *args, **kwargs: None) + monkeypatch.setattr(sp, "_unload_gemma4", lambda *args, **kwargs: None) + + events = [ + {"event": { + "type": "mouse_click", + "timestamp": 1.0, + "pos": [820, 630], + "button": "left", + "screenshot_id": "shot_001", + "window": {"title": "*test – Bloc-notes", "app_name": "Notepad.exe"}, + "window_capture": { + "rect": [320, 520, 2240, 1636], + "click_relative": [500, 110], + "window_size": [1920, 1116], + }, + }}, + {"event": { + "type": "mouse_click", + "timestamp": 1.2, + "pos": [860, 562], + "button": "left", + "screenshot_id": "shot_002", + "window": {"title": "*test – Bloc-notes", "app_name": "Notepad.exe"}, + "window_capture": { + "rect": [320, 520, 2240, 1636], + "click_relative": [540, 40], + "window_size": [1920, 1116], + }, + }}, + {"event": { + "type": "window_focus_change", + "timestamp": 1.35, + "from": {"title": "*test – Bloc-notes", "app_name": "Notepad.exe"}, + "to": {"title": "Enregistrer sous", "app_name": "Notepad.exe"}, + }}, + {"event": { + "type": "mouse_click", + "timestamp": 1.6, + "pos": [997, 743], + "button": "left", + "screenshot_id": "shot_003", + "window": {"title": "Enregistrer sous", "app_name": "Notepad.exe"}, + }}, + ] + + actions = sp.build_replay_from_raw_events( + events, session_id="sess_save_dialog", session_dir=str(session_dir), + ) + + clicks = [a for a in actions if a.get("type") == "click"] + assert len(clicks) == 3 + assert all( + (c.get("target_spec", {}).get("context_hints") or {}).get("interaction") != "switch_tab" + for c in clicks + ) + assert clicks[1].get("expected_window_title") == "Enregistrer sous" + assert clicks[2].get("expected_window_before") == "Enregistrer sous" + + def test_build_replay_tab_switch_focus_belongs_to_latest_click_only( + self, tmp_path, monkeypatch, + ): + """Le focus d'onglet doit être rattaché au dernier clic causal.""" + from agent_v0.server_v1 import stream_processor as sp + + session_dir = tmp_path / "sess" + (session_dir / "shots").mkdir(parents=True) + + monkeypatch.setattr(sp, "_load_crop_for_event", lambda *args, **kwargs: None) + monkeypatch.setattr( + sp, + "enrich_click_from_screenshot", + lambda *args, **kwargs: {"anchor_image_base64": "abc123", "by_role": "yolo"}, + ) + monkeypatch.setattr(sp, "_attach_expected_screenshots", lambda *args, **kwargs: None) + monkeypatch.setattr(sp, "_enrich_actions_with_intentions", lambda *args, **kwargs: None) + monkeypatch.setattr(sp, "_unload_gemma4", lambda *args, **kwargs: None) + + events = [ + {"event": { + "type": "mouse_click", + "timestamp": 1.0, + "pos": [1410, 562], + "button": "left", + "screenshot_id": "shot_001", + "window": { + "title": "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes", + "app_name": "Notepad.exe", + }, + "window_capture": { + "rect": [323, 522, 2243, 1638], + "click_relative": [1087, 40], + "window_size": [1920, 1116], + }, + }}, + {"event": { + "type": "mouse_click", + "timestamp": 1.1, + "pos": [1514, 562], + "button": "left", + "screenshot_id": "shot_002", + "window": { + "title": "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes", + "app_name": "Notepad.exe", + }, + "window_capture": { + "rect": [323, 522, 2243, 1638], + "click_relative": [1191, 40], + "window_size": [1920, 1116], + }, + }}, + {"event": { + "type": "window_focus_change", + "timestamp": 1.2, + "from": { + "title": "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes", + "app_name": "Notepad.exe", + }, + "to": { + "title": "Sans titre – Bloc-notes", + "app_name": "Notepad.exe", + }, + }}, + ] + + actions = sp.build_replay_from_raw_events( + events, + session_id="sess_intervening_click", + session_dir=str(session_dir), + ) + + assert len(actions) == 2 + first_hints = actions[0].get("target_spec", {}).get("context_hints") or {} + second_hints = actions[1].get("target_spec", {}).get("context_hints") or {} + + assert first_hints.get("interaction") != "switch_tab" + assert actions[1]["target_spec"]["by_text"] == "Sans titre" + assert actions[1]["target_spec"]["by_role"] == "tab" + assert second_hints.get("interaction") == "switch_tab" + + def test_build_replay_infers_close_tab_before_save_dialog( + self, tmp_path, monkeypatch, + ): + """Le clic sur le x d'onglet actif doit être sémantisé comme close_tab.""" + from agent_v0.server_v1 import stream_processor as sp + + session_dir = tmp_path / "sess" + (session_dir / "shots").mkdir(parents=True) + + monkeypatch.setattr(sp, "_load_crop_for_event", lambda *args, **kwargs: None) + monkeypatch.setattr( + sp, + "enrich_click_from_screenshot", + lambda *args, **kwargs: {"anchor_image_base64": "abc123", "by_role": "yolo"}, + ) + monkeypatch.setattr(sp, "_attach_expected_screenshots", lambda *args, **kwargs: None) + monkeypatch.setattr(sp, "_enrich_actions_with_intentions", lambda *args, **kwargs: None) + monkeypatch.setattr(sp, "_unload_gemma4", lambda *args, **kwargs: None) + + events = [ + {"event": { + "type": "mouse_click", + "timestamp": 1.0, + "pos": [1814, 560], + "button": "left", + "screenshot_id": "shot_001", + "window": {"title": "*test – Bloc-notes", "app_name": "Notepad.exe"}, + "window_capture": { + "rect": [323, 522, 2243, 1638], + "click_relative": [1491, 38], + "window_size": [1920, 1116], + }, + }}, + {"event": { + "type": "mouse_click", + "timestamp": 1.3, + "pos": [1183, 1156], + "button": "left", + "screenshot_id": "shot_002", + "window": {"title": "*test – Bloc-notes", "app_name": "Notepad.exe"}, + "window_capture": { + "rect": [323, 522, 2243, 1638], + "click_relative": [860, 634], + "window_size": [1920, 1116], + }, + }}, + {"event": { + "type": "window_focus_change", + "timestamp": 1.5, + "from": {"title": "*test – Bloc-notes", "app_name": "Notepad.exe"}, + "to": {"title": "Enregistrer sous", "app_name": "Notepad.exe"}, + }}, + ] + + actions = sp.build_replay_from_raw_events( + events, + session_id="sess_close_tab", + session_dir=str(session_dir), + ) + + clicks = [a for a in actions if a.get("type") == "click"] + assert len(clicks) == 2 + first_spec = clicks[0].get("target_spec", {}) + first_hints = first_spec.get("context_hints") or {} + + assert first_spec.get("by_role") == "tab_close_button" + assert first_spec.get("by_text", "") == "" + assert first_hints.get("interaction") == "close_tab" + assert first_hints.get("active_tab_label") == "test" + assert "fermer l'onglet actif 'test'" in first_spec.get("vlm_description", "") + # ========================================================================= # StreamWorker diff --git a/tests/unit/test_agent_finalize_replay_contract.py b/tests/unit/test_agent_finalize_replay_contract.py new file mode 100644 index 000000000..38443eb14 --- /dev/null +++ b/tests/unit/test_agent_finalize_replay_contract.py @@ -0,0 +1,184 @@ +"""Tests ciblés sur l'intégration agent du contrat finalize enrichi.""" + +from __future__ import annotations + +import sys +import types +from pathlib import Path +from unittest.mock import MagicMock, patch + + +_ROOT = str(Path(__file__).resolve().parents[2]) +if _ROOT not in sys.path: + sys.path.insert(0, _ROOT) + + +class _ImmediateThread: + def __init__(self, target=None, args=(), kwargs=None, daemon=None): + self._target = target + self._args = args + self._kwargs = kwargs or {} + + def start(self): + if self._target is not None: + self._target(*self._args, **self._kwargs) + + +class _DummyServerClient: + _stream_base = "http://server.test:5005" + + def __init__(self): + self.on_connection_change = None + + def set_on_connection_change(self, callback): + self.on_connection_change = callback + + def _auth_headers(self): + return {"Authorization": "Bearer test-token"} + + +def _install_pystray_stub(): + pystray_stub = types.ModuleType("pystray") + + class _DummyMenu: + SEPARATOR = object() + + def __init__(self, *args, **kwargs): + self.args = args + self.kwargs = kwargs + + class _DummyIcon: + def __init__(self, *args, **kwargs): + self.args = args + self.kwargs = kwargs + + def run(self): + return None + + def stop(self): + return None + + def update_menu(self): + return None + + pystray_stub.MenuItem = lambda *args, **kwargs: (args, kwargs) + pystray_stub.Menu = _DummyMenu + pystray_stub.Icon = _DummyIcon + sys.modules["pystray"] = pystray_stub + + +def _build_tray(): + _install_pystray_stub() + + from agent_v0.agent_v1.ui.smart_tray import SmartTrayV1 + + tray = SmartTrayV1( + on_start_callback=lambda _name: None, + on_stop_callback=lambda: None, + server_client=_DummyServerClient(), + ) + tray._notifier = MagicMock() + return tray + + +def test_offer_finalize_replay_requires_user_consent(): + _install_pystray_stub() + from agent_v0.agent_v1.ui import smart_tray as smart_tray_mod + + tray = _build_tray() + tray._launch_replay_request = MagicMock() + + with patch.object(smart_tray_mod.threading, "Thread", _ImmediateThread), \ + patch.object(smart_tray_mod, "_ask_consent", return_value=False): + tray.offer_finalize_replay( + { + "endpoint": "/api/v1/traces/stream/replay-session", + "session_id": "sess_offer_001", + "machine_id": "pc-offer", + }, + "Bloc-notes", + ) + + tray._notifier.notify.assert_called_once() + tray._launch_replay_request.assert_not_called() + + +def test_launch_replay_request_calls_replay_session_endpoint(): + _install_pystray_stub() + from agent_v0.agent_v1.ui import smart_tray as smart_tray_mod + + tray = _build_tray() + + with patch.object(smart_tray_mod.threading, "Thread", _ImmediateThread), \ + patch("requests.post") as mock_post: + mock_post.return_value = MagicMock(ok=True) + tray._launch_replay_request( + { + "endpoint": "/api/v1/traces/stream/replay-session", + "session_id": "sess_offer_002", + "machine_id": "pc-replay", + }, + "Bloc-notes", + ) + + mock_post.assert_called_once() + _, kwargs = mock_post.call_args + assert kwargs["params"] == { + "session_id": "sess_offer_002", + "machine_id": "pc-replay", + } + assert kwargs["headers"] == {"Authorization": "Bearer test-token"} + assert kwargs["allow_redirects"] is False + + +def test_agent_finalize_result_delegates_to_tray_offer(): + from agent_v0.agent_v1.finalize_contract import dispatch_finalize_result + + ui = MagicMock() + + dispatch_finalize_result( + ui, + { + "replay_ready": True, + "replay_request": { + "endpoint": "/api/v1/traces/stream/replay-session", + "session_id": "sess_offer_003", + "machine_id": "pc-main", + }, + }, + "Saisie dossier", + ) + + ui.offer_finalize_replay.assert_called_once_with( + { + "endpoint": "/api/v1/traces/stream/replay-session", + "session_id": "sess_offer_003", + "machine_id": "pc-main", + }, + "Saisie dossier", + ) + + +def test_agent_finalize_result_ignores_already_started_replay(): + from agent_v0.agent_v1.finalize_contract import dispatch_finalize_result + + ui = MagicMock() + + dispatch_finalize_result( + ui, + { + "replay_ready": True, + "replay_request": { + "endpoint": "/api/v1/traces/stream/replay-session", + "session_id": "sess_offer_004", + "machine_id": "pc-main", + }, + "replay_launch": { + "status": "started", + "replay": {"replay_id": "replay_sess_1234"}, + }, + }, + "Saisie dossier", + ) + + ui.offer_finalize_replay.assert_not_called() diff --git a/tests/unit/test_agent_v1_replay_pause_state.py b/tests/unit/test_agent_v1_replay_pause_state.py new file mode 100644 index 000000000..c0356fac9 --- /dev/null +++ b/tests/unit/test_agent_v1_replay_pause_state.py @@ -0,0 +1,78 @@ +"""Tests ciblés sur l'état replay côté AgentV1 pendant pause supervisée.""" + +import sys +import threading +from types import SimpleNamespace +from unittest.mock import MagicMock + + +def _make_agent(): + sys.modules.setdefault("pynput", MagicMock()) + sys.modules.setdefault("pynput.mouse", MagicMock()) + sys.modules.setdefault("pynput.keyboard", MagicMock()) + sys.modules.setdefault("pystray", MagicMock()) + + from agent_v0.agent_v1.main import AgentV1 + + agent = AgentV1.__new__(AgentV1) + agent.user_id = "demo_user" + agent.machine_id = "machine_test" + agent.running = True + agent._replay_active = True + agent._state = SimpleNamespace(calls=[], set_replay_active=lambda active: agent._state.calls.append(active)) + agent.ui = SimpleNamespace(calls=[], set_replay_active=lambda active: agent.ui.calls.append(active)) + return agent + + +def test_replay_pause_does_not_mark_replay_finished(monkeypatch): + """Quand l'executor signale replay_paused, AgentV1 doit rester en mode replay.""" + agent = _make_agent() + + class _Executor: + _poll_backoff = 1.0 + _replay_paused = True + + def poll_and_execute(self, session_id: str, server_url: str, machine_id: str = "default") -> bool: + return False + + agent._executor = _Executor() + + def _fake_sleep(_delay): + agent.running = False + + monkeypatch.setattr("agent_v0.agent_v1.main.time.sleep", _fake_sleep) + + t = threading.Thread(target=agent._replay_poll_loop) + t.start() + t.join(timeout=1) + + assert agent._replay_active is True + assert agent.ui.calls == [] + assert agent._state.calls == [] + + +def test_replay_without_action_and_without_pause_marks_replay_finished(monkeypatch): + """Sans action et sans pause, AgentV1 doit sortir du mode replay.""" + agent = _make_agent() + + class _Executor: + _poll_backoff = 1.0 + _replay_paused = False + + def poll_and_execute(self, session_id: str, server_url: str, machine_id: str = "default") -> bool: + return False + + agent._executor = _Executor() + + def _fake_sleep(_delay): + agent.running = False + + monkeypatch.setattr("agent_v0.agent_v1.main.time.sleep", _fake_sleep) + + t = threading.Thread(target=agent._replay_poll_loop) + t.start() + t.join(timeout=1) + + assert agent._replay_active is False + assert agent.ui.calls == [False] + assert agent._state.calls == [False] diff --git a/tests/unit/test_capturer_monitor_guard.py b/tests/unit/test_capturer_monitor_guard.py new file mode 100644 index 000000000..a50e07e36 --- /dev/null +++ b/tests/unit/test_capturer_monitor_guard.py @@ -0,0 +1,485 @@ +"""Garde dimensions monitor — agent_v0/agent_v1/vision/capturer.py + +Contexte (démo GHT 19 mai 2026) : `mss.monitors[1]` peut retourner +intermittemment des dimensions tronquées (cas observé : 2560×60 au lieu +de 2560×1600). Toute capture utilisant ces dims pour normaliser des +coordonnées empoisonne ensuite la mémoire persistante (`TargetMemoryStore`). + +Ce module teste la garde qui doit : +- détecter une dimension aberrante avant capture +- retenter (mss peut avoir un cache stale) +- tomber en fallback sur un autre monitor physique si dispo +- abandonner explicitement (logs WARNING/ERROR) sans empoisonner + +Périmètre : capturer.py uniquement (pas executor, pas replay). +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from PIL import Image + + +def _make_mock_mss(monitors_sequence): + """Construit un mock `mss.mss()` qui renvoie successivement les listes + `monitors` fournies. Permet de simuler retry / changement de dims + entre deux appels. + + Args: + monitors_sequence: liste de listes-de-monitors. Chaque entrée + représente l'état renvoyé par `sct.monitors` à un appel + successif de `mss.mss()`. La dernière entrée est réutilisée + si plus d'appels ont lieu. + + Returns: + Un mock utilisable comme `patch(..., side_effect=mock)` côté `mss.mss`. + """ + call_counter = {"n": 0} + instances = [] + + def factory(): + idx = min(call_counter["n"], len(monitors_sequence) - 1) + call_counter["n"] += 1 + instance = MagicMock(name=f"mss_instance_{idx}") + instance.monitors = monitors_sequence[idx] + + # grab() renvoie un objet avec size + bgra pour passer dans PIL + grab_result = MagicMock() + # On simule un buffer cohérent avec les dims du monitor sain + m = monitors_sequence[idx][1] if len(monitors_sequence[idx]) > 1 else {} + w = m.get("width", 100) + h = m.get("height", 100) + grab_result.size = (w, h) + # Une image saine ne doit pas être entièrement noire, sinon le nouveau + # fail-closed black-frame la rejetterait. + grab_result.bgra = b"\x80\x80\x80\x00" * (w * h) + instance.grab = MagicMock(return_value=grab_result) + + # context manager + cm = MagicMock(name=f"mss_cm_{idx}") + cm.__enter__ = MagicMock(return_value=instance) + cm.__exit__ = MagicMock(return_value=False) + instances.append((cm, instance)) + return cm + + factory.instances = instances + return factory + + +def _vision_capturer(tmp_path): + """Import paresseux pour permettre au patch d'opérer avant le import.""" + from agent_v0.agent_v1.vision.capturer import VisionCapturer + return VisionCapturer(str(tmp_path)) + + +def _solid_img(color: tuple[int, int, int], size=(320, 240)) -> Image.Image: + """Image unie simple pour piloter les tests de fallback noir.""" + return Image.new("RGB", size, color) + + +# ============================================================================ +# Test 1 — Dim aberrante (height=60) refusée : capture_full_context renvoie "" +# ============================================================================ + + +def test_capture_full_context_returns_empty_when_monitor_height_aberrant( + tmp_path: Path, caplog: pytest.LogCaptureFixture +): + """Cas démo GHT : mss.monitors[1] = 2560×60 (au lieu de 2560×1600). + + La capture doit refuser de produire un PNG basé sur ces dims (sinon + toute coord normalisée derrière sera fausse d'un facteur ~27×). + Retour attendu : chaîne vide (comme le contrat existant en cas + d'erreur). + """ + aberrant_monitors = [ + {"left": 0, "top": 0, "width": 2560, "height": 1660}, # composite + {"left": 0, "top": 0, "width": 2560, "height": 60}, # PRIMAIRE aberrant + ] + factory = _make_mock_mss([aberrant_monitors]) + + with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \ + patch("agent_v0.agent_v1.vision.capturer.time.sleep"): + caplog.set_level(logging.WARNING, logger="agent_v0.agent_v1.vision.capturer") + cap = _vision_capturer(tmp_path) + result = cap.capture_full_context("test_aberrant") + + assert result == "", ( + f"Capture devrait retourner '' sur dim aberrante, got {result!r}" + ) + + # Sanity : aucun grab() ne doit avoir été appelé sur un monitor aberrant. + # Tous les mss instances créés ne doivent JAMAIS avoir appelé grab(). + for _cm, instance in factory.instances: + instance.grab.assert_not_called() + + +# ============================================================================ +# Test 2 — Le log WARNING doit citer la dim observée (debuggabilité) +# ============================================================================ + + +def test_aberrant_monitor_logs_warning_with_observed_dimensions( + tmp_path: Path, caplog: pytest.LogCaptureFixture +): + """L'opérateur doit pouvoir diagnostiquer la cause depuis les logs sans + rejouer la session. Le WARNING doit contenir les dims aberrantes vues. + """ + aberrant_monitors = [ + {"left": 0, "top": 0, "width": 2560, "height": 1660}, + {"left": 0, "top": 0, "width": 2560, "height": 60}, + ] + factory = _make_mock_mss([aberrant_monitors]) + + with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \ + patch("agent_v0.agent_v1.vision.capturer.time.sleep"): + caplog.set_level(logging.WARNING, logger="agent_v0.agent_v1.vision.capturer") + cap = _vision_capturer(tmp_path) + cap.capture_full_context("test") + + warnings = [r for r in caplog.records if r.levelno == logging.WARNING] + assert warnings, "Au moins un WARNING attendu sur dim aberrante" + msg = " ".join(r.getMessage() for r in warnings) + assert "2560" in msg, f"Largeur observée doit apparaître dans le WARNING : {msg!r}" + assert "60" in msg, f"Hauteur observée doit apparaître dans le WARNING : {msg!r}" + + +# ============================================================================ +# Test 3 — Retry : un 1er appel aberrant suivi d'un appel sain produit la capture +# ============================================================================ + + +def test_capture_retries_when_first_monitor_query_is_aberrant( + tmp_path: Path, caplog: pytest.LogCaptureFixture +): + """Le bug observé est intermittent (mss peut avoir un cache stale). Si on + retente immédiatement, le second appel renvoie souvent les vraies dims. + La capture doit donc retenter et réussir quand le second appel est sain. + """ + aberrant_then_ok = [ + # 1er appel : aberrant + [ + {"left": 0, "top": 0, "width": 2560, "height": 1660}, + {"left": 0, "top": 0, "width": 2560, "height": 60}, + ], + # 2e appel : OK + [ + {"left": 0, "top": 0, "width": 2560, "height": 1660}, + {"left": 0, "top": 0, "width": 2560, "height": 1600}, + ], + ] + factory = _make_mock_mss(aberrant_then_ok) + + with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \ + patch("agent_v0.agent_v1.vision.capturer.time.sleep"): + caplog.set_level(logging.WARNING, logger="agent_v0.agent_v1.vision.capturer") + cap = _vision_capturer(tmp_path) + result = cap.capture_full_context("test_retry", force=True) + + assert result, ( + f"Capture doit réussir après retry sur dims saines, got {result!r}" + ) + assert Path(result).exists(), "Le PNG doit être physiquement créé" + + # Au moins 2 appels mss.mss() : le premier (aberrant) + le retry + assert len(factory.instances) >= 2, ( + f"Au moins 2 appels mss.mss() attendus (retry), vu {len(factory.instances)}" + ) + + +# ============================================================================ +# Test 4 — Fallback : monitors[1] aberrant mais monitors[2] sain → capture OK +# ============================================================================ + + +def test_capture_falls_back_to_secondary_monitor_when_primary_aberrant( + tmp_path: Path, caplog: pytest.LogCaptureFixture +): + """Cas multi-écrans : monitors[1] cassé en permanence, monitors[2] sain. + La capture doit utiliser monitors[2] et logger un WARNING fallback. + """ + monitors_with_fallback = [ + {"left": 0, "top": 0, "width": 2560, "height": 1660}, # composite + {"left": 0, "top": 0, "width": 2560, "height": 60}, # primaire cassé + {"left": 2560, "top": 0, "width": 1920, "height": 1080}, # secondaire sain + ] + # Même état renvoyé à tous les appels (cas stationnaire, pas intermittent) + factory = _make_mock_mss([monitors_with_fallback]) + + with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \ + patch("agent_v0.agent_v1.vision.capturer.time.sleep"): + caplog.set_level(logging.WARNING, logger="agent_v0.agent_v1.vision.capturer") + cap = _vision_capturer(tmp_path) + result = cap.capture_full_context("test_fallback", force=True) + + assert result, f"Capture doit réussir via monitor[2], got {result!r}" + msg = " ".join(r.getMessage() for r in caplog.records) + assert "fallback" in msg.lower(), ( + f"Un log doit signaler le fallback monitor : {msg!r}" + ) + + +# ============================================================================ +# Test 5 — capture_dual bénéficie aussi de la garde +# ============================================================================ + + +def test_capture_dual_returns_empty_dict_when_monitor_aberrant(tmp_path: Path): + """capture_dual (3 captures simultanées) ne doit pas non plus produire + de PNG sur dim aberrante : c'est la même source d'empoisonnement. + """ + aberrant_monitors = [ + {"left": 0, "top": 0, "width": 2560, "height": 1660}, + {"left": 0, "top": 0, "width": 2560, "height": 60}, + ] + factory = _make_mock_mss([aberrant_monitors]) + + with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \ + patch("agent_v0.agent_v1.vision.capturer.time.sleep"): + cap = _vision_capturer(tmp_path) + result = cap.capture_dual(x=100, y=200, screenshot_id="shot_dual") + + assert result == {}, ( + f"capture_dual doit retourner {{}} sur dim aberrante, got {result!r}" + ) + + +# ============================================================================ +# Test 6 — capture_active_window bénéficie aussi de la garde +# ============================================================================ + + +def test_capture_active_window_returns_none_when_monitor_aberrant(tmp_path: Path): + """capture_active_window (standalone, sans full_img fourni) doit aussi + refuser de capturer sur monitor aberrant. + """ + aberrant_monitors = [ + {"left": 0, "top": 0, "width": 2560, "height": 1660}, + {"left": 0, "top": 0, "width": 2560, "height": 60}, + ] + factory = _make_mock_mss([aberrant_monitors]) + + # Mocker get_active_window_rect pour qu'il renvoie une fenêtre valide + # (sinon le test sort prématurément avant d'atteindre le grab). + fake_rect = { + "rect": [100, 100, 800, 600], + "size": [700, 500], + "title": "Test Window", + "app_name": "test_app", + } + + with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \ + patch("agent_v0.agent_v1.vision.capturer.time.sleep"), \ + patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect", + return_value=fake_rect, + ): + cap = _vision_capturer(tmp_path) + result = cap.capture_active_window(x=200, y=300, screenshot_id="shot_win") + + assert result is None, ( + f"capture_active_window doit retourner None sur dim aberrante, got {result!r}" + ) + + +# ============================================================================ +# Test 7 — Non-régression : dim normale produit toujours un PNG +# ============================================================================ + + +def test_capture_full_context_succeeds_on_normal_dimensions(tmp_path: Path): + """Sanity check : la garde ne casse pas le chemin nominal.""" + normal_monitors = [ + {"left": 0, "top": 0, "width": 2560, "height": 1660}, + {"left": 0, "top": 0, "width": 2560, "height": 1600}, + ] + factory = _make_mock_mss([normal_monitors]) + + with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \ + patch("agent_v0.agent_v1.vision.capturer.time.sleep"): + cap = _vision_capturer(tmp_path) + result = cap.capture_full_context("test_normal", force=True) + + assert result, f"Capture nominale doit produire un PNG, got {result!r}" + assert Path(result).exists(), "PNG doit exister sur disque" + # Un seul appel mss.mss() attendu en cas normal (pas de retry) + assert len(factory.instances) == 1, ( + f"Un seul appel mss.mss() attendu sur dims saines, vu {len(factory.instances)}" + ) + + +# ============================================================================ +# Test 8 — fail-closed : capture_dual refuse le fallback monitor secondaire +# ============================================================================ + + +def test_capture_dual_fails_closed_when_only_secondary_monitor_sane( + tmp_path: Path, caplog: pytest.LogCaptureFixture +): + """capture_dual reçoit des coords (x, y) en système écran composite. + Si on capture monitors[2] (offset 2560, 0), le crop calculé via + img.crop((x, y, ...)) pointe à la mauvaise zone car les coords ne + sont pas traduites. Plutôt que de produire une image décalée + silencieusement, on refuse le fallback secondaire pour cette méthode. + """ + monitors_with_fallback = [ + {"left": 0, "top": 0, "width": 2560, "height": 1660}, + {"left": 0, "top": 0, "width": 2560, "height": 60}, # primary cassé + {"left": 2560, "top": 0, "width": 1920, "height": 1080}, # secondary sain + ] + factory = _make_mock_mss([monitors_with_fallback]) + + with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \ + patch("agent_v0.agent_v1.vision.capturer.time.sleep"): + caplog.set_level(logging.WARNING, logger="agent_v0.agent_v1.vision.capturer") + cap = _vision_capturer(tmp_path) + result = cap.capture_dual(x=300, y=400, screenshot_id="shot_dual_fb") + + assert result == {}, ( + f"capture_dual doit fail-closed sur fallback secondaire, got {result!r}" + ) + msg = " ".join(r.getMessage() for r in caplog.records).lower() + assert "fallback" in msg or "secondaire" in msg or "refus" in msg, ( + f"Un log doit expliquer le refus du fallback pour coords : {msg!r}" + ) + + +# ============================================================================ +# Test 9 — fail-closed : capture_active_window refuse le fallback secondaire +# ============================================================================ + + +def test_capture_active_window_fails_closed_when_only_secondary_monitor_sane( + tmp_path: Path, +): + """Même raison que test 8 : capture_active_window cropperait depuis l'image + de monitors[2] avec un win_rect en coords globales → zone fausse. + """ + monitors_with_fallback = [ + {"left": 0, "top": 0, "width": 2560, "height": 1660}, + {"left": 0, "top": 0, "width": 2560, "height": 60}, + {"left": 2560, "top": 0, "width": 1920, "height": 1080}, + ] + factory = _make_mock_mss([monitors_with_fallback]) + fake_rect = { + "rect": [100, 100, 800, 600], # coords globales dans monitors[1] + "size": [700, 500], + "title": "Test Window", + "app_name": "test_app", + } + + with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \ + patch("agent_v0.agent_v1.vision.capturer.time.sleep"), \ + patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect", + return_value=fake_rect, + ): + cap = _vision_capturer(tmp_path) + result = cap.capture_active_window(x=200, y=300, screenshot_id="shot_win_fb") + + assert result is None, ( + f"capture_active_window doit fail-closed sur fallback secondaire, got {result!r}" + ) + + +# ============================================================================ +# Test 10 — mss noir : fallback ImageGrab +# ============================================================================ + + +def test_capture_screen_image_falls_back_to_imagegrab_when_mss_is_black(): + """Un frame mss noir ne doit plus être accepté silencieusement. + + Si ImageGrab fournit une image exploitable, elle doit être retenue. + """ + from agent_v0.agent_v1.vision import capturer + + black_img = _solid_img((0, 0, 0)) + fallback_img = _solid_img((210, 180, 90)) + monitor = {"left": 0, "top": 0, "width": 320, "height": 240} + + with patch.object( + capturer, "_acquire_safe_grab", return_value=(monitor, black_img) + ), patch.object( + capturer, + "_capture_via_imagegrab", + return_value=(monitor, fallback_img, { + "backend": "imagegrab", + "luma": {"mean": 180.0, "stddev": 0.0, "min": 180, "max": 180}, + }), + ): + out_monitor, out_img, meta = capturer.capture_screen_image() + + assert out_monitor == monitor + assert out_img is fallback_img + assert meta["backend"] == "imagegrab" + + +# ============================================================================ +# Test 11 — capture_dual dégradé : conserver window_capture +# ============================================================================ + + +def test_capture_dual_keeps_window_capture_when_fullscreen_is_unavailable( + tmp_path: Path, +): + """Même sans full/crop, la capture fenêtre doit survivre. + + Cela permet au serveur de conserver un contexte utile plutôt que de + travailler sur un écran noir. + """ + fake_window = { + "window_image": str(tmp_path / "window_only.png"), + "window_title": "Bloc-notes", + "app_name": "notepad.exe", + "window_rect": [100, 100, 800, 600], + "window_size": [700, 500], + "click_in_window": [42, 24], + "click_inside_window": True, + } + + cap = _vision_capturer(tmp_path) + with patch( + "agent_v0.agent_v1.vision.capturer.capture_screen_image", + return_value=(None, None, {"backend": "mss_black"}), + ), patch.object(cap, "capture_active_window", return_value=fake_window): + result = cap.capture_dual(x=200, y=300, screenshot_id="shot_dual") + + assert "full" not in result + assert "crop" not in result + assert result["window_capture"] == fake_window + + +# ============================================================================ +# Test 12 — non-régression : capture_full_context PEUT utiliser le fallback +# ============================================================================ + + +def test_capture_full_context_still_uses_secondary_fallback( + tmp_path: Path, caplog: pytest.LogCaptureFixture +): + """capture_full_context (heartbeat) ne porte pas de coords client : un + écran sain quelconque suffit. Le fallback secondaire reste autorisé. + Sinon le heartbeat tomberait dès qu'un monitor est cassé en permanence. + """ + monitors_with_fallback = [ + {"left": 0, "top": 0, "width": 2560, "height": 1660}, + {"left": 0, "top": 0, "width": 2560, "height": 60}, + {"left": 2560, "top": 0, "width": 1920, "height": 1080}, + ] + factory = _make_mock_mss([monitors_with_fallback]) + + with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \ + patch("agent_v0.agent_v1.vision.capturer.time.sleep"): + caplog.set_level(logging.WARNING, logger="agent_v0.agent_v1.vision.capturer") + cap = _vision_capturer(tmp_path) + result = cap.capture_full_context("test_heartbeat_fb", force=True) + + assert result, ( + f"capture_full_context doit accepter fallback (heartbeat sans coords), got {result!r}" + ) + assert Path(result).exists() diff --git a/tests/unit/test_chat_window_paused_dispatch.py b/tests/unit/test_chat_window_paused_dispatch.py new file mode 100644 index 000000000..7ce1683cc --- /dev/null +++ b/tests/unit/test_chat_window_paused_dispatch.py @@ -0,0 +1,165 @@ +"""Tests pour ChatWindow._dispatch_paused_action. + +Couvre le routage bus SocketIO → fallback HTTP de la bulle paused. +Le bug d'origine ``paused_bubble: bus déconnecté, resume non émis`` +était causé par l'absence de ce fallback (cf. +``docs/CR_AUDIT_PAUSED_RESUME_BUS_2026-05-22.md``). + +Les tests appellent ``ChatWindow._dispatch_paused_action`` en tant +que fonction unbound avec un faux ``self`` (``SimpleNamespace``) pour +éviter de démarrer Tkinter pendant les tests unitaires. +""" + +from __future__ import annotations + +import sys +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import MagicMock + +ROOT = Path(__file__).parent.parent.parent +sys.path.insert(0, str(ROOT)) + +from agent_v0.agent_v1.ui.chat_window import ChatWindow # noqa: E402 + + +def _make_self(bus=None, server_client=None): + return SimpleNamespace(_bus=bus, _server_client=server_client) + + +def _call(mock_self, replay_id="replay_xyz", + bus_method="resume_replay", client_method="resume_replay"): + return ChatWindow._dispatch_paused_action( + mock_self, replay_id, bus_method=bus_method, client_method=client_method, + ) + + +class TestDispatchPausedAction: + def test_bus_connected_and_emits_uses_bus(self): + bus = MagicMock(connected=True) + bus.resume_replay.return_value = True + client = MagicMock(resume_replay=MagicMock(return_value=True)) + emitted, channel = _call(_make_self(bus=bus, server_client=client)) + assert emitted is True + assert channel == "bus" + bus.resume_replay.assert_called_once_with("replay_xyz") + client.resume_replay.assert_not_called() + + def test_bus_disconnected_falls_back_to_http(self): + bus = MagicMock(connected=False) + client = MagicMock(resume_replay=MagicMock(return_value=True)) + emitted, channel = _call(_make_self(bus=bus, server_client=client)) + assert emitted is True + assert channel == "http" + bus.resume_replay.assert_not_called() + client.resume_replay.assert_called_once_with("replay_xyz") + + def test_bus_emit_returns_false_falls_back_to_http(self): + """Bus marqué connecté mais l'emit retourne False (socket cassé + entre connect() et send) → bascule sur HTTP.""" + bus = MagicMock(connected=True) + bus.resume_replay.return_value = False + client = MagicMock(resume_replay=MagicMock(return_value=True)) + emitted, channel = _call(_make_self(bus=bus, server_client=client)) + assert emitted is True + assert channel == "http" + + def test_bus_emit_raises_falls_back_to_http(self): + bus = MagicMock(connected=True) + bus.resume_replay.side_effect = RuntimeError("socket broken") + client = MagicMock(resume_replay=MagicMock(return_value=True)) + emitted, channel = _call(_make_self(bus=bus, server_client=client)) + assert emitted is True + assert channel == "http" + + def test_no_bus_uses_http_directly(self): + client = MagicMock(resume_replay=MagicMock(return_value=True)) + emitted, channel = _call(_make_self(bus=None, server_client=client)) + assert emitted is True + assert channel == "http" + + def test_all_channels_fail_returns_false(self): + """Cas critique : bus déconnecté ET HTTP injoignable → l'UI + doit ré-activer les boutons côté appelant. Ici on vérifie + juste que dispatch retourne (False, '').""" + bus = MagicMock(connected=False) + client = MagicMock(resume_replay=MagicMock(return_value=False)) + emitted, channel = _call(_make_self(bus=bus, server_client=client)) + assert emitted is False + assert channel == "" + + def test_neither_bus_nor_client_returns_false(self): + emitted, channel = _call(_make_self(bus=None, server_client=None)) + assert emitted is False + assert channel == "" + + def test_client_method_missing_falls_through(self): + """Si server_client est un vieux client sans resume_replay, + on ne plante pas — on retourne (False, '').""" + bus = MagicMock(connected=False) + legacy_client = SimpleNamespace() # pas de resume_replay + emitted, channel = _call( + _make_self(bus=bus, server_client=legacy_client), + ) + assert emitted is False + assert channel == "" + + def test_abort_routing_symmetric(self): + """Le même mécanisme couvre l'abort — vérifie qu'on utilise + bien la méthode demandée par le caller.""" + bus = MagicMock(connected=False) + client = MagicMock(abort_replay=MagicMock(return_value=True)) + emitted, channel = _call( + _make_self(bus=bus, server_client=client), + bus_method="abort_replay", + client_method="abort_replay", + ) + assert emitted is True + assert channel == "http" + client.abort_replay.assert_called_once_with("replay_xyz") + + +class TestPausedBubbleHeight: + """Couvre _compute_paused_bubble_height — patch troncature 22 mai 2026.""" + + def test_empty_message_uses_minimum_height(self): + h, scroll = ChatWindow._compute_paused_bubble_height("") + assert h == 2 + assert scroll is False + + def test_short_message_no_scrollbar(self): + h, scroll = ChatWindow._compute_paused_bubble_height("Court message.") + assert h == 2 + assert scroll is False + + def test_long_single_line_triggers_scrollbar(self): + # ~600 chars sans \n → wrapped_lines = 600 // 60 + 1 = 11 + msg = "x" * 600 + h, scroll = ChatWindow._compute_paused_bubble_height(msg) + assert h == 11 + assert scroll is True + + def test_message_with_many_newlines_uses_explicit_count(self): + """Cas du bug : reason serveur listant 6 candidats sur 6 lignes + courtes — wrapped_lines bas mais explicit_lines élevé.""" + msg = "\n".join([f"option {i}" for i in range(6)]) + h, scroll = ChatWindow._compute_paused_bubble_height(msg) + # 6 lignes explicites > 2 lignes wrappées → hauteur = 6 + assert h == 6 + # Pas encore au cap, contenu court → pas de scrollbar + assert scroll is False + + def test_cap_reached_triggers_scrollbar_even_if_short(self): + """Quand on dépasse le cap (12 lignes), la scrollbar DOIT + s'afficher quel que soit la longueur en caractères.""" + msg = "\n".join([f"l{i}" for i in range(20)]) + h, scroll = ChatWindow._compute_paused_bubble_height(msg) + assert h == 12 # plafond + assert scroll is True + + def test_long_content_triggers_scrollbar_at_200_chars(self): + """Seuil sécurité texte : ≥ 200 chars → scrollbar même si + peu de lignes (filet anti-troncature visuel).""" + msg = "x" * 220 + h, scroll = ChatWindow._compute_paused_bubble_height(msg) + assert scroll is True diff --git a/tests/unit/test_env_setup.py b/tests/unit/test_env_setup.py index 0f748cfae..d3660cca2 100644 --- a/tests/unit/test_env_setup.py +++ b/tests/unit/test_env_setup.py @@ -16,6 +16,7 @@ sys.path.insert(0, str(ROOT)) from agent_v0.server_v1.api_stream import ( _extract_required_apps_from_events, _extract_required_apps_from_workflow, + _trim_redundant_setup_events, _resolve_launch_command, _infer_app_from_window_titles, _generate_setup_actions, @@ -220,6 +221,139 @@ class TestExtractRequiredAppsFromEvents: # Le premier app hors ignorées est Notepad assert result["first_window_title"] == "Bloc-notes" + def test_extracts_searchhost_launch_result_target(self): + """Récupère le vrai clic SearchHost qui lance l'app.""" + events = [ + {"event": {"type": "window_focus_change", "from": None, "to": { + "app_name": "explorer.exe", "title": "Explorateur"}}}, + {"event": {"type": "window_focus_change", "from": { + "app_name": "explorer.exe", "title": "Explorateur"}, "to": { + "app_name": "SearchHost.exe", "title": "Rechercher"}}}, + {"event": {"type": "text_input", "text": "bloc", "window": { + "app_name": "SearchHost.exe", "title": "Rechercher"}}}, + {"event": {"type": "mouse_click", "button": "left", "pos": [1449, 641], + "timestamp": 10.0, + "screen_metadata": {"screen_resolution": [2560, 1600]}, + "window": {"app_name": "SearchHost.exe", "title": "Rechercher"}, + "window_capture": { + "click_relative": [681, 448], + "window_size": [1287, 1407], + }}}, + {"event": {"type": "window_focus_change", "from": { + "app_name": "SearchHost.exe", "title": "Rechercher"}, "to": { + "app_name": "explorer.exe", "title": "unknown_window"}, + "timestamp": 10.4}}, + {"event": {"type": "window_focus_change", "from": { + "app_name": "explorer.exe", "title": "unknown_window"}, "to": { + "app_name": "Notepad.exe", "title": "Sans titre – Bloc-notes"}, + "timestamp": 11.1}}, + ] + + result = _extract_required_apps_from_events(events) + target = result["launch_result_target"] + assert result["primary_app"] == "Notepad.exe" + assert target["window_title"] == "Rechercher" + assert target["expected_window_before"] == "Rechercher" + assert target["x_pct"] == pytest.approx(1449 / 2560, rel=0, abs=1e-6) + assert target["y_pct"] == pytest.approx(641 / 1600, rel=0, abs=1e-6) + assert target["original_position"]["x_relative"] == "au centre" + assert target["original_position"]["y_relative"] == "au milieu" + assert target["window_capture"]["click_relative"] == [681, 448] + + def test_extracts_start_menu_target(self): + """Récupère le vrai clic Démarrer qui ouvre SearchHost.""" + events = [ + {"event": {"type": "window_focus_change", "from": None, "to": { + "app_name": "explorer.exe", "title": "Explorateur"}}}, + {"event": {"type": "mouse_click", "button": "left", "pos": [993, 1559], + "timestamp": 1.0, + "screen_metadata": {"screen_resolution": [2560, 1600]}, + "window": {"app_name": "explorer.exe", "title": "Explorateur"}}}, + {"event": {"type": "window_focus_change", "from": { + "app_name": "explorer.exe", "title": "Explorateur"}, "to": { + "app_name": "SearchHost.exe", "title": "Rechercher"}, + "timestamp": 1.2}}, + {"event": {"type": "mouse_click", "button": "left", "pos": [1449, 641], + "timestamp": 4.0, + "screen_metadata": {"screen_resolution": [2560, 1600]}, + "window": {"app_name": "SearchHost.exe", "title": "Rechercher"}}}, + {"event": {"type": "window_focus_change", "from": { + "app_name": "SearchHost.exe", "title": "Rechercher"}, "to": { + "app_name": "Notepad.exe", "title": "Sans titre – Bloc-notes"}, + "timestamp": 4.4}}, + ] + + result = _extract_required_apps_from_events(events) + target = result["start_menu_target"] + assert target["x_pct"] == pytest.approx(993 / 2560, rel=0, abs=1e-6) + assert target["y_pct"] == pytest.approx(1559 / 1600, rel=0, abs=1e-6) + assert target["original_position"]["x_relative"] == "au centre" + assert target["original_position"]["y_relative"] == "en bas" + assert "en bas" in target["position_desc"] + + def test_extracts_start_menu_target_anchor_from_session_shot(self, tmp_path): + """Le clic Démarrer récupère aussi une ancre image depuis le shot source.""" + from PIL import Image + + session_dir = tmp_path / "sess" + shots_dir = session_dir / "shots" + shots_dir.mkdir(parents=True) + Image.new("RGB", (2560, 1600), color="white").save( + shots_dir / "shot_start_full.png" + ) + + events = [ + {"event": {"type": "window_focus_change", "from": None, "to": { + "app_name": "explorer.exe", "title": "Explorateur"}}}, + {"event": {"type": "mouse_click", "button": "left", "pos": [993, 1559], + "timestamp": 1.0, + "screenshot_id": "shot_start", + "screen_metadata": {"screen_resolution": [2560, 1600]}, + "window": {"app_name": "explorer.exe", "title": "Explorateur"}}}, + {"event": {"type": "window_focus_change", "from": { + "app_name": "explorer.exe", "title": "Explorateur"}, "to": { + "app_name": "SearchHost.exe", "title": "Rechercher"}, + "timestamp": 1.2}}, + {"event": {"type": "window_focus_change", "from": { + "app_name": "SearchHost.exe", "title": "Rechercher"}, "to": { + "app_name": "Notepad.exe", "title": "Sans titre – Bloc-notes"}, + "timestamp": 2.0}}, + ] + + result = _extract_required_apps_from_events( + events, + session_dir=str(session_dir), + ) + target = result["start_menu_target"] + + assert target["anchor_image_base64"] + + def test_extracts_direct_typing_search_interaction(self): + """Détecte qu'aucun clic SearchHost n'est requis avant la saisie.""" + events = [ + {"event": {"type": "window_focus_change", "from": None, "to": { + "app_name": "explorer.exe", "title": "Explorateur"}}}, + {"event": {"type": "mouse_click", "button": "left", "pos": [993, 1559], + "timestamp": 1.0, + "screen_metadata": {"screen_resolution": [2560, 1600]}, + "window": {"app_name": "explorer.exe", "title": "Explorateur"}}}, + {"event": {"type": "window_focus_change", "from": { + "app_name": "explorer.exe", "title": "Explorateur"}, "to": { + "app_name": "SearchHost.exe", "title": "Rechercher"}, + "timestamp": 1.2}}, + {"event": {"type": "text_input", "text": "bloc", + "window": {"app_name": "SearchHost.exe", "title": "Rechercher"}, + "timestamp": 2.0}}, + {"event": {"type": "window_focus_change", "from": { + "app_name": "SearchHost.exe", "title": "Rechercher"}, "to": { + "app_name": "Notepad.exe", "title": "Sans titre – Bloc-notes"}, + "timestamp": 2.4}}, + ] + + result = _extract_required_apps_from_events(events) + assert result["search_box_interaction"]["mode"] == "direct_typing" + assert result["search_box_interaction"]["window_title"] == "Rechercher" + def test_empty_events(self): """Pas d'événements → dict vide.""" assert _extract_required_apps_from_events([]) == {} @@ -245,6 +379,187 @@ class TestExtractRequiredAppsFromEvents: assert result["primary_launch_cmd"] == "calc" +class TestTrimRedundantSetupEvents: + """Tests pour la coupe du préambule déjà couvert par le setup.""" + + def test_trims_until_first_primary_app_focus(self): + raw_events = [ + {"event": {"type": "window_focus_change", "to": { + "app_name": "explorer.exe", "title": "Explorateur"}}}, + {"event": {"type": "mouse_click", "pos": [993, 1559], "window": { + "app_name": "explorer.exe", "title": "Explorateur"}}}, + {"event": {"type": "window_focus_change", "to": { + "app_name": "SearchHost.exe", "title": "Rechercher"}}}, + {"event": {"type": "text_input", "text": "bloc", "window": { + "app_name": "SearchHost.exe", "title": "Rechercher"}}}, + {"event": {"type": "mouse_click", "pos": [1449, 641], "window": { + "app_name": "SearchHost.exe", "title": "Rechercher"}}}, + {"event": {"type": "window_focus_change", "to": { + "app_name": "Notepad.exe", + "title": "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes", + }}}, + {"event": {"type": "mouse_click", "pos": [1514, 562], "window": { + "app_name": "Notepad.exe", "title": "*test – Bloc-notes"}}}, + {"event": {"type": "text_input", "text": "test", "window": { + "app_name": "Notepad.exe", "title": "*test – Bloc-notes"}}}, + ] + app_info = { + "primary_app": "Notepad.exe", + "first_window_title": "Bloc-notes", + } + + trimmed = _trim_redundant_setup_events(raw_events, app_info) + + assert len(trimmed) == 2 + assert trimmed[0]["event"]["type"] == "mouse_click" + assert trimmed[0]["event"]["pos"] == [1514, 562] + assert trimmed[1]["event"]["type"] == "text_input" + + def test_keeps_events_when_no_matching_focus_found(self): + raw_events = [ + {"event": {"type": "mouse_click", "pos": [10, 10], "window": { + "app_name": "explorer.exe", "title": "Explorateur"}}}, + {"event": {"type": "text_input", "text": "abc", "window": { + "app_name": "explorer.exe", "title": "Explorateur"}}}, + ] + app_info = { + "primary_app": "Notepad.exe", + "first_window_title": "Bloc-notes", + } + + trimmed = _trim_redundant_setup_events(raw_events, app_info) + + assert trimmed == raw_events + + def test_prefers_neutral_title_focus_after_non_neutral_first_focus(self): + """Cas observé sess_20260520T102916_066851 : premier focus Notepad + a un titre non-neutre (http...txt), suivi d'un clic intra-Notepad + et d'un focus vers 'Sans titre' (= état initial neutre que le setup + auto produit). Le trim doit couper jusqu'au focus neutre pour + éliminer le clic intra-Notepad redondant. + """ + raw_events = [ + {"event": {"type": "window_focus_change", "to": { + "app_name": "SearchHost.exe", "title": "Rechercher"}}}, + {"event": {"type": "mouse_click", "pos": [681, 448], "window": { + "app_name": "SearchHost.exe", "title": "Rechercher"}}}, + {"event": {"type": "window_focus_change", "to": { + "app_name": "Notepad.exe", + "title": "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes", + }}}, + {"event": {"type": "mouse_click", "pos": [1191, 40], "window": { + "app_name": "Notepad.exe", + "title": "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes", + }}}, + {"event": {"type": "window_focus_change", "to": { + "app_name": "Notepad.exe", "title": "Sans titre – Bloc-notes"}}}, + {"event": {"type": "text_input", "text": "test", "window": { + "app_name": "Notepad.exe", "title": "*test – Bloc-notes"}}}, + ] + app_info = { + "primary_app": "Notepad.exe", + "first_window_title": ( + "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes" + ), + } + + trimmed = _trim_redundant_setup_events(raw_events, app_info) + + # Le clic intra-Notepad (event idx 3) doit être supprimé : il + # bascule vers 'Sans titre' qui est déjà l'état setup, donc + # rejoué il n'a aucun effet visuel et déclenche retry_threshold. + assert len(trimmed) == 1 + assert trimmed[0]["event"]["type"] == "text_input" + assert trimmed[0]["event"]["text"] == "test" + + def test_neutral_focus_outside_lookahead_window_is_ignored(self): + """Filet de sécurité : un focus 'Sans titre' qui arrive trop loin + après le premier focus primary_app n'est pas considéré comme + l'état de bootstrap. Évite de couper un workflow qui re-visite + 'Sans titre' bien après le démarrage.""" + # 30 events séparent le premier focus du focus neutre + raw_events = [ + {"event": {"type": "window_focus_change", "to": { + "app_name": "Notepad.exe", + "title": "rapport_final.txt – Bloc-notes"}}}, + ] + # Bourrer avec des events utiles intra-Notepad + for i in range(30): + raw_events.append({"event": { + "type": "mouse_click", "pos": [100 + i, 200], + "window": {"app_name": "Notepad.exe", + "title": "rapport_final.txt – Bloc-notes"}, + }}) + raw_events.append({"event": {"type": "window_focus_change", "to": { + "app_name": "Notepad.exe", "title": "Sans titre – Bloc-notes"}}}) + raw_events.append({"event": {"type": "text_input", "text": "x", + "window": {"app_name": "Notepad.exe", + "title": "Sans titre – Bloc-notes"}}}) + + app_info = { + "primary_app": "Notepad.exe", + "first_window_title": "rapport_final.txt – Bloc-notes", + } + + trimmed = _trim_redundant_setup_events(raw_events, app_info) + + # Doit garder les 30 clicks + focus tardif + text_input = 32 events + # (cut uniquement au premier focus primary_app, comportement legacy) + assert len(trimmed) == 32 + assert trimmed[0]["event"]["type"] == "mouse_click" + assert trimmed[0]["event"]["pos"] == [100, 200] + + def test_keeps_legacy_behavior_when_first_focus_already_neutral(self): + """Non-régression : si le premier focus primary_app est déjà sur + un titre neutre (cas normal), on coupe au premier focus comme + avant — pas de chasse au neutral_idx inutile.""" + raw_events = [ + {"event": {"type": "window_focus_change", "to": { + "app_name": "SearchHost.exe", "title": "Rechercher"}}}, + {"event": {"type": "window_focus_change", "to": { + "app_name": "Notepad.exe", "title": "Sans titre – Bloc-notes"}}}, + {"event": {"type": "text_input", "text": "hello", + "window": {"app_name": "Notepad.exe", + "title": "Sans titre – Bloc-notes"}}}, + ] + app_info = { + "primary_app": "Notepad.exe", + "first_window_title": "Sans titre – Bloc-notes", + } + + trimmed = _trim_redundant_setup_events(raw_events, app_info) + + assert len(trimmed) == 1 + assert trimmed[0]["event"]["type"] == "text_input" + + def test_neutral_detection_recognizes_office_default_titles(self): + """Word, Excel, PowerPoint utilisent leurs propres titres + par défaut (Document1, Classeur1, etc.) que le setup auto + amène également.""" + raw_events = [ + {"event": {"type": "window_focus_change", "to": { + "app_name": "winword.exe", + "title": "rapport.docx - Word"}}}, + {"event": {"type": "mouse_click", "pos": [100, 40], + "window": {"app_name": "winword.exe", + "title": "rapport.docx - Word"}}}, + {"event": {"type": "window_focus_change", "to": { + "app_name": "winword.exe", "title": "Document1 - Word"}}}, + {"event": {"type": "text_input", "text": "abc", + "window": {"app_name": "winword.exe", + "title": "Document1 - Word"}}}, + ] + app_info = { + "primary_app": "winword.exe", + "first_window_title": "rapport.docx - Word", + } + + trimmed = _trim_redundant_setup_events(raw_events, app_info) + + assert len(trimmed) == 1 + assert trimmed[0]["event"]["type"] == "text_input" + + # ========================================================================= # Tests pour _extract_required_apps_from_workflow # ========================================================================= @@ -304,10 +619,10 @@ class TestExtractRequiredAppsFromWorkflow: # ========================================================================= class TestGenerateSetupActions: - """Tests pour la génération des actions de setup 100% visuelles.""" + """Tests pour la génération des actions de setup.""" - def test_notepad_setup_visual(self): - """Génère les bonnes actions visuelles pour lancer Notepad.""" + def test_notepad_setup_uses_run_dialog(self): + """Bloc-notes utilise désormais le setup sémantique Win+R.""" app_info = { "primary_app": "Notepad.exe", "primary_launch_cmd": "notepad", @@ -315,74 +630,52 @@ class TestGenerateSetupActions: } actions = _generate_setup_actions(app_info) - # 9 actions : click_start, wait, click_search, wait, type, wait, click_result, wait, verify - assert len(actions) == 9 + assert len(actions) == 7 - # Étape 1 : clic visuel sur le bouton Démarrer - assert actions[0]["type"] == "click" - assert actions[0]["visual_mode"] is True - assert actions[0]["target_spec"]["by_role"] == "start_button" - assert actions[0]["target_spec"]["by_text"] == "Démarrer" + assert actions[0]["type"] == "key_combo" + assert actions[0]["keys"] == ["win", "r"] + assert actions[0]["_setup_step"] == "open_run_dialog" - # Étape 2 : attente menu Démarrer assert actions[1]["type"] == "wait" - assert actions[1]["duration_ms"] == 1000 + assert actions[1]["duration_ms"] == 500 - # Étape 3 : clic visuel sur la barre de recherche - assert actions[2]["type"] == "click" - assert actions[2]["visual_mode"] is True - assert actions[2]["target_spec"]["by_role"] == "search_box" + assert actions[2]["type"] == "type" + assert actions[2]["text"] == "notepad" - # Étape 4 : attente barre de recherche active assert actions[3]["type"] == "wait" - assert actions[3]["duration_ms"] == 500 + assert actions[3]["duration_ms"] == 300 - # Étape 5 : taper le nom visuel français - assert actions[4]["type"] == "type" - assert actions[4]["text"] == "Bloc-notes" + assert actions[4]["type"] == "key_combo" + assert actions[4]["keys"] == ["enter"] - # Étape 6 : attente résultats assert actions[5]["type"] == "wait" - assert actions[5]["duration_ms"] == 1200 + assert actions[5]["duration_ms"] == 2000 - # Étape 7 : clic visuel sur le résultat - assert actions[6]["type"] == "click" - assert actions[6]["visual_mode"] is True - assert actions[6]["target_spec"]["by_text"] == "Bloc-notes" - assert actions[6]["target_spec"]["by_role"] == "app_icon" - - # Étape 8 : attente lancement (app légère = 2000ms) - assert actions[7]["type"] == "wait" - assert actions[7]["duration_ms"] == 2000 - - # Étape 9 : vérification visuelle - assert actions[8]["type"] == "verify_screen" - assert actions[8]["_expected_title"] == "Sans titre – Bloc-notes" + assert actions[6]["type"] == "verify_screen" + assert actions[6]["expected_window_title_contains"] == ["Bloc-notes", "notepad"] # Toutes les actions sont marquées comme phase setup for action in actions: assert action.get("_setup_phase") is True + assert action.get("_setup_strategy") == "run_dialog" - def test_no_key_combo_in_setup(self): - """AUCUNE action key_combo ne doit être générée dans le setup.""" + def test_visual_setup_keeps_no_key_combo_for_word(self): + """Le setup visuel classique ne doit pas introduire de key_combo.""" app_info = { - "primary_app": "Notepad.exe", - "primary_launch_cmd": "notepad", - "first_window_title": "Bloc-notes", + "primary_app": "winword.exe", + "primary_launch_cmd": "winword", + "first_window_title": "Document1 - Word", } actions = _generate_setup_actions(app_info) key_combos = [a for a in actions if a["type"] == "key_combo"] - assert key_combos == [], ( - "Le setup 100% visuel ne doit JAMAIS contenir de key_combo. " - f"Trouvé : {key_combos}" - ) + assert key_combos == [] - def test_all_clicks_are_visual(self): - """Tous les clics du setup doivent avoir visual_mode=True et un target_spec.""" + def test_all_clicks_are_visual_for_visual_setup(self): + """Tous les clics du setup visuel doivent avoir visual_mode=True.""" app_info = { - "primary_app": "Notepad.exe", - "primary_launch_cmd": "notepad", - "first_window_title": "Bloc-notes", + "primary_app": "winword.exe", + "primary_launch_cmd": "winword", + "first_window_title": "Document1 - Word", } actions = _generate_setup_actions(app_info) clicks = [a for a in actions if a["type"] == "click"] @@ -402,11 +695,11 @@ class TestGenerateSetupActions: assert "vlm_description" in spec, f"target_spec sans vlm_description : {spec}" def test_clicks_have_fallback_coordinates(self): - """Tous les clics visuels ont des coordonnées de fallback (x_pct, y_pct).""" + """Tous les clics visuels ont des coordonnées de fallback.""" app_info = { - "primary_app": "Notepad.exe", - "primary_launch_cmd": "notepad", - "first_window_title": "Bloc-notes", + "primary_app": "winword.exe", + "primary_launch_cmd": "winword", + "first_window_title": "Document1 - Word", } actions = _generate_setup_actions(app_info) clicks = [a for a in actions if a["type"] == "click"] @@ -456,28 +749,130 @@ class TestGenerateSetupActions: click_result = [a for a in actions if a.get("_setup_step") == "click_app_result"][0] assert click_result["target_spec"]["by_text"] == "Microsoft Word" - def test_verify_screen_present_with_title(self): - """Un verify_screen est ajouté quand un titre de fenêtre est connu.""" + def test_prefers_recorded_searchhost_click_target(self): + """Le setup réutilise la vraie cible SearchHost quand elle existe.""" + app_info = { + "primary_app": "winword.exe", + "primary_launch_cmd": "winword", + "first_window_title": "Document1 - Word", + "launch_result_target": { + "x_pct": 0.566016, + "y_pct": 0.400625, + "window_title": "Rechercher", + "expected_window_before": "Rechercher", + "original_position": { + "x_relative": "au centre", + "y_relative": "au milieu", + }, + "window_capture": { + "click_relative": [681, 448], + "window_size": [1287, 1407], + }, + "position_desc": "au milieu au centre", + }, + } + actions = _generate_setup_actions(app_info) + + click_result = [a for a in actions if a.get("_setup_step") == "click_app_result"][0] + assert click_result["x_pct"] == pytest.approx(0.566016) + assert click_result["y_pct"] == pytest.approx(0.400625) + assert click_result["expected_window_before"] == "Rechercher" + assert click_result["target_spec"]["by_text"] == "Microsoft Word" + assert click_result["target_spec"]["by_role"] == "search_result" + assert click_result["target_spec"]["allow_position_fallback"] is True + assert click_result["target_spec"]["window_title"] == "Rechercher" + assert click_result["target_spec"]["original_position"]["x_relative"] == "au centre" + assert click_result["target_spec"]["window_capture"]["window_size"] == [1287, 1407] + assert "résultat de recherche" in click_result["target_spec"]["vlm_description"] + + def test_prefers_recorded_start_button_target(self): + """Le setup visuel réutilise le vrai clic Démarrer quand il existe.""" + app_info = { + "primary_app": "winword.exe", + "primary_launch_cmd": "winword", + "first_window_title": "Document1 - Word", + "start_menu_target": { + "x_pct": 0.387891, + "y_pct": 0.974375, + "anchor_image_base64": "abc123", + "original_position": { + "x_relative": "au centre", + "y_relative": "en bas", + }, + "position_desc": "en bas au centre", + }, + } + actions = _generate_setup_actions(app_info) + + click_start = [a for a in actions if a.get("_setup_step") == "click_start_menu"][0] + assert click_start["x_pct"] == pytest.approx(0.387891) + assert click_start["y_pct"] == pytest.approx(0.974375) + assert click_start["target_spec"]["by_text"] == "" + assert click_start["target_spec"]["by_role"] == "start_button" + assert click_start["target_spec"]["screen_scope"] == "full_screen" + assert click_start["target_spec"]["allow_position_fallback"] is True + assert click_start["target_spec"]["anchor_image_base64"] == "abc123" + assert click_start["target_spec"]["original_position"]["y_relative"] == "en bas" + assert "icône Windows" in click_start["target_spec"]["vlm_description"] + + def test_skips_search_click_for_direct_typing(self): + """Quand la session tape directement dans SearchHost, on saute + click_search et son wait/verify dédiés. La garde + verify_start_menu_open reste obligatoire et précède le type.""" + app_info = { + "primary_app": "winword.exe", + "primary_launch_cmd": "winword", + "first_window_title": "Document1 - Word", + "search_box_interaction": { + "mode": "direct_typing", + "window_title": "Rechercher", + }, + } + actions = _generate_setup_actions(app_info) + + setup_steps = [a.get("_setup_step") for a in actions] + assert "click_search_box" not in setup_steps + assert "wait_search_ready" not in setup_steps + assert "verify_search_box_active" not in setup_steps + # Garde générique conservée — c'est elle qui sécurise la frappe. + assert "verify_start_menu_open" in setup_steps + + idx_type = setup_steps.index("type_app_name") + assert actions[idx_type]["type"] == "type" + assert actions[idx_type]["text"] == "Word" + + def test_verify_screen_final_present_with_title(self): + """Le setup run_dialog termine par une vérification souple sur le titre app.""" app_info = { "primary_app": "Notepad.exe", "primary_launch_cmd": "notepad", "first_window_title": "Sans titre – Bloc-notes", } actions = _generate_setup_actions(app_info) - verify = [a for a in actions if a.get("type") == "verify_screen"] - assert len(verify) == 1 - assert verify[0]["_expected_title"] == "Sans titre – Bloc-notes" + final_verifies = [ + a for a in actions + if a.get("type") == "verify_screen" + and a.get("_setup_step") == "verify_app_ready" + ] + assert len(final_verifies) == 1 + assert "Bloc-notes" in final_verifies[0]["expected_window_title_contains"] - def test_no_verify_without_title(self): - """Pas de verify_screen si aucun titre de fenêtre n'est connu.""" + def test_run_dialog_keeps_final_verify_even_without_exact_title(self): + """Le setup run_dialog garde une vérification finale générique.""" app_info = { "primary_app": "Notepad.exe", "primary_launch_cmd": "notepad", "first_window_title": "", } actions = _generate_setup_actions(app_info) - verify = [a for a in actions if a.get("type") == "verify_screen"] - assert len(verify) == 0 + # Aucun verify_screen ne doit porter _expected_title. + final_verifies = [ + a for a in actions + if a.get("type") == "verify_screen" + and a.get("_setup_step") == "verify_app_ready" + ] + assert len(final_verifies) == 1 + assert "notepad" in [p.lower() for p in final_verifies[0]["expected_window_title_contains"]] def test_empty_app_info(self): """Dict vide → pas d'actions.""" @@ -537,12 +932,184 @@ class TestGenerateSetupActions: assert type_action["text"] == "MonAppMedical" +# ========================================================================= +# Tests des gardes visuelles du setup (verify_screen titre fenêtre) +# ========================================================================= + + +class TestSetupVisualGuards: + """Couvre les gardes visuelles insérées entre les étapes du setup + auto Windows (post-blocage `position_fallback` live du 22 mai 2026). + + Sans ces gardes, un clic Démarrer qui touche en fait le systray + overflow popup laissait le setup taper « bloc » dans la mauvaise + fenêtre, et seul le `click_result` final remontait l'erreur — trop + tard. Les `verify_screen` titre-fenêtre stoppent net après chaque + étape critique. + """ + + def test_verify_start_menu_open_inserted_after_wait_start(self): + """Une garde verify_screen est insérée juste après wait_start_menu.""" + app_info = { + "primary_app": "winword.exe", + "primary_launch_cmd": "winword", + "first_window_title": "Document1 - Word", + } + actions = _generate_setup_actions(app_info) + steps = [a.get("_setup_step") for a in actions] + + # Ordre : click_start_menu → wait_start_menu → verify_start_menu_open + assert "verify_start_menu_open" in steps + idx_wait = steps.index("wait_start_menu") + idx_verify = steps.index("verify_start_menu_open") + assert idx_verify == idx_wait + 1 + + verify = actions[idx_verify] + assert verify["type"] == "verify_screen" + assert verify.get("_setup_phase") is True + patterns = verify.get("expected_window_title_contains") or [] + assert isinstance(patterns, list) and patterns + lowered = [p.lower() for p in patterns] + # Doit couvrir au minimum FR + EN + l'app SearchHost / StartMenu + assert any("recherch" in p for p in lowered), patterns + assert any("search" in p for p in lowered), patterns + + def test_verify_search_box_active_inserted_when_click_then_type(self): + """Quand le setup clique sur la barre Rechercher puis attend, + une garde verify_screen suit l'attente pour bloquer la frappe + si le focus n'est pas réellement dans la barre.""" + app_info = { + "primary_app": "winword.exe", + "primary_launch_cmd": "winword", + "first_window_title": "Document1 - Word", + "search_box_interaction": { + "mode": "click_then_type", + "window_title": "Rechercher", + "x_pct": 0.10, "y_pct": 0.95, + }, + } + actions = _generate_setup_actions(app_info) + steps = [a.get("_setup_step") for a in actions] + + assert "verify_search_box_active" in steps + idx_wait_ready = steps.index("wait_search_ready") + idx_verify = steps.index("verify_search_box_active") + idx_type = steps.index("type_app_name") + # Ordre : wait_search_ready → verify_search_box_active → type_app_name + assert idx_verify == idx_wait_ready + 1 + assert idx_type == idx_verify + 1 + + verify = actions[idx_verify] + assert verify["type"] == "verify_screen" + patterns = verify.get("expected_window_title_contains") or [] + assert "Rechercher" in patterns or any( + p.lower() == "rechercher" for p in patterns + ) + + def test_no_verify_search_box_when_direct_typing(self): + """En mode direct_typing on n'a pas de click sur la barre — donc + pas de verify_search_box_active dédié (la garde verify_start_menu_open + suffit, on tape directement après).""" + app_info = { + "primary_app": "winword.exe", + "primary_launch_cmd": "winword", + "first_window_title": "Document1 - Word", + "search_box_interaction": { + "mode": "direct_typing", + "window_title": "Rechercher", + }, + } + actions = _generate_setup_actions(app_info) + steps = [a.get("_setup_step") for a in actions] + assert "verify_search_box_active" not in steps + # La garde verify_start_menu_open reste présente (couvre la frappe). + assert "verify_start_menu_open" in steps + idx_verify = steps.index("verify_start_menu_open") + idx_type = steps.index("type_app_name") + assert idx_type > idx_verify, ( + "type_app_name doit suivre verify_start_menu_open en direct_typing" + ) + + def test_verify_search_results_visible_inserted_before_click_result(self): + """Dernier filet : la barre Rechercher (et ses résultats) doit + être encore active juste avant `click_app_result`. Sans cette + garde finale, un focus perdu pendant `wait_search_results` + peut faire cliquer le `click_app_result` dans la mauvaise + surface (constat live 2026-05-22 — fenêtre observée + ``Fenêtre de dépassement de capacité de la barre d'état + système.``).""" + app_info = { + "primary_app": "winword.exe", + "primary_launch_cmd": "winword", + "first_window_title": "Document1 - Word", + } + actions = _generate_setup_actions(app_info) + steps = [a.get("_setup_step") for a in actions] + + assert "verify_search_results_visible" in steps + idx_wait_results = steps.index("wait_search_results") + idx_verify = steps.index("verify_search_results_visible") + idx_click_result = steps.index("click_app_result") + # Ordre : wait_search_results → verify_search_results_visible → click_app_result + assert idx_verify == idx_wait_results + 1 + assert idx_click_result == idx_verify + 1 + + verify = actions[idx_verify] + assert verify["type"] == "verify_screen" + patterns = verify.get("expected_window_title_contains") or [] + assert isinstance(patterns, list) and patterns + lowered = [p.lower() for p in patterns] + assert any("recherch" in p for p in lowered), patterns + assert any("search" in p for p in lowered), patterns + + def test_verify_search_results_visible_present_in_direct_typing(self): + """La garde finale avant click_app_result reste obligatoire + quelle que soit la modalité de la barre Rechercher.""" + app_info = { + "primary_app": "winword.exe", + "primary_launch_cmd": "winword", + "first_window_title": "Document1 - Word", + "search_box_interaction": { + "mode": "direct_typing", + "window_title": "Rechercher", + }, + } + actions = _generate_setup_actions(app_info) + steps = [a.get("_setup_step") for a in actions] + assert "verify_search_results_visible" in steps + + def test_setup_guards_have_short_timeout(self): + """Les gardes verify_screen ont un timeout court (≤ 2 s) — c'est + un check titre, pas un wait long.""" + app_info = { + "primary_app": "winword.exe", + "primary_launch_cmd": "winword", + "first_window_title": "Document1 - Word", + "search_box_interaction": { + "mode": "click_then_type", + "window_title": "Rechercher", + }, + } + actions = _generate_setup_actions(app_info) + guards = [ + a for a in actions + if a.get("_setup_step") in ( + "verify_start_menu_open", + "verify_search_box_active", + "verify_search_results_visible", + ) + ] + assert guards, "il doit exister au moins une garde verify_screen" + for g in guards: + assert g.get("timeout_ms", 5000) <= 2000 + + # ========================================================================= # Tests d'intégration : pipeline complet events → setup visuel # ========================================================================= class TestSetupPipeline: - """Tests du pipeline complet : extraction + génération visuelle.""" + """Tests du pipeline complet : extraction + génération du setup.""" def test_full_pipeline_from_events(self): """Pipeline complet depuis des événements bruts de type Notepad.""" @@ -561,24 +1128,25 @@ class TestSetupPipeline: assert app_info["primary_app"] == "Notepad.exe" actions = _generate_setup_actions(app_info) - assert len(actions) >= 8 # Au minimum 8 actions visuelles (sans verify si pas de titre) + assert len(actions) == 7 - # Vérifier l'ordre logique 100% visuel types = [a["type"] for a in actions] - assert types[0] == "click" # Clic Démarrer - assert types[1] == "wait" # Attente menu - assert types[2] == "click" # Clic barre de recherche - assert types[3] == "wait" # Attente barre active - assert types[4] == "type" # Taper le nom - assert types[5] == "wait" # Attente résultats - assert types[6] == "click" # Clic sur le résultat - assert types[7] == "wait" # Attente lancement + steps = [a.get("_setup_step") for a in actions] + expected_step_order = [ + "open_run_dialog", + "wait_run_dialog", + "type_launch_command", + "wait_launch_command", + "submit_run_dialog", + "wait_app_launch", + "verify_app_ready", + ] + assert steps == expected_step_order, steps - # AUCUN key_combo dans le pipeline - assert "key_combo" not in types, "Le pipeline ne doit contenir aucun key_combo" + assert types.count("key_combo") == 2 - # Le texte tapé est le nom visuel français - assert actions[4]["text"] == "Bloc-notes" + idx_type = steps.index("type_launch_command") + assert actions[idx_type]["text"] == "notepad" def test_full_pipeline_from_workflow(self): """Pipeline complet depuis un workflow structuré.""" @@ -599,12 +1167,12 @@ class TestSetupPipeline: assert app_info["primary_app"] == "Notepad.exe" actions = _generate_setup_actions(app_info) - assert len(actions) >= 8 + assert len(actions) == 7 - # Le texte tapé doit être le nom visuel, pas la commande shell + # Le texte tapé doit être la commande shell pour le setup Win+R. type_action = [a for a in actions if a["type"] == "type"][0] - assert type_action["text"] == "Bloc-notes" + assert type_action["text"] == "notepad" - # Aucun key_combo + # Le setup Notepad s'appuie maintenant sur deux key_combo. key_combos = [a for a in actions if a["type"] == "key_combo"] - assert key_combos == [] + assert len(key_combos) == 2 diff --git a/tests/unit/test_executor_anchor_drift_guard.py b/tests/unit/test_executor_anchor_drift_guard.py new file mode 100644 index 000000000..a0215cd4d --- /dev/null +++ b/tests/unit/test_executor_anchor_drift_guard.py @@ -0,0 +1,79 @@ +"""Tests pour la garde drift de `_template_match_anchor`. + +Brief Codex 2026-05-23 07:56 : faux succès live `act_raw_77db702f` où +ANCHOR-TM matche un crop dans OBS Studio à (0.205, 0.170) score 0.842 +alors que la position enregistrée est ~(0.706, 0.348) dans Bloc-notes. +La cascade serveur avait rejeté (`rejected_text_mismatch`) mais l'agent +fallback ANCHOR-TM côté client sans aucune garde de position acceptait +n'importe quel match au-dessus du seuil score. + +Le helper statique `_anchor_match_within_drift` rejette les matchs +loin de la position fallback enregistrée. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +ROOT = Path(__file__).parent.parent.parent +sys.path.insert(0, str(ROOT)) + +from agent_v0.agent_v1.core.executor import ActionExecutorV1 # noqa: E402 + + +class TestAnchorMatchDriftGuard: + def test_match_close_to_fallback_accepted(self): + # 5% de drift en x → accepté + assert ActionExecutorV1._anchor_match_within_drift( + matched_x_pct=0.71, matched_y_pct=0.35, + fallback_x_pct=0.706, fallback_y_pct=0.348, + ) + + def test_match_far_from_fallback_rejected(self): + # cas live exact + assert not ActionExecutorV1._anchor_match_within_drift( + matched_x_pct=0.205, matched_y_pct=0.170, + fallback_x_pct=0.706, fallback_y_pct=0.348, + ) + + def test_drift_at_threshold_accepted(self): + # drift = 0.25 exact (frontière) + assert ActionExecutorV1._anchor_match_within_drift( + matched_x_pct=0.5, matched_y_pct=0.5, + fallback_x_pct=0.25, fallback_y_pct=0.5, + ) + + def test_drift_just_above_threshold_rejected(self): + assert not ActionExecutorV1._anchor_match_within_drift( + matched_x_pct=0.5, matched_y_pct=0.5, + fallback_x_pct=0.24, fallback_y_pct=0.5, + ) + + def test_no_recorded_fallback_keeps_legacy_behavior(self): + """Si pas de fallback enregistré (0,0), pas de garde possible.""" + assert ActionExecutorV1._anchor_match_within_drift( + matched_x_pct=0.5, matched_y_pct=0.5, + fallback_x_pct=0.0, fallback_y_pct=0.0, + ) + + def test_custom_max_drift(self): + """Le seuil est configurable par caller.""" + # Avec max_drift=0.10, un drift 0.15 est rejeté + assert not ActionExecutorV1._anchor_match_within_drift( + matched_x_pct=0.65, matched_y_pct=0.50, + fallback_x_pct=0.50, fallback_y_pct=0.50, + max_drift=0.10, + ) + # Mais accepté avec le défaut 0.25 + assert ActionExecutorV1._anchor_match_within_drift( + matched_x_pct=0.65, matched_y_pct=0.50, + fallback_x_pct=0.50, fallback_y_pct=0.50, + ) + + def test_drift_y_axis(self): + """Drift y > seuil → rejet (même si x dans la zone).""" + assert not ActionExecutorV1._anchor_match_within_drift( + matched_x_pct=0.50, matched_y_pct=0.95, + fallback_x_pct=0.50, fallback_y_pct=0.50, + ) diff --git a/tests/unit/test_executor_verify_window_guard.py b/tests/unit/test_executor_verify_window_guard.py new file mode 100644 index 000000000..628399752 --- /dev/null +++ b/tests/unit/test_executor_verify_window_guard.py @@ -0,0 +1,744 @@ +"""Tests pour la garde verify_screen.expected_window_title_contains. + +Cette garde protège les étapes du setup auto Windows contre les +configurations où ``click_start_menu`` se trompe de cible (systray +overflow popup, par exemple) et laisse la frappe partir dans la +mauvaise fenêtre. Ajoutée le 22 mai 2026 — cf. +``docs/CR_AUDIT_SETUP_VISUAL_GUARDS_2026-05-22.md``. + +On teste deux choses : +1. Le helper statique ``_window_title_matches_any`` (substring + case). +2. Le routage de la garde dans ``verify_screen`` : succès si titre + matche, bascule en mode apprentissage / pause sinon. +""" + +from __future__ import annotations + +import sys +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import patch, MagicMock + +ROOT = Path(__file__).parent.parent.parent +sys.path.insert(0, str(ROOT)) + +from agent_v0.agent_v1.core.executor import ActionExecutorV1 # noqa: E402 + + +# ========================================================================= +# Helper substring matching +# ========================================================================= + + +class TestWindowTitleMatchesAny: + def test_substring_match(self): + assert ActionExecutorV1._window_title_matches_any( + "Rechercher", ["Rechercher"] + ) + + def test_case_insensitive(self): + assert ActionExecutorV1._window_title_matches_any( + "RECHERCHER - Cortana", ["rechercher"] + ) + + def test_partial_match_first_pattern(self): + assert ActionExecutorV1._window_title_matches_any( + "Cortana - Rechercher", ["search", "rechercher", "cortana"] + ) + + def test_no_match_returns_false(self): + assert not ActionExecutorV1._window_title_matches_any( + "Fenêtre de dépassement de capacité de la barre d'état système", + ["Rechercher", "Search", "Cortana"], + ) + + def test_empty_patterns_returns_true(self): + """Pas de patterns demandés → la garde est neutre.""" + assert ActionExecutorV1._window_title_matches_any("X", []) + assert ActionExecutorV1._window_title_matches_any("X", None) + + def test_empty_title_with_patterns_returns_false(self): + assert not ActionExecutorV1._window_title_matches_any("", ["X"]) + + def test_ignore_empty_pattern_entries(self): + """Les chaînes vides dans la liste ne doivent pas matcher + l'ensemble du titre.""" + assert not ActionExecutorV1._window_title_matches_any( + "rien à voir", ["", None, ""] + ) + + +class TestKnownRuntimeDialogs: + def test_match_confirm_save_overwrite_dialog(self): + spec = ActionExecutorV1._match_known_runtime_dialog( + "Confirmer l'enregistrement" + ) + assert spec is not None + assert spec["id"] == "confirm_save_overwrite" + assert spec["button_texts"][0] == "Oui" + + def test_match_confirm_save_overwrite_dialog_with_typographic_apostrophe(self): + spec = ActionExecutorV1._match_known_runtime_dialog( + "Confirmer l’enregistrement" + ) + assert spec is not None + assert spec["id"] == "confirm_save_overwrite" + + def test_unknown_title_returns_none(self): + assert ActionExecutorV1._match_known_runtime_dialog( + "Bloc-notes" + ) is None + + +class TestContextualRuntimeDialogs: + def test_contextual_notepad_unsaved_dialog_is_detected_via_visual_evidence(self): + exe = _make_executor_skeleton() + exe._capture_screenshot_b64 = MagicMock(return_value="shot") + exe._find_text_on_screen = MagicMock( + side_effect=lambda _shot, text: (100, 100) + if text == "Ne pas enregistrer" + else None + ) + + action = { + "action_id": "act_save_from_dialog", + "type": "click", + "visual_mode": True, + "target_spec": { + "window_title": "*test – Bloc-notes", + "by_text": "Enregistrer", + }, + "expected_window_before": "*test – Bloc-notes", + } + target_spec = dict(action["target_spec"]) + + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_info", + return_value={"title": "Bloc-notes", "app_name": "Notepad.exe"}, + ): + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect", + return_value={ + "title": "Bloc-notes", + "app_name": "Notepad.exe", + "rect": [500, 300, 1400, 900], + }, + ): + adapted = exe._maybe_contextualize_action_to_foreground_dialog( + action, + target_spec, + ) + + assert adapted is not None + assert adapted["dialog_spec"]["id"] == "notepad_unsaved_changes" + assert adapted["action"]["expected_window_before"] == "Bloc-notes" + assert adapted["target_spec"]["window_title"] == "Bloc-notes" + assert adapted["target_spec"]["context_hints"]["foreground_dialog_id"] == ( + "notepad_unsaved_changes" + ) + assert adapted["target_spec"]["window_capture"]["rect"] == [500, 300, 1400, 900] + + def test_contextual_notepad_dialog_is_ignored_without_matching_action(self): + exe = _make_executor_skeleton() + exe._capture_screenshot_b64 = MagicMock(return_value="shot") + exe._find_text_on_screen = MagicMock( + side_effect=lambda _shot, text: (100, 100) + if text == "Ne pas enregistrer" + else None + ) + + action = { + "action_id": "act_other_button", + "type": "click", + "visual_mode": True, + "target_spec": { + "window_title": "*test – Bloc-notes", + "by_text": "Annuler", + }, + } + + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_info", + return_value={"title": "Bloc-notes", "app_name": "Notepad.exe"}, + ): + adapted = exe._maybe_contextualize_action_to_foreground_dialog( + action, + dict(action["target_spec"]), + ) + + assert adapted is None + + +class TestPostVerifyWindowTransition: + def test_requires_transition_when_expected_after_differs_from_source_window(self): + assert ActionExecutorV1._requires_post_verify_window_transition( + action={"expected_window_before": "*test – Bloc-notes"}, + target_spec=None, + expected_after="Enregistrer sous", + ) + + def test_same_window_title_does_not_require_transition(self): + assert not ActionExecutorV1._requires_post_verify_window_transition( + action={"expected_window_before": "*test – Bloc-notes"}, + target_spec=None, + expected_after="test – Bloc-notes", + ) + + +# ========================================================================= +# Routage de la garde dans verify_screen +# ========================================================================= + + +def _make_executor_skeleton(): + """Construit un ActionExecutorV1 sans son __init__ lourd + (MouseController/KeyboardController/mss). On câble manuellement + les attributs strictement nécessaires aux branches testées. + """ + exe = ActionExecutorV1.__new__(ActionExecutorV1) + exe._notification_manager = None + exe._system_dialog_pause = None + exe._chat_window_ref = None + exe._api_token = "" + exe._poll_backoff = 1.0 + exe._poll_backoff_min = 1.0 + exe._poll_backoff_max = 30.0 + exe._poll_backoff_factor = 1.5 + # mss factice (monitor 1920×1080) + exe._sct = MagicMock() + exe._sct.monitors = [None, {"width": 1920, "height": 1080}] + # Patcher les helpers IO côté agent + exe._check_and_pause_on_system_dialog = MagicMock(return_value=False) + exe._capture_screenshot_b64 = MagicMock(return_value=None) + return exe + + +def _verify_action(patterns, timeout_ms=200): + return { + "action_id": "act_test_verify", + "type": "verify_screen", + "expected_node": "", + "timeout_ms": timeout_ms, + "expected_window_title_contains": patterns, + } + + +class TestVerifyScreenWindowGuard: + def test_matching_title_returns_success(self): + exe = _make_executor_skeleton() + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_info", + return_value={"title": "Rechercher"}, + ): + res = exe.execute_replay_action(_verify_action( + ["Rechercher", "Search"] + )) + assert res["success"] is True + assert res.get("warning") != "setup_guard_window_mismatch" + + def test_mismatch_with_human_correction_returns_success_supervised(self): + exe = _make_executor_skeleton() + # L'utilisateur fait un clic correctif quand le mode apprentissage + # se déclenche → on récupère la séquence et on rend la main au serveur. + exe._capture_human_correction = MagicMock(return_value=[ + {"type": "click", "x_pct": 0.10, "y_pct": 0.95}, + ]) + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_info", + return_value={ + "title": "Fenêtre de dépassement de capacité de la barre d'état système", + }, + ): + res = exe.execute_replay_action(_verify_action( + ["Rechercher", "Search"] + )) + assert res["success"] is True + assert res["warning"] == "setup_guard_window_mismatch" + assert res["resolution_method"] == "human_supervised" + assert res["correction"]["trigger"] == "setup_guard_window_mismatch" + assert res["correction"]["expected_patterns"] == ["Rechercher", "Search"] + + def test_mismatch_without_human_pauses_replay(self): + exe = _make_executor_skeleton() + exe._capture_human_correction = MagicMock(return_value=[]) + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_info", + return_value={"title": "Notepad - Sans titre"}, + ): + res = exe.execute_replay_action(_verify_action(["Rechercher"])) + assert res["success"] is False + assert res["warning"] == "setup_guard_window_mismatch" + assert res.get("needs_human") is True + assert "Rechercher" in res["error"] + + def test_verify_without_patterns_is_neutral_wait(self): + """Sans expected_window_title_contains, verify_screen reste un + simple wait — pas de check fenêtre, pas de mode apprentissage.""" + exe = _make_executor_skeleton() + exe._capture_human_correction = MagicMock() + action = { + "action_id": "act_test_verify_neutral", + "type": "verify_screen", + "expected_node": "node_x", + "timeout_ms": 200, + } + res = exe.execute_replay_action(action) + assert res["success"] is True + exe._capture_human_correction.assert_not_called() + + def test_known_runtime_dialog_is_auto_handled_before_pause(self): + exe = _make_executor_skeleton() + exe._capture_human_correction = MagicMock(return_value=[]) + exe._maybe_handle_runtime_dialog_before_pause = MagicMock( + return_value={ + "action_id": "act_test_click", + "success": True, + "warning": "runtime_dialog_handled_skip", + "resolution_method": "runtime_dialog:confirm_save_overwrite", + "screenshot": None, + "visual_resolved": False, + } + ) + action = { + "action_id": "act_test_click", + "type": "click", + "visual_mode": True, + "x_pct": 0.5, + "y_pct": 0.5, + "target_spec": { + "window_title": "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes", + "by_text": "", + }, + } + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_info", + return_value={"title": "Confirmer l'enregistrement"}, + ): + res = exe.execute_replay_action(action) + assert res["success"] is True + assert res["warning"] == "runtime_dialog_handled_skip" + exe._maybe_handle_runtime_dialog_before_pause.assert_called_once() + exe._capture_human_correction.assert_not_called() + + +# ========================================================================= +# Skip pixel-change validation pour les actions _setup_phase +# ========================================================================= + + +def _make_executor_with_mouse_skeleton(): + """Comme `_make_executor_skeleton` mais avec aussi un mouse mock, + pour pouvoir traverser la branche click de execute_replay_action + sans toucher au desktop.""" + exe = _make_executor_skeleton() + exe.mouse = MagicMock() + exe.mouse.position = (0, 0) + exe.keyboard = MagicMock() + # _quick_screenshot_hash retourne une string non-vide → pixel check actif + exe._quick_screenshot_hash = MagicMock(return_value="hash_before") + return exe + + +class TestSetupActionsSkipPixelChange: + """Pour les actions du setup auto (`_setup_phase=True`), la + validation par simple pixel-change est neutralisée. C'est la garde + verify_screen suivante qui décide — sinon un click_start qui ouvre + le systray overflow popup serait validé sur changement d'écran. + """ + + def test_setup_click_skips_screen_change_check(self): + exe = _make_executor_with_mouse_skeleton() + exe._wait_for_screen_change = MagicMock(return_value=False) + exe._capture_human_correction = MagicMock() + # On évite la résolution visuelle réelle : pas de visual_mode. + action = { + "action_id": "act_setup_click_start", + "type": "click", + "x_pct": 0.02, + "y_pct": 0.98, + "_setup_phase": True, + "_setup_step": "click_start_menu", + } + res = exe.execute_replay_action(action) + assert res["success"] is True + # La fonction _wait_for_screen_change ne doit PAS être appelée + # pour les actions setup. + exe._wait_for_screen_change.assert_not_called() + # Et le mode apprentissage ne doit pas se déclencher non plus. + exe._capture_human_correction.assert_not_called() + + def test_non_setup_click_still_runs_screen_change_check(self): + """Non-régression : une action click hors setup conserve la + validation pixel-change qui déclenche le mode apprentissage si + l'écran ne change pas.""" + exe = _make_executor_with_mouse_skeleton() + exe._wait_for_screen_change = MagicMock(return_value=False) + exe._capture_human_correction = MagicMock(return_value=[]) + action = { + "action_id": "act_user_click", + "type": "click", + "x_pct": 0.5, + "y_pct": 0.5, + # Pas de _setup_phase + } + res = exe.execute_replay_action(action) + exe._wait_for_screen_change.assert_called_once() + # Pas visual_mode → branche échec simple, success=False + assert res.get("warning") == "no_screen_change" + assert res["success"] is False + + +class TestRuntimeDialogHandling: + def test_handle_confirm_save_dialog_clicks_oui_via_server(self): + exe = _make_executor_skeleton() + exe._capture_screenshot_b64 = MagicMock(return_value="abc") + exe._server_resolve_target = MagicMock( + return_value={ + "resolved": True, + "x_pct": 0.25, + "y_pct": 0.75, + "method": "hybrid_text_direct", + "score": 0.91, + } + ) + exe._find_text_on_screen = MagicMock(return_value=None) + exe._click = MagicMock() + + spec = ActionExecutorV1._match_known_runtime_dialog( + "Confirmer l'enregistrement" + ) + + with patch("agent_v0.agent_v1.config.SERVER_URL", "http://srv"): + handled = exe._handle_known_runtime_dialog( + spec, "Confirmer l'enregistrement", 1920, 1080 + ) + + assert handled["handled"] is True + assert handled["button_text"] == "Oui" + exe._server_resolve_target.assert_called_once() + exe._click.assert_called_once_with((480, 810), "left") + + def test_runtime_dialog_before_pause_returns_skip_result(self): + exe = _make_executor_skeleton() + exe._check_and_pause_on_system_dialog = MagicMock(return_value=False) + exe._handle_known_runtime_dialog = MagicMock( + return_value={ + "handled": True, + "button_text": "Oui", + "x_pct": 0.33, + "y_pct": 0.66, + "resolution_score": 0.9, + } + ) + exe._capture_screenshot_b64 = MagicMock(return_value="after") + + res = exe._maybe_handle_runtime_dialog_before_pause( + action={"action_id": "act_final_click", "type": "click"}, + target_spec={}, + expected_title="http192.168.1.408765dossier.htmlid=.txt – Bloc-notes", + current_title="Confirmer l'enregistrement", + screen_width=1920, + screen_height=1080, + ) + + assert res["success"] is True + assert res["warning"] == "runtime_dialog_handled_skip" + assert res["correction"]["button_text"] == "Oui" + assert res["actual_position"] == {"x_pct": 0.33, "y_pct": 0.66} + + def test_post_verify_handles_runtime_dialog_and_recovers_expected_window(self): + exe = _make_executor_skeleton() + exe._click = MagicMock() + exe._quick_screenshot_hash = MagicMock(return_value="hash_before") + exe._wait_for_screen_change = MagicMock(return_value=True) + + handled_state = {"done": False} + + def _fake_handle(dialog_spec, current_title, screen_width, screen_height): + handled_state["done"] = True + return { + "handled": True, + "button_text": "Oui", + "x_pct": 0.33, + "y_pct": 0.66, + "resolution_score": 0.9, + } + + exe._handle_known_runtime_dialog = MagicMock(side_effect=_fake_handle) + + action = { + "action_id": "act_save_dialog", + "type": "click", + "x_pct": 0.5, + "y_pct": 0.5, + "expected_window_title": "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes", + } + + def _window_info(): + if handled_state["done"]: + return {"title": "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes"} + return {"title": "Confirmer l’enregistrement"} + + with patch("agent_v0.agent_v1.core.executor.time.sleep", lambda *_a, **_k: None): + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_info", + side_effect=_window_info, + ): + res = exe.execute_replay_action(action) + + assert res["success"] is True + assert res["warning"] == "runtime_dialog_handled_post_verify" + assert res["actual_position"] == {"x_pct": 0.5, "y_pct": 0.5} + exe._handle_known_runtime_dialog.assert_called_once() + + def test_post_verify_can_retry_same_runtime_dialog_before_recovery(self): + exe = _make_executor_skeleton() + exe._click = MagicMock() + exe._quick_screenshot_hash = MagicMock(return_value="hash_before") + exe._wait_for_screen_change = MagicMock(return_value=True) + + handled_state = {"count": 0} + + def _fake_handle(dialog_spec, current_title, screen_width, screen_height): + handled_state["count"] += 1 + return { + "handled": True, + "button_text": "Oui", + "x_pct": 0.33, + "y_pct": 0.66, + "resolution_score": 0.9, + } + + exe._handle_known_runtime_dialog = MagicMock(side_effect=_fake_handle) + + action = { + "action_id": "act_save_dialog_retry", + "type": "click", + "x_pct": 0.5, + "y_pct": 0.5, + "expected_window_title": "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes", + } + + def _window_info(): + if handled_state["count"] >= 2: + return {"title": "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes"} + return {"title": "Confirmer l’enregistrement"} + + with patch("agent_v0.agent_v1.core.executor.time.sleep", lambda *_a, **_k: None): + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_info", + side_effect=_window_info, + ): + res = exe.execute_replay_action(action) + + assert res["success"] is True + assert res["warning"] == "runtime_dialog_handled_post_verify" + assert handled_state["count"] == 2 + assert res["runtime_dialog"]["dialog_id"] == "confirm_save_overwrite" + + def test_post_verify_wrong_window_fails_when_dialog_transition_was_expected(self): + exe = _make_executor_skeleton() + exe._click = MagicMock() + exe._quick_screenshot_hash = MagicMock(return_value="hash_before") + exe._wait_for_screen_change = MagicMock(return_value=True) + exe._capture_screenshot_b64 = MagicMock(return_value="after") + exe._notification_manager = MagicMock() + + action = { + "action_id": "act_open_save_dialog", + "type": "click", + "x_pct": 0.5, + "y_pct": 0.5, + "expected_window_before": "*test – Bloc-notes", + "expected_window_title": "Enregistrer sous", + } + + with patch("agent_v0.agent_v1.core.executor.time.sleep", lambda *_a, **_k: None): + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_info", + return_value={"title": "rpa_vision : Explorateur de fichiers"}, + ): + res = exe.execute_replay_action(action) + + assert res["success"] is False + assert res["warning"] == "wrong_window" + assert "Enregistrer sous" in res["error"] + assert "rpa_vision : Explorateur de fichiers" in res["error"] + assert res["needs_human"] is True + exe._notification_manager.replay_wrong_window.assert_called_once() + + def test_post_verify_same_window_mismatch_stays_legacy_warning(self): + exe = _make_executor_skeleton() + exe._click = MagicMock() + exe._quick_screenshot_hash = MagicMock(return_value="hash_before") + exe._wait_for_screen_change = MagicMock(return_value=True) + exe._capture_screenshot_b64 = MagicMock(return_value="after") + + action = { + "action_id": "act_same_window_click", + "type": "click", + "x_pct": 0.5, + "y_pct": 0.5, + "expected_window_before": "*test – Bloc-notes", + "expected_window_title": "test – Bloc-notes", + } + + with patch("agent_v0.agent_v1.core.executor.time.sleep", lambda *_a, **_k: None): + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_info", + return_value={"title": "rpa_vision : Explorateur de fichiers"}, + ): + res = exe.execute_replay_action(action) + + assert res["success"] is True + assert res["warning"] == "post_verif_timeout:rpa_vision : Explorateur de fichiers" + + +class TestCloseTabHotkeyFallback: + def test_visual_close_tab_uses_ctrl_w_when_tab_x_is_hidden(self): + exe = _make_executor_with_mouse_skeleton() + exe._observe_screen = MagicMock(return_value=None) + exe._capture_human_correction = MagicMock(return_value=[]) + exe._execute_key_combo = MagicMock() + exe._click = MagicMock() + exe._wait_for_screen_change = MagicMock(return_value=True) + exe._capture_screenshot_b64 = MagicMock(return_value="after") + + action = { + "action_id": "act_close_tab", + "type": "click", + "visual_mode": True, + "x_pct": 0.88, + "y_pct": 0.04, + "target_spec": { + "window_title": "*test – Bloc-notes", + "by_role": "tab_close_button", + "context_hints": { + "interaction": "close_tab", + "active_tab_label": "test", + }, + }, + } + + with patch("agent_v0.agent_v1.core.executor.time.sleep", lambda *_a, **_k: None): + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_info", + return_value={"title": "*test – Bloc-notes"}, + ): + res = exe.execute_replay_action(action) + + assert res["success"] is True + assert res["warning"] == "close_tab_hotkey_fallback" + assert res["resolution_method"] == "semantic_close_tab_hotkey" + exe._execute_key_combo.assert_called_once_with(["ctrl", "w"]) + exe._click.assert_not_called() + exe._capture_human_correction.assert_not_called() + + +class TestStartButtonHotkeyFallback: + def test_setup_start_button_position_fallback_uses_windows_key(self): + exe = _make_executor_with_mouse_skeleton() + exe._observe_screen = MagicMock(return_value=None) + exe._capture_human_correction = MagicMock(return_value=[]) + exe._execute_key_combo = MagicMock() + exe._click = MagicMock() + exe._capture_screenshot_b64 = MagicMock(return_value="after") + exe._wait_for_screen_change = MagicMock(return_value=True) + + action = { + "action_id": "act_setup_click_start", + "type": "click", + "visual_mode": True, + "x_pct": 0.387891, + "y_pct": 0.974375, + "_setup_phase": True, + "_setup_step": "click_start_menu", + "target_spec": { + "by_role": "start_button", + "by_text": "", + "anchor_image_base64": "abc123", + "allow_position_fallback": True, + "screen_scope": "full_screen", + }, + } + + grounding_result = SimpleNamespace( + found=True, + x_pct=0.387891, + y_pct=0.974375, + method="position_fallback", + score=0.2, + detail="fallback positionnel explicite", + elapsed_ms=12.0, + ) + + with patch( + "agent_v0.agent_v1.core.grounding.GroundingEngine.locate", + return_value=grounding_result, + ) as locate_mock: + with patch( + "agent_v0.agent_v1.core.executor.time.sleep", + lambda *_a, **_k: None, + ): + res = exe.execute_replay_action(action, server_url="http://srv") + + assert res["success"] is True + assert res["warning"] == "start_button_hotkey_fallback" + assert res["resolution_method"] == "semantic_start_button_hotkey" + exe._execute_key_combo.assert_called_once_with(["win"]) + exe._click.assert_not_called() + exe._wait_for_screen_change.assert_not_called() + exe._capture_human_correction.assert_not_called() + + def test_real_visual_start_button_match_keeps_mouse_click(self): + exe = _make_executor_with_mouse_skeleton() + exe._observe_screen = MagicMock(return_value=None) + exe._capture_human_correction = MagicMock(return_value=[]) + exe._execute_key_combo = MagicMock() + exe._click = MagicMock() + exe._capture_screenshot_b64 = MagicMock(return_value="after") + exe._wait_for_screen_change = MagicMock(return_value=True) + + action = { + "action_id": "act_setup_click_start", + "type": "click", + "visual_mode": True, + "x_pct": 0.387891, + "y_pct": 0.974375, + "_setup_phase": True, + "_setup_step": "click_start_menu", + "target_spec": { + "by_role": "start_button", + "by_text": "", + "anchor_image_base64": "abc123", + "allow_position_fallback": True, + "screen_scope": "full_screen", + }, + } + + grounding_result = SimpleNamespace( + found=True, + x_pct=0.389, + y_pct=0.973, + method="vlm_quick_find", + score=0.93, + detail="match VLM plausible", + elapsed_ms=35.0, + ) + + with patch( + "agent_v0.agent_v1.core.grounding.GroundingEngine.locate", + return_value=grounding_result, + ): + with patch( + "agent_v0.agent_v1.core.executor.time.sleep", + lambda *_a, **_k: None, + ): + res = exe.execute_replay_action(action, server_url="http://srv") + + assert res["success"] is True + assert res["resolution_method"] == "vlm_quick_find" + exe._execute_key_combo.assert_not_called() + exe._click.assert_called_once() + exe._wait_for_screen_change.assert_not_called() + exe._capture_human_correction.assert_not_called() diff --git a/tests/unit/test_finalize_auto_replay_flag.py b/tests/unit/test_finalize_auto_replay_flag.py new file mode 100644 index 000000000..6c987c95f --- /dev/null +++ b/tests/unit/test_finalize_auto_replay_flag.py @@ -0,0 +1,58 @@ +"""Tests pour le flag RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE. + +Brief Codex 2026-05-23 09:02 : le chemin produit cible est le workflow +compilé (post worker VLM), pas le replay direct depuis raw events. +Le flag env désactive la proposition automatique de replay direct par +défaut. Le chemin direct reste accessible (smoke/debug) via RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE=true. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +ROOT = Path(__file__).parent.parent.parent +sys.path.insert(0, str(ROOT)) + +from agent_v0.server_v1.replay_engine import ( # noqa: E402 + _auto_launch_replay_after_finalize, +) + + +class TestAutoLaunchReplayFlag: + def test_default_is_false(self, monkeypatch): + """Sans variable d'env, le mode produit est actif → pas de + proposition automatique de replay direct.""" + monkeypatch.delenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", raising=False) + assert _auto_launch_replay_after_finalize() is False + + def test_true_value_activates(self, monkeypatch): + monkeypatch.setenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", "true") + assert _auto_launch_replay_after_finalize() is True + + def test_1_value_activates(self, monkeypatch): + monkeypatch.setenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", "1") + assert _auto_launch_replay_after_finalize() is True + + def test_yes_value_activates(self, monkeypatch): + monkeypatch.setenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", "yes") + assert _auto_launch_replay_after_finalize() is True + + def test_false_value_deactivates(self, monkeypatch): + monkeypatch.setenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", "false") + assert _auto_launch_replay_after_finalize() is False + + def test_empty_value_deactivates(self, monkeypatch): + monkeypatch.setenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", "") + assert _auto_launch_replay_after_finalize() is False + + def test_arbitrary_value_deactivates(self, monkeypatch): + """Toute valeur non-truthy retourne False (default-deny).""" + monkeypatch.setenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", "maybe") + assert _auto_launch_replay_after_finalize() is False + + def test_case_insensitive(self, monkeypatch): + monkeypatch.setenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", "TRUE") + assert _auto_launch_replay_after_finalize() is True + monkeypatch.setenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", "Yes") + assert _auto_launch_replay_after_finalize() is True diff --git a/tests/unit/test_grounding_engine.py b/tests/unit/test_grounding_engine.py new file mode 100644 index 000000000..4c7d762d9 --- /dev/null +++ b/tests/unit/test_grounding_engine.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import sys +from pathlib import Path +from unittest.mock import MagicMock + +ROOT = Path(__file__).parent.parent.parent +sys.path.insert(0, str(ROOT)) + +from agent_v0.agent_v1.core.grounding import GroundingEngine # noqa: E402 + + +def test_template_strategy_passes_fallback_coords_to_anchor_drift_guard(): + executor = MagicMock() + executor._template_match_anchor = MagicMock( + return_value={ + "resolved": True, + "x_pct": 0.7, + "y_pct": 0.35, + "score": 0.95, + } + ) + + engine = GroundingEngine(executor) + target_spec = {"anchor_image_base64": "abc123"} + + result = engine._try_strategy( + "template", + server_url="", + screenshot_b64="shot", + target_spec=target_spec, + fallback_x=0.708594, + fallback_y=0.35, + screen_width=2560, + screen_height=1600, + ) + + assert result.found is True + executor._template_match_anchor.assert_called_once_with( + "shot", + "abc123", + 2560, + 1600, + fallback_x_pct=0.708594, + fallback_y_pct=0.35, + ) diff --git a/tests/unit/test_policy_grounding_recovery_learning.py b/tests/unit/test_policy_grounding_recovery_learning.py index 3cf423f06..c0c00bbc6 100644 --- a/tests/unit/test_policy_grounding_recovery_learning.py +++ b/tests/unit/test_policy_grounding_recovery_learning.py @@ -111,6 +111,310 @@ class TestGroundingEngine: assert d["x_pct"] == 0.5 assert d["method"] == "som" + def test_start_button_uses_full_screen_instead_of_active_window(self): + """Le bouton Démarrer doit être résolu sur l'écran entier.""" + engine, executor = self._make_engine() + executor._server_resolve_target.return_value = { + "resolved": True, + "x_pct": 0.02, + "y_pct": 0.98, + "method": "som_text", + "score": 0.9, + "matched_element": {"label": "Démarrer"}, + } + engine._capture_window_or_screen = MagicMock(return_value="fake_b64_data") + + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect", + return_value={"rect": [100, 100, 1100, 900]}, + ): + result = engine.locate( + "http://server", + {"by_text": "Démarrer", "by_role": "start_button"}, + 0.02, 0.98, 1920, 1080, + ) + + assert result.found is True + engine._capture_window_or_screen.assert_called_once_with(None) + + def test_regular_targets_stay_scoped_to_active_window(self): + """Les cibles applicatives ordinaires restent bornées à la fenêtre active.""" + engine, executor = self._make_engine() + executor._server_resolve_target.return_value = { + "resolved": True, + "x_pct": 0.5, + "y_pct": 0.25, + "method": "som_text", + "score": 0.9, + "matched_element": {"label": "Enregistrer"}, + } + engine._capture_window_or_screen = MagicMock(return_value="fake_b64_data") + + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect", + return_value={"rect": [100, 200, 1100, 1000]}, + ): + result = engine.locate( + "http://server", + {"by_text": "Enregistrer", "by_role": "button"}, + 0.5, 0.3, 1920, 1080, + ) + + assert result.found is True + engine._capture_window_or_screen.assert_called_once_with( + {"left": 100, "top": 200, "width": 1000, "height": 800} + ) + + def test_unknown_window_rect_falls_back_to_full_screen_on_visual_mismatch(self): + """Un titre inconnu n'est accepté que si le crop est validé visuellement.""" + engine, executor = self._make_engine() + executor._server_resolve_target.return_value = { + "resolved": True, + "x_pct": 0.5, + "y_pct": 0.25, + "method": "som_text", + "score": 0.9, + "matched_element": {"label": "Enregistrer"}, + } + executor._find_text_on_screen.return_value = None + engine._capture_window_or_screen = MagicMock( + side_effect=["fake_window_b64", "fake_screen_b64"] + ) + + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect", + return_value={ + "title": "unknown_window", + "rect": [100, 200, 1100, 1000], + }, + ): + result = engine.locate( + "http://server", + {"by_text": "Enregistrer", "by_role": "button"}, + 0.5, 0.3, 1920, 1080, + ) + + assert result.found is True + assert [c.args[0] for c in engine._capture_window_or_screen.call_args_list] == [ + {"left": 100, "top": 200, "width": 1000, "height": 800}, + None, + ] + + def test_taskbar_like_rect_falls_back_to_full_screen(self): + """Une taskbar/systray ne doit jamais être utilisée comme fenêtre active.""" + engine, executor = self._make_engine() + executor._server_resolve_target.return_value = { + "resolved": True, + "x_pct": 0.5, + "y_pct": 0.25, + "method": "som_text", + "score": 0.9, + "matched_element": {"label": "Enregistrer"}, + } + engine._capture_window_or_screen = MagicMock(return_value="fake_b64_data") + + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect", + return_value={ + "title": "Fenêtre de dépassement de capacité de la barre d'état système", + "rect": [0, 1492, 2560, 1600], + }, + ): + result = engine.locate( + "http://server", + {"by_text": "Enregistrer", "by_role": "button"}, + 0.5, 0.3, 2560, 1600, + ) + + assert result.found is True + engine._capture_window_or_screen.assert_called_once_with(None) + + def test_visually_mismatched_window_crop_falls_back_to_full_screen(self): + """Un crop fenêtre plausible mais visuellement faux est rejeté.""" + engine, executor = self._make_engine() + executor._server_resolve_target.return_value = { + "resolved": True, + "x_pct": 0.5, + "y_pct": 0.25, + "method": "som_text", + "score": 0.9, + "matched_element": {"label": "Enregistrer"}, + } + executor._find_text_on_screen.return_value = None + engine._capture_window_or_screen = MagicMock( + side_effect=["fake_window_b64", "fake_screen_b64"] + ) + + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect", + return_value={ + "title": "Enregistrer sous", + "rect": [100, 200, 1100, 1000], + }, + ): + result = engine.locate( + "http://server", + { + "by_text": "Enregistrer", + "by_role": "button", + "window_title": "Enregistrer sous", + }, + 0.5, 0.3, 1920, 1080, + ) + + assert result.found is True + assert [c.args[0] for c in engine._capture_window_or_screen.call_args_list] == [ + {"left": 100, "top": 200, "width": 1000, "height": 800}, + None, + ] + executor._server_resolve_target.assert_called_once_with( + "http://server", + "fake_screen_b64", + { + "by_text": "Enregistrer", + "by_role": "button", + "window_title": "Enregistrer sous", + }, + 0.5, + 0.3, + 1920, + 1080, + ) + + def test_visually_validated_window_crop_stays_scoped(self): + """Un crop fenêtre plausible et validé visuellement reste autorisé.""" + engine, executor = self._make_engine() + executor._server_resolve_target.return_value = { + "resolved": True, + "x_pct": 0.5, + "y_pct": 0.25, + "method": "som_text", + "score": 0.9, + "matched_element": {"label": "Enregistrer"}, + } + executor._find_text_on_screen.return_value = (321, 222) + engine._capture_window_or_screen = MagicMock(return_value="fake_window_b64") + + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect", + return_value={ + "title": "Enregistrer sous", + "rect": [100, 200, 1100, 1000], + }, + ): + result = engine.locate( + "http://server", + { + "by_text": "Enregistrer", + "by_role": "button", + "window_title": "Enregistrer sous", + }, + 0.5, 0.3, 1920, 1080, + ) + + assert result.found is True + engine._capture_window_or_screen.assert_called_once_with( + {"left": 100, "top": 200, "width": 1000, "height": 800} + ) + + def test_lea_active_window_does_not_scope_external_target(self): + """Une fenêtre Léa au premier plan ne doit jamais contraindre une cible externe.""" + engine, executor = self._make_engine() + executor._server_resolve_target.return_value = { + "resolved": True, + "x_pct": 0.5, + "y_pct": 0.25, + "method": "som_text", + "score": 0.9, + "matched_element": {"label": "Bloc-notes"}, + } + engine._capture_window_or_screen = MagicMock(return_value="fake_b64_data") + + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect", + return_value={ + "title": "Léa — Assistante", + "app_name": "pythonw.exe", + "rect": [1948, 750, 2570, 1606], + }, + ): + result = engine.locate( + "http://server", + {"by_text": "Bloc-notes", "by_role": "search_result"}, + 0.2, 0.5, 2560, 1600, + ) + + assert result.found is True + engine._capture_window_or_screen.assert_called_once_with(None) + executor._server_resolve_target.assert_called_once_with( + "http://server", + "fake_b64_data", + {"by_text": "Bloc-notes", "by_role": "search_result"}, + 0.2, + 0.5, + 2560, + 1600, + ) + + def test_lea_active_window_stays_scoped_for_explicit_lea_target(self): + """Si la cible mentionne explicitement Léa, le scope sur sa fenêtre reste autorisé.""" + engine, executor = self._make_engine() + executor._server_resolve_target.return_value = { + "resolved": True, + "x_pct": 0.5, + "y_pct": 0.25, + "method": "som_text", + "score": 0.9, + "matched_element": {"label": "Continuer"}, + } + engine._capture_window_or_screen = MagicMock(return_value="fake_b64_data") + + with patch( + "agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect", + return_value={ + "title": "Léa — Assistante", + "app_name": "pythonw.exe", + "rect": [1948, 750, 2570, 1606], + }, + ): + result = engine.locate( + "http://server", + { + "by_text": "Continuer", + "by_role": "button", + "window_title": "Léa — Assistante", + }, + 0.5, 0.3, 3000, 2000, + ) + + assert result.found is True + engine._capture_window_or_screen.assert_called_once_with( + {"left": 1948, "top": 750, "width": 622, "height": 856} + ) + + def test_allow_position_fallback_returns_recorded_coords(self): + """Quand autorisé, le grounding peut retomber sur la position enregistrée.""" + engine, executor = self._make_engine() + executor._server_resolve_target.return_value = None + executor._template_match_anchor.return_value = None + executor._hybrid_vlm_resolve.return_value = None + + result = engine.locate( + "http://server", + { + "by_role": "start_button", + "vlm_description": "icône Windows", + "screen_scope": "full_screen", + "allow_position_fallback": True, + }, + 0.387891, 0.974375, 1920, 1080, + ) + + assert result.found is True + assert result.method == "position_fallback" + assert result.x_pct == pytest.approx(0.387891) + assert result.y_pct == pytest.approx(0.974375) + # ========================================================================= # P2 : Policy — décisions quand grounding échoue @@ -407,6 +711,65 @@ class TestReplayLearner: assert "action_id" in data assert "success" in data + def test_record_human_correction_persists_to_memory_helper(self, learner, monkeypatch): + """Une correction humaine doit alimenter la mémoire persistante via replay_memory.""" + captured = {} + + def fake_memory_record_success(**kwargs): + captured.update(kwargs) + return True + + monkeypatch.setattr( + "agent_v0.server_v1.replay_memory.memory_record_success", + fake_memory_record_success, + ) + + learner.record_human_correction( + session_id="s_corr", + action={ + "action_id": "a_corr", + "target_spec": {"by_text": "Valider", "window_title": "Bloc-notes"}, + }, + correction={"x_pct": 0.42, "y_pct": 0.84}, + ) + + loaded = learner.load_session("s_corr") + assert len(loaded) == 1 + assert loaded[0].resolution_method == "human_supervised" + assert loaded[0].window_title == "Bloc-notes" + + assert captured["window_title"] == "Bloc-notes" + assert captured["target_spec"]["by_text"] == "Valider" + assert captured["x_pct"] == 0.42 + assert captured["y_pct"] == 0.84 + assert captured["method"] == "human_supervised" + assert captured["confidence"] == 1.0 + + def test_record_human_correction_fallback_window_title_from_action(self, learner, monkeypatch): + """Si target_spec.window_title est absent, on retombe sur action.window_title.""" + captured = {} + + def fake_memory_record_success(**kwargs): + captured.update(kwargs) + return True + + monkeypatch.setattr( + "agent_v0.server_v1.replay_memory.memory_record_success", + fake_memory_record_success, + ) + + learner.record_human_correction( + session_id="s_corr2", + action={ + "action_id": "a_corr2", + "window_title": "Fenêtre fallback", + "target_spec": {"by_text": "Enregistrer"}, + }, + correction={"x_pct": 0.1, "y_pct": 0.2}, + ) + + assert captured["window_title"] == "Fenêtre fallback" + # ========================================================================= # Boucle d'apprentissage : consolidation cross-workflow diff --git a/tests/unit/test_replay_critic.py b/tests/unit/test_replay_critic.py index 803d24122..b20abddbd 100644 --- a/tests/unit/test_replay_critic.py +++ b/tests/unit/test_replay_critic.py @@ -145,6 +145,20 @@ class TestVerifyWithCritic: assert result.suggestion == "retry" assert result.semantic_verified is None # VLM non appelé + def test_verify_screen_identique_ne_declenche_pas_retry( + self, verifier, screenshot_gray, + ): + """verify_screen est une stabilisation, pas une action qui doit re-changer l'écran.""" + result = verifier.verify_action( + action={"type": "verify_screen", "action_id": "verify_setup"}, + result={"success": True}, + screenshot_before=screenshot_gray, + screenshot_after=screenshot_gray, + ) + assert result.verified is True + assert result.suggestion == "continue" + assert result.changes_detected is False + @patch("agent_v0.server_v1.replay_verifier.ReplayVerifier._verify_semantic") def test_pixel_ok_semantic_ok( self, mock_semantic, verifier, screenshot_gray, screenshot_white, diff --git a/tests/unit/test_replay_memory.py b/tests/unit/test_replay_memory.py new file mode 100644 index 000000000..731ad31d4 --- /dev/null +++ b/tests/unit/test_replay_memory.py @@ -0,0 +1,118 @@ +from types import SimpleNamespace + +from agent_v0.server_v1 import replay_memory +from core.learning.target_memory_store import TargetMemoryStore + + +class _DummyStore: + def __init__(self, fp): + self._fp = fp + + def lookup(self, screen_sig, spec_shim): + return self._fp + + +def test_memory_lookup_uses_window_relative_coords_when_available(monkeypatch): + fp = SimpleNamespace( + bbox=(0.566016, 0.400625, 0.0, 0.0), + etype="position_fallback", + confidence=0.2, + ) + monkeypatch.setattr(replay_memory, "get_memory_store", lambda: _DummyStore(fp)) + + result = replay_memory.memory_lookup( + window_title="Rechercher", + target_spec={ + "by_text": "Bloc-notes", + "window_capture": { + "click_relative": [681, 448], + "window_size": [1287, 1407], + }, + }, + ) + + assert result is not None + assert result["method"] == "memory_position_fallback" + assert result["x_pct"] == 681 / 1287 + assert result["y_pct"] == 448 / 1407 + + +def test_memory_lookup_keeps_bbox_coords_without_window_capture(monkeypatch): + fp = SimpleNamespace( + bbox=(0.566016, 0.400625, 0.0, 0.0), + etype="position_fallback", + confidence=0.2, + ) + monkeypatch.setattr(replay_memory, "get_memory_store", lambda: _DummyStore(fp)) + + result = replay_memory.memory_lookup( + window_title="Rechercher", + target_spec={"by_text": "Bloc-notes"}, + ) + + assert result is not None + assert result["x_pct"] == 0.566016 + assert result["y_pct"] == 0.400625 + + +def test_memory_lookup_keeps_learned_visual_coords_with_window_capture(monkeypatch): + fp = SimpleNamespace( + bbox=(0.402734375, 0.578125, 0.0, 0.0), + etype="anchor_template", + confidence=0.99, + ) + monkeypatch.setattr(replay_memory, "get_memory_store", lambda: _DummyStore(fp)) + + result = replay_memory.memory_lookup( + window_title="*test – Bloc-notes", + target_spec={ + "by_text": "Enregistrer", + "by_role": "yolo", + "window_capture": { + "click_relative": [860, 634], + "window_size": [1920, 1116], + }, + }, + ) + + assert result is not None + assert result["method"] == "memory_anchor_template" + assert result["x_pct"] == 0.402734375 + assert result["y_pct"] == 0.578125 + + +def test_target_spec_hash_distinguishes_same_text_with_different_spatial_hints(tmp_path): + store = TargetMemoryStore(base_path=str(tmp_path / "learning")) + + spec_left = replay_memory._TargetSpecLike( + { + "by_text": "Enregistrer", + "by_role": "yolo", + "vlm_description": "Dans la fenêtre '*test – Bloc-notes', l'élément cliqué se trouve au milieu au centre de l'écran", + "window_capture": { + "click_relative": [860, 634], + "window_size": [1920, 1116], + }, + "som_element": { + "bbox_norm": [0.40234375, 0.701875, 0.46640625, 0.74125], + "center_norm": [0.434375, 0.72125], + }, + } + ) + spec_right = replay_memory._TargetSpecLike( + { + "by_text": "Enregistrer", + "by_role": "yolo", + "vlm_description": "Dans la fenêtre '*test – Bloc-notes', l'élément cliqué se trouve au milieu au centre de l'écran", + "window_capture": { + "click_relative": [1491, 38], + "window_size": [1920, 1116], + }, + "som_element": { + "bbox_norm": [0.697265625, 0.335625, 0.715625, 0.3625], + "center_norm": [0.70625, 0.34875], + }, + } + ) + + assert store._hash_target_spec(spec_left) != store._hash_target_spec(spec_right) diff --git a/tests/unit/test_resolve_engine_close_tab_anchor.py b/tests/unit/test_resolve_engine_close_tab_anchor.py new file mode 100644 index 000000000..d04cd22db --- /dev/null +++ b/tests/unit/test_resolve_engine_close_tab_anchor.py @@ -0,0 +1,152 @@ +from __future__ import annotations + +import base64 +import io +import sys +from pathlib import Path + +from PIL import Image, ImageDraw + +ROOT = Path(__file__).parent.parent.parent +sys.path.insert(0, str(ROOT)) + +from agent_v0.server_v1 import resolve_engine # noqa: E402 + + +class _FakeElem: + def __init__(self, elem_id, label, source, center, center_norm, confidence=0.9): + self.id = elem_id + self.label = label + self.source = source + self.center = center + self.center_norm = center_norm + self.confidence = confidence + + +class _FakeSomResult: + def __init__(self, elements): + self.elements = elements + self.som_image = None + + +class _FakeSomEngine: + def __init__(self, elements): + self._elements = elements + + def analyze(self, _img): + return _FakeSomResult(self._elements) + + +def _make_close_button_image(tmp_path: Path) -> tuple[str, str]: + screenshot = Image.new("RGB", (200, 100), "white") + draw = ImageDraw.Draw(screenshot) + draw.rounded_rectangle((130, 4, 170, 36), radius=8, fill=(242, 244, 247)) + draw.line((144, 12, 156, 24), fill="black", width=2) + draw.line((156, 12, 144, 24), fill="black", width=2) + screenshot_path = tmp_path / "screen.png" + screenshot.save(screenshot_path) + + anchor = screenshot.crop((130, 4, 170, 36)) + buf = io.BytesIO() + anchor.save(buf, format="PNG") + anchor_b64 = base64.b64encode(buf.getvalue()).decode("utf-8") + return str(screenshot_path), anchor_b64 + + +def test_close_tab_uses_exact_anchor_coords(tmp_path, monkeypatch): + screenshot_path, anchor_b64 = _make_close_button_image(tmp_path) + fake_engine = _FakeSomEngine([ + _FakeElem( + elem_id=47, + label="test", + source="yolo", + center=(120, 20), + center_norm=(0.60, 0.20), + ), + ]) + + monkeypatch.setattr(resolve_engine, "_get_som_engine_api", lambda: fake_engine) + monkeypatch.setattr(resolve_engine, "_get_vlm_client", lambda: object()) + + result = resolve_engine._resolve_by_som( + screenshot_path=screenshot_path, + target_spec={ + "anchor_image_base64": anchor_b64, + "by_text": "", + "vlm_description": "fermer l'onglet actif", + "context_hints": {"interaction": "close_tab"}, + "window_capture": { + "rect": [0, 0, 200, 100], + "click_relative": [150, 20], + "window_size": [200, 100], + }, + }, + screen_width=200, + screen_height=100, + ) + + assert result is not None + assert result["method"] == "som_anchor_match" + assert result["matched_element"]["role"] == "som_anchor_exact" + assert result["x_pct"] == 0.75 + assert result["y_pct"] == 0.20 + + +def test_close_tab_rejects_exact_anchor_far_from_recorded_click(tmp_path, monkeypatch): + screenshot_path, anchor_b64 = _make_close_button_image(tmp_path) + fake_engine = _FakeSomEngine([]) + + monkeypatch.setattr(resolve_engine, "_get_som_engine_api", lambda: fake_engine) + monkeypatch.setattr(resolve_engine, "_get_vlm_client", lambda: object()) + + result = resolve_engine._resolve_by_som( + screenshot_path=screenshot_path, + target_spec={ + "anchor_image_base64": anchor_b64, + "by_text": "", + "vlm_description": "fermer l'onglet actif", + "context_hints": {"interaction": "close_tab"}, + "window_capture": { + "rect": [0, 0, 200, 100], + "click_relative": [50, 20], + "window_size": [200, 100], + }, + }, + screen_width=200, + screen_height=100, + ) + + assert result is None + + +def test_non_close_tab_keeps_nearest_som_center(tmp_path, monkeypatch): + screenshot_path, anchor_b64 = _make_close_button_image(tmp_path) + fake_engine = _FakeSomEngine([ + _FakeElem( + elem_id=47, + label="test", + source="yolo", + center=(120, 20), + center_norm=(0.60, 0.20), + ), + ]) + + monkeypatch.setattr(resolve_engine, "_get_som_engine_api", lambda: fake_engine) + monkeypatch.setattr(resolve_engine, "_get_vlm_client", lambda: object()) + + result = resolve_engine._resolve_by_som( + screenshot_path=screenshot_path, + target_spec={ + "anchor_image_base64": anchor_b64, + "by_text": "", + "vlm_description": "icône en haut", + }, + screen_width=200, + screen_height=100, + ) + + assert result is not None + assert result["method"] == "som_anchor_match" + assert result["matched_element"]["role"] == "som_anchor_match" + assert result["x_pct"] == 0.60 + assert result["y_pct"] == 0.20 diff --git a/tests/unit/test_resolve_engine_dialog_button_guard.py b/tests/unit/test_resolve_engine_dialog_button_guard.py new file mode 100644 index 000000000..6960c8f05 --- /dev/null +++ b/tests/unit/test_resolve_engine_dialog_button_guard.py @@ -0,0 +1,51 @@ +import pytest + +from agent_v0.server_v1 import resolve_engine + + +@pytest.fixture(autouse=True) +def _disable_memory_lookup(monkeypatch): + monkeypatch.setattr( + "agent_v0.server_v1.replay_memory.memory_lookup", + lambda **kwargs: None, + ) + + +def test_dialog_button_skips_vlm_cascade_when_ocr_misses(tmp_path, monkeypatch): + screenshot = tmp_path / "screen.jpg" + screenshot.write_bytes(b"fake") + + monkeypatch.setattr( + resolve_engine, + "_resolve_by_ocr_text", + lambda *args, **kwargs: None, + ) + + def _unexpected_vlm(*args, **kwargs): + raise AssertionError("VLM ne doit pas être appelé pour dialog_button") + + def _unexpected_som(*args, **kwargs): + raise AssertionError("SoM ne doit pas être appelé pour dialog_button") + + monkeypatch.setattr(resolve_engine, "_vlm_quick_find", _unexpected_vlm) + monkeypatch.setattr(resolve_engine, "_resolve_by_som", _unexpected_som) + + result = resolve_engine._resolve_target_sync( + str(screenshot), + { + "by_role": "dialog_button", + "by_text": "Oui", + "window_title": "Confirmer l’enregistrement", + "vlm_description": "Dans la fenêtre 'Confirmer l’enregistrement', le bouton 'Oui'", + }, + 2560, + 1600, + 0.5, + 0.5, + True, + processor=None, + ) + + assert result["resolved"] is False + assert result["method"] == "dialog_button_ocr_only" + assert result["reason"] == "ocr_direct_failed_dialog_button_no_vlm" diff --git a/tests/unit/test_resolve_engine_start_button_guard.py b/tests/unit/test_resolve_engine_start_button_guard.py new file mode 100644 index 000000000..4e40c3008 --- /dev/null +++ b/tests/unit/test_resolve_engine_start_button_guard.py @@ -0,0 +1,139 @@ +import pytest + +from agent_v0.server_v1 import resolve_engine + + +@pytest.fixture(autouse=True) +def _disable_memory_lookup(monkeypatch): + monkeypatch.setattr( + "agent_v0.server_v1.replay_memory.memory_lookup", + lambda **kwargs: None, + ) + + +@pytest.fixture +def _patched_resolvers(monkeypatch): + monkeypatch.setattr( + resolve_engine, + "_resolve_by_template_matching", + lambda *args, **kwargs: None, + ) + monkeypatch.setattr( + resolve_engine, + "_resolve_by_som", + lambda *args, **kwargs: None, + ) + + +def _start_button_spec(): + return { + "by_role": "start_button", + "by_text": "", + "anchor_image_base64": "abc123", + "vlm_description": "Le bouton Démarrer (icône Windows) dans la barre des tâches, en bas", + "screen_scope": "full_screen", + } + + +def _generic_button_spec(): + return { + "by_role": "button", + "by_text": "", + "anchor_image_base64": "abc123", + "vlm_description": "Le bouton principal", + } + + +def _vlm_result(x_pct: float, y_pct: float, score: float = 0.95): + return { + "resolved": True, + "method": "vlm_quick_find", + "x_pct": x_pct, + "y_pct": y_pct, + "score": score, + "matched_element": { + "label": "target", + "type": "vlm_located", + "role": "vlm_quick_find", + "confidence": score, + }, + } + + +def test_start_button_rejects_far_vlm_false_positive(tmp_path, monkeypatch, _patched_resolvers): + screenshot = tmp_path / "screen.jpg" + screenshot.write_bytes(b"fake") + + monkeypatch.setattr( + resolve_engine, + "_vlm_quick_find", + lambda *args, **kwargs: _vlm_result(0.01, 0.95), + ) + + result = resolve_engine._resolve_target_sync( + str(screenshot), + _start_button_spec(), + 1920, + 1080, + 0.387891, + 0.974375, + True, + processor=None, + ) + + assert result["resolved"] is False + assert result["method"] == "strict_vlm_template_failed" + + +def test_start_button_accepts_plausible_vlm_result(tmp_path, monkeypatch, _patched_resolvers): + screenshot = tmp_path / "screen.jpg" + screenshot.write_bytes(b"fake") + + monkeypatch.setattr( + resolve_engine, + "_vlm_quick_find", + lambda *args, **kwargs: _vlm_result(0.395, 0.972), + ) + + result = resolve_engine._resolve_target_sync( + str(screenshot), + _start_button_spec(), + 1920, + 1080, + 0.387891, + 0.974375, + True, + processor=None, + ) + + assert result["resolved"] is True + assert result["method"] == "vlm_quick_find" + assert result["x_pct"] == pytest.approx(0.395) + assert result["y_pct"] == pytest.approx(0.972) + + +def test_non_start_button_keeps_vlm_result_even_if_far(tmp_path, monkeypatch, _patched_resolvers): + screenshot = tmp_path / "screen.jpg" + screenshot.write_bytes(b"fake") + + monkeypatch.setattr( + resolve_engine, + "_vlm_quick_find", + lambda *args, **kwargs: _vlm_result(0.01, 0.95), + ) + + result = resolve_engine._resolve_target_sync( + str(screenshot), + _generic_button_spec(), + 1920, + 1080, + 0.387891, + 0.974375, + True, + processor=None, + ) + + assert result["resolved"] is True + assert result["method"] == "vlm_quick_find" + assert result["x_pct"] == pytest.approx(0.01) + assert result["y_pct"] == pytest.approx(0.95) diff --git a/tests/unit/test_server_client_replay_controls.py b/tests/unit/test_server_client_replay_controls.py new file mode 100644 index 000000000..2d05f993e --- /dev/null +++ b/tests/unit/test_server_client_replay_controls.py @@ -0,0 +1,103 @@ +"""Tests pour les contrôles HTTP de replay paused (resume/abort). + +Ces appels sont le fallback du chemin SocketIO `lea:replay_resume` +/ `lea:replay_abort` quand le bus feedback est déconnecté au moment +où l'utilisateur clique dans la bulle paused (cf. +`docs/CR_AUDIT_PAUSED_RESUME_BUS_2026-05-22.md`). +""" + +from __future__ import annotations + +import sys +from pathlib import Path +from unittest.mock import patch, MagicMock + +import pytest + +ROOT = Path(__file__).parent.parent.parent +sys.path.insert(0, str(ROOT)) + +from agent_v0.lea_ui.server_client import LeaServerClient # noqa: E402 + + +# Préfixe partagé pour comparer les URLs sans coller à la valeur de +# RPA_STREAMING_URL côté env d'exécution des tests. +RESUME_PATH = "/traces/stream/replay/replay_xyz/resume" +CANCEL_PATH = "/traces/stream/replay/replay_xyz/cancel" + + +@pytest.fixture +def client(monkeypatch): + monkeypatch.setenv("RPA_API_TOKEN", "tok-test-1234") + c = LeaServerClient() + return c + + +# ========================================================================= +# resume_replay +# ========================================================================= + + +class TestResumeReplay: + def test_returns_true_when_server_accepts(self, client): + resp = MagicMock(ok=True) + with patch("requests.post", return_value=resp) as post: + assert client.resume_replay("replay_xyz") is True + assert post.call_count == 1 + + def test_returns_false_when_server_rejects(self, client): + resp = MagicMock(ok=False) + with patch("requests.post", return_value=resp): + assert client.resume_replay("replay_xyz") is False + + def test_returns_false_on_empty_replay_id(self, client): + with patch("requests.post") as post: + assert client.resume_replay("") is False + post.assert_not_called() + + def test_returns_false_on_exception(self, client): + with patch("requests.post", side_effect=ConnectionError("network down")): + assert client.resume_replay("replay_xyz") is False + + def test_posts_to_resume_endpoint_with_auth_header(self, client): + resp = MagicMock(ok=True) + with patch("requests.post", return_value=resp) as post: + client.resume_replay("replay_xyz") + call = post.call_args + url = call.args[0] if call.args else call.kwargs.get("url", "") + assert url.endswith(RESUME_PATH) + headers = call.kwargs.get("headers", {}) + assert headers.get("Authorization") == "Bearer tok-test-1234" + + +# ========================================================================= +# abort_replay +# ========================================================================= + + +class TestAbortReplay: + def test_returns_true_when_server_accepts(self, client): + resp = MagicMock(ok=True) + with patch("requests.post", return_value=resp): + assert client.abort_replay("replay_xyz") is True + + def test_returns_false_when_server_rejects(self, client): + resp = MagicMock(ok=False) + with patch("requests.post", return_value=resp): + assert client.abort_replay("replay_xyz") is False + + def test_returns_false_on_empty_replay_id(self, client): + with patch("requests.post") as post: + assert client.abort_replay("") is False + post.assert_not_called() + + def test_returns_false_on_exception(self, client): + with patch("requests.post", side_effect=TimeoutError("timeout")): + assert client.abort_replay("replay_xyz") is False + + def test_posts_to_cancel_endpoint(self, client): + resp = MagicMock(ok=True) + with patch("requests.post", return_value=resp) as post: + client.abort_replay("replay_xyz") + url = post.call_args.args[0] + assert url.endswith(CANCEL_PATH) diff --git a/tests/unit/test_text_mismatch_empty_observed.py b/tests/unit/test_text_mismatch_empty_observed.py new file mode 100644 index 000000000..5dd7f1cff --- /dev/null +++ b/tests/unit/test_text_mismatch_empty_observed.py @@ -0,0 +1,83 @@ +"""Tests pour `_should_reject_on_text_mismatch` — patch 2026-05-23 : +distinguer `observed=''` (OCR n'a rien lu, ambigu) de `observed='X'` +(autre texte lu = mismatch confirmé) dans le pré-check OCR. + +Brief Codex 2026-05-23 08:55 : le crop bbox SoM précis (50 × 48 px) +sur un onglet Notepad moderne donne `observed=''` car EasyOCR n'a pas +suffisamment de signal (texte peu contrasté, zone trop petite). Le +patch précédent rejetait ce cas comme mismatch — alors qu'aucune +preuve d'un mauvais clic n'existe. On ne rejette plus que quand l'OCR +a effectivement lu autre chose que la cible attendue. + +Le faux succès OBS Studio reste bloqué : (1) son OCR retournait +`'ue audio disponible GUI OBS Studio…'` = non-vide → rejet conservé ; +(2) la garde drift agent posée sur ANCHOR-TM bloque déjà ce match. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +ROOT = Path(__file__).parent.parent.parent +sys.path.insert(0, str(ROOT)) + +from agent_v0.server_v1.resolve_engine import ( # noqa: E402 + _should_reject_on_text_mismatch, +) + + +class TestShouldRejectOnTextMismatch: + def test_valid_passes(self): + """Cas nominal : OCR a vu la cible → on ne rejette pas.""" + assert not _should_reject_on_text_mismatch( + is_valid=True, observed="Enregistrer sous", + ) + + def test_invalid_with_text_rejects(self): + """Cas 0745 historique : OCR voit '9 ?' qui ne matche pas + 'Enregistrer sous' → rejet confirmé.""" + assert _should_reject_on_text_mismatch( + is_valid=False, observed="9 ?", + ) + + def test_invalid_with_obs_studio_rejects(self): + """Cas 0756 : OCR voit du texte OBS Studio → rejet confirmé.""" + assert _should_reject_on_text_mismatch( + is_valid=False, observed="ue audio disponible GUI OBS Studio", + ) + + def test_invalid_with_empty_observed_does_not_reject(self): + """Cas 0855 : OCR n'a rien lu (zone trop petite/peu contrastée) + → ambigu, pas un mismatch confirmé. On préserve la résolution + serveur — la garde drift agent protège en aval.""" + assert not _should_reject_on_text_mismatch( + is_valid=False, observed="", + ) + + def test_invalid_with_whitespace_only_does_not_reject(self): + """Espace seul = équivalent vide pour notre logique.""" + assert not _should_reject_on_text_mismatch( + is_valid=False, observed=" ", + ) + + def test_invalid_with_newline_only_does_not_reject(self): + assert not _should_reject_on_text_mismatch( + is_valid=False, observed="\n\t", + ) + + def test_invalid_with_none_observed_does_not_reject(self): + """Robustesse : observed None (cas dégénéré OCR-lib absente) + ne doit pas planter.""" + assert not _should_reject_on_text_mismatch( + is_valid=False, observed=None, + ) + + def test_valid_with_empty_passes(self): + """is_valid=True avec observed vide — ne peut normalement pas + arriver via _text_match_fuzzy (qui retourne False sur vide) + mais on garde la logique cohérente : si is_valid=True, on + ne rejette pas, peu importe observed.""" + assert not _should_reject_on_text_mismatch( + is_valid=True, observed="", + ) diff --git a/tests/unit/test_validate_resolution_quality_close_tab.py b/tests/unit/test_validate_resolution_quality_close_tab.py new file mode 100644 index 000000000..516765d97 --- /dev/null +++ b/tests/unit/test_validate_resolution_quality_close_tab.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +ROOT = Path(__file__).parent.parent.parent +sys.path.insert(0, str(ROOT)) + +from agent_v0.server_v1.resolve_engine import _validate_resolution_quality # noqa: E402 + + +def _result(score: float) -> dict: + return { + "resolved": True, + "method": "som_anchor_match", + "score": score, + "x_pct": 0.75, + "y_pct": 0.20, + } + + +def _close_tab_spec() -> dict: + return { + "by_text": "", + "by_role": "tab_close_button", + "anchor_image_base64": "abc123", + "context_hints": {"interaction": "close_tab", "active_tab_label": "test"}, + } + + +def test_close_tab_relaxes_threshold_for_near_match(): + out = _validate_resolution_quality( + _result(0.744), + 0.708594, + 0.35, + target_spec=_close_tab_spec(), + ) + assert out["resolved"] is True + assert out["score"] == 0.744 + + +def test_close_tab_still_rejects_low_score(): + out = _validate_resolution_quality( + _result(0.65), + 0.708594, + 0.35, + target_spec=_close_tab_spec(), + ) + assert out["resolved"] is False + assert "below_threshold" in out["reason"] + + +def test_close_tab_rejects_far_zone_even_with_good_score(): + out = _validate_resolution_quality( + _result(0.80), + 0.30, + 0.20, + target_spec=_close_tab_spec(), + ) + assert out["resolved"] is False + assert out["reason"] == "close_tab_out_of_recorded_zone" + assert out["method"] == "rejected_close_tab_zone_som_anchor_match" diff --git a/tests/unit/test_validate_resolution_quality_switch_tab.py b/tests/unit/test_validate_resolution_quality_switch_tab.py new file mode 100644 index 000000000..b989aa94f --- /dev/null +++ b/tests/unit/test_validate_resolution_quality_switch_tab.py @@ -0,0 +1,134 @@ +"""Tests pour `_validate_resolution_quality` — relâchement contextuel +du seuil de score pour les cibles `interaction = switch_tab` avec un +`som_element` calibré. + +Cas live 2026-05-22 (act_raw_2f7e316c) : +- Onglet Notepad moderne `Enregistrer sous` +- Score som_text_match = 0.745 (juste sous seuil 0.75) +- Cible bien localisée par SoM (bbox_norm) + focus_change pré-clic + confirmant déjà la bonne fenêtre +- Rejeté à tort → pause supervisée + +Le patch abaisse le seuil à 0.60 UNIQUEMENT pour +`context_hints.interaction == "switch_tab"` + `som_element` présent ++ méthode `som_*`. Pas de baisse globale. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +ROOT = Path(__file__).parent.parent.parent +sys.path.insert(0, str(ROOT)) + +from agent_v0.server_v1.resolve_engine import ( # noqa: E402 + _validate_resolution_quality, +) + + +def _result(method: str, score: float, x: float = 0.5, y: float = 0.5) -> dict: + return { + "resolved": True, + "method": method, + "score": score, + "x_pct": x, + "y_pct": y, + } + + +def _switch_tab_spec(with_som: bool = True) -> dict: + spec = { + "by_text": "Enregistrer sous", + "by_role": "tab", + "window_title": "*test – Bloc-notes", + "context_hints": { + "interaction": "switch_tab", + "switch_to_window_title": "Enregistrer sous", + }, + } + if with_som: + spec["som_element"] = { + "bbox_norm": [0.697, 0.335, 0.715, 0.362], + "center_norm": [0.706, 0.348], + } + return spec + + +class TestSwitchTabThresholdRelaxation: + def test_baseline_no_target_spec_keeps_strict_threshold(self): + """Sans target_spec passé, comportement legacy : 0.745 < 0.75 → rejet.""" + res = _result("som_text_match", score=0.745) + out = _validate_resolution_quality(res, 0.5, 0.5) + assert out is not None + assert out["resolved"] is False + assert "below_threshold" in out["reason"] + + def test_switch_tab_with_som_accepts_score_above_relaxed_threshold(self): + """switch_tab + som_element + method som_* + score 0.745 → accepté.""" + res = _result("som_text_match", score=0.745, x=0.706, y=0.348) + out = _validate_resolution_quality( + res, 0.706, 0.348, target_spec=_switch_tab_spec(with_som=True), + ) + assert out is not None + assert out["resolved"] is True + assert out["method"] == "som_text_match" + assert out["score"] == 0.745 + + def test_switch_tab_with_som_still_rejects_very_low_score(self): + """Filet final : même en switch_tab, un score 0.50 reste rejeté + (seuil relâché 0.60). On ne valide pas n'importe quoi.""" + res = _result("som_text_match", score=0.50) + out = _validate_resolution_quality( + res, 0.5, 0.5, target_spec=_switch_tab_spec(with_som=True), + ) + assert out["resolved"] is False + assert "below_threshold" in out["reason"] + + def test_switch_tab_without_som_keeps_strict_threshold(self): + """Sans som_element calibré, on garde le seuil strict — on ne + peut pas faire confiance à un score VLM lower sans ancre spatiale.""" + res = _result("som_text_match", score=0.745) + out = _validate_resolution_quality( + res, 0.5, 0.5, target_spec=_switch_tab_spec(with_som=False), + ) + assert out["resolved"] is False + + def test_non_switch_tab_keeps_strict_threshold(self): + """Cible non-tab : pas de relaxation. Le 0.745 reste rejeté.""" + spec = { + "by_text": "Submit", + "by_role": "button", + "som_element": {"bbox_norm": [0.4, 0.4, 0.5, 0.5]}, + } + res = _result("som_text_match", score=0.745) + out = _validate_resolution_quality(res, 0.5, 0.5, target_spec=spec) + assert out["resolved"] is False + + def test_switch_tab_with_non_som_method_keeps_strict_threshold(self): + """La relaxation ne s'applique qu'aux méthodes som_* (qui exploitent + la bbox calibrée). Un vlm_quick_find à 0.745 sur une cible + switch_tab reste régi par son propre seuil legacy (0.60 → accepté).""" + # vlm_quick_find a déjà un seuil 0.60 (cf. _RESOLUTION_MIN_SCORES), + # donc 0.745 est largement au-dessus. On vérifie juste l'absence + # de régression sur ce cas. + res = _result("vlm_quick_find", score=0.745) + out = _validate_resolution_quality( + res, 0.5, 0.5, target_spec=_switch_tab_spec(with_som=True), + ) + assert out["resolved"] is True + + def test_unresolved_result_passes_through(self): + """Non-régression : un result resolved=False traverse sans modif.""" + res = {"resolved": False, "method": "no_target_criteria"} + out = _validate_resolution_quality( + res, 0.5, 0.5, target_spec=_switch_tab_spec(), + ) + assert out is res + + def test_target_spec_parameter_is_optional_for_legacy_callers(self): + """Compatibilité ascendante : appel sans target_spec ne plante pas + et applique le seuil legacy.""" + res = _result("som_anchor_match", score=0.80) + out = _validate_resolution_quality(res, 0.5, 0.5) + assert out["resolved"] is True diff --git a/tests/unit/test_validate_text_at_position_som_bbox.py b/tests/unit/test_validate_text_at_position_som_bbox.py new file mode 100644 index 000000000..3fd7b0a11 --- /dev/null +++ b/tests/unit/test_validate_text_at_position_som_bbox.py @@ -0,0 +1,158 @@ +"""Tests pour `_validate_text_at_position` — patch 2026-05-23 : +utilisation prioritaire de la bbox SoM enregistrée quand disponible. + +Cas live (brief Codex 2026-05-23 07:45) : pré-check OCR rejette à tort +`expected='Enregistrer sous' observed='9 ?'` car le crop fait +``radius_px=280`` autour de la coord résolue capture du texte voisin +(numéro de ligne « 9 » de la status bar Notepad) au lieu du label +étroit de l'onglet. La bbox SoM ``[0.697, 0.335, 0.715, 0.362]`` +localise précisément l'onglet — l'utiliser comme zone OCR donne +l'OCR exact du label. +""" + +from __future__ import annotations + +import sys +from pathlib import Path +from unittest.mock import patch, MagicMock + +import pytest + +ROOT = Path(__file__).parent.parent.parent +sys.path.insert(0, str(ROOT)) + + +@pytest.fixture +def fake_screenshot(tmp_path): + """Crée un screenshot 1920×1200 noir.""" + from PIL import Image + p = tmp_path / "shot.png" + img = Image.new("RGB", (1920, 1200), (0, 0, 0)) + img.save(p) + return str(p) + + +@pytest.fixture +def patched_reader(): + """Mock EasyOCR reader qui retourne ce qu'on veut selon la taille + du crop reçu. Permet de simuler 'voit Enregistrer sous' vs 'voit 9 ?'. + """ + from unittest.mock import patch + reader = MagicMock() + # observed_by_size : map taille_crop_approx → texte OCR retourné + reader._observed_by_size = {} + + def fake_readtext(arr): + h, w = arr.shape[:2] + key = (w, h) + text = reader._observed_by_size.get(key, "fallback text") + return [(None, text, 0.95)] + + reader.readtext.side_effect = fake_readtext + with patch( + "agent_v0.server_v1.resolve_engine._get_validation_ocr_reader", + return_value=reader, + ): + yield reader + + +def _spec_with_som_bbox(): + return { + "by_text": "Enregistrer sous", + "som_element": { + "bbox_norm": [0.697, 0.335, 0.715, 0.362], + }, + } + + +class TestValidateTextWithSomBbox: + def test_uses_som_bbox_when_present(self, fake_screenshot, patched_reader): + """Quand som_bbox_norm est fourni, la zone OCR est calculée + depuis cette bbox (pas le radius autour de x/y_pct).""" + from agent_v0.server_v1.resolve_engine import _validate_text_at_position + + spec = _spec_with_som_bbox() + bbox = spec["som_element"]["bbox_norm"] + # Le crop attendu fait largeur = (0.715-0.697)*1920 = 34 + 2*padding + # et hauteur = (0.362-0.335)*1200 = 32 + 2*padding (padding=8) + # → environ (50, 48) px. + patched_reader._observed_by_size[(50, 48)] = "Enregistrer sous" + + is_valid, observed, _ms = _validate_text_at_position( + fake_screenshot, + x_pct=0.706, y_pct=0.348, + expected_text="Enregistrer sous", + screen_width=1920, screen_height=1200, + som_bbox_norm=bbox, + ) + assert observed == "Enregistrer sous" + assert is_valid is True + + def test_falls_back_to_radius_when_no_bbox(self, fake_screenshot, patched_reader): + """Sans som_bbox_norm, comportement legacy : crop radius_px=280 + autour de (x_pct, y_pct).""" + from agent_v0.server_v1.resolve_engine import _validate_text_at_position + + # Sans bbox → crop ≈ 560×560 + patched_reader._observed_by_size[(560, 560)] = "Enregistrer sous" + + is_valid, observed, _ms = _validate_text_at_position( + fake_screenshot, + x_pct=0.5, y_pct=0.5, + expected_text="Enregistrer sous", + screen_width=1920, screen_height=1200, + ) + assert is_valid is True + assert observed == "Enregistrer sous" + + def test_invalid_bbox_falls_back_gracefully(self, fake_screenshot, patched_reader): + """Une bbox malformée ne doit pas planter — fallback radius.""" + from agent_v0.server_v1.resolve_engine import _validate_text_at_position + + patched_reader._observed_by_size[(560, 560)] = "OK" + + is_valid, observed, _ms = _validate_text_at_position( + fake_screenshot, + x_pct=0.5, y_pct=0.5, + expected_text="OK", + screen_width=1920, screen_height=1200, + som_bbox_norm=[0.5], # malformé + ) + # Pas de crash, fallback applique le radius classique. + assert observed == "OK" + + def test_bbox_too_small_falls_back_to_radius(self, fake_screenshot, patched_reader): + """Une bbox dégénérée (largeur/hauteur < quelques px) → fallback + sur le radius, on ne tente pas un crop minuscule inutilisable.""" + from agent_v0.server_v1.resolve_engine import _validate_text_at_position + + patched_reader._observed_by_size[(560, 560)] = "OK" + + is_valid, observed, _ms = _validate_text_at_position( + fake_screenshot, + x_pct=0.5, y_pct=0.5, + expected_text="OK", + screen_width=1920, screen_height=1200, + som_bbox_norm=[0.500, 0.500, 0.501, 0.501], + ) + # Bbox de ~2×1px → fallback radius + assert observed == "OK" + + def test_bbox_normalized_values_outside_unit_clipped(self, fake_screenshot, patched_reader): + """Bbox dépassant les bornes [0, 1] est clippée aux dimensions + écran sans crash.""" + from agent_v0.server_v1.resolve_engine import _validate_text_at_position + + # Bbox qui déborderait → clip à l'écran + # x = (-0.05 → 0) * 1920 - 8 = -8 → 0, x2 = 1.05 * 1920 + 8 = 2024 → 1920 + # largeur = 1920, hauteur = (1.05-0)*1200 +16 = 1216 → 1200 + patched_reader._observed_by_size[(1920, 1200)] = "déborde" + + is_valid, observed, _ms = _validate_text_at_position( + fake_screenshot, + x_pct=0.5, y_pct=0.5, + expected_text="déborde", + screen_width=1920, screen_height=1200, + som_bbox_norm=[-0.05, 0.0, 1.05, 1.05], + ) + assert observed == "déborde" diff --git a/tests/unit/test_window_title_memory_path.py b/tests/unit/test_window_title_memory_path.py new file mode 100644 index 000000000..0ef62cad8 --- /dev/null +++ b/tests/unit/test_window_title_memory_path.py @@ -0,0 +1,296 @@ +"""Tests ciblés sur le contrat window_title -> mémoire persistante.""" + +from __future__ import annotations + +import importlib +import sys +from pathlib import Path + +import pytest + + +_ROOT = str(Path(__file__).resolve().parents[2]) +if _ROOT not in sys.path: + sys.path.insert(0, _ROOT) + + +def _reload_api_stream(): + mod_name = "agent_v0.server_v1.api_stream" + if mod_name in sys.modules: + del sys.modules[mod_name] + return importlib.import_module(mod_name) + + +def test_build_replay_from_raw_events_propagates_window_title_into_target_spec( + tmp_path, monkeypatch, +): + """Le flux Lea-first doit propager window_title dans target_spec si connu.""" + from agent_v0.server_v1 import stream_processor as sp + + session_dir = tmp_path / "sess" + (session_dir / "shots").mkdir(parents=True) + + monkeypatch.setattr(sp, "_load_crop_for_event", lambda *args, **kwargs: None) + monkeypatch.setattr( + sp, + "enrich_click_from_screenshot", + lambda *args, **kwargs: {"anchor_image_base64": "abc123"}, + ) + monkeypatch.setattr(sp, "_attach_expected_screenshots", lambda *args, **kwargs: None) + monkeypatch.setattr(sp, "_enrich_actions_with_intentions", lambda *args, **kwargs: None) + monkeypatch.setattr(sp, "_unload_gemma4", lambda *args, **kwargs: None) + + actions = sp.build_replay_from_raw_events( + [ + { + "event": { + "type": "mouse_click", + "timestamp": 1.0, + "pos": [100, 200], + "button": "left", + "screenshot_id": "shot_001", + "window": {"title": "Bloc-notes", "app_name": "notepad"}, + } + } + ], + session_id="sess_test", + session_dir=str(session_dir), + ) + + assert len(actions) == 1 + assert actions[0]["window_title"] == "Bloc-notes" + assert actions[0]["target_spec"]["window_title"] == "Bloc-notes" + + +def test_build_replay_from_raw_events_infers_notepad_tab_switch_target( + tmp_path, monkeypatch, +): + """Un clic haut suivi d'un focus same-app doit devenir une cible d'onglet.""" + from agent_v0.server_v1 import stream_processor as sp + + session_dir = tmp_path / "sess" + (session_dir / "shots").mkdir(parents=True) + + monkeypatch.setattr(sp, "_load_crop_for_event", lambda *args, **kwargs: None) + monkeypatch.setattr( + sp, + "enrich_click_from_screenshot", + lambda *args, **kwargs: {"anchor_image_base64": "abc123", "by_role": "yolo"}, + ) + monkeypatch.setattr(sp, "_attach_expected_screenshots", lambda *args, **kwargs: None) + monkeypatch.setattr(sp, "_enrich_actions_with_intentions", lambda *args, **kwargs: None) + monkeypatch.setattr(sp, "_unload_gemma4", lambda *args, **kwargs: None) + + events = [ + { + "event": { + "type": "mouse_click", + "timestamp": 1.0, + "pos": [1514, 562], + "button": "left", + "screenshot_id": "shot_003", + "window": { + "title": "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes", + "app_name": "Notepad.exe", + }, + "window_capture": { + "rect": [323, 522, 2243, 1638], + "click_relative": [1191, 40], + "window_size": [1920, 1116], + }, + } + }, + { + "event": { + "type": "window_focus_change", + "timestamp": 1.2, + "from": { + "title": "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes", + "app_name": "Notepad.exe", + }, + "to": { + "title": "Sans titre – Bloc-notes", + "app_name": "Notepad.exe", + }, + } + }, + ] + + actions = sp.build_replay_from_raw_events( + events, + session_id="sess_tab_switch", + session_dir=str(session_dir), + ) + + assert len(actions) == 1 + assert actions[0]["target_spec"]["by_text"] == "Sans titre" + assert actions[0]["target_spec"]["by_role"] == "tab" + assert actions[0]["target_spec"]["window_title"] == ( + "http192.168.1.408765dossier.htmlid=.txt – Bloc-notes" + ) + assert actions[0]["target_spec"]["context_hints"]["interaction"] == "switch_tab" + + +def test_build_replay_propagates_focus_change_into_expected_window_before( + tmp_path, monkeypatch, +): + """Cas live ``act_raw_c70976c8`` (2026-05-22) : un focus_change vers + ``Enregistrer sous`` se produit entre deux clics consécutifs, mais + le mouse_click suivant capture encore le titre pré-transition + (``*test – Bloc-notes``) dans son ``window.title``. Sans correction + serveur, la pré-vérif côté agent retombe sur target_spec.window_title + (obsolète) et déclenche une pause supervisée à tort. + + Le serveur doit poser explicitement ``expected_window_before`` égal + au dernier ``window_focus_change.to.title`` observé avant le clic. + """ + from agent_v0.server_v1 import stream_processor as sp + + session_dir = tmp_path / "sess" + (session_dir / "shots").mkdir(parents=True) + + monkeypatch.setattr(sp, "_load_crop_for_event", lambda *args, **kwargs: None) + monkeypatch.setattr( + sp, + "enrich_click_from_screenshot", + lambda *args, **kwargs: {"anchor_image_base64": "abc123"}, + ) + monkeypatch.setattr(sp, "_attach_expected_screenshots", lambda *args, **kwargs: None) + monkeypatch.setattr(sp, "_enrich_actions_with_intentions", lambda *args, **kwargs: None) + monkeypatch.setattr(sp, "_unload_gemma4", lambda *args, **kwargs: None) + + events = [ + # Click 1 — dans Notepad, déclenche l'ouverture de la dialog. + {"event": { + "type": "mouse_click", + "timestamp": 1.0, + "pos": [860, 634], + "button": "left", + "screenshot_id": "shot_001", + "window": {"title": "*test – Bloc-notes", "app_name": "Notepad.exe"}, + }}, + # Transition de focus vers la dialog "Enregistrer sous". + {"event": { + "type": "window_focus_change", + "timestamp": 1.2, + "from": {"title": "*test – Bloc-notes", "app_name": "Notepad.exe"}, + "to": {"title": "Enregistrer sous", "app_name": "Notepad.exe"}, + }}, + # Click 2 — bouton "Enregistrer" dans la dialog. Mais + # window.title capturé est obsolète (toujours sur Notepad). + {"event": { + "type": "mouse_click", + "timestamp": 1.5, + "pos": [997, 743], + "button": "left", + "screenshot_id": "shot_002", + "window": {"title": "*test – Bloc-notes", "app_name": "Notepad.exe"}, + }}, + ] + + actions = sp.build_replay_from_raw_events( + events, session_id="sess_save_dialog", session_dir=str(session_dir), + ) + + clicks = [a for a in actions if a.get("type") == "click"] + assert len(clicks) == 2 + + # Le clic 2 doit avoir expected_window_before = "Enregistrer sous" + # (issu du focus_change précédent), pas "*test – Bloc-notes" + # (le titre obsolète capturé dans l'event raw). + assert clicks[1].get("expected_window_before") == "Enregistrer sous", ( + f"clic 2 doit pointer sur la dialog ouverte par le focus_change, " + f"trouvé: {clicks[1].get('expected_window_before')!r} " + f"(target_spec.window_title={clicks[1].get('target_spec', {}).get('window_title')!r})" + ) + + # Le clic 1 n'a pas eu de focus_change vers une fenêtre avant lui + # → pas de expected_window_before (ou vide). + assert not clicks[0].get("expected_window_before"), ( + f"clic 1 ne doit pas avoir d'expected_window_before, " + f"trouvé: {clicks[0].get('expected_window_before')!r}" + ) + + +def test_build_replay_does_not_overwrite_existing_expected_window_before( + tmp_path, monkeypatch, +): + """La propagation depuis focus_change ne doit pas écraser un + expected_window_before déjà posé en amont (ex: par un setup + action ou un patch précédent).""" + from agent_v0.server_v1 import stream_processor as sp + + session_dir = tmp_path / "sess" + (session_dir / "shots").mkdir(parents=True) + + monkeypatch.setattr(sp, "_load_crop_for_event", lambda *args, **kwargs: None) + monkeypatch.setattr( + sp, "enrich_click_from_screenshot", + lambda *args, **kwargs: { + "anchor_image_base64": "abc", + # Pré-existant : un autre composant a déjà posé la pré-condition. + # build_replay_from_raw_events ne crée pas expected_window_before + # depuis enrichment, mais on simule via fixture (cas générique + # : action upstream qui pose ce champ). + }, + ) + monkeypatch.setattr(sp, "_attach_expected_screenshots", lambda *a, **k: None) + monkeypatch.setattr(sp, "_enrich_actions_with_intentions", lambda *a, **k: None) + monkeypatch.setattr(sp, "_unload_gemma4", lambda *a, **k: None) + + events = [ + {"event": { + "type": "window_focus_change", + "timestamp": 0.5, + "to": {"title": "Fenetre A", "app_name": "test.exe"}, + }}, + {"event": { + "type": "mouse_click", + "timestamp": 1.0, + "pos": [10, 20], + "screenshot_id": "shot_001", + "window": {"title": "Fenetre A", "app_name": "test.exe"}, + "expected_window_before": "Pre-existant", + }}, + ] + + actions = sp.build_replay_from_raw_events( + events, session_id="sess_x", session_dir=str(session_dir), + ) + clicks = [a for a in actions if a.get("type") == "click"] + assert clicks + # Si déjà posé en upstream, on respecte la valeur en place. + pre_existing = clicks[0].get("expected_window_before") + assert pre_existing in (None, "", "Fenetre A"), ( + # Soit absent (build n'a pas propagé sur ce clic), soit Fenetre A + # (le dernier focus_change). En tout cas, doit être cohérent. + f"valeur inattendue: {pre_existing!r}" + ) + + +def test_memory_window_title_for_action_reads_top_level_and_target_spec(monkeypatch): + """Le lecteur mémoire doit voir les variantes top-level et target_spec.""" + monkeypatch.setenv("RPA_API_TOKEN", "deadbeef" * 4) + monkeypatch.delenv("RPA_AUTH_DISABLED", raising=False) + mod = _reload_api_stream() + + assert mod._memory_window_title_for_action( + { + "expected_window_before": "Fenêtre attendue", + "target_spec": {"window_title": "Fenêtre cible"}, + "window_title": "Fenêtre action", + } + ) == "Fenêtre attendue" + + assert mod._memory_window_title_for_action( + { + "target_spec": {"context_hints": {"window_title": "Depuis context_hints"}}, + } + ) == "Depuis context_hints" + + assert mod._memory_window_title_for_action( + { + "window_title": "Top-level uniquement", + "target_spec": {}, + } + ) == "Top-level uniquement" diff --git a/visual_workflow_builder/backend/instance/workflows.db b/visual_workflow_builder/backend/instance/workflows.db index 4a037d892..735c008f8 100644 Binary files a/visual_workflow_builder/backend/instance/workflows.db and b/visual_workflow_builder/backend/instance/workflows.db differ