snapshot: WIP 5j replay reliability (B1 watchdog + dialog handlers + grounding drift)
Snapshot avant correction du blocage relance Léa (3 incidents 24h: SSH refusé, polls morts ×2). Point de rollback stable. Contenu: - agent_v1/core/executor.py: 5 patchs dialog handling (saveas drift, close_tab hotkey fallback, confirm_save Unicode apostrophe, foreground dialog recontextualization, runtime_dialog in-loop) + helpers normalize_window_hint, requires_post_verify_window_transition - agent_v1/core/grounding.py: garde drift template fix (fallback_x/y plumbed) - server_v1/replay_watchdog.py (NEW): orphan watchdog B1, scan 10s timeout 30s - server_v1/api_stream.py: dispatched_action plumbing, watchdog lifespan, metrics endpoint - server_v1/replay_engine.py: _schedule_retry préserve original_action + dispatched_action - stream_processor.py: gardes _infer_tab_switch_target (no false switch_tab on save_as dialog open) + _attach_expected_window_before - tests/integration: test_replay_watchdog.py (8 cas), test_stream_processor.py - tests/unit: test_executor_verify_window_guard.py (start_button, close_tab, runtime_dialog, post_verify, transition fallbacks) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1025,6 +1025,345 @@ def enrich_click_from_screenshot(
|
||||
return result
|
||||
|
||||
|
||||
def _title_to_tab_label(window_title: str) -> str:
|
||||
"""Réduire un titre de fenêtre en libellé d'onglet probable.
|
||||
|
||||
Exemples:
|
||||
- "Sans titre – Bloc-notes" -> "Sans titre"
|
||||
- "*test – Bloc-notes" -> "test"
|
||||
"""
|
||||
title = str(window_title or "").strip()
|
||||
if not title:
|
||||
return ""
|
||||
|
||||
for sep in (" – ", " - "):
|
||||
if sep in title:
|
||||
head = title.split(sep, 1)[0].strip()
|
||||
if head:
|
||||
title = head
|
||||
break
|
||||
|
||||
return title.lstrip("*").strip()
|
||||
|
||||
|
||||
def _split_window_title_head_suffix(window_title: str) -> tuple[str, str]:
|
||||
"""Découper un titre de fenêtre en ``(head, suffix)`` si possible.
|
||||
|
||||
Exemples:
|
||||
- ``Sans titre – Bloc-notes`` -> (``Sans titre``, ``Bloc-notes``)
|
||||
- ``Page 1 - Google Chrome`` -> (``Page 1``, ``Google Chrome``)
|
||||
- ``Enregistrer sous`` -> ("", "")
|
||||
"""
|
||||
title = str(window_title or "").strip()
|
||||
if not title:
|
||||
return "", ""
|
||||
|
||||
for sep in (" – ", " - "):
|
||||
if sep in title:
|
||||
head, suffix = title.split(sep, 1)
|
||||
head = head.strip()
|
||||
suffix = suffix.strip()
|
||||
if head and suffix:
|
||||
return head, suffix
|
||||
return "", ""
|
||||
|
||||
|
||||
def _looks_like_same_app_tab_switch(from_title: str, to_title: str) -> bool:
|
||||
"""Vrai si la transition de focus ressemble à un vrai changement d'onglet.
|
||||
|
||||
On exige que les deux titres partagent un suffixe applicatif stable
|
||||
(ex: ``Bloc-notes``, ``Google Chrome``). Cela exclut les dialogs
|
||||
modaux same-app comme ``Enregistrer sous`` qui ne sont pas des
|
||||
onglets et ne doivent pas être compilés en ``switch_tab``.
|
||||
"""
|
||||
from_head, from_suffix = _split_window_title_head_suffix(from_title)
|
||||
to_head, to_suffix = _split_window_title_head_suffix(to_title)
|
||||
if not (from_head and from_suffix and to_head and to_suffix):
|
||||
return False
|
||||
return from_suffix.casefold() == to_suffix.casefold()
|
||||
|
||||
|
||||
def _infer_tab_switch_target(
|
||||
raw_events: list,
|
||||
click_event: Dict[str, Any],
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Détecter un clic d'onglet à partir d'une bascule de focus dans la même app.
|
||||
|
||||
Cas réel observé:
|
||||
- fenêtre active `http...txt – Bloc-notes`
|
||||
- clic dans la barre d'onglets (y relatif ~40 px)
|
||||
- focus immédiat vers `Sans titre – Bloc-notes`
|
||||
|
||||
Dans ce cas, l'ancre image seule est trop fragile. On enrichit donc le
|
||||
target_spec avec un libellé d'onglet explicite (`by_text='Sans titre'`,
|
||||
`by_role='tab'`).
|
||||
"""
|
||||
event_type = click_event.get("type", "")
|
||||
if event_type != "mouse_click":
|
||||
return None
|
||||
|
||||
window = click_event.get("window", {})
|
||||
if not isinstance(window, dict):
|
||||
return None
|
||||
|
||||
from_title = str(window.get("title", "")).strip()
|
||||
app_name = str(window.get("app_name", "")).strip().lower()
|
||||
if not from_title or not app_name:
|
||||
return None
|
||||
|
||||
# Heuristique: on ne traite que les clics très hauts dans la fenêtre,
|
||||
# typiques d'une barre d'onglets / bouton de fermeture d'onglet.
|
||||
window_capture = click_event.get("window_capture", {})
|
||||
if not isinstance(window_capture, dict):
|
||||
return None
|
||||
click_relative = window_capture.get("click_relative")
|
||||
if not (isinstance(click_relative, list) and len(click_relative) == 2):
|
||||
return None
|
||||
try:
|
||||
rel_y = int(click_relative[1])
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
if rel_y > 90:
|
||||
return None
|
||||
|
||||
click_ts = click_event.get("timestamp")
|
||||
click_pos = click_event.get("pos") or []
|
||||
|
||||
match_idx = None
|
||||
for idx, raw_evt in enumerate(raw_events):
|
||||
event_data = raw_evt.get("event", raw_evt)
|
||||
if event_data.get("type") != "mouse_click":
|
||||
continue
|
||||
if event_data.get("timestamp") != click_ts:
|
||||
continue
|
||||
if (event_data.get("pos") or []) != click_pos:
|
||||
continue
|
||||
match_idx = idx
|
||||
break
|
||||
|
||||
if match_idx is None:
|
||||
return None
|
||||
|
||||
for follow_evt in raw_events[match_idx + 1: match_idx + 7]:
|
||||
follow_data = follow_evt.get("event", follow_evt)
|
||||
follow_type = follow_data.get("type", "")
|
||||
if follow_type in {"mouse_click", "text_input", "key_press", "key_combo"}:
|
||||
# Un autre geste utilisateur est intervenu avant le focus_change :
|
||||
# le focus observé n'est plus attribuable avec confiance à CE clic.
|
||||
return None
|
||||
if follow_type != "window_focus_change":
|
||||
continue
|
||||
|
||||
to_info = follow_data.get("to", {})
|
||||
if not isinstance(to_info, dict):
|
||||
continue
|
||||
if str(to_info.get("app_name", "")).strip().lower() != app_name:
|
||||
continue
|
||||
|
||||
to_title = str(to_info.get("title", "")).strip()
|
||||
if not to_title or to_title == from_title:
|
||||
continue
|
||||
if not _looks_like_same_app_tab_switch(from_title, to_title):
|
||||
return None
|
||||
|
||||
follow_ts = follow_data.get("timestamp")
|
||||
if (
|
||||
isinstance(click_ts, (int, float))
|
||||
and isinstance(follow_ts, (int, float))
|
||||
and follow_ts - click_ts > 3.0
|
||||
):
|
||||
break
|
||||
|
||||
tab_label = _title_to_tab_label(to_title)
|
||||
if not tab_label:
|
||||
return None
|
||||
|
||||
return {
|
||||
"by_text": tab_label,
|
||||
"by_role": "tab",
|
||||
"window_title": from_title,
|
||||
"context_hints": {
|
||||
"window_title": from_title,
|
||||
"switch_to_window_title": to_title,
|
||||
"interaction": "switch_tab",
|
||||
},
|
||||
"vlm_description": (
|
||||
f"Dans la fenêtre '{from_title}', l'onglet '{tab_label}' "
|
||||
"dans la barre d'onglets en haut"
|
||||
),
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _infer_close_tab_target(
|
||||
raw_events: list,
|
||||
click_event: Dict[str, Any],
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Détecter un clic sur le bouton fermer de l'onglet actif.
|
||||
|
||||
Pattern ciblé observé sur Bloc-notes moderne :
|
||||
- clic très haut dans la barre d'onglets sur un titre ``*... – Bloc-notes``
|
||||
- un clic suivant dans la même fenêtre
|
||||
- puis focus vers ``Enregistrer sous``
|
||||
|
||||
Cela correspond à la fermeture d'un onglet modifié qui déclenche ensuite
|
||||
le flow de sauvegarde. On enrichit le clic avec un hint sémantique pour
|
||||
viser le vrai bouton ``x`` de l'onglet actif plutôt qu'un simple `yolo`.
|
||||
"""
|
||||
event_type = click_event.get("type", "")
|
||||
if event_type != "mouse_click":
|
||||
return None
|
||||
|
||||
window = click_event.get("window", {})
|
||||
if not isinstance(window, dict):
|
||||
return None
|
||||
|
||||
from_title = str(window.get("title", "")).strip()
|
||||
app_name = str(window.get("app_name", "")).strip().lower()
|
||||
if not from_title or not app_name or not from_title.startswith("*"):
|
||||
return None
|
||||
|
||||
window_capture = click_event.get("window_capture", {})
|
||||
if not isinstance(window_capture, dict):
|
||||
return None
|
||||
click_relative = window_capture.get("click_relative")
|
||||
if not (isinstance(click_relative, list) and len(click_relative) == 2):
|
||||
return None
|
||||
try:
|
||||
rel_y = int(click_relative[1])
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
if rel_y > 90:
|
||||
return None
|
||||
|
||||
click_ts = click_event.get("timestamp")
|
||||
click_pos = click_event.get("pos") or []
|
||||
match_idx = None
|
||||
for idx, raw_evt in enumerate(raw_events):
|
||||
event_data = raw_evt.get("event", raw_evt)
|
||||
if event_data.get("type") != "mouse_click":
|
||||
continue
|
||||
if event_data.get("timestamp") != click_ts:
|
||||
continue
|
||||
if (event_data.get("pos") or []) != click_pos:
|
||||
continue
|
||||
match_idx = idx
|
||||
break
|
||||
|
||||
if match_idx is None:
|
||||
return None
|
||||
|
||||
saw_follow_click_same_window = False
|
||||
for follow_evt in raw_events[match_idx + 1: match_idx + 8]:
|
||||
follow_data = follow_evt.get("event", follow_evt)
|
||||
follow_type = follow_data.get("type", "")
|
||||
|
||||
if follow_type in {"text_input", "key_press", "key_combo"}:
|
||||
return None
|
||||
|
||||
if follow_type == "mouse_click":
|
||||
follow_window = follow_data.get("window", {})
|
||||
if not isinstance(follow_window, dict):
|
||||
return None
|
||||
follow_app = str(follow_window.get("app_name", "")).strip().lower()
|
||||
follow_title = str(follow_window.get("title", "")).strip()
|
||||
if follow_app != app_name:
|
||||
return None
|
||||
if follow_title == from_title:
|
||||
saw_follow_click_same_window = True
|
||||
continue
|
||||
return None
|
||||
|
||||
if follow_type != "window_focus_change" or not saw_follow_click_same_window:
|
||||
continue
|
||||
|
||||
to_info = follow_data.get("to", {})
|
||||
if not isinstance(to_info, dict):
|
||||
continue
|
||||
if str(to_info.get("app_name", "")).strip().lower() != app_name:
|
||||
continue
|
||||
to_title = str(to_info.get("title", "")).strip()
|
||||
if to_title != "Enregistrer sous":
|
||||
continue
|
||||
|
||||
follow_ts = follow_data.get("timestamp")
|
||||
if (
|
||||
isinstance(click_ts, (int, float))
|
||||
and isinstance(follow_ts, (int, float))
|
||||
and follow_ts - click_ts > 5.0
|
||||
):
|
||||
break
|
||||
|
||||
tab_label = _title_to_tab_label(from_title)
|
||||
if not tab_label:
|
||||
return None
|
||||
|
||||
return {
|
||||
"by_text": "",
|
||||
"by_role": "tab_close_button",
|
||||
"window_title": from_title,
|
||||
"context_hints": {
|
||||
"window_title": from_title,
|
||||
"active_tab_label": tab_label,
|
||||
"interaction": "close_tab",
|
||||
},
|
||||
"vlm_description": (
|
||||
f"Dans la fenêtre '{from_title}', le bouton x pour fermer "
|
||||
f"l'onglet actif '{tab_label}' dans la barre d'onglets en haut"
|
||||
),
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _attach_expected_window_before(actions: list, raw_events: list) -> None:
|
||||
"""Attacher la fenêtre attendue AVANT chaque clic en rejouant les
|
||||
raw events et en conservant le dernier ``window_focus_change.to.title``.
|
||||
|
||||
Pourquoi : ``mouse_click.window.title`` capturé pendant
|
||||
l'enregistrement peut être obsolète si une transition de fenêtre
|
||||
se produit juste avant la capture (ex: dialog Windows qui s'ouvre
|
||||
milliseconde avant le clic suivant). Le serveur dispose pourtant
|
||||
des ``window_focus_change`` consécutifs — on s'en sert pour poser
|
||||
explicitement ``expected_window_before`` sur le clic, lu en priorité
|
||||
absolue par la pré-vérif côté agent.
|
||||
|
||||
Idempotent : si une action a déjà ``expected_window_before``, on
|
||||
ne touche pas.
|
||||
"""
|
||||
if not actions or not raw_events:
|
||||
return
|
||||
|
||||
last_focus_title = ""
|
||||
action_idx = 0
|
||||
|
||||
def _next_click_idx(start: int) -> int:
|
||||
i = start
|
||||
while i < len(actions) and actions[i].get("type") != "click":
|
||||
i += 1
|
||||
return i
|
||||
|
||||
for raw_evt in raw_events:
|
||||
ev = raw_evt.get("event", raw_evt) if isinstance(raw_evt, dict) else {}
|
||||
etype = ev.get("type", "")
|
||||
if etype == "window_focus_change":
|
||||
to_info = ev.get("to") or {}
|
||||
title = str(to_info.get("title", "") or "").strip()
|
||||
if title and title != "unknown_window":
|
||||
last_focus_title = title
|
||||
continue
|
||||
if etype != "mouse_click":
|
||||
continue
|
||||
action_idx = _next_click_idx(action_idx)
|
||||
if action_idx >= len(actions):
|
||||
return
|
||||
a = actions[action_idx]
|
||||
if last_focus_title and not a.get("expected_window_before"):
|
||||
a["expected_window_before"] = last_focus_title
|
||||
action_idx += 1
|
||||
|
||||
|
||||
def _attach_expected_screenshots(
|
||||
actions: list, raw_events: list, session_dir: Path,
|
||||
) -> None:
|
||||
@@ -1591,6 +1930,8 @@ def build_replay_from_raw_events(
|
||||
k: v for k, v in enrichment.items()
|
||||
if k != "by_position" # by_position est déjà dans x_pct/y_pct
|
||||
}
|
||||
if action.get("window_title") and not action["target_spec"].get("window_title"):
|
||||
action["target_spec"]["window_title"] = action["window_title"]
|
||||
# Ajouter les métadonnées fenêtre pour le grounding ciblé
|
||||
wc = evt.get("window_capture", {})
|
||||
if wc.get("rect"):
|
||||
@@ -1600,6 +1941,33 @@ def build_replay_from_raw_events(
|
||||
"click_relative": wc.get("click_relative"),
|
||||
}
|
||||
|
||||
tab_switch_target = _infer_tab_switch_target(events, evt)
|
||||
if tab_switch_target:
|
||||
target_spec = action.setdefault("target_spec", {})
|
||||
# Préférer une sémantique explicite d'onglet à un rôle brut
|
||||
# `yolo`/anchor-only quand le flux brut montre une vraie
|
||||
# bascule de focus dans la même application.
|
||||
if not target_spec.get("by_text"):
|
||||
target_spec["by_text"] = tab_switch_target["by_text"]
|
||||
target_spec["by_role"] = tab_switch_target["by_role"]
|
||||
target_spec["window_title"] = tab_switch_target["window_title"]
|
||||
target_spec["vlm_description"] = tab_switch_target["vlm_description"]
|
||||
context_hints = dict(target_spec.get("context_hints") or {})
|
||||
context_hints.update(tab_switch_target["context_hints"])
|
||||
target_spec["context_hints"] = context_hints
|
||||
action["visual_mode"] = True
|
||||
|
||||
close_tab_target = _infer_close_tab_target(events, evt)
|
||||
if close_tab_target:
|
||||
target_spec = action.setdefault("target_spec", {})
|
||||
target_spec["by_role"] = close_tab_target["by_role"]
|
||||
target_spec["window_title"] = close_tab_target["window_title"]
|
||||
target_spec["vlm_description"] = close_tab_target["vlm_description"]
|
||||
context_hints = dict(target_spec.get("context_hints") or {})
|
||||
context_hints.update(close_tab_target["context_hints"])
|
||||
target_spec["context_hints"] = context_hints
|
||||
action["visual_mode"] = True
|
||||
|
||||
elif evt_type == "text_input":
|
||||
text = evt.get("text", "")
|
||||
if not text:
|
||||
@@ -1695,6 +2063,21 @@ def build_replay_from_raw_events(
|
||||
if next_title:
|
||||
result[ci]["expected_window_title"] = next_title
|
||||
|
||||
# ── 9b. Pré-condition fiable : expected_window_before ──
|
||||
# Bug live 2026-05-22 (act_raw_c70976c8) : window.title d'un
|
||||
# mouse_click peut être obsolète quand une transition de fenêtre
|
||||
# (ex: ouverture dialog "Enregistrer sous") se produit juste avant
|
||||
# la capture du click. Sans correction, target_spec.window_title
|
||||
# reste sur l'ancien titre et la pré-vérif côté agent
|
||||
# (executor.py:653) déclenche une pause supervisée à tort.
|
||||
#
|
||||
# On rejoue les raw events en maintenant le dernier titre vu via
|
||||
# window_focus_change.to.title et on le pose comme
|
||||
# expected_window_before sur chaque clic qui n'en a pas déjà un.
|
||||
# Le champ est lu en priorité absolue par la pré-vérif agent, donc
|
||||
# il prime sur target_spec.window_title obsolète.
|
||||
_attach_expected_window_before(result, events)
|
||||
|
||||
# ── 10. Enrichir avec intention + expected_result via gemma4 (Critic) ──
|
||||
# gemma4 analyse chaque action dans son contexte pour produire :
|
||||
# - intention : ce que l'utilisateur veut accomplir
|
||||
|
||||
Reference in New Issue
Block a user