snapshot: WIP 5j replay reliability (B1 watchdog + dialog handlers + grounding drift)

Snapshot avant correction du blocage relance Léa (3 incidents 24h: SSH refusé,
polls morts ×2). Point de rollback stable.

Contenu:
- agent_v1/core/executor.py: 5 patchs dialog handling (saveas drift, close_tab
  hotkey fallback, confirm_save Unicode apostrophe, foreground dialog
  recontextualization, runtime_dialog in-loop) + helpers normalize_window_hint,
  requires_post_verify_window_transition
- agent_v1/core/grounding.py: garde drift template fix (fallback_x/y plumbed)
- server_v1/replay_watchdog.py (NEW): orphan watchdog B1, scan 10s timeout 30s
- server_v1/api_stream.py: dispatched_action plumbing, watchdog lifespan,
  metrics endpoint
- server_v1/replay_engine.py: _schedule_retry préserve original_action +
  dispatched_action
- stream_processor.py: gardes _infer_tab_switch_target (no false switch_tab
  on save_as dialog open) + _attach_expected_window_before
- tests/integration: test_replay_watchdog.py (8 cas), test_stream_processor.py
- tests/unit: test_executor_verify_window_guard.py (start_button, close_tab,
  runtime_dialog, post_verify, transition fallbacks)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-05-24 16:48:37 +02:00
parent 5ea4960e65
commit 7df51d2c79
47 changed files with 9811 additions and 451 deletions

View File

@@ -1025,6 +1025,345 @@ def enrich_click_from_screenshot(
return result
def _title_to_tab_label(window_title: str) -> str:
"""Réduire un titre de fenêtre en libellé d'onglet probable.
Exemples:
- "Sans titre Bloc-notes" -> "Sans titre"
- "*test Bloc-notes" -> "test"
"""
title = str(window_title or "").strip()
if not title:
return ""
for sep in (" ", " - "):
if sep in title:
head = title.split(sep, 1)[0].strip()
if head:
title = head
break
return title.lstrip("*").strip()
def _split_window_title_head_suffix(window_title: str) -> tuple[str, str]:
"""Découper un titre de fenêtre en ``(head, suffix)`` si possible.
Exemples:
- ``Sans titre Bloc-notes`` -> (``Sans titre``, ``Bloc-notes``)
- ``Page 1 - Google Chrome`` -> (``Page 1``, ``Google Chrome``)
- ``Enregistrer sous`` -> ("", "")
"""
title = str(window_title or "").strip()
if not title:
return "", ""
for sep in (" ", " - "):
if sep in title:
head, suffix = title.split(sep, 1)
head = head.strip()
suffix = suffix.strip()
if head and suffix:
return head, suffix
return "", ""
def _looks_like_same_app_tab_switch(from_title: str, to_title: str) -> bool:
"""Vrai si la transition de focus ressemble à un vrai changement d'onglet.
On exige que les deux titres partagent un suffixe applicatif stable
(ex: ``Bloc-notes``, ``Google Chrome``). Cela exclut les dialogs
modaux same-app comme ``Enregistrer sous`` qui ne sont pas des
onglets et ne doivent pas être compilés en ``switch_tab``.
"""
from_head, from_suffix = _split_window_title_head_suffix(from_title)
to_head, to_suffix = _split_window_title_head_suffix(to_title)
if not (from_head and from_suffix and to_head and to_suffix):
return False
return from_suffix.casefold() == to_suffix.casefold()
def _infer_tab_switch_target(
raw_events: list,
click_event: Dict[str, Any],
) -> Optional[Dict[str, Any]]:
"""Détecter un clic d'onglet à partir d'une bascule de focus dans la même app.
Cas réel observé:
- fenêtre active `http...txt Bloc-notes`
- clic dans la barre d'onglets (y relatif ~40 px)
- focus immédiat vers `Sans titre Bloc-notes`
Dans ce cas, l'ancre image seule est trop fragile. On enrichit donc le
target_spec avec un libellé d'onglet explicite (`by_text='Sans titre'`,
`by_role='tab'`).
"""
event_type = click_event.get("type", "")
if event_type != "mouse_click":
return None
window = click_event.get("window", {})
if not isinstance(window, dict):
return None
from_title = str(window.get("title", "")).strip()
app_name = str(window.get("app_name", "")).strip().lower()
if not from_title or not app_name:
return None
# Heuristique: on ne traite que les clics très hauts dans la fenêtre,
# typiques d'une barre d'onglets / bouton de fermeture d'onglet.
window_capture = click_event.get("window_capture", {})
if not isinstance(window_capture, dict):
return None
click_relative = window_capture.get("click_relative")
if not (isinstance(click_relative, list) and len(click_relative) == 2):
return None
try:
rel_y = int(click_relative[1])
except (TypeError, ValueError):
return None
if rel_y > 90:
return None
click_ts = click_event.get("timestamp")
click_pos = click_event.get("pos") or []
match_idx = None
for idx, raw_evt in enumerate(raw_events):
event_data = raw_evt.get("event", raw_evt)
if event_data.get("type") != "mouse_click":
continue
if event_data.get("timestamp") != click_ts:
continue
if (event_data.get("pos") or []) != click_pos:
continue
match_idx = idx
break
if match_idx is None:
return None
for follow_evt in raw_events[match_idx + 1: match_idx + 7]:
follow_data = follow_evt.get("event", follow_evt)
follow_type = follow_data.get("type", "")
if follow_type in {"mouse_click", "text_input", "key_press", "key_combo"}:
# Un autre geste utilisateur est intervenu avant le focus_change :
# le focus observé n'est plus attribuable avec confiance à CE clic.
return None
if follow_type != "window_focus_change":
continue
to_info = follow_data.get("to", {})
if not isinstance(to_info, dict):
continue
if str(to_info.get("app_name", "")).strip().lower() != app_name:
continue
to_title = str(to_info.get("title", "")).strip()
if not to_title or to_title == from_title:
continue
if not _looks_like_same_app_tab_switch(from_title, to_title):
return None
follow_ts = follow_data.get("timestamp")
if (
isinstance(click_ts, (int, float))
and isinstance(follow_ts, (int, float))
and follow_ts - click_ts > 3.0
):
break
tab_label = _title_to_tab_label(to_title)
if not tab_label:
return None
return {
"by_text": tab_label,
"by_role": "tab",
"window_title": from_title,
"context_hints": {
"window_title": from_title,
"switch_to_window_title": to_title,
"interaction": "switch_tab",
},
"vlm_description": (
f"Dans la fenêtre '{from_title}', l'onglet '{tab_label}' "
"dans la barre d'onglets en haut"
),
}
return None
def _infer_close_tab_target(
raw_events: list,
click_event: Dict[str, Any],
) -> Optional[Dict[str, Any]]:
"""Détecter un clic sur le bouton fermer de l'onglet actif.
Pattern ciblé observé sur Bloc-notes moderne :
- clic très haut dans la barre d'onglets sur un titre ``*... Bloc-notes``
- un clic suivant dans la même fenêtre
- puis focus vers ``Enregistrer sous``
Cela correspond à la fermeture d'un onglet modifié qui déclenche ensuite
le flow de sauvegarde. On enrichit le clic avec un hint sémantique pour
viser le vrai bouton ``x`` de l'onglet actif plutôt qu'un simple `yolo`.
"""
event_type = click_event.get("type", "")
if event_type != "mouse_click":
return None
window = click_event.get("window", {})
if not isinstance(window, dict):
return None
from_title = str(window.get("title", "")).strip()
app_name = str(window.get("app_name", "")).strip().lower()
if not from_title or not app_name or not from_title.startswith("*"):
return None
window_capture = click_event.get("window_capture", {})
if not isinstance(window_capture, dict):
return None
click_relative = window_capture.get("click_relative")
if not (isinstance(click_relative, list) and len(click_relative) == 2):
return None
try:
rel_y = int(click_relative[1])
except (TypeError, ValueError):
return None
if rel_y > 90:
return None
click_ts = click_event.get("timestamp")
click_pos = click_event.get("pos") or []
match_idx = None
for idx, raw_evt in enumerate(raw_events):
event_data = raw_evt.get("event", raw_evt)
if event_data.get("type") != "mouse_click":
continue
if event_data.get("timestamp") != click_ts:
continue
if (event_data.get("pos") or []) != click_pos:
continue
match_idx = idx
break
if match_idx is None:
return None
saw_follow_click_same_window = False
for follow_evt in raw_events[match_idx + 1: match_idx + 8]:
follow_data = follow_evt.get("event", follow_evt)
follow_type = follow_data.get("type", "")
if follow_type in {"text_input", "key_press", "key_combo"}:
return None
if follow_type == "mouse_click":
follow_window = follow_data.get("window", {})
if not isinstance(follow_window, dict):
return None
follow_app = str(follow_window.get("app_name", "")).strip().lower()
follow_title = str(follow_window.get("title", "")).strip()
if follow_app != app_name:
return None
if follow_title == from_title:
saw_follow_click_same_window = True
continue
return None
if follow_type != "window_focus_change" or not saw_follow_click_same_window:
continue
to_info = follow_data.get("to", {})
if not isinstance(to_info, dict):
continue
if str(to_info.get("app_name", "")).strip().lower() != app_name:
continue
to_title = str(to_info.get("title", "")).strip()
if to_title != "Enregistrer sous":
continue
follow_ts = follow_data.get("timestamp")
if (
isinstance(click_ts, (int, float))
and isinstance(follow_ts, (int, float))
and follow_ts - click_ts > 5.0
):
break
tab_label = _title_to_tab_label(from_title)
if not tab_label:
return None
return {
"by_text": "",
"by_role": "tab_close_button",
"window_title": from_title,
"context_hints": {
"window_title": from_title,
"active_tab_label": tab_label,
"interaction": "close_tab",
},
"vlm_description": (
f"Dans la fenêtre '{from_title}', le bouton x pour fermer "
f"l'onglet actif '{tab_label}' dans la barre d'onglets en haut"
),
}
return None
def _attach_expected_window_before(actions: list, raw_events: list) -> None:
"""Attacher la fenêtre attendue AVANT chaque clic en rejouant les
raw events et en conservant le dernier ``window_focus_change.to.title``.
Pourquoi : ``mouse_click.window.title`` capturé pendant
l'enregistrement peut être obsolète si une transition de fenêtre
se produit juste avant la capture (ex: dialog Windows qui s'ouvre
milliseconde avant le clic suivant). Le serveur dispose pourtant
des ``window_focus_change`` consécutifs — on s'en sert pour poser
explicitement ``expected_window_before`` sur le clic, lu en priorité
absolue par la pré-vérif côté agent.
Idempotent : si une action a déjà ``expected_window_before``, on
ne touche pas.
"""
if not actions or not raw_events:
return
last_focus_title = ""
action_idx = 0
def _next_click_idx(start: int) -> int:
i = start
while i < len(actions) and actions[i].get("type") != "click":
i += 1
return i
for raw_evt in raw_events:
ev = raw_evt.get("event", raw_evt) if isinstance(raw_evt, dict) else {}
etype = ev.get("type", "")
if etype == "window_focus_change":
to_info = ev.get("to") or {}
title = str(to_info.get("title", "") or "").strip()
if title and title != "unknown_window":
last_focus_title = title
continue
if etype != "mouse_click":
continue
action_idx = _next_click_idx(action_idx)
if action_idx >= len(actions):
return
a = actions[action_idx]
if last_focus_title and not a.get("expected_window_before"):
a["expected_window_before"] = last_focus_title
action_idx += 1
def _attach_expected_screenshots(
actions: list, raw_events: list, session_dir: Path,
) -> None:
@@ -1591,6 +1930,8 @@ def build_replay_from_raw_events(
k: v for k, v in enrichment.items()
if k != "by_position" # by_position est déjà dans x_pct/y_pct
}
if action.get("window_title") and not action["target_spec"].get("window_title"):
action["target_spec"]["window_title"] = action["window_title"]
# Ajouter les métadonnées fenêtre pour le grounding ciblé
wc = evt.get("window_capture", {})
if wc.get("rect"):
@@ -1600,6 +1941,33 @@ def build_replay_from_raw_events(
"click_relative": wc.get("click_relative"),
}
tab_switch_target = _infer_tab_switch_target(events, evt)
if tab_switch_target:
target_spec = action.setdefault("target_spec", {})
# Préférer une sémantique explicite d'onglet à un rôle brut
# `yolo`/anchor-only quand le flux brut montre une vraie
# bascule de focus dans la même application.
if not target_spec.get("by_text"):
target_spec["by_text"] = tab_switch_target["by_text"]
target_spec["by_role"] = tab_switch_target["by_role"]
target_spec["window_title"] = tab_switch_target["window_title"]
target_spec["vlm_description"] = tab_switch_target["vlm_description"]
context_hints = dict(target_spec.get("context_hints") or {})
context_hints.update(tab_switch_target["context_hints"])
target_spec["context_hints"] = context_hints
action["visual_mode"] = True
close_tab_target = _infer_close_tab_target(events, evt)
if close_tab_target:
target_spec = action.setdefault("target_spec", {})
target_spec["by_role"] = close_tab_target["by_role"]
target_spec["window_title"] = close_tab_target["window_title"]
target_spec["vlm_description"] = close_tab_target["vlm_description"]
context_hints = dict(target_spec.get("context_hints") or {})
context_hints.update(close_tab_target["context_hints"])
target_spec["context_hints"] = context_hints
action["visual_mode"] = True
elif evt_type == "text_input":
text = evt.get("text", "")
if not text:
@@ -1695,6 +2063,21 @@ def build_replay_from_raw_events(
if next_title:
result[ci]["expected_window_title"] = next_title
# ── 9b. Pré-condition fiable : expected_window_before ──
# Bug live 2026-05-22 (act_raw_c70976c8) : window.title d'un
# mouse_click peut être obsolète quand une transition de fenêtre
# (ex: ouverture dialog "Enregistrer sous") se produit juste avant
# la capture du click. Sans correction, target_spec.window_title
# reste sur l'ancien titre et la pré-vérif côté agent
# (executor.py:653) déclenche une pause supervisée à tort.
#
# On rejoue les raw events en maintenant le dernier titre vu via
# window_focus_change.to.title et on le pose comme
# expected_window_before sur chaque clic qui n'en a pas déjà un.
# Le champ est lu en priorité absolue par la pré-vérif agent, donc
# il prime sur target_spec.window_title obsolète.
_attach_expected_window_before(result, events)
# ── 10. Enrichir avec intention + expected_result via gemma4 (Critic) ──
# gemma4 analyse chaque action dans son contexte pour produire :
# - intention : ce que l'utilisateur veut accomplir