snapshot: WIP 5j replay reliability (B1 watchdog + dialog handlers + grounding drift)

Snapshot avant correction du blocage relance Léa (3 incidents 24h: SSH refusé,
polls morts ×2). Point de rollback stable.

Contenu:
- agent_v1/core/executor.py: 5 patchs dialog handling (saveas drift, close_tab
  hotkey fallback, confirm_save Unicode apostrophe, foreground dialog
  recontextualization, runtime_dialog in-loop) + helpers normalize_window_hint,
  requires_post_verify_window_transition
- agent_v1/core/grounding.py: garde drift template fix (fallback_x/y plumbed)
- server_v1/replay_watchdog.py (NEW): orphan watchdog B1, scan 10s timeout 30s
- server_v1/api_stream.py: dispatched_action plumbing, watchdog lifespan,
  metrics endpoint
- server_v1/replay_engine.py: _schedule_retry préserve original_action +
  dispatched_action
- stream_processor.py: gardes _infer_tab_switch_target (no false switch_tab
  on save_as dialog open) + _attach_expected_window_before
- tests/integration: test_replay_watchdog.py (8 cas), test_stream_processor.py
- tests/unit: test_executor_verify_window_guard.py (start_button, close_tab,
  runtime_dialog, post_verify, transition fallbacks)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-05-24 16:48:37 +02:00
parent 5ea4960e65
commit 7df51d2c79
47 changed files with 9811 additions and 451 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -74,6 +74,142 @@ class GroundingEngine:
"""
self._executor = executor
@staticmethod
def _should_scope_to_active_window(target_spec: Dict[str, Any]) -> bool:
"""Déterminer si le grounding doit être limité à la fenêtre active."""
if str(target_spec.get("screen_scope", "")).strip().lower() == "full_screen":
return False
by_role = str(target_spec.get("by_role", "")).strip().lower()
if by_role in {"start_button"}:
return False
return True
@staticmethod
def _targets_lea_window(target_spec: Dict[str, Any]) -> bool:
"""Déterminer si la cible pointe explicitement vers l'UI de Léa."""
try:
from ..ui.messages import est_fenetre_lea
except Exception:
return False
context_hints = target_spec.get("context_hints") or {}
hints = [
target_spec.get("window_title", ""),
context_hints.get("window_title", ""),
target_spec.get("vlm_description", ""),
target_spec.get("by_text", ""),
]
return any(est_fenetre_lea(str(hint)) for hint in hints if hint)
@staticmethod
def _is_plausible_window_rect(
rect: Optional[List[int]],
title: str,
screen_width: int,
screen_height: int,
) -> bool:
"""Valider qu'un rect actif ressemble à une vraie fenêtre utilisable.
Rejette explicitement les zones système "bar-like" (taskbar, systray)
et les titres inconnus/bruités. Le grounding ne doit jamais se
contraindre à une zone non validée.
"""
if not rect or len(rect) != 4:
return False
try:
from ..ui.messages import est_fenetre_bruit
except Exception:
def est_fenetre_bruit(_title: str) -> bool:
return not _title or _title.strip().lower() == "unknown_window"
w = rect[2] - rect[0]
h = rect[3] - rect[1]
title_clean = str(title or "").strip()
if w <= 50 or h <= 50:
return False
title_lower = title_clean.lower()
is_unknown_title = not title_clean or title_lower == "unknown_window"
if not is_unknown_title and est_fenetre_bruit(title_clean):
return False
# Une zone très plate, surtout en bas d'écran et très large, est
# typiquement une barre des tâches / systray, pas une vraie fenêtre.
# On réduit le seuil de hauteur à 120px pour ne pas rejeter les petits modaux.
is_bar_like = (
h < 120
or (w > 0.9 * screen_width and h < 0.15 * screen_height)
)
# Exception : si le titre contient un mot-clé de dialogue connu,
# on considère que c'est plausible même si c'est petit.
keywords = ["enregistrer sous", "save as", "voulez-vous", "confirm", "attention", "error", "erreur"]
if any(k in title_lower for k in keywords):
return h >= 80 # Un dialogue fait au moins 80px (titre + bouton)
return not is_bar_like
@staticmethod
def _visual_scope_hints(target_spec: Dict[str, Any]) -> List[str]:
"""Construire des indices textuels à chercher dans le crop fenêtre."""
hints: List[str] = []
raw_hints = [
target_spec.get("window_title", ""),
(target_spec.get("context_hints") or {}).get("window_title", ""),
target_spec.get("by_text", ""),
]
for raw in raw_hints:
text = str(raw or "").strip()
if not text:
continue
text = text.lstrip("*").strip()
variants = [text]
for sep in (" ", " - ", ""):
if sep in text:
variants.extend(part.strip().lstrip("*") for part in text.split(sep))
for variant in variants:
if variant and len(variant) >= 3 and variant not in hints:
hints.append(variant)
return hints
def _window_crop_matches_target_visually(
self,
screenshot_b64: str,
target_spec: Dict[str, Any],
) -> bool:
"""Vérifier visuellement qu'un crop contraint contient la bonne cible.
Principe: ne jamais faire confiance au rect système seul. Si aucun
indice textuel n'est disponible, on laisse passer le crop plausible
pour ne pas sur-bloquer les cibles purement iconiques.
"""
hints = self._visual_scope_hints(target_spec)
if not hints:
return True
finder = getattr(self._executor, "_find_text_on_screen", None)
if not callable(finder):
return True
for hint in hints:
try:
if finder(screenshot_b64, hint):
logger.info(
"Grounding fenêtre validé visuellement via '%s'",
hint,
)
return True
except Exception as e:
logger.debug("Validation visuelle du crop échouée pour '%s': %s", hint, e)
logger.info(
"Grounding plein écran : crop fenêtre rejeté par validation visuelle "
"(hints=%s)",
hints,
)
return False
def locate(
self,
server_url: str,
@@ -128,35 +264,63 @@ class GroundingEngine:
t_start = time.time()
# ── Capture contrainte à la fenêtre active ──
# Le grounding ne voit QUE la fenêtre attendue — pas la taskbar,
# pas le systray, pas les autres apps. Comme un humain qui regarde
# l'application sur laquelle il travaille.
window_rect = None
try:
from ..window_info_crossplatform import get_active_window_rect
win_info = get_active_window_rect()
if win_info and win_info.get("rect"):
r = win_info["rect"] # [left, top, right, bottom]
# Validation : fenêtre visible et pas minuscule
w = r[2] - r[0]
h = r[3] - r[1]
if w > 50 and h > 50:
window_rect = {
"left": max(0, r[0]),
"top": max(0, r[1]),
"width": min(w, screen_width),
"height": min(h, screen_height),
}
logger.info(
f"Grounding contraint à la fenêtre : "
f"{window_rect['width']}x{window_rect['height']} "
f"à ({window_rect['left']}, {window_rect['top']})"
)
except Exception as e:
logger.debug(f"Pas de window rect disponible : {e}")
active_title = ""
if self._should_scope_to_active_window(target_spec):
# ── Capture contrainte à la fenêtre active ──
# Le grounding ne voit QUE la fenêtre attendue — pas la taskbar,
# pas le systray, pas les autres apps. Comme un humain qui regarde
# l'application sur laquelle il travaille.
try:
from ..window_info_crossplatform import get_active_window_rect
from ..ui.messages import est_fenetre_lea
win_info = get_active_window_rect()
if win_info and win_info.get("rect"):
active_title = str(win_info.get("title", "") or "")
if est_fenetre_lea(active_title) and not self._targets_lea_window(target_spec):
logger.info(
"Grounding plein écran : fenêtre active Léa ignorée pour "
"cible externe (%s)",
target_spec.get("by_text", "") or target_spec.get("by_role", ""),
)
win_info = None
if win_info and win_info.get("rect"):
r = win_info["rect"] # [left, top, right, bottom]
if self._is_plausible_window_rect(r, active_title, screen_width, screen_height):
w = r[2] - r[0]
h = r[3] - r[1]
window_rect = {
"left": max(0, r[0]),
"top": max(0, r[1]),
"width": min(w, screen_width),
"height": min(h, screen_height),
}
logger.info(
f"Grounding contraint à la fenêtre : "
f"{window_rect['width']}x{window_rect['height']} "
f"à ({window_rect['left']}, {window_rect['top']})"
)
else:
logger.info(
"Grounding plein écran : rect actif rejeté "
"(title='%s', rect=%s)",
active_title,
r,
)
except Exception as e:
logger.debug(f"Pas de window rect disponible : {e}")
else:
logger.info(
"Grounding plein écran pour by_role='%s'",
target_spec.get("by_role", ""),
)
screenshot_b64 = self._capture_window_or_screen(window_rect)
if window_rect and screenshot_b64:
if not self._window_crop_matches_target_visually(screenshot_b64, target_spec):
window_rect = None
screenshot_b64 = self._capture_window_or_screen(None)
if not screenshot_b64:
return GroundingResult(
found=False, detail="Capture screenshot échouée",
@@ -186,6 +350,18 @@ class GroundingEngine:
result.elapsed_ms = (time.time() - t_start) * 1000
return result
if target_spec.get("allow_position_fallback"):
if 0.0 <= fallback_x <= 1.0 and 0.0 <= fallback_y <= 1.0:
return GroundingResult(
found=True,
x_pct=fallback_x,
y_pct=fallback_y,
method="position_fallback",
score=0.2,
detail="fallback positionnel explicite",
elapsed_ms=(time.time() - t_start) * 1000,
)
return GroundingResult(
found=False,
detail=f"Toutes les stratégies ont échoué ({', '.join(strategies)})",
@@ -258,7 +434,12 @@ class GroundingEngine:
anchor_b64 = target_spec.get("anchor_image_base64", "")
if anchor_b64:
raw = self._executor._template_match_anchor(
screenshot_b64, anchor_b64, screen_width, screen_height,
screenshot_b64,
anchor_b64,
screen_width,
screen_height,
fallback_x_pct=fallback_x,
fallback_y_pct=fallback_y,
)
if raw and raw.get("resolved"):
return GroundingResult(

View File

@@ -0,0 +1,39 @@
"""Dispatch léger du contrat enrichi de /finalize côté agent."""
from __future__ import annotations
import logging
from typing import Any, Dict
logger = logging.getLogger(__name__)
def dispatch_finalize_result(ui: Any, payload: Dict[str, Any], replay_name: str) -> None:
"""Router le résultat de /finalize vers la bonne surface UI agent."""
if not isinstance(payload, dict):
return
replay_request = payload.get("replay_request") or {}
replay_launch = payload.get("replay_launch") or {}
if replay_launch.get("status") == "started":
logger.info("Replay direct déjà lancé par le serveur après finalize")
return
if not payload.get("replay_ready") or not replay_request:
return
if replay_launch.get("status") == "failed":
logger.warning(
"Auto-replay serveur échoué après finalize, proposition manuelle"
)
if ui is None or not hasattr(ui, "offer_finalize_replay"):
logger.info("UI indisponible pour proposer un test immédiat")
return
ui.offer_finalize_replay(
replay_request,
replay_name or "la tâche que vous venez d'enregistrer",
)

View File

@@ -28,6 +28,7 @@ from .ui.chat_window import ChatWindow
from .ui.capture_server import CaptureServer
from .session.storage import SessionStorage
from .vision.capturer import VisionCapturer
from .finalize_contract import dispatch_finalize_result
# Import optionnel du client serveur (pour le chat et les workflows)
# Deux chemins : relatif (depuis agent_v0.agent_v1) ou absolu (depuis C:\rpa_vision\agent_v1)
@@ -80,6 +81,7 @@ class AgentV1:
self._executor = None
# Flag pour indiquer qu'un replay est en cours (eviter les conflits)
self._replay_active = False
self._last_recording_name = ""
# Etat partage entre systray et chat (source de verite unique)
self._state = AgentState()
@@ -210,12 +212,14 @@ class AgentV1:
time.sleep(30) # Vérifier toutes les 30s
def start_session(self, workflow_name):
self._last_recording_name = workflow_name
self.session_id = f"sess_{time.strftime('%Y%m%dT%H%M%S')}_{uuid.uuid4().hex[:6]}"
self.session_dir = self.storage.get_session_dir(self.session_id)
self.vision = VisionCapturer(str(self.session_dir))
self.streamer = TraceStreamer(self.session_id, machine_id=self.machine_id)
self.streamer.set_on_finalize_result(self._on_finalize_result)
self.captor = EventCaptorV1(self._on_event_bridge)
# Initialiser l'executeur partage
@@ -325,6 +329,15 @@ class AgentV1:
# pour enchainer les actions du workflow
time.sleep(0.2)
else:
if getattr(self._executor, "_replay_paused", False):
if not self._replay_active:
self._replay_active = True
self.ui.set_replay_active(True)
self._state.set_replay_active(True)
poll_delay = getattr(self._executor, '_poll_backoff', REPLAY_POLL_INTERVAL)
time.sleep(max(poll_delay, REPLAY_POLL_INTERVAL))
continue
# Pas d'action en attente — utiliser le backoff de l'executor
# (augmente si le serveur est indisponible, reset a 1s sinon)
if self._replay_active:
@@ -429,6 +442,11 @@ class AgentV1:
f"agent_{self.user_id}"
)
def _on_finalize_result(self, payload: dict) -> None:
"""Réagir au contrat enrichi de /finalize côté agent."""
replay_name = self._last_recording_name or "la tâche que vous venez d'enregistrer"
dispatch_finalize_result(self.ui, payload, replay_name)
_last_heartbeat_hash: str = ""
def _heartbeat_loop(self):

View File

@@ -30,6 +30,7 @@ import os
import queue
import threading
import time
from typing import Callable, Optional
import requests
from PIL import Image
@@ -95,6 +96,11 @@ class TraceStreamer:
# Initialisé paresseusement pour ne pas payer le coût SQLite en dehors
# d'un streaming actif.
self._buffer: PersistentBuffer | None = None
self._on_finalize_result: Optional[Callable[[dict], None]] = None
def set_on_finalize_result(self, callback: Optional[Callable[[dict], None]]) -> None:
"""Définir un callback appelé avec le payload JSON de /finalize."""
self._on_finalize_result = callback
def _get_buffer(self) -> PersistentBuffer:
"""Retourne le buffer persistant, en l'initialisant au besoin."""
@@ -621,6 +627,14 @@ class TraceStreamer:
if resp.ok:
result = resp.json()
logger.info(f"Session finalisée: {result}")
if self._on_finalize_result is not None:
try:
self._on_finalize_result(result)
except Exception as cb_error:
logger.warning(
"Callback finalize ignoré après erreur: %s",
cb_error,
)
else:
logger.warning(f"Finalisation échouée: {resp.status_code}")
except Exception as e:

View File

@@ -158,14 +158,25 @@ class CaptureHandler(BaseHTTPRequestHandler):
"""Capture l'ecran principal et le renvoie en base64 JPEG."""
t0 = time.perf_counter()
try:
import mss
from PIL import Image
from ..vision.capturer import (
capture_foreground_window_image,
capture_screen_image,
)
with mss.mss() as sct:
monitor = sct.monitors[1] # ecran principal
raw = sct.grab(monitor)
img = Image.frombytes("RGB", raw.size, raw.bgra, "raw", "BGRX")
_monitor, img, meta = capture_screen_image()
if img is None:
img, win_meta = capture_foreground_window_image()
meta.update(win_meta)
if img is None:
elapsed_ms = (time.perf_counter() - t0) * 1000
logger.error("Erreur capture : aucun backend exploitable (%s)", meta)
self._send_json(503, {
"error": "capture_unavailable",
"source": meta.get("backend", "unknown"),
"capture_ms": round(elapsed_ms),
"diagnostics": meta,
})
return
# Floutage des données sensibles (conformité AI Act)
if BLUR_SENSITIVE:
@@ -180,15 +191,22 @@ class CaptureHandler(BaseHTTPRequestHandler):
img_b64 = base64.b64encode(buf.getvalue()).decode()
elapsed_ms = (time.perf_counter() - t0) * 1000
logger.info(f"Capture {img.width}x{img.height} en {elapsed_ms:.0f}ms")
logger.info(
"Capture %sx%s via %s en %.0fms",
img.width,
img.height,
meta.get("backend", "unknown"),
elapsed_ms,
)
self._send_json(200, {
"image": img_b64,
"width": img.width,
"height": img.height,
"format": "jpeg",
"source": "windows_live",
"source": meta.get("backend", "windows_live"),
"capture_ms": round(elapsed_ms),
"diagnostics": meta,
})
except Exception as e:

View File

@@ -894,6 +894,34 @@ class ChatWindow:
except Exception:
logger.debug("clear chat history silenced", exc_info=True)
@staticmethod
def _compute_paused_bubble_height(reason_str: str) -> tuple:
"""Calcule la hauteur du Text (en lignes) + si une scrollbar est
nécessaire pour le message d'une bulle paused.
Patch 22 mai 2026 — fix troncature : on prend en compte les \\n
explicites (les `reason` serveur peuvent lister plusieurs
candidats avec un saut de ligne par item) en plus de la longueur
en caractères, et on active la scrollbar dès que le cap est
atteint pour éviter que du contenu disparaisse silencieusement.
Retourne ``(height_lines, needs_scrollbar)``.
"""
if not reason_str:
return 2, False
text = str(reason_str)
# Estimation : ~60 chars/ligne effectifs avec wraplength.
wrapped_lines = (len(text) // 60) + 1
explicit_lines = text.count("\n") + 1
estimated = max(wrapped_lines, explicit_lines)
cap = 12
height = max(2, min(cap, estimated))
# Scrollbar dès que le cap est atteint OU contenu long (filet
# textuel : ≥ 200 chars implique souvent un débordement visuel
# même quand les lignes brutes sont peu nombreuses).
needs_scroll = (estimated >= cap) or (len(text) > 200)
return height, needs_scroll
def _render_paused_bubble(self, payload: Dict[str, Any]) -> None:
tk = self._tk
if getattr(self, "_msg_frame", None) is None:
@@ -923,22 +951,23 @@ class ChatWindow:
# Message scrollable pour les longs reasons (ex: 200+ chars depuis le serveur).
# On utilise un Text en mode read-only avec hauteur calculée selon la longueur.
# Au-delà de 280 chars, scrollbar interne ; sinon Text auto-fitté.
# Patch 22 mai 2026 : prendre en compte les \n explicites (titres
# fenêtre / patterns) et activer la scrollbar dès que le cap de
# hauteur est atteint — sinon les bulles de pause étaient
# tronquées visuellement sans aucun ascenseur visible.
reason_str = str(reason)
# Estimation simple : ~70 chars/ligne avec wraplength
approx_lines = max(2, min(8, (len(reason_str) // 60) + 1))
height_lines, needs_scroll = self._compute_paused_bubble_height(reason_str)
msg_frame = tk.Frame(inner, bg=PAUSED_BG)
msg_frame.pack(fill=tk.X, anchor=tk.W, pady=(6, 0))
reason_text = tk.Text(
msg_frame, bg=PAUSED_BG, fg=PAUSED_FG,
font=FONT_MSG, wrap=tk.WORD, bd=0, height=approx_lines,
font=FONT_MSG, wrap=tk.WORD, bd=0, height=height_lines,
highlightthickness=0, relief=tk.FLAT, cursor="arrow",
)
reason_text.insert("1.0", reason_str)
reason_text.configure(state="disabled")
reason_text.pack(side=tk.LEFT, fill=tk.X, expand=True)
# Scrollbar interne uniquement si le contenu déborde (long messages)
if len(reason_str) > 280:
if needs_scroll:
reason_scroll = tk.Scrollbar(
msg_frame, orient=tk.VERTICAL,
command=reason_text.yview, width=8,
@@ -1019,27 +1048,40 @@ class ChatWindow:
UX fix 8 mai 2026 : on désactive les 2 boutons et on affiche un message
de feedback dès le clic, sans attendre l'ack serveur. Le bus émet en
arrière-plan ; si la connexion est tombée, on log un warning visible.
Fallback HTTP 22 mai 2026 : si le bus SocketIO est déconnecté, on
retombe sur un POST direct ``/replay/{id}/resume`` via
``server_client``. Si les deux échouent, on ré-active les boutons
et on saute l'auto-hide pour permettre à l'utilisateur de
réessayer manuellement (sinon le replay reste figé côté serveur).
"""
if not replay_id:
self._update_paused_feedback("⚠ replay_id manquant — impossible de relancer")
return
emitted = False
if self._bus is not None and self._bus.connected:
emitted = self._bus.resume_replay(replay_id)
# Feedback immédiat : disable boutons + message
emitted, channel = self._dispatch_paused_action(
replay_id,
bus_method="resume_replay",
client_method="resume_replay",
)
self._disable_paused_buttons()
if emitted:
self._update_paused_feedback("→ Reprise demandée…")
logger.info("paused_bubble: lea:replay_resume émis pour %s", replay_id)
else:
self._update_paused_feedback("⚠ Bus indisponible — réessayez dans 5s")
logger.warning("paused_bubble: bus déconnecté, resume non émis")
# UX fix mai 2026 : minimiser la fenêtre vers le systray après 500ms
# (laisse à l'utilisateur le temps de voir "Reprise demandée…").
try:
self._root.after(500, self._do_hide)
except Exception:
logger.debug("auto-hide on resume silenced", exc_info=True)
logger.info(
"paused_bubble: replay_resume émis pour %s via %s",
replay_id, channel,
)
try:
self._root.after(500, self._do_hide)
except Exception:
logger.debug("auto-hide on resume silenced", exc_info=True)
return
# Échec sur les deux canaux : laisser l'utilisateur réessayer.
self._update_paused_feedback("⚠ Serveur injoignable — réessayez")
self._enable_paused_buttons()
logger.warning(
"paused_bubble: bus et HTTP indisponibles, resume non émis "
"pour %s", replay_id,
)
def _on_paused_abort(self, replay_id: str) -> None:
"""Bouton Annuler : émettre lea:replay_abort + fermeture locale immédiate.
@@ -1048,17 +1090,30 @@ class ChatWindow:
n'envoie pas de lea:resumed pour un abort, donc sans cette fermeture
locale la bulle restait coincée — c'était la cause de "Annuler ne
fonctionne pas" rapportée par Dom).
Fallback HTTP 22 mai 2026 : symétrique de ``_on_paused_resume`` —
si le bus est déconnecté, POST direct ``/replay/{id}/cancel``.
L'abort ferme la bulle localement quelle que soit l'issue (l'état
serveur sera réconcilié au prochain poll /replay/next).
"""
emitted = False
if self._bus is not None and self._bus.connected:
emitted = self._bus.abort_replay(replay_id)
emitted, channel = self._dispatch_paused_action(
replay_id,
bus_method="abort_replay",
client_method="abort_replay",
)
self._disable_paused_buttons()
if emitted:
self._update_paused_feedback("✗ Annulé")
logger.info("paused_bubble: lea:replay_abort émis pour %s", replay_id)
logger.info(
"paused_bubble: replay_abort émis pour %s via %s",
replay_id, channel,
)
else:
self._update_paused_feedback("✗ Annulé (bus indisponible)")
logger.warning("paused_bubble: bus déconnecté, abort non émis")
self._update_paused_feedback("✗ Annulé (serveur injoignable)")
logger.warning(
"paused_bubble: bus et HTTP indisponibles, abort non émis "
"pour %s", replay_id,
)
# Fermer la bulle en local (l'abort n'a pas de lea:resumed associé)
self._close_active_paused_bubble(reason="abort_local")
# UX fix mai 2026 : minimiser la fenêtre après 500ms (cohérence
@@ -1068,6 +1123,34 @@ class ChatWindow:
except Exception:
logger.debug("auto-hide on abort silenced", exc_info=True)
def _dispatch_paused_action(
self,
replay_id: str,
bus_method: str,
client_method: str,
) -> tuple:
"""Envoyer une action de bulle paused via bus puis fallback HTTP.
Retourne ``(emitted, channel)`` où ``channel`` vaut ``"bus"``,
``"http"`` ou ``""`` (aucun chemin n'a abouti).
"""
if self._bus is not None and getattr(self._bus, "connected", False):
try:
if getattr(self._bus, bus_method)(replay_id):
return True, "bus"
except Exception:
logger.debug("paused_bubble: bus %s silenced", bus_method, exc_info=True)
if self._server_client is not None and hasattr(self._server_client, client_method):
try:
if getattr(self._server_client, client_method)(replay_id):
return True, "http"
except Exception:
logger.debug(
"paused_bubble: server_client %s silenced",
client_method, exc_info=True,
)
return False, ""
def _disable_paused_buttons(self) -> None:
if not self._active_paused_bubble:
return
@@ -1077,6 +1160,19 @@ class ChatWindow:
except Exception:
logger.debug("disable paused buttons silenced", exc_info=True)
def _enable_paused_buttons(self) -> None:
"""Ré-activer les boutons Continuer/Annuler de la bulle paused
active. Appelé quand l'envoi a échoué sur tous les canaux —
l'utilisateur doit pouvoir réessayer manuellement.
"""
if not self._active_paused_bubble:
return
try:
self._active_paused_bubble["btn_resume"].config(state="normal")
self._active_paused_bubble["btn_abort"].config(state="normal")
except Exception:
logger.debug("enable paused buttons silenced", exc_info=True)
def _update_paused_feedback(self, text: str) -> None:
if not self._active_paused_bubble:
return

View File

@@ -504,6 +504,100 @@ class SmartTrayV1:
threading.Thread(target=_replay, daemon=True).start()
def _launch_replay_request(
self,
replay_request: Dict[str, Any],
replay_name: str,
) -> None:
"""Lance un replay direct depuis un payload `replay_request` serveur."""
endpoint = (replay_request or {}).get("endpoint", "")
session_id = (replay_request or {}).get("session_id", "")
machine_id = (replay_request or {}).get("machine_id") or self.machine_id
if endpoint != "/api/v1/traces/stream/replay-session" or not session_id:
logger.warning("Replay request non supporté: %s", replay_request)
self._notifier.notify(
"Léa",
"Je ne peux pas lancer ce test automatique pour le moment.",
)
return
def _replay():
if self.server_client is None:
return
with self._state_lock:
self._replay_active = True
self._update_icon()
self._notifier.notify(
"Léa",
f"Le système d'intelligence artificielle exécute la "
f"tâche '{replay_name}' sur votre écran.",
)
try:
import requests
auth_headers = {}
if self.server_client is not None:
auth_headers = self.server_client._auth_headers()
resp = requests.post(
f"{self.server_client._stream_base}{endpoint}",
params={
"session_id": session_id,
"machine_id": machine_id,
},
headers=auth_headers,
timeout=30,
allow_redirects=False,
)
if resp.ok:
logger.info(
"Replay direct démarré pour session %s (machine=%s)",
session_id,
machine_id,
)
else:
self._notifier.notify(
"Léa",
"Hmm, le serveur a refusé le test immédiat.",
)
except Exception as e:
logger.error("Erreur lancement replay direct : %s", e)
self._notifier.notify(
"Léa",
f"Oups, un problème : {e}",
)
finally:
with self._state_lock:
self._replay_active = False
self._update_icon()
threading.Thread(target=_replay, daemon=True).start()
def offer_finalize_replay(
self,
replay_request: Dict[str, Any],
replay_name: str,
) -> None:
"""Proposer à l'utilisateur de tester immédiatement la tâche apprise."""
if not replay_request or not replay_request.get("session_id"):
return
def _offer():
self._notifier.notify(
"Léa",
f"J'ai compris la tâche '{replay_name}'. Voulez-vous la tester ?",
)
if not _ask_consent(
"Léa — Test immédiat",
f"J'ai compris la tâche '{replay_name}'. "
"Voulez-vous la tester maintenant ?",
):
return
self._launch_replay_request(replay_request, replay_name)
threading.Thread(target=_offer, daemon=True).start()
def _on_emergency_stop(self, _icon=None, _item=None) -> None:
"""Arret d'urgence — stoppe TOUTES les activites de l'agent immediatement.

View File

@@ -15,7 +15,7 @@ import time
import logging
import hashlib
import platform
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Tuple
from PIL import Image, ImageFilter, ImageStat
import mss
from ..config import TARGETED_CROP_SIZE, SCREENSHOT_QUALITY, BLUR_SENSITIVE
@@ -86,6 +86,337 @@ def _enrich_with_monitor_info(payload: dict) -> dict:
payload["monitors_geometry"] = _get_monitors_geometry()
return payload
# Garde dimensions monitor (démo GHT 19 mai 2026) : mss.monitors[1] peut
# retourner intermittemment des dims tronquées (cas observé 2560×60). Utiliser
# ces dims pour normaliser des coords empoisonne la mémoire (TargetMemoryStore).
MIN_MONITOR_WIDTH = 200
MIN_MONITOR_HEIGHT = 200
MONITOR_MAX_ATTEMPTS = 2
MONITOR_RETRY_DELAY_S = 0.05
BLACK_FRAME_MEAN_MAX = 1.0
BLACK_FRAME_STDDEV_MAX = 1.0
BLACK_FRAME_MAX_LUMA = 3
def _is_monitor_sane(monitor) -> bool:
"""True si les dims du monitor sont au-dessus du seuil de plausibilité."""
if not isinstance(monitor, dict):
return False
w = monitor.get("width", 0) or 0
h = monitor.get("height", 0) or 0
return w >= MIN_MONITOR_WIDTH and h >= MIN_MONITOR_HEIGHT
def _dim_str(monitor) -> str:
"""Représentation courte WxH pour les logs (gère monitor=None)."""
if not isinstance(monitor, dict):
return "?x?"
return f"{monitor.get('width', '?')}x{monitor.get('height', '?')}"
def _acquire_safe_grab(max_attempts: int = MONITOR_MAX_ATTEMPTS,
retry_delay_s: float = MONITOR_RETRY_DELAY_S,
allow_secondary_fallback: bool = True):
"""Ouvre mss et capture un monitor avec dimensions plausibles.
Stratégie en cascade :
1. À chaque tentative, ouvrir un nouveau `mss.mss()` (peut rafraîchir le
cache interne) et examiner monitors[1..n].
2. Préférer monitors[1] (écran principal physique). Si aberrant ET
`allow_secondary_fallback=True`, prendre le premier monitors[2..n]
sain avec un WARNING explicite.
3. Si `allow_secondary_fallback=False`, on n'accepte QUE monitors[1].
Utile pour les méthodes qui reçoivent des coordonnées (x, y) en
système écran composite : capturer un monitor secondaire produirait
une image saine mais décalée par rapport à ces coords.
4. Si aucune dim plausible : attendre `retry_delay_s` et retenter.
5. Après `max_attempts` infructueuses : log ERROR et retourner
(None, None) pour que l'appelant tombe en sortie d'erreur explicite.
Args:
max_attempts: nombre de tentatives mss avant abandon.
retry_delay_s: délai entre tentatives.
allow_secondary_fallback: si False, refuser monitors[2..n] (fail-closed
pour les méthodes coord-bearing).
Returns:
Tuple (monitor_dict, PIL.Image) si capture saine réussie,
(None, None) sinon.
"""
last_aberrant = None
secondary_seen = False # un monitor secondaire sain a été vu mais refusé
for attempt in range(max_attempts):
with mss.mss() as sct:
monitors = list(sct.monitors) if sct.monitors else []
chosen = None
chosen_idx = None
for idx in range(1, len(monitors)):
candidate = monitors[idx]
if not _is_monitor_sane(candidate):
last_aberrant = candidate
logger.warning(
"Monitor[%d] dims aberrantes (%s, seuil %dx%d) "
"— attempt %d/%d",
idx, _dim_str(candidate),
MIN_MONITOR_WIDTH, MIN_MONITOR_HEIGHT,
attempt + 1, max_attempts,
)
continue
# Monitor sain trouvé
if idx == 1 or allow_secondary_fallback:
chosen = candidate
chosen_idx = idx
break
# Sinon : sain mais secondaire interdit pour cet appelant
secondary_seen = True
logger.warning(
"Monitor[%d] sain (%s) mais fallback secondaire refusé "
"(allow_secondary_fallback=False) — capture cohérente "
"des coords impossible",
idx, _dim_str(candidate),
)
if chosen is not None:
if chosen_idx != 1 or attempt > 0:
logger.warning(
"Capture fallback : monitor[%d] dim=%s, attempt=%d",
chosen_idx, _dim_str(chosen), attempt + 1,
)
sct_img = sct.grab(chosen)
img = Image.frombytes(
"RGB", sct_img.size, sct_img.bgra, "raw", "BGRX",
)
return chosen, img
if attempt < max_attempts - 1:
time.sleep(retry_delay_s)
if secondary_seen and not allow_secondary_fallback:
logger.error(
"Capture abandonnée : monitor[1] aberrant après %d tentatives "
"(dernier vu %s) et fallback secondaire désactivé "
"pour préserver la cohérence des coordonnées",
max_attempts, _dim_str(last_aberrant),
)
else:
logger.error(
"Aucun monitor avec dims plausibles trouvé après %d tentatives "
"(dernier vu : %s, seuil %dx%d) — capture abandonnée",
max_attempts, _dim_str(last_aberrant),
MIN_MONITOR_WIDTH, MIN_MONITOR_HEIGHT,
)
return None, None
def _compute_luma_stats(img: Image.Image) -> Dict[str, float | int]:
"""Retourne des stats simples de luminance pour diagnostiquer un frame noir."""
gray = img.convert("L")
stat = ImageStat.Stat(gray)
min_luma, max_luma = gray.getextrema()
return {
"mean": round(float(stat.mean[0]) if stat.mean else 0.0, 2),
"stddev": round(float(stat.stddev[0]) if stat.stddev else 0.0, 2),
"min": int(min_luma),
"max": int(max_luma),
}
def _is_effectively_black(img: Image.Image) -> bool:
"""Heuristique fail-closed pour refuser un screenshot pratiquement noir."""
stats = _compute_luma_stats(img)
return (
stats["max"] <= BLACK_FRAME_MAX_LUMA
and stats["mean"] <= BLACK_FRAME_MEAN_MAX
and stats["stddev"] <= BLACK_FRAME_STDDEV_MAX
)
def _capture_via_imagegrab() -> Tuple[Optional[Dict[str, int]], Optional[Image.Image], Dict[str, Any]]:
"""Fallback Windows via Pillow/ImageGrab.
Utile quand `mss` retourne un frame noir alors que la session graphique
utilisateur reste visible.
"""
if _SYSTEM != "Windows":
return None, None, {"backend": "imagegrab", "error": "unsupported_platform"}
try:
from PIL import ImageGrab
except ImportError as exc:
return None, None, {"backend": "imagegrab", "error": str(exc)}
try:
img = ImageGrab.grab(all_screens=True)
except Exception as exc:
logger.warning("ImageGrab indisponible pour le fallback capture : %s", exc)
return None, None, {"backend": "imagegrab", "error": str(exc)}
monitor = {"left": 0, "top": 0, "width": img.width, "height": img.height}
return monitor, img, {
"backend": "imagegrab",
"luma": _compute_luma_stats(img),
}
def capture_screen_image(
allow_secondary_fallback: bool = True,
) -> Tuple[Optional[Dict[str, int]], Optional[Image.Image], Dict[str, Any]]:
"""Capture plein écran avec diagnostic noir + fallback Windows.
Returns:
(monitor, image, meta) où image peut être None si aucun backend plein
écran n'a produit une image exploitable.
"""
monitor, img = _acquire_safe_grab(
allow_secondary_fallback=allow_secondary_fallback
)
meta: Dict[str, Any] = {"backend": "mss"}
if img is not None:
meta["luma"] = _compute_luma_stats(img)
if not _is_effectively_black(img):
return monitor, img, meta
logger.warning(
"Capture mss quasi noire (%s) — tentative de fallback",
meta["luma"],
)
meta["mss_black_frame"] = True
else:
meta["mss_unavailable"] = True
fallback_monitor, fallback_img, fallback_meta = _capture_via_imagegrab()
if fallback_img is not None:
if not _is_effectively_black(fallback_img):
logger.warning(
"Capture fallback via ImageGrab (%sx%s)",
fallback_img.width,
fallback_img.height,
)
return fallback_monitor, fallback_img, fallback_meta
logger.warning(
"Capture ImageGrab quasi noire (%s)",
fallback_meta.get("luma"),
)
meta["imagegrab_black_frame"] = True
meta["imagegrab_error"] = fallback_meta.get("error")
return None, None, meta
def _capture_window_image_windows(
hwnd: int,
width: int,
height: int,
) -> Tuple[Optional[Image.Image], Dict[str, Any]]:
"""Capture une fenêtre Windows via PrintWindow.
Fallback utile quand la capture plein écran est noire mais que la fenêtre
active reste imprimable par l'API Win32.
"""
if _SYSTEM != "Windows":
return None, {"backend": "printwindow", "error": "unsupported_platform"}
try:
import ctypes
import win32gui
import win32ui
except ImportError as exc:
return None, {"backend": "printwindow", "error": str(exc)}
last_error = None
for flag in (3, 2, 0):
wnd_dc = None
src_dc = None
mem_dc = None
bmp = None
try:
wnd_dc = win32gui.GetWindowDC(hwnd)
if not wnd_dc:
raise RuntimeError("GetWindowDC a retourné 0")
src_dc = win32ui.CreateDCFromHandle(wnd_dc)
mem_dc = src_dc.CreateCompatibleDC()
bmp = win32ui.CreateBitmap()
bmp.CreateCompatibleBitmap(src_dc, width, height)
mem_dc.SelectObject(bmp)
result = ctypes.windll.user32.PrintWindow(
hwnd, mem_dc.GetSafeHdc(), flag
)
bits = bmp.GetBitmapBits(True)
img = Image.frombuffer(
"RGB", (width, height), bits, "raw", "BGRX", 0, 1
)
luma = _compute_luma_stats(img)
if result or not _is_effectively_black(img):
return img, {
"backend": f"printwindow:{flag}",
"printwindow_result": int(result),
"luma": luma,
}
except Exception as exc:
last_error = str(exc)
finally:
try:
if bmp is not None:
win32gui.DeleteObject(bmp.GetHandle())
except Exception:
pass
try:
if mem_dc is not None:
mem_dc.DeleteDC()
except Exception:
pass
try:
if src_dc is not None:
src_dc.DeleteDC()
except Exception:
pass
try:
if wnd_dc is not None:
win32gui.ReleaseDC(hwnd, wnd_dc)
except Exception:
pass
return None, {
"backend": "printwindow",
"error": last_error or "no_usable_frame",
}
def capture_foreground_window_image() -> Tuple[Optional[Image.Image], Dict[str, Any]]:
"""Capture la fenêtre au focus via API native si disponible."""
try:
from ..window_info_crossplatform import get_active_window_rect
rect_info = get_active_window_rect()
except Exception as exc:
return None, {"backend": "printwindow", "error": str(exc)}
if not rect_info:
return None, {"backend": "printwindow", "error": "active_window_unavailable"}
win_w, win_h = rect_info.get("size", [0, 0])
hwnd = rect_info.get("hwnd")
if not hwnd or win_w <= 0 or win_h <= 0:
return None, {
"backend": "printwindow",
"error": "active_window_handle_unavailable",
"title": rect_info.get("title", "unknown_window"),
}
img, meta = _capture_window_image_windows(hwnd, win_w, win_h)
if img is None:
return None, meta
meta.update(
{
"title": rect_info.get("title", "unknown_window"),
"app_name": rect_info.get("app_name", "unknown_app"),
"rect": rect_info.get("rect"),
"window_size": rect_info.get("size"),
"hwnd": hwnd,
}
)
return img, meta
class VisionCapturer:
def __init__(self, session_dir: str):
self.session_dir = session_dir
@@ -103,25 +434,35 @@ class VisionCapturer:
(utile pour le contextualisation des heartbeats côté serveur).
"""
try:
with mss.mss() as sct:
monitor = sct.monitors[1]
sct_img = sct.grab(monitor)
img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX")
_monitor, img, meta = capture_screen_image()
if img is None:
img, win_meta = capture_foreground_window_image()
if img is None:
logger.error(
"Capture plein contexte indisponible (meta=%s, window=%s)",
meta,
win_meta,
)
return ""
logger.warning(
"Capture plein contexte dégradée via fenêtre active (%s)",
win_meta.get("backend"),
)
# Détection de changement (pour Heartbeat)
if not force:
current_hash = self._compute_quick_hash(img)
if current_hash == self.last_img_hash:
return "" # Pas de changement, on économise la fibre
self.last_img_hash = current_hash
# Détection de changement (pour Heartbeat)
if not force:
current_hash = self._compute_quick_hash(img)
if current_hash == self.last_img_hash:
return "" # Pas de changement, on économise la fibre
self.last_img_hash = current_hash
# Floutage des données sensibles (conformité AI Act)
if BLUR_SENSITIVE:
blur_sensitive_regions(img)
# Floutage des données sensibles (conformité AI Act)
if BLUR_SENSITIVE:
blur_sensitive_regions(img)
path = os.path.join(self.shots_dir, f"context_{int(time.time())}_{name_suffix}.png")
img.save(path, "PNG", quality=SCREENSHOT_QUALITY)
return path
path = os.path.join(self.shots_dir, f"context_{int(time.time())}_{name_suffix}.png")
img.save(path, "PNG", quality=SCREENSHOT_QUALITY)
return path
except Exception as e:
logger.error(f"Erreur Context Capture: {e}")
return ""
@@ -145,46 +486,62 @@ class VisionCapturer:
sont toujours retournés (fallback gracieux).
"""
try:
with mss.mss() as sct:
full_path = os.path.join(self.shots_dir, f"{screenshot_id}_full.png")
monitor = sct.monitors[1]
sct_img = sct.grab(monitor)
img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX")
# Capture du Crop (Cœur de l'apprentissage qwen3-vl)
crop_path = os.path.join(self.shots_dir, f"{screenshot_id}_crop.png")
w, h = TARGETED_CROP_SIZE
left = max(0, x - w // 2)
top = max(0, y - h // 2)
crop_img = img.crop((left, top, left + w, top + h))
if anonymize:
crop_img = crop_img.filter(ImageFilter.GaussianBlur(radius=4))
# Floutage des données sensibles (conformité AI Act)
if BLUR_SENSITIVE:
blur_sensitive_regions(img)
blur_sensitive_regions(crop_img)
img.save(full_path, "PNG", quality=SCREENSHOT_QUALITY)
crop_img.save(crop_path, "PNG", quality=SCREENSHOT_QUALITY)
# Mise à jour du hash pour le prochain heartbeat
self.last_img_hash = self._compute_quick_hash(img)
result = {"full": full_path, "crop": crop_path}
# --- Capture de la fenêtre active ---
# Ajout non-bloquant : enrichit le résultat avec l'image
# de la fenêtre seule + métadonnées (titre, rect, clic relatif)
window_info = self.capture_active_window(x, y, screenshot_id, full_img=img)
# Coords (x, y) sont en système écran composite ; cropper depuis
# un monitor secondaire (offset ≠ 0) produirait une image saine
# mais décalée → fail-closed sur fallback secondaire.
_monitor, img, meta = capture_screen_image(
allow_secondary_fallback=False
)
if img is None:
window_info = self.capture_active_window(
x, y, screenshot_id, full_img=None
)
if window_info:
result["window_capture"] = window_info
result = {"window_capture": window_info}
_enrich_with_monitor_info(result)
logger.warning(
"capture_dual dégradée: fenêtre active seule (%s)",
meta,
)
return result
return {}
# QW1 — enrichissement multi-écrans (additif, fallback gracieux)
_enrich_with_monitor_info(result)
full_path = os.path.join(self.shots_dir, f"{screenshot_id}_full.png")
return result
# Capture du Crop (Cœur de l'apprentissage qwen3-vl)
crop_path = os.path.join(self.shots_dir, f"{screenshot_id}_crop.png")
w, h = TARGETED_CROP_SIZE
left = max(0, x - w // 2)
top = max(0, y - h // 2)
crop_img = img.crop((left, top, left + w, top + h))
if anonymize:
crop_img = crop_img.filter(ImageFilter.GaussianBlur(radius=4))
# Floutage des données sensibles (conformité AI Act)
if BLUR_SENSITIVE:
blur_sensitive_regions(img)
blur_sensitive_regions(crop_img)
img.save(full_path, "PNG", quality=SCREENSHOT_QUALITY)
crop_img.save(crop_path, "PNG", quality=SCREENSHOT_QUALITY)
# Mise à jour du hash pour le prochain heartbeat
self.last_img_hash = self._compute_quick_hash(img)
result = {"full": full_path, "crop": crop_path}
# --- Capture de la fenêtre active ---
# Ajout non-bloquant : enrichit le résultat avec l'image
# de la fenêtre seule + métadonnées (titre, rect, clic relatif)
window_info = self.capture_active_window(x, y, screenshot_id, full_img=img)
if window_info:
result["window_capture"] = window_info
# QW1 — enrichissement multi-écrans (additif, fallback gracieux)
_enrich_with_monitor_info(result)
return result
except Exception as e:
logger.error(f"Erreur Dual Capture: {e}")
return {}
@@ -239,33 +596,54 @@ class VisionCapturer:
# Si le clic est en dehors de la fenêtre, on le signale mais on continue
click_inside = (0 <= click_rel_x <= win_w and 0 <= click_rel_y <= win_h)
window_img = None
# --- Crop de la fenêtre depuis le plein écran ---
if full_img is None:
# Pas de screenshot fourni — en capturer un (cas standalone)
# Pas de screenshot fourni — en capturer un (cas standalone).
# win_rect est en coords globales ; cropper depuis un monitor
# secondaire produirait une image décalée → fail-closed sur
# fallback secondaire.
try:
with mss.mss() as sct:
monitor = sct.monitors[1]
sct_img = sct.grab(monitor)
full_img = Image.frombytes(
"RGB", sct_img.size, sct_img.bgra, "raw", "BGRX"
)
_monitor, full_img, _meta = capture_screen_image(
allow_secondary_fallback=False
)
except Exception as e:
logger.error(f"Erreur capture plein écran pour fenêtre : {e}")
return None
full_img = None
# Borner le crop aux limites de l'image plein écran
img_w, img_h = full_img.size
crop_left = max(0, win_left)
crop_top = max(0, win_top)
crop_right = min(img_w, win_right)
crop_bottom = min(img_h, win_bottom)
if full_img is not None and not _is_effectively_black(full_img):
img_w, img_h = full_img.size
crop_left = max(0, win_left)
crop_top = max(0, win_top)
crop_right = min(img_w, win_right)
crop_bottom = min(img_h, win_bottom)
if crop_right <= crop_left or crop_bottom <= crop_top:
logger.debug("Fenêtre hors écran — skip capture fenêtre")
if crop_right > crop_left and crop_bottom > crop_top:
window_img = full_img.crop(
(crop_left, crop_top, crop_right, crop_bottom)
)
else:
logger.debug("Fenêtre hors écran — fallback natif si possible")
elif full_img is not None:
logger.warning(
"capture_active_window: screenshot plein écran noir, fallback natif"
)
if window_img is None and rect_info.get("hwnd"):
window_img, native_meta = _capture_window_image_windows(
rect_info["hwnd"], win_w, win_h
)
if window_img is not None:
logger.warning(
"capture_active_window via fallback natif (%s)",
native_meta.get("backend"),
)
if window_img is None:
logger.debug("Fenêtre hors écran ou capture native indisponible")
return None
window_img = full_img.crop((crop_left, crop_top, crop_right, crop_bottom))
# Floutage conformité AI Act
if BLUR_SENSITIVE:
blur_sensitive_regions(window_img)

View File

@@ -338,6 +338,50 @@ class LeaServerClient:
except Exception:
return None
def resume_replay(self, replay_id: str) -> bool:
"""Reprendre un replay en pause supervisée via HTTP direct.
Fallback du chemin SocketIO (`lea:replay_resume` → agent_chat)
utilisé quand le bus feedback est déconnecté au moment où
l'utilisateur clique « Continuer » dans la bulle paused.
Retourne True si le serveur streaming a accepté la reprise.
"""
if not replay_id:
return False
try:
import requests
resp = requests.post(
f"{self._stream_url}/traces/stream/replay/{replay_id}/resume",
headers=self._auth_headers(),
timeout=10,
)
return bool(resp.ok)
except Exception:
logger.debug("resume_replay HTTP silenced", exc_info=True)
return False
def abort_replay(self, replay_id: str) -> bool:
"""Annuler un replay en pause supervisée via HTTP direct.
Symétrique de ``resume_replay`` : fallback du chemin SocketIO
(`lea:replay_abort`) quand le bus feedback est déconnecté.
POSTe sur ``/replay/{id}/cancel`` côté serveur streaming.
"""
if not replay_id:
return False
try:
import requests
resp = requests.post(
f"{self._stream_url}/traces/stream/replay/{replay_id}/cancel",
headers=self._auth_headers(),
timeout=10,
)
return bool(resp.ok)
except Exception:
logger.debug("abort_replay HTTP silenced", exc_info=True)
return False
def report_action_result(
self,
session_id: str,

View File

@@ -61,7 +61,9 @@ MAX_ACTIONS_PER_REPLAY = 500 # Max actions par requête de replay
MAX_REPLAY_STATES = 1000 # Max entrées dans _replay_states
REPLAY_STATE_TTL_SECONDS = 3600 # Nettoyage auto des replays terminés après 1h
# Actions en cours de retry : action_id -> {"action": ..., "retry_count": N, "replay_id": ...}
# Actions in-flight / retry : action_id -> transport + retry metadata.
# `action` remains the semantic/original action for reporting/retry logic,
# while `dispatched_action` tracks the exact payload last sent to Lea.
_retry_pending: Dict[str, Dict[str, Any]] = {}
# Callbacks d'erreur par replay_id : replay_id -> callback_url
@@ -207,12 +209,14 @@ from .replay_engine import (
_MAX_ACTION_TEXT_LENGTH,
_MAX_KEYS_PER_COMBO,
_KNOWN_KEY_NAMES,
_auto_launch_replay_after_finalize,
_validate_replay_action,
_APP_LAUNCH_COMMANDS,
_APP_VISUAL_SEARCH,
_SETUP_IGNORE_APPS,
_extract_required_apps_from_events,
_extract_required_apps_from_workflow,
_trim_redundant_setup_events,
_resolve_launch_command,
_infer_app_from_window_titles,
_get_visual_search_info,
@@ -475,6 +479,19 @@ def _clear_replay_lock():
logger.error(f"Erreur suppression replay lock : {e}")
def _memory_window_title_for_action(action_meta: Dict[str, Any]) -> str:
"""Résoudre le meilleur window_title disponible pour la mémoire persistante."""
action_meta = action_meta or {}
target_spec = action_meta.get("target_spec") or {}
context_hints = target_spec.get("context_hints") or {}
return (
action_meta.get("expected_window_before", "")
or target_spec.get("window_title", "")
or context_hints.get("window_title", "")
or action_meta.get("window_title", "")
)
def _get_worker_queue_status() -> Dict[str, Any]:
"""Retourne l'état de la queue du worker VLM (pour le monitoring)."""
queue = []
@@ -544,6 +561,34 @@ _machine_replay_target: Dict[str, str] = {}
_replay_states: Dict[str, Dict[str, Any]] = {}
def _remove_queued_action_duplicates(session_id: str, action_id: str) -> int:
"""Retirer d'une queue les copies exactes d'une action déjà acquittée.
Le watchdog peut re-pousser une action orpheline en tête de queue. Si le
report original arrive juste après, cette copie resend doit être jetée,
sinon Léa ré-exécute la même action avec le même `action_id` et peut
toggler l'état UI (ex: touche Windows qui referme Démarrer).
"""
if not session_id or not action_id:
return 0
queue = _replay_queues.get(session_id, [])
if not queue:
return 0
filtered: List[Dict[str, Any]] = []
removed = 0
for queued_action in queue:
queued_id = str((queued_action or {}).get("action_id", "") or "")
if queued_id == action_id:
removed += 1
continue
filtered.append(queued_action)
if removed:
_replay_queues[session_id] = filtered
return removed
class StreamEvent(BaseModel):
session_id: str
timestamp: float
@@ -832,6 +877,16 @@ async def startup():
threading.Thread(target=_preload_easyocr, daemon=True, name="preload_easyocr").start()
from .replay_watchdog import get_or_create_watchdog
app.state.replay_watchdog = get_or_create_watchdog(
retry_pending=_retry_pending,
replay_queues=_replay_queues,
async_lock_factory=_async_replay_lock,
sse_notifier=None,
)
await app.state.replay_watchdog.start()
logger.info(
"API Streaming démarrée — StreamProcessor, Worker et Cleanup prêts. "
"VLM Worker dans un process séparé (run_worker.py)."
@@ -886,6 +941,9 @@ def _load_existing_workflows():
async def shutdown():
global _cleanup_running
_cleanup_running = False
watchdog = getattr(app.state, "replay_watchdog", None)
if watchdog is not None:
await watchdog.stop(timeout_s=3.0)
worker.stop()
# Nettoyer le replay lock au shutdown (sinon le worker VLM resterait bloqué)
_clear_replay_lock()
@@ -1477,17 +1535,24 @@ def _process_screenshot_thread(session_id: str, shot_id: str, path: str):
# =========================================================================
@app.post("/api/v1/traces/stream/finalize")
async def finalize(session_id: str, machine_id: str = "default"):
async def finalize(
session_id: str,
machine_id: str = "default",
launch_replay: bool = False,
):
"""Clôture la session et place le traitement en file d'attente.
Ne bloque plus : marque la session comme finalisée et l'ajoute à la queue
du worker VLM (process séparé) pour analyse + construction workflow.
Le client peut suivre la progression via GET /api/v1/traces/stream/processing/status.
Optionnellement, il peut aussi déclencher immédiatement un replay direct
depuis la session finalisée (chemin Lea-first, sans attendre le workflow VLM).
Args:
session_id: Identifiant de la session à finaliser
machine_id: Identifiant machine (informatif, le machine_id est déjà dans la session)
launch_replay: Si vrai, tente de lancer immédiatement /replay-session
"""
# Vérifier que la session existe
session = processor.session_manager.get_session(session_id)
@@ -1501,6 +1566,10 @@ async def finalize(session_id: str, machine_id: str = "default"):
processor.session_manager.finalize(session_id)
logger.info(f"Session {session_id} finalisée, ajout à la queue du worker VLM")
resolved_machine_id = machine_id
if resolved_machine_id == "default" and getattr(session, "machine_id", ""):
resolved_machine_id = session.machine_id
# Nettoyer les structures d'enrichissement temps réel pour cette session
with _enrichment_lock:
keys_to_remove = [k for k in _pending_click_enrichments if k[0] == session_id]
@@ -1521,17 +1590,70 @@ async def finalize(session_id: str, machine_id: str = "default"):
if shots_dir.exists():
full_shots_count = len(list(shots_dir.glob("shot_*_full.png")))
return {
# Patch 2026-05-23 (brief 0902 deferred-workflow) : par défaut, on
# ne propose plus le replay direct immédiat post-finalize — le chemin
# produit cible est le workflow compilé par le worker VLM. Le client
# attend la disponibilité du workflow nommé pour proposer un test.
# Le replay direct reste accessible (smoke/debug) en activant
# RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE=true côté serveur, OU
# en appelant explicitement POST /api/v1/traces/stream/replay-session
# depuis un outil de test.
_direct_replay_enabled = _auto_launch_replay_after_finalize()
response = {
"status": "queued_for_processing",
"session_id": session_id,
"machine_id": session.machine_id,
"screenshots_to_analyze": full_shots_count,
"replay_ready": _direct_replay_enabled,
"message": (
f"Session finalisée. {full_shots_count} screenshots seront analysés "
"en arrière-plan. Suivez la progression via "
"GET /api/v1/traces/stream/processing/status"
"GET /api/v1/traces/stream/processing/status."
),
}
if _direct_replay_enabled:
response["replay_request"] = {
"endpoint": "/api/v1/traces/stream/replay-session",
"session_id": session_id,
"machine_id": resolved_machine_id,
}
response["message"] += (
" Le replay direct est disponible via "
"POST /api/v1/traces/stream/replay-session"
)
if not launch_replay:
return response
try:
replay_result = await replay_from_session(
session_id=session_id,
machine_id=resolved_machine_id,
)
except HTTPException as exc:
logger.warning(
"Finalize %s : replay direct non lancé (%s)",
session_id,
exc.detail,
)
response["replay_launch"] = {
"status": "failed",
"status_code": exc.status_code,
"detail": exc.detail,
}
response["message"] += (
" Le lancement automatique du replay direct a échoué ; "
"la session reste finalisée et re-jouable manuellement."
)
return response
response["replay_launch"] = {
"status": "started",
"replay": replay_result,
}
response["message"] += " Le replay direct a été lancé immédiatement."
return response
# =========================================================================
@@ -2262,18 +2384,39 @@ async def replay_from_session(
if session_mem and session_mem.events:
_merge_enrichments_into_raw_events(raw_events, session_mem.events)
# ── 3. Construire le replay propre depuis les events bruts ──
# Passer le répertoire de session pour activer le visual replay (crops de référence)
# Répertoire de session utilisé par le visual replay et les anchors setup
session_dir = str(events_file.parent)
# ── 3. Préparer le setup environnement et couper le préambule source ──
setup_actions = []
app_info = _extract_required_apps_from_events(
raw_events,
session_dir=session_dir,
)
replay_raw_events = raw_events
if app_info:
setup_actions = _generate_setup_actions(app_info, setup_id_prefix="setup_sess")
if setup_actions:
replay_raw_events = _trim_redundant_setup_events(raw_events, app_info)
logger.info(
"replay-session %s : %d actions de setup préparées avant le replay "
"(app=%s, cmd=%s, raw_trim=%d%d)",
session_id, len(setup_actions),
app_info.get("primary_app"), app_info.get("primary_launch_cmd"),
len(raw_events), len(replay_raw_events),
)
# ── 4. Construire le replay propre depuis les events bruts ──
# Passer le répertoire de session pour activer le visual replay (crops de référence)
actions = build_replay_from_raw_events(
raw_events, session_id=session_id, session_dir=session_dir,
replay_raw_events, session_id=session_id, session_dir=session_dir,
)
if not actions:
raise HTTPException(
status_code=400,
detail=f"Session '{session_id}' : aucune action exploitable après nettoyage "
f"({len(raw_events)} événements bruts)"
f"({len(replay_raw_events)} événements bruts)"
)
# Limite de sécurité
@@ -2305,23 +2448,10 @@ async def replay_from_session(
if _gesture_catalog and actions:
actions = _gesture_catalog.optimize_replay_actions(actions)
# ── 3b. Setup environnement — ouvrir les applications nécessaires ──
# Analyser les événements bruts pour détecter quelles applications sont requises
# et injecter des actions de setup en tête de la queue de replay.
setup_actions = []
app_info = _extract_required_apps_from_events(raw_events)
if app_info:
setup_actions = _generate_setup_actions(app_info, setup_id_prefix="setup_sess")
if setup_actions:
actions = setup_actions + actions
logger.info(
"replay-session %s : %d actions de setup injectées avant le replay "
"(app=%s, cmd=%s)",
session_id, len(setup_actions),
app_info.get("primary_app"), app_info.get("primary_launch_cmd"),
)
if setup_actions:
actions = setup_actions + actions
# ── 4. Trouver la session de replay cible (Agent V1 actif) ──
# ── 5. Trouver la session de replay cible (Agent V1 actif) ──
# L'agent actif peut avoir une session différente de la session source
target_session_id = _find_active_agent_session(machine_id=machine_id)
if not target_session_id:
@@ -2335,7 +2465,7 @@ async def replay_from_session(
"Lancez l'Agent V1 sur le PC cible."
)
# ── 5. Injecter dans la queue de replay ──
# ── 6. Injecter dans la queue de replay ──
replay_id = f"replay_sess_{uuid.uuid4().hex[:8]}"
async with _async_replay_lock():
@@ -3265,11 +3395,35 @@ async def get_next_action(session_id: str, machine_id: str = "default"):
# NE PAS écraser si _schedule_retry a déjà mis le bon retry_count
action_id_sent = action.get("action_id", "")
if action_id_sent and action_id_sent not in _retry_pending:
now = time.time()
_retry_pending[action_id_sent] = {
"action": dict(action),
"dispatched_action": dict(action),
"retry_count": 0,
"replay_id": "",
"replay_id": owning_replay.get("replay_id", "") if owning_replay else "",
"session_id": session_id,
"machine_id": machine_id,
"dispatched_at": now,
"first_dispatched_at": now,
"resent_count": 0,
"last_resent_at": 0.0,
}
elif action_id_sent:
existing = _retry_pending.get(action_id_sent)
if existing is not None:
now = time.time()
existing.setdefault("action", dict(action))
existing["dispatched_action"] = dict(action)
existing["replay_id"] = existing.get("replay_id") or (
owning_replay.get("replay_id", "") if owning_replay else ""
)
existing["session_id"] = session_id
existing["machine_id"] = machine_id
existing["dispatched_at"] = now
if not existing.get("first_dispatched_at"):
existing["first_dispatched_at"] = now
existing.setdefault("resent_count", 0)
existing.setdefault("last_resent_at", 0.0)
# [REPLAY] log structuré pour suivre une action à travers toute la chaîne
# Grep facile : journalctl --user -u rpa-streaming -f | grep REPLAY
@@ -3400,6 +3554,15 @@ async def report_action_result(report: ReplayResultReport):
)
return {"status": "no_active_replay", "session_id": session_id}
removed_dupes = _remove_queued_action_duplicates(session_id, action_id)
if removed_dupes:
logger.warning(
"[REPLAY] REPORT cleanup session=%s action_id=%s removed_queue_duplicates=%d",
session_id,
action_id,
removed_dupes,
)
# Récupérer l'info de retry pour cette action (si c'est un retry)
retry_info = _retry_pending.pop(action_id, None)
retry_count = retry_info["retry_count"] if retry_info else 0
@@ -3631,10 +3794,7 @@ async def report_action_result(report: ReplayResultReport):
_current = _actions_meta[_idx] or {}
if _current.get("type") == "click":
_mem_target_spec = _current.get("target_spec") or {}
_mem_window_title = (
_mem_target_spec.get("window_title", "")
or _mem_target_spec.get("expected_window_before", "")
)
_mem_window_title = _memory_window_title_for_action(_current)
if _mem_window_title:
_mem_success = (
@@ -3749,6 +3909,7 @@ async def report_action_result(report: ReplayResultReport):
"target_description": f"Dialogue système : {_sys_category}",
"screenshot_b64": screenshot_after or report.screenshot,
"target_spec": _tspec_sys,
"original_action": dict(original_action or {}),
"reason": "system_dialog",
"system_dialog": _sys_info,
"error_detail": _sys_reason or (report.error or ""),
@@ -3814,6 +3975,7 @@ async def report_action_result(report: ReplayResultReport):
"target_description": _target_desc_ww,
"screenshot_b64": screenshot_after or report.screenshot,
"target_spec": _tspec_ww,
"original_action": dict(original_action or {}),
"reason": "wrong_window",
"error_detail": report.error or "",
}
@@ -3888,6 +4050,7 @@ async def report_action_result(report: ReplayResultReport):
"target_description": _target_desc,
"screenshot_b64": screenshot_after or report.screenshot,
"target_spec": _tspec,
"original_action": dict(original_action or {}),
"reason": "no_screen_change_strict",
"resolution_method": report.resolution_method or "",
"resolution_score": report.resolution_score or 0,
@@ -3947,6 +4110,7 @@ async def report_action_result(report: ReplayResultReport):
"target_description": target_desc,
"screenshot_b64": screenshot_after or report.screenshot,
"target_spec": report.target_spec,
"original_action": dict(original_action or {}),
}
replay_state["pause_message"] = f"Je ne vois pas '{target_desc}' à l'écran"
error_entry = {
@@ -3989,6 +4153,7 @@ async def report_action_result(report: ReplayResultReport):
"target_description": target_desc,
"screenshot_b64": screenshot_after or report.screenshot,
"target_spec": report.target_spec,
"original_action": dict(original_action or {}),
}
replay_state["pause_message"] = f"Je ne vois pas '{target_desc}' à l'écran"
error_entry = {
@@ -4341,8 +4506,14 @@ async def resume_replay(
and failed_action.get("reason") != "user_request"):
# Reconstruire l'action a partir du retry_pending ou de l'original
original_action_id = failed_action["action_id"]
original = failed_action.get("original_action")
if isinstance(original, dict) and original:
original = dict(original)
else:
original = None
# Chercher l'action originale dans les retry_pending
original = _retry_pending.pop(original_action_id, {}).get("action")
if not original:
original = _retry_pending.pop(original_action_id, {}).get("action")
if not original:
# Reconstruire un minimum depuis le failed_action context
original = {
@@ -4358,8 +4529,15 @@ async def resume_replay(
# Stocker dans retry_pending pour le suivi
_retry_pending[resume_id] = {
"action": original,
"dispatched_action": dict(resume_action),
"retry_count": 0,
"replay_id": replay_id,
"session_id": session_id,
"machine_id": state.get("machine_id", "default"),
"dispatched_at": 0.0,
"first_dispatched_at": 0.0,
"resent_count": 0,
"last_resent_at": 0.0,
"reason": "resume_after_pause",
}
queue = _replay_queues.get(session_id, [])
@@ -4399,6 +4577,13 @@ async def cancel_replay(replay_id: str):
return {"status": "cancelled", "replay_id": replay_id, "session_id": session_id}
@app.get("/api/v1/traces/stream/replay/watchdog/metrics")
async def watchdog_metrics():
from .replay_watchdog import get_metrics_snapshot
return {"watchdog": get_metrics_snapshot()}
# =========================================================================
# Visual Replay — Résolution visuelle des cibles (module resolve_engine)
# =========================================================================
@@ -4545,10 +4730,13 @@ async def resolve_target(request: ResolveTargetRequest):
# Validation qualité en sortie de cascade : seuil de score + garde
# de proximité contre les coords enregistrées. Single point of
# insertion, n'altère pas la cascade existante.
# target_spec propagé pour relaxation contextuelle (switch_tab +
# som_element calibré, cf. resolve_engine.py 2026-05-22).
result = _validate_resolution_quality(
result,
request.fallback_x_pct,
request.fallback_y_pct,
target_spec=request.target_spec,
)
# Pré-check sémantique post-cascade : OCR sur une zone autour de la
@@ -4581,6 +4769,15 @@ async def resolve_target(request: ResolveTargetRequest):
_by_text = (request.target_spec.get("by_text") or "").strip()
if _by_text:
from agent_v0.server_v1.resolve_engine import _validate_text_at_position
# Propager la bbox SoM enregistrée (si présente) au
# pré-check OCR : pour les éléments étroits (onglets
# Notepad moderne, ~30-40px haut), le radius générique
# capture du texte voisin et rejette à tort.
# Patch 2026-05-23 — cf. inbox_codex/…_notepad-tab-ocr-precheck.
_som_bbox = (
(request.target_spec.get("som_element") or {})
.get("bbox_norm")
)
_is_valid, _observed, _ocr_ms = _validate_text_at_position(
tmp_path,
float(result.get("x_pct", 0) or 0),
@@ -4588,6 +4785,7 @@ async def resolve_target(request: ResolveTargetRequest):
_by_text,
effective_w,
effective_h,
som_bbox_norm=_som_bbox,
)
logger.info(
"[REPLAY] Pre-check OCR ACTIF : '%s' attendu @ (%.4f, %.4f) "
@@ -4600,7 +4798,16 @@ async def resolve_target(request: ResolveTargetRequest):
_is_valid,
_ocr_ms,
)
if not _is_valid:
# Patch 2026-05-23 : rejet uniquement si OCR a effectivement
# lu *autre chose* que la cible. Si observed est vide, l'OCR
# n'a rien lu (crop bbox SoM trop petit / contraste faible
# sur onglet Notepad moderne) — ambigu, on garde la
# résolution serveur. La garde drift ANCHOR-TM côté agent
# bloque les vrais faux positifs.
from agent_v0.server_v1.resolve_engine import (
_should_reject_on_text_mismatch,
)
if _should_reject_on_text_mismatch(_is_valid, _observed):
logger.warning(
"[REPLAY] Pre-check OCR REJET : '%s' attendu @ (%.4f, %.4f) "
"via %s mais OCR voit '%s' (%.0fms)",
@@ -4620,6 +4827,15 @@ async def resolve_target(request: ResolveTargetRequest):
"x_pct": None,
"y_pct": None,
}
elif not _is_valid:
# observed vide → on log mais on accepte
logger.info(
"[REPLAY] Pre-check OCR observed='' (crop trop "
"petit/contraste faible) — on garde la résolution "
"via %s (score=%s), garde drift agent protège en aval",
result.get("method", "?"),
result.get("score"),
)
# [REPLAY] log structuré de sortie résolution (après validation)
# Note: x_pct/y_pct peuvent être None quand le pré-check OCR rejette

View File

@@ -17,6 +17,20 @@ from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
def _infer_machine_id_from_session_id(session_id: str, fallback: str = "default") -> str:
"""Déduire le machine_id depuis un session_id spécial si possible.
Les heartbeats de fond de Léa utilisent `bg_<machine_id>` comme
identifiant de session. Lors d'un redémarrage serveur, ces sessions
peuvent être restaurées depuis la persistance JSON avec `machine_id`
resté à `default`. On rétablit ici l'information machine pour que les
replays ciblés retrouvent bien la session de fond active.
"""
if session_id.startswith("bg_") and len(session_id) > 3:
return session_id[3:]
return fallback
@dataclass
class LiveSessionState:
"""État d'une session active en mémoire."""
@@ -86,11 +100,18 @@ class LiveSessionManager:
def _load_persisted_sessions(self):
"""Charger les sessions sauvegardées au démarrage (JSON state files)."""
count = 0
for session_file in sorted(self._persist_dir.glob("sess_*.json")):
session_files = sorted(self._persist_dir.glob("sess_*.json"))
session_files += sorted(self._persist_dir.glob("bg_*.json"))
for session_file in session_files:
try:
with open(session_file, 'r', encoding='utf-8') as f:
data = json.load(f)
session = LiveSessionState.from_dict(data)
if session.machine_id == "default":
session.machine_id = _infer_machine_id_from_session_id(
session.session_id,
fallback=session.machine_id,
)
self._sessions[session.session_id] = session
count += 1
except Exception as e:
@@ -117,7 +138,7 @@ class LiveSessionManager:
for jsonl_file in sorted(live_dir.glob("**/live_events.jsonl")):
session_dir = jsonl_file.parent
session_id = session_dir.name
if not session_id.startswith("sess_"):
if not (session_id.startswith("sess_") or session_id.startswith("bg_")):
continue
if session_id in self._sessions:
continue
@@ -125,7 +146,7 @@ class LiveSessionManager:
# Déduire le machine_id depuis le chemin parent
parent_name = session_dir.parent.name
if parent_name == live_dir.name:
machine_id = "default"
machine_id = _infer_machine_id_from_session_id(session_id)
else:
machine_id = parent_name

File diff suppressed because it is too large Load Diff

View File

@@ -188,7 +188,12 @@ class ReplayLearner:
"""
target_spec = action.get("target_spec", {})
by_text = target_spec.get("by_text", "")
window_title = target_spec.get("window_title", "")
window_title = (
target_spec.get("window_title", "")
or action.get("window_title", "")
or target_spec.get("expected_window_before", "")
or (target_spec.get("context_hints") or {}).get("window_title", "")
)
x_pct = correction.get("x_pct", 0.0)
y_pct = correction.get("y_pct", 0.0)
@@ -207,20 +212,36 @@ class ReplayLearner:
# Stocker dans target_memory.db pour le lookup futur
try:
from .replay_memory import get_target_memory_store
store = get_target_memory_store()
if store:
store.record_success(
screen_signature="human_correction",
from .replay_memory import memory_record_success
stored = False
if window_title:
stored = memory_record_success(
window_title=window_title,
target_spec=target_spec,
resolved_position={"x_pct": x_pct, "y_pct": y_pct},
x_pct=float(x_pct),
y_pct=float(y_pct),
method="human_supervised",
score=1.0,
confidence=1.0,
)
else:
logger.warning(
"[APPRENTISSAGE] Correction humaine non persistée : "
"window_title absent pour '%s'",
by_text,
)
if stored:
logger.info(
f"[APPRENTISSAGE] Correction stockée dans target_memory : "
f"'{by_text}' → ({x_pct:.4f}, {y_pct:.4f})"
)
elif window_title:
logger.warning(
"[APPRENTISSAGE] Correction humaine non persistée : "
"échec memory_record_success pour '%s' dans '%s'",
by_text,
window_title,
)
except Exception as e:
logger.warning(f"Learning: échec stockage target_memory: {e}")

View File

@@ -103,15 +103,53 @@ def compute_screen_sig(window_title: str) -> str:
return hashlib.sha256(norm.encode("utf-8")).hexdigest()[:16]
def _round_float_list(values: Any, precision: int = 4) -> Optional[tuple[float, ...]]:
"""Normaliser une liste de coordonnées flottantes pour le hash mémoire."""
if not isinstance(values, (list, tuple)):
return None
out = []
for value in values:
try:
out.append(round(float(value), precision))
except (TypeError, ValueError):
return None
return tuple(out)
def _int_pair(values: Any) -> Optional[tuple[int, int]]:
"""Extraire une paire entière stable pour les hints spatiaux."""
if not isinstance(values, (list, tuple)) or len(values) < 2:
return None
try:
return int(values[0]), int(values[1])
except (TypeError, ValueError):
return None
def _should_reuse_recorded_window_relative_coords(fp: Any) -> bool:
"""Décider si on doit remplacer la mémoire apprise par la position source.
Cette réécriture n'est légitime que pour les entrées faibles de type
`position_fallback`/`v4_unknown`, où la mémoire ne contient pas une vraie
localisation visuelle robuste mais seulement un clic écran dépendant de la
résolution. Pour les méthodes visuelles apprises (template, SoM, OCR...),
réinjecter un vieux `click_relative` source crée des collisions et des
dérives sur des boutons homonymes (`Enregistrer`, `OK`, etc.).
"""
method = str(getattr(fp, "etype", "") or "").strip().lower()
return method in {"position_fallback", "v4_unknown"}
class _TargetSpecLike:
"""Adaptateur dict → objet pour `TargetMemoryStore._hash_target_spec()`.
Le hash interne de TargetMemoryStore utilise `getattr(spec, "by_role", ...)`
qui ne fonctionne pas avec un dict brut. On expose les attributs nécessaires.
On intègre aussi `resolve_order` et `vlm_description` dans `context_hints`
pour qu'ils entrent dans le hash — deux actions avec le même `by_text`
mais un `resolve_order` différent doivent avoir des hashes distincts.
On intègre aussi `resolve_order`, `vlm_description` et des indices
spatiaux (SoM, click_relative) dans `context_hints` pour qu'ils entrent
dans le hash. Sinon, deux actions `Enregistrer` dans la même fenêtre
mais à des emplacements différents collisionnent.
"""
__slots__ = ("by_role", "by_text", "by_position", "context_hints")
@@ -131,6 +169,21 @@ class _TargetSpecLike:
hints["_vlm_desc"] = str(d["vlm_description"])
if d.get("anchor_hint"):
hints["_anchor_hint"] = str(d["anchor_hint"])
som_element = d.get("som_element") or {}
som_bbox = _round_float_list(som_element.get("bbox_norm"))
if som_bbox:
hints["_som_bbox"] = som_bbox
som_center = _round_float_list(som_element.get("center_norm"), precision=5)
if som_center:
hints["_som_center"] = som_center
window_capture = d.get("window_capture") or {}
click_relative = _int_pair(window_capture.get("click_relative"))
window_size = _int_pair(window_capture.get("window_size"))
if click_relative and window_size:
hints["_window_rel"] = f"{click_relative[0]},{click_relative[1]}@{window_size[0]}x{window_size[1]}"
self.context_hints = hints
@@ -176,6 +229,46 @@ def memory_lookup(
logger.debug("memory_lookup: fingerprint bbox invalide")
return None
# Quand l'entrée mémoire provient d'un simple `position_fallback`, les
# coordonnées stockées reflètent surtout la géométrie écran source. Dans
# ce cas précis, réutiliser la position relative enregistrée dans la
# fenêtre source reste préférable si elle existe.
#
# En revanche, pour une méthode visuelle réellement apprise
# (`anchor_template`, `som_*`, `hybrid_text_direct`, ...), remplacer les
# coords mémorisées par un vieux `click_relative` crée des dérives sur
# des cibles textuelles homonymes. On garde donc les coords apprises.
window_capture = target_spec.get("window_capture") or {}
click_relative = window_capture.get("click_relative")
window_size = window_capture.get("window_size")
if (
_should_reuse_recorded_window_relative_coords(fp)
and (
isinstance(click_relative, (list, tuple))
and len(click_relative) >= 2
and isinstance(window_size, (list, tuple))
and len(window_size) >= 2
)
):
try:
rel_x = float(click_relative[0])
rel_y = float(click_relative[1])
win_w = float(window_size[0])
win_h = float(window_size[1])
if win_w > 1 and win_h > 1:
x_pct = rel_x / win_w
y_pct = rel_y / win_h
logger.info(
"memory_lookup: coords fenêtre source réutilisées "
"(click_relative=%s, window_size=%s) -> (%.4f, %.4f)",
click_relative,
window_size,
x_pct,
y_pct,
)
except (TypeError, ValueError, ZeroDivisionError):
logger.debug("memory_lookup: window_capture invalide, fallback bbox")
# Sanity check : les pourcentages doivent être dans [0, 1]
if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0):
logger.warning(

View File

@@ -328,10 +328,11 @@ class ReplayVerifier:
),
)
# Cas 4 : Pas de changement (key_combo, wait)
# Pour les raccourcis clavier et attentes, l'absence de changement
# n'est pas forcément un problème (ex: Ctrl+C ne change pas l'écran)
if action_type in ("key_combo", "wait"):
# Cas 4 : Pas de changement (key_combo, wait, verify_screen)
# `verify_screen` côté agent n'est qu'une temporisation de stabilisation.
# Il ne doit pas exiger un NOUVEAU changement visuel sinon le setup
# boucle inutilement une fois l'application déjà ouverte.
if action_type in ("key_combo", "wait", "verify_screen"):
return VerificationResult(
verified=True,
confidence=0.4,

View File

@@ -0,0 +1,329 @@
"""Replay orphan watchdog for in-flight replay actions.
This module watches `_retry_pending` and re-pushes actions that were
dispatched by the server but never acknowledged by the Windows agent.
"""
from __future__ import annotations
import asyncio
import contextlib
import logging
import os
import time
from typing import Any, Callable, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
def _env_bool(name: str, default: str) -> bool:
return os.environ.get(name, default).strip().lower() in {
"1",
"true",
"yes",
"on",
}
def _env_float(name: str, default: float) -> float:
try:
return float(os.environ.get(name, str(default)))
except (TypeError, ValueError):
logger.warning("Watchdog: invalid env %s, fallback=%s", name, default)
return default
def _env_int(name: str, default: int) -> int:
try:
return int(os.environ.get(name, str(default)))
except (TypeError, ValueError):
logger.warning("Watchdog: invalid env %s, fallback=%s", name, default)
return default
def _env_max_resends(default: int) -> int:
raw = os.environ.get("RPA_WATCHDOG_MAX_RESENDS")
if raw is None or not str(raw).strip():
raw = os.environ.get("RPA_WATCHDOG_MAX_RETRIES")
try:
return int(raw) if raw is not None else default
except (TypeError, ValueError):
logger.warning("Watchdog: invalid max resend env, fallback=%s", default)
return default
WATCHDOG_ENABLED = _env_bool("RPA_WATCHDOG_ENABLED", "1")
WATCHDOG_SCAN_INTERVAL_S = _env_float("RPA_WATCHDOG_SCAN_INTERVAL_S", 10.0)
WATCHDOG_ORPHAN_TIMEOUT_S = _env_float("RPA_WATCHDOG_ORPHAN_TIMEOUT_S", 45.0)
WATCHDOG_MAX_RESENDS = _env_max_resends(2)
WATCHDOG_REPUSH_POSITION = (
os.environ.get("RPA_WATCHDOG_REPUSH_POSITION", "head").strip().lower()
)
_metrics_lock = asyncio.Lock()
_metrics: Dict[str, Any] = {
"orphans_detected_total": 0,
"orphans_resent_total": 0,
"orphans_giveup_total": 0,
"scans_total": 0,
"scans_failed_total": 0,
"last_scan_ts": 0.0,
"last_scan_duration_ms": 0.0,
"current_in_flight_count": 0,
"current_orphan_count": 0,
}
async def _bump(key: str, delta: int = 1) -> None:
async with _metrics_lock:
_metrics[key] = _metrics.get(key, 0) + delta
def get_metrics_snapshot() -> Dict[str, Any]:
return dict(_metrics)
SseNotifier = Callable[[str, str], None]
class ReplayWatchdog:
"""Background coroutine that re-pushes orphaned replay actions."""
def __init__(
self,
retry_pending: Dict[str, Dict[str, Any]],
replay_queues: Dict[str, List[Dict[str, Any]]],
async_lock_factory: Callable[[], Any],
sse_notifier: Optional[SseNotifier] = None,
) -> None:
self._retry_pending = retry_pending
self._replay_queues = replay_queues
self._async_lock = async_lock_factory
self._sse_notifier = sse_notifier
self._task: Optional[asyncio.Task] = None
self._stopped = asyncio.Event()
async def start(self) -> None:
if not WATCHDOG_ENABLED:
logger.info("[WATCHDOG] disabled via RPA_WATCHDOG_ENABLED=0")
return
if self._task is not None and not self._task.done():
logger.warning("[WATCHDOG] already started")
return
self._stopped.clear()
self._task = asyncio.create_task(self._run(), name="replay_watchdog")
logger.info(
"[WATCHDOG] started scan=%.1fs orphan_timeout=%.1fs max_resends=%d repush=%s",
WATCHDOG_SCAN_INTERVAL_S,
WATCHDOG_ORPHAN_TIMEOUT_S,
WATCHDOG_MAX_RESENDS,
WATCHDOG_REPUSH_POSITION,
)
async def stop(self, timeout_s: float = 5.0) -> None:
if self._task is None:
return
self._stopped.set()
self._task.cancel()
try:
await asyncio.wait_for(self._task, timeout=timeout_s)
except asyncio.CancelledError:
pass
except asyncio.TimeoutError:
logger.warning("[WATCHDOG] stop timeout after %.1fs", timeout_s)
except Exception:
logger.exception("[WATCHDOG] unexpected stop error")
self._task = None
logger.info("[WATCHDOG] stopped")
async def _run(self) -> None:
try:
while not self._stopped.is_set():
try:
await asyncio.wait_for(
self._stopped.wait(),
timeout=WATCHDOG_SCAN_INTERVAL_S,
)
break
except asyncio.TimeoutError:
pass
try:
await self._scan_once()
except Exception:
await _bump("scans_failed_total")
logger.exception("[WATCHDOG] scan failed")
except asyncio.CancelledError:
logger.info("[WATCHDOG] cancelled")
raise
finally:
logger.info("[WATCHDOG] loop terminated")
async def _scan_once(self) -> Dict[str, int]:
t0 = time.time()
await _bump("scans_total")
resent = 0
gaveup = 0
skipped = 0
in_flight = 0
orphans = 0
orphan_targets: List[Tuple[str, Dict[str, Any]]] = []
async with self._async_lock():
for action_id, info in list(self._retry_pending.items()):
dispatched_at = info.get("dispatched_at", 0.0) or 0.0
if dispatched_at <= 0:
skipped += 1
continue
age = t0 - dispatched_at
in_flight += 1
if age < WATCHDOG_ORPHAN_TIMEOUT_S:
continue
orphans += 1
orphan_targets.append((action_id, dict(info)))
for action_id, info in orphan_targets:
await _bump("orphans_detected_total")
resent_count = int(info.get("resent_count", 0) or 0)
if resent_count >= WATCHDOG_MAX_RESENDS:
async with self._async_lock():
self._retry_pending.pop(action_id, None)
age_total = t0 - float(info.get("first_dispatched_at", t0) or t0)
logger.error(
"[BUS] lea:dispatch_orphan_giveup action_id=%s resent=%d age_total=%.1fs "
"session=%s machine=%s replay=%s",
action_id,
resent_count,
age_total,
info.get("session_id", "?"),
info.get("machine_id", "?"),
info.get("replay_id", "?"),
)
gaveup += 1
await _bump("orphans_giveup_total")
continue
session_id = info.get("session_id")
machine_id = info.get("machine_id", "default")
action = info.get("dispatched_action") or info.get("action")
if not session_id or not isinstance(action, dict):
logger.warning(
"[WATCHDOG] invalid schema for %s session_id=%r action_type=%s",
action_id,
session_id,
type(action).__name__,
)
async with self._async_lock():
self._retry_pending.pop(action_id, None)
continue
async with self._async_lock():
existing = self._retry_pending.get(action_id)
if existing is None:
logger.debug(
"[WATCHDOG] %s acked between snapshot and resend; skip",
action_id,
)
continue
queue = self._replay_queues.setdefault(session_id, [])
if WATCHDOG_REPUSH_POSITION == "tail":
queue.append(dict(action))
else:
queue.insert(0, dict(action))
existing["resent_count"] = resent_count + 1
existing["last_resent_at"] = time.time()
existing["dispatched_at"] = 0.0
age_total = t0 - float(info.get("first_dispatched_at", t0) or t0)
logger.warning(
"[BUS] lea:dispatch_orphan_resent action_id=%s resent=%d/%d age=%.1fs "
"session=%s machine=%s replay=%s",
action_id,
resent_count + 1,
WATCHDOG_MAX_RESENDS,
age_total,
session_id,
machine_id,
info.get("replay_id", "?"),
)
resent += 1
await _bump("orphans_resent_total")
if self._sse_notifier is not None:
try:
self._sse_notifier(session_id, machine_id)
except Exception as exc:
logger.debug("[WATCHDOG] sse notifier failed: %s", exc)
elapsed_ms = (time.time() - t0) * 1000.0
async with _metrics_lock:
_metrics["last_scan_ts"] = t0
_metrics["last_scan_duration_ms"] = elapsed_ms
_metrics["current_in_flight_count"] = in_flight
_metrics["current_orphan_count"] = orphans
scans_total = _metrics["scans_total"]
if orphans or gaveup:
logger.info(
"[METRIC] watchdog scan=%d orphans=%d resent=%d gaveup=%d "
"in_flight=%d skipped=%d elapsed_ms=%.1f",
scans_total,
orphans,
resent,
gaveup,
in_flight,
skipped,
elapsed_ms,
)
return {
"orphans": orphans,
"resent": resent,
"gaveup": gaveup,
"skipped": skipped,
"in_flight": in_flight,
}
_singleton: Optional[ReplayWatchdog] = None
def get_or_create_watchdog(
retry_pending: Dict[str, Dict[str, Any]],
replay_queues: Dict[str, List[Dict[str, Any]]],
async_lock_factory: Callable[[], Any],
sse_notifier: Optional[SseNotifier] = None,
) -> ReplayWatchdog:
global _singleton
if _singleton is None:
_singleton = ReplayWatchdog(
retry_pending=retry_pending,
replay_queues=replay_queues,
async_lock_factory=async_lock_factory,
sse_notifier=sse_notifier,
)
return _singleton
@contextlib.asynccontextmanager
async def watchdog_lifespan(
retry_pending: Dict[str, Dict[str, Any]],
replay_queues: Dict[str, List[Dict[str, Any]]],
async_lock_factory: Callable[[], Any],
sse_notifier: Optional[SseNotifier] = None,
):
watchdog = get_or_create_watchdog(
retry_pending=retry_pending,
replay_queues=replay_queues,
async_lock_factory=async_lock_factory,
sse_notifier=sse_notifier,
)
await watchdog.start()
try:
yield watchdog
finally:
await watchdog.stop()

View File

@@ -243,6 +243,168 @@ def _validate_match_context(
return True
def _has_meaningful_recorded_coords(
fallback_x_pct: float,
fallback_y_pct: float,
) -> bool:
"""Indiquer si les coordonnées fallback représentent une vraie position source."""
return (
fallback_x_pct > 0.001
and fallback_y_pct > 0.001
and not (
abs(fallback_x_pct - 0.5) < 0.001
and abs(fallback_y_pct - 0.5) < 0.001
)
)
def _is_close_tab_target(target_spec: Optional[Dict[str, Any]]) -> bool:
"""Détecter une action close_tab issue du compilateur replay."""
if not isinstance(target_spec, dict):
return False
context_hints = target_spec.get("context_hints") or {}
return str((context_hints.get("interaction") or "")).strip().lower() == "close_tab"
def _get_expected_close_tab_coords(
target_spec: Optional[Dict[str, Any]],
screen_width: int,
screen_height: int,
fallback_x_pct: float = 0.0,
fallback_y_pct: float = 0.0,
) -> Optional[tuple[float, float]]:
"""Retrouver la position attendue la plus fiable pour un close_tab.
Ordre de préférence :
1. Coordonnées fallback explicites de l'action replay
2. centre SoM calibré à l'enregistrement
3. click_relative + rect fenêtre source
"""
if _has_meaningful_recorded_coords(fallback_x_pct, fallback_y_pct):
return float(fallback_x_pct), float(fallback_y_pct)
if not isinstance(target_spec, dict):
return None
som_center = (target_spec.get("som_element") or {}).get("center_norm")
if isinstance(som_center, (list, tuple)) and len(som_center) >= 2:
try:
exp_x = float(som_center[0])
exp_y = float(som_center[1])
if 0.0 <= exp_x <= 1.0 and 0.0 <= exp_y <= 1.0:
return exp_x, exp_y
except (TypeError, ValueError):
pass
window_capture = target_spec.get("window_capture") or {}
rect = window_capture.get("rect")
click_relative = window_capture.get("click_relative")
if (
isinstance(rect, (list, tuple))
and len(rect) >= 4
and isinstance(click_relative, (list, tuple))
and len(click_relative) >= 2
and screen_width > 0
and screen_height > 0
):
try:
abs_x = float(rect[0]) + float(click_relative[0])
abs_y = float(rect[1]) + float(click_relative[1])
exp_x = abs_x / float(screen_width)
exp_y = abs_y / float(screen_height)
if 0.0 <= exp_x <= 1.0 and 0.0 <= exp_y <= 1.0:
return exp_x, exp_y
except (TypeError, ValueError, ZeroDivisionError):
pass
return None
def _is_close_tab_result_plausible(
resolved_x: float,
resolved_y: float,
target_spec: Optional[Dict[str, Any]],
screen_width: int,
screen_height: int,
fallback_x_pct: float = 0.0,
fallback_y_pct: float = 0.0,
) -> bool:
"""Filtrer les faux positifs close_tab qui dérivent vers le bouton fermer."""
if not _is_close_tab_target(target_spec):
return True
expected = _get_expected_close_tab_coords(
target_spec,
screen_width,
screen_height,
fallback_x_pct=fallback_x_pct,
fallback_y_pct=fallback_y_pct,
)
if expected is None:
return True
exp_x, exp_y = expected
dx = abs(float(resolved_x) - exp_x)
dy = abs(float(resolved_y) - exp_y)
distance = (dx ** 2 + dy ** 2) ** 0.5
is_plausible = dx <= 0.18 and distance <= 0.20
if not is_plausible:
logger.warning(
"close_tab guard : résultat rejeté car trop éloigné de la zone "
"source (resolved=(%.4f, %.4f), expected=(%.4f, %.4f), "
"drift=(%.4f, %.4f), dist=%.4f)",
float(resolved_x),
float(resolved_y),
exp_x,
exp_y,
dx,
dy,
distance,
)
return is_plausible
def _is_start_button_vlm_result_plausible(
result: Dict[str, Any],
fallback_x_pct: float,
fallback_y_pct: float,
target_spec: Dict[str, Any],
max_distance: float = 0.20,
) -> bool:
"""Filtrer les faux positifs VLM sur le bouton Démarrer.
Le bouton Démarrer est un singleton système. Quand on dispose d'un vrai clic
enregistré (`fallback_*`), une localisation VLM très éloignée de cette zone
est plus probablement un faux positif qu'un vrai déplacement UI.
"""
by_role = str(target_spec.get("by_role", "") or "").strip().lower()
if by_role != "start_button":
return True
if not _has_meaningful_recorded_coords(fallback_x_pct, fallback_y_pct):
return True
if _validate_match_context(
result,
fallback_x_pct,
fallback_y_pct,
target_spec,
max_distance=max_distance,
):
return True
logger.warning(
"Start button guard : résultat VLM rejeté car trop éloigné de la "
"position enregistrée (resolved=(%.4f, %.4f), expected=(%.4f, %.4f), max=%.2f)",
float(result.get("x_pct", 0) or 0),
float(result.get("y_pct", 0) or 0),
fallback_x_pct,
fallback_y_pct,
max_distance,
)
return False
# =========================================================================
# YOLO/OmniParser — Résolution par détection d'éléments UI
# =========================================================================
@@ -1109,16 +1271,66 @@ def _resolve_by_som(
# Centre du match
match_cx = max_loc[0] + anc_w // 2
match_cy = max_loc[1] + anc_h // 2
interaction = str(
(target_spec.get("context_hints") or {}).get("interaction", "") or ""
).strip().lower()
if interaction == "close_tab":
elapsed = time.time() - t0
cx_norm = match_cx / screen_width if screen_width > 0 else 0.0
cy_norm = match_cy / screen_height if screen_height > 0 else 0.0
if _is_close_tab_result_plausible(
cx_norm,
cy_norm,
target_spec,
screen_width,
screen_height,
):
logger.info(
"SoM resolve ANCHOR exact close_tab : score=%.3f "
"centre=(%d, %d) → (%.4f, %.4f) en %.1fs",
max_score, match_cx, match_cy, cx_norm, cy_norm, elapsed,
)
return {
"resolved": True,
"method": "som_anchor_match",
"x_pct": round(cx_norm, 6),
"y_pct": round(cy_norm, 6),
"matched_element": {
"label": "close_tab_button",
"type": "visual_anchor",
"role": "som_anchor_exact",
"confidence": max_score,
},
"score": max_score,
"match_box": {
"x": int(max_loc[0]),
"y": int(max_loc[1]),
"width": int(anc_w),
"height": int(anc_h),
},
}
logger.warning(
"SoM resolve ANCHOR exact close_tab rejeté : score=%.3f "
"centre=(%d, %d) → (%.4f, %.4f), passage VLM/fallback",
max_score, match_cx, match_cy, cx_norm, cy_norm,
)
# Ne pas recycler ce faux match vers l'élément SoM le plus
# proche : pour close_tab, cela retombe facilement sur le
# bouton de fermeture de la fenêtre.
best_elem = None
else:
best_elem = None
# Trouver l'élément SomEngine le plus proche du centre du match
best_elem = None
best_dist = float("inf")
for elem in som_result.elements:
cx, cy = elem.center
dist = ((match_cx - cx) ** 2 + (match_cy - cy) ** 2) ** 0.5
if dist < best_dist:
best_dist = dist
best_elem = elem
if best_elem is None and interaction != "close_tab":
for elem in som_result.elements:
cx, cy = elem.center
dist = ((match_cx - cx) ** 2 + (match_cy - cy) ** 2) ** 0.5
if dist < best_dist:
best_dist = dist
best_elem = elem
if best_elem and best_dist < 100: # Max 100px de distance
elapsed = time.time() - t0
@@ -1584,6 +1796,49 @@ def _resolve_target_sync(
"fallback cascade legacy"
)
# ===================================================================
# Cas spécial : boutons de dialogue runtime ("Oui", "Non", "OK", ...)
# ===================================================================
# Ces boutons sont textuels, sans ancre stable, et apparaissent souvent
# au milieu d'une action déjà en cours. Si on les laisse partir dans la
# cascade générique (VLM -> SoM -> ScreenAnalyzer), on peut bloquer
# l'action principale assez longtemps pour déclencher le watchdog.
# Contrat voulu : OCR direct rapide, sinon abandon immédiat pour que le
# client essaie son fallback local par template texte.
dialog_role = str(target_spec.get("by_role", "") or "").strip().lower()
dialog_text = str(target_spec.get("by_text", "") or "").strip()
if dialog_role == "dialog_button" and dialog_text and not anchor_image_b64:
ocr_result = _resolve_by_ocr_text(
screenshot_path=screenshot_path,
target_text=dialog_text,
screen_width=screen_width,
screen_height=screen_height,
)
if ocr_result and ocr_result.get("score", 0) >= 0.80:
ocr_result["method"] = "hybrid_text_direct"
logger.info(
"Resolve dialog_button OCR-DIRECT : OK '%s' → (%.4f, %.4f) score=%.2f",
dialog_text[:40],
ocr_result.get("x_pct", 0),
ocr_result.get("y_pct", 0),
ocr_result.get("score", 0),
)
return ocr_result
logger.info(
"Resolve dialog_button OCR-only : '%s' non trouvé "
"(fenêtre='%s') — skip VLM/SoM/ScreenAnalyzer",
dialog_text[:40],
str(target_spec.get("window_title", "") or "")[:80],
)
return {
"resolved": False,
"method": "dialog_button_ocr_only",
"reason": "ocr_direct_failed_dialog_button_no_vlm",
"x_pct": fallback_x_pct,
"y_pct": fallback_y_pct,
}
# ===================================================================
# MODE STRICT (replay sessions) — Stratégie VLM-FIRST
# ===================================================================
@@ -1656,13 +1911,25 @@ def _resolve_target_sync(
screen_height=screen_height,
)
if grounding_result and grounding_result.get("resolved"):
logger.info(
"Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'",
grounding_result.get("x_pct", 0),
grounding_result.get("y_pct", 0),
by_text_strict[:50],
if _is_close_tab_result_plausible(
float(grounding_result.get("x_pct", 0) or 0),
float(grounding_result.get("y_pct", 0) or 0),
target_spec,
screen_width,
screen_height,
fallback_x_pct=fallback_x_pct,
fallback_y_pct=fallback_y_pct,
):
logger.info(
"Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'",
grounding_result.get("x_pct", 0),
grounding_result.get("y_pct", 0),
by_text_strict[:50],
)
return grounding_result
logger.warning(
"Strict resolve GROUNDING : résultat close_tab rejeté, passage template/VLM"
)
return grounding_result
if not by_text_strict or by_text_source not in ("ocr", "vlm"):
# Template matching pour les éléments sans texte (icônes pures)
@@ -1690,11 +1957,23 @@ def _resolve_target_sync(
abs_y = window_rect[1] + y_tm * tm_screen_h
result["x_pct"] = round(abs_x / screen_width, 6)
result["y_pct"] = round(abs_y / screen_height, 6)
logger.info(
"Strict resolve TEMPLATE : icon match (score=%.3f)",
result.get("score", 0),
if _is_close_tab_result_plausible(
float(result.get("x_pct", 0) or 0),
float(result.get("y_pct", 0) or 0),
target_spec,
screen_width,
screen_height,
fallback_x_pct=fallback_x_pct,
fallback_y_pct=fallback_y_pct,
):
logger.info(
"Strict resolve TEMPLATE : icon match (score=%.3f)",
result.get("score", 0),
)
return result
logger.warning(
"Strict resolve TEMPLATE : résultat close_tab rejeté, passage cascade suivante"
)
return result
# ---------------------------------------------------------------
# Étape 0.5 : OCR direct (hybrid_text_direct) — chemin rapide
@@ -1739,6 +2018,27 @@ def _resolve_target_sync(
by_text_strict[:40],
)
# Les boutons de dialogues runtime connus ("Oui", "Non", "OK", etc.)
# ne doivent pas partir dans la cascade lente VLM -> SoM. Si l'OCR
# direct ne les trouve pas immédiatement, on rend la main au client
# pour son fallback local par template texte, sinon on bloque l'action
# principale assez longtemps pour déclencher le watchdog.
dialog_role = str(target_spec.get("by_role", "") or "").strip().lower()
if dialog_role == "dialog_button" and by_text_strict and not anchor_image_b64:
logger.info(
"Strict resolve dialog_button : OCR-direct only pour '%s' "
"(fenêtre='%s') — skip VLM/SoM/template",
by_text_strict[:40],
str(target_spec.get("window_title", "") or "")[:80],
)
return {
"resolved": False,
"method": "dialog_button_ocr_only",
"reason": "ocr_direct_failed_dialog_button_no_vlm",
"x_pct": fallback_x_pct,
"y_pct": fallback_y_pct,
}
# ---------------------------------------------------------------
# Étape 1 : VLM Quick Find (fallback, multi-image)
# ---------------------------------------------------------------
@@ -1750,12 +2050,29 @@ def _resolve_target_sync(
)
if vlm_result and vlm_result.get("resolved"):
if vlm_result.get("score", 0) >= 0.3:
logger.info(
"Strict resolve VLM-first : VLM OK (score=%.2f) pour '%s'",
vlm_result.get("score", 0),
vlm_description[:60] if vlm_description else "(anchor)",
if _is_start_button_vlm_result_plausible(
vlm_result,
fallback_x_pct,
fallback_y_pct,
target_spec,
) and _is_close_tab_result_plausible(
float(vlm_result.get("x_pct", 0) or 0),
float(vlm_result.get("y_pct", 0) or 0),
target_spec,
screen_width,
screen_height,
fallback_x_pct=fallback_x_pct,
fallback_y_pct=fallback_y_pct,
):
logger.info(
"Strict resolve VLM-first : VLM OK (score=%.2f) pour '%s'",
vlm_result.get("score", 0),
vlm_description[:60] if vlm_description else "(anchor)",
)
return vlm_result
logger.warning(
"Strict resolve VLM-first : résultat VLM rejeté par un garde-fou, passage SoM/template"
)
return vlm_result
else:
logger.info(
"Strict resolve VLM-first : VLM score=%.2f trop bas, passage template",
@@ -1782,12 +2099,24 @@ def _resolve_target_sync(
screen_height=screen_height,
)
if som_result and som_result.get("resolved"):
logger.info(
"Strict resolve SoM+VLM : OK (score=%.2f, mark=#%s)",
som_result.get("score", 0),
som_result.get("matched_element", {}).get("som_id", "?"),
if _is_close_tab_result_plausible(
float(som_result.get("x_pct", 0) or 0),
float(som_result.get("y_pct", 0) or 0),
target_spec,
screen_width,
screen_height,
fallback_x_pct=fallback_x_pct,
fallback_y_pct=fallback_y_pct,
):
logger.info(
"Strict resolve SoM+VLM : OK (score=%.2f, mark=#%s)",
som_result.get("score", 0),
som_result.get("matched_element", {}).get("som_id", "?"),
)
return som_result
logger.warning(
"Strict resolve SoM+VLM : résultat close_tab rejeté, passage template matching"
)
return som_result
else:
logger.info("Strict resolve SoM+VLM : échoué, passage template matching")
@@ -1805,12 +2134,24 @@ def _resolve_target_sync(
score = result.get("score", 0)
# Score >= 0.95 : match quasi-parfait, pas besoin de valider le contexte
if score >= 0.95:
logger.info(
"Strict resolve VLM-first : template matching fallback OK "
"(score=%.3f >= 0.95, contexte skip — match quasi-parfait)",
score,
if _is_close_tab_result_plausible(
float(result.get("x_pct", 0) or 0),
float(result.get("y_pct", 0) or 0),
target_spec,
screen_width,
screen_height,
fallback_x_pct=fallback_x_pct,
fallback_y_pct=fallback_y_pct,
):
logger.info(
"Strict resolve VLM-first : template matching fallback OK "
"(score=%.3f >= 0.95, contexte skip — match quasi-parfait)",
score,
)
return result
logger.warning(
"Strict resolve TEMPLATE : match close_tab très fort mais hors zone source, rejeté"
)
return result
elif _validate_match_context(result, fallback_x_pct, fallback_y_pct, target_spec):
logger.info(
"Strict resolve VLM-first : template matching fallback OK "
@@ -2189,6 +2530,37 @@ def _text_match_fuzzy(expected: str, observed: str, min_token_ratio: float = 0.6
return matched / len(tokens) >= min_token_ratio
_SOM_BBOX_OCR_PADDING_PX: int = 8
_SOM_BBOX_MIN_DIM_PX: int = 12
def _should_reject_on_text_mismatch(
is_valid: bool,
observed: Optional[str],
) -> bool:
"""Décide si le pré-check OCR doit rejeter la résolution.
Patch 2026-05-23 : on distingue deux cas d'échec du fuzzy match :
- ``observed`` contient du texte (ex: ``'9 ?'``, ``'OBS Studio…'``)
→ mismatch confirmé, la cascade a probablement cliqué ailleurs
→ on rejette.
- ``observed`` est vide ou whitespace
→ l'OCR n'a rien lu (zone trop petite, texte peu contrasté,
modèle EasyOCR sous le seuil de détection). C'est ambigu :
ce n'est PAS la preuve d'un faux positif, on accepte la
résolution serveur. La garde drift ANCHOR-TM côté agent
protège en aval contre les vrais faux positifs.
Si ``is_valid=True`` → jamais de rejet (cas nominal).
"""
if is_valid:
return False
if observed is None:
return False
return bool(str(observed).strip())
def _validate_text_at_position(
screenshot_path: str,
x_pct: float,
@@ -2197,9 +2569,20 @@ def _validate_text_at_position(
screen_width: int,
screen_height: int,
radius_px: int = 280,
som_bbox_norm: Optional[List[float]] = None,
) -> tuple:
"""Pré-check sémantique : OCR sur une zone autour de (x_pct, y_pct) et
vérifie que `expected_text` y est présent (substring ou fuzzy 50%).
"""Pré-check sémantique : OCR sur une zone et vérifie que
`expected_text` y est présent (substring ou fuzzy 50%).
Zone OCR (par priorité) :
1. Si ``som_bbox_norm = [x1, y1, x2, y2]`` (normalisé 0..1) est
fourni et a une largeur/hauteur > _SOM_BBOX_MIN_DIM_PX en
pixels écran : OCR sur cette bbox élargie d'un padding court.
Plus précis pour les éléments étroits (onglets Notepad
moderne, ~30-40px haut) que le radius générique qui capture
le texte voisin (status bar, etc.).
2. Sinon : fallback historique → carré de ``radius_px`` autour
de (x_pct, y_pct).
Retourne (is_valid: bool, observed_text: str, elapsed_ms: float).
@@ -2219,16 +2602,52 @@ def _validate_text_at_position(
t0 = time.time()
img = Image.open(screenshot_path).convert("RGB")
img_w, img_h = img.size
cx = int(x_pct * screen_width)
cy = int(y_pct * screen_height)
# Saturer dans les bornes de l'image (le screenshot peut être plus
# large que la fenêtre logique — utiliser min(img_*, screen_*) en sécurité).
max_x = min(img_w, screen_width)
max_y = min(img_h, screen_height)
x1 = max(0, cx - radius_px)
y1 = max(0, cy - radius_px)
x2 = min(max_x, cx + radius_px)
y2 = min(max_y, cy + radius_px)
# --- Tentative 1 : zone OCR depuis la bbox SoM (préférée) ---
x1 = y1 = x2 = y2 = None
if (
isinstance(som_bbox_norm, (list, tuple))
and len(som_bbox_norm) == 4
):
try:
bx1, by1, bx2, by2 = (float(v) for v in som_bbox_norm)
# Tolérer ordre inversé.
bx1, bx2 = sorted((bx1, bx2))
by1, by2 = sorted((by1, by2))
# Refuser les bboxes dégénérées AVANT padding : si
# l'élément cible fait < _SOM_BBOX_MIN_DIM_PX en
# natif, c'est probablement une bbox d'apparence
# (curseur, séparateur 1px) — pas un label OCRable.
raw_w = (bx2 - bx1) * screen_width
raw_h = (by2 - by1) * screen_height
if (
raw_w >= _SOM_BBOX_MIN_DIM_PX
and raw_h >= _SOM_BBOX_MIN_DIM_PX
):
# Conversion en pixels écran + clipping et padding.
px1 = int(bx1 * screen_width) - _SOM_BBOX_OCR_PADDING_PX
py1 = int(by1 * screen_height) - _SOM_BBOX_OCR_PADDING_PX
px2 = int(bx2 * screen_width) + _SOM_BBOX_OCR_PADDING_PX
py2 = int(by2 * screen_height) + _SOM_BBOX_OCR_PADDING_PX
x1 = max(0, px1)
y1 = max(0, py1)
x2 = min(max_x, px2)
y2 = min(max_y, py2)
except (TypeError, ValueError):
# Bbox malformée : fallback silencieux sur le radius.
x1 = y1 = x2 = y2 = None
# --- Fallback : carré radius_px autour de (x_pct, y_pct) ---
if x1 is None:
cx = int(x_pct * screen_width)
cy = int(y_pct * screen_height)
x1 = max(0, cx - radius_px)
y1 = max(0, cy - radius_px)
x2 = min(max_x, cx + radius_px)
y2 = min(max_y, cy + radius_px)
if x2 - x1 < 10 or y2 - y1 < 10:
return True, "", 0.0
crop = img.crop((x1, y1, x2, y2))
@@ -2246,6 +2665,7 @@ def _validate_resolution_quality(
result: Optional[Dict[str, Any]],
fallback_x_pct: float,
fallback_y_pct: float,
target_spec: Optional[Dict[str, Any]] = None,
) -> Optional[Dict[str, Any]]:
"""Valide un résultat de résolution et le rejette s'il est peu fiable.
@@ -2263,6 +2683,16 @@ def _validate_resolution_quality(
elle n'est PAS appelée par les méthodes internes de la cascade, mais
uniquement depuis le handler HTTP `/resolve_target` après que la
cascade a produit son meilleur candidat.
Argument optionnel `target_spec` : permet d'appliquer des relaxations
contextuelles. Cas couvert (2026-05-22) : pour une cible
`context_hints.interaction == "switch_tab"` qui dispose d'un
`som_element.bbox_norm`, on abaisse le seuil des méthodes ``som_*``
de 0.75 → 0.60. Justification : (1) le focus_change pré-clic
prouve qu'on est dans la bonne fenêtre, (2) la bbox SoM a été
calibrée à l'enregistrement et reste valide, (3) les onglets
Notepad moderne sont visuellement quasi-identiques → score VLM
inévitablement lower.
"""
if not result or not isinstance(result, dict):
return result
@@ -2291,6 +2721,52 @@ def _validate_resolution_quality(
min_score = threshold
break
# Relaxation contextuelle pour switch_tab + SoM calibré (2026-05-22).
# Les onglets Notepad moderne (et apps similaires) sont visuellement
# quasi-identiques : le grounding VLM/SoM produit fréquemment un
# score 0.65-0.75, juste sous le seuil strict. Comme le contexte
# `interaction=switch_tab` + bbox SoM enregistrée + focus_change
# pré-clic confirment déjà la fenêtre et la zone, on relâche le
# seuil des méthodes som_* à 0.60 dans CE cas précis uniquement.
if (
min_score is not None
and target_spec
and method.startswith("som_")
):
context_hints = target_spec.get("context_hints") or {}
is_tab_switch = (
context_hints.get("interaction") == "switch_tab"
and target_spec.get("by_role") == "tab"
)
som_element = target_spec.get("som_element") or {}
has_calibrated_som = bool(som_element.get("bbox_norm"))
if is_tab_switch and has_calibrated_som:
relaxed = 0.60
if relaxed < min_score:
logger.info(
"[REPLAY] switch_tab + som_element calibré → seuil "
"som_* relâché %.2f%.2f (cible='%s')",
min_score, relaxed,
target_spec.get("by_text", ""),
)
min_score = relaxed
is_close_tab = (
method == "som_anchor_match"
and str((context_hints.get("interaction") or "")).strip().lower() == "close_tab"
and not str(target_spec.get("by_text", "") or "").strip()
and bool(target_spec.get("anchor_image_base64"))
)
if is_close_tab:
relaxed = 0.70
if relaxed < min_score:
logger.info(
"[REPLAY] close_tab + anchor-only → seuil som_anchor_match "
"relâché %.2f%.2f",
min_score, relaxed,
)
min_score = relaxed
if min_score is not None and score < min_score:
logger.warning(
"[REPLAY] Resolution REJETÉE (score trop bas) : method=%s score=%.3f < %.2f",
@@ -2306,13 +2782,40 @@ def _validate_resolution_quality(
"y_pct": fallback_y_pct,
}
if _is_close_tab_target(target_spec) and not _is_close_tab_result_plausible(
resolved_x,
resolved_y,
target_spec,
0,
0,
fallback_x_pct=fallback_x_pct,
fallback_y_pct=fallback_y_pct,
):
logger.warning(
"[REPLAY] Resolution REJETÉE (close_tab hors zone source) : "
"method=%s resolved=(%.3f, %.3f) expected=(%.3f, %.3f)",
method,
resolved_x,
resolved_y,
fallback_x_pct,
fallback_y_pct,
)
return {
"resolved": False,
"method": f"rejected_close_tab_zone_{method}",
"reason": "close_tab_out_of_recorded_zone",
"original_method": method,
"original_score": score,
"x_pct": fallback_x_pct,
"y_pct": fallback_y_pct,
}
# --- Check 2 : garde de proximité ---
# On n'applique la garde que si les coordonnées enregistrées ont un
# sens (pas des placeholders 0.5/0.5 des plans V4 ni des 0.0/0.0).
_has_recorded_coords = (
fallback_x_pct > 0.001
and fallback_y_pct > 0.001
and not (abs(fallback_x_pct - 0.5) < 0.001 and abs(fallback_y_pct - 0.5) < 0.001)
_has_recorded_coords = _has_meaningful_recorded_coords(
fallback_x_pct,
fallback_y_pct,
)
if _has_recorded_coords:
dx = abs(resolved_x - fallback_x_pct)

View File

@@ -1025,6 +1025,345 @@ def enrich_click_from_screenshot(
return result
def _title_to_tab_label(window_title: str) -> str:
"""Réduire un titre de fenêtre en libellé d'onglet probable.
Exemples:
- "Sans titre Bloc-notes" -> "Sans titre"
- "*test Bloc-notes" -> "test"
"""
title = str(window_title or "").strip()
if not title:
return ""
for sep in (" ", " - "):
if sep in title:
head = title.split(sep, 1)[0].strip()
if head:
title = head
break
return title.lstrip("*").strip()
def _split_window_title_head_suffix(window_title: str) -> tuple[str, str]:
"""Découper un titre de fenêtre en ``(head, suffix)`` si possible.
Exemples:
- ``Sans titre Bloc-notes`` -> (``Sans titre``, ``Bloc-notes``)
- ``Page 1 - Google Chrome`` -> (``Page 1``, ``Google Chrome``)
- ``Enregistrer sous`` -> ("", "")
"""
title = str(window_title or "").strip()
if not title:
return "", ""
for sep in (" ", " - "):
if sep in title:
head, suffix = title.split(sep, 1)
head = head.strip()
suffix = suffix.strip()
if head and suffix:
return head, suffix
return "", ""
def _looks_like_same_app_tab_switch(from_title: str, to_title: str) -> bool:
"""Vrai si la transition de focus ressemble à un vrai changement d'onglet.
On exige que les deux titres partagent un suffixe applicatif stable
(ex: ``Bloc-notes``, ``Google Chrome``). Cela exclut les dialogs
modaux same-app comme ``Enregistrer sous`` qui ne sont pas des
onglets et ne doivent pas être compilés en ``switch_tab``.
"""
from_head, from_suffix = _split_window_title_head_suffix(from_title)
to_head, to_suffix = _split_window_title_head_suffix(to_title)
if not (from_head and from_suffix and to_head and to_suffix):
return False
return from_suffix.casefold() == to_suffix.casefold()
def _infer_tab_switch_target(
raw_events: list,
click_event: Dict[str, Any],
) -> Optional[Dict[str, Any]]:
"""Détecter un clic d'onglet à partir d'une bascule de focus dans la même app.
Cas réel observé:
- fenêtre active `http...txt Bloc-notes`
- clic dans la barre d'onglets (y relatif ~40 px)
- focus immédiat vers `Sans titre Bloc-notes`
Dans ce cas, l'ancre image seule est trop fragile. On enrichit donc le
target_spec avec un libellé d'onglet explicite (`by_text='Sans titre'`,
`by_role='tab'`).
"""
event_type = click_event.get("type", "")
if event_type != "mouse_click":
return None
window = click_event.get("window", {})
if not isinstance(window, dict):
return None
from_title = str(window.get("title", "")).strip()
app_name = str(window.get("app_name", "")).strip().lower()
if not from_title or not app_name:
return None
# Heuristique: on ne traite que les clics très hauts dans la fenêtre,
# typiques d'une barre d'onglets / bouton de fermeture d'onglet.
window_capture = click_event.get("window_capture", {})
if not isinstance(window_capture, dict):
return None
click_relative = window_capture.get("click_relative")
if not (isinstance(click_relative, list) and len(click_relative) == 2):
return None
try:
rel_y = int(click_relative[1])
except (TypeError, ValueError):
return None
if rel_y > 90:
return None
click_ts = click_event.get("timestamp")
click_pos = click_event.get("pos") or []
match_idx = None
for idx, raw_evt in enumerate(raw_events):
event_data = raw_evt.get("event", raw_evt)
if event_data.get("type") != "mouse_click":
continue
if event_data.get("timestamp") != click_ts:
continue
if (event_data.get("pos") or []) != click_pos:
continue
match_idx = idx
break
if match_idx is None:
return None
for follow_evt in raw_events[match_idx + 1: match_idx + 7]:
follow_data = follow_evt.get("event", follow_evt)
follow_type = follow_data.get("type", "")
if follow_type in {"mouse_click", "text_input", "key_press", "key_combo"}:
# Un autre geste utilisateur est intervenu avant le focus_change :
# le focus observé n'est plus attribuable avec confiance à CE clic.
return None
if follow_type != "window_focus_change":
continue
to_info = follow_data.get("to", {})
if not isinstance(to_info, dict):
continue
if str(to_info.get("app_name", "")).strip().lower() != app_name:
continue
to_title = str(to_info.get("title", "")).strip()
if not to_title or to_title == from_title:
continue
if not _looks_like_same_app_tab_switch(from_title, to_title):
return None
follow_ts = follow_data.get("timestamp")
if (
isinstance(click_ts, (int, float))
and isinstance(follow_ts, (int, float))
and follow_ts - click_ts > 3.0
):
break
tab_label = _title_to_tab_label(to_title)
if not tab_label:
return None
return {
"by_text": tab_label,
"by_role": "tab",
"window_title": from_title,
"context_hints": {
"window_title": from_title,
"switch_to_window_title": to_title,
"interaction": "switch_tab",
},
"vlm_description": (
f"Dans la fenêtre '{from_title}', l'onglet '{tab_label}' "
"dans la barre d'onglets en haut"
),
}
return None
def _infer_close_tab_target(
raw_events: list,
click_event: Dict[str, Any],
) -> Optional[Dict[str, Any]]:
"""Détecter un clic sur le bouton fermer de l'onglet actif.
Pattern ciblé observé sur Bloc-notes moderne :
- clic très haut dans la barre d'onglets sur un titre ``*... Bloc-notes``
- un clic suivant dans la même fenêtre
- puis focus vers ``Enregistrer sous``
Cela correspond à la fermeture d'un onglet modifié qui déclenche ensuite
le flow de sauvegarde. On enrichit le clic avec un hint sémantique pour
viser le vrai bouton ``x`` de l'onglet actif plutôt qu'un simple `yolo`.
"""
event_type = click_event.get("type", "")
if event_type != "mouse_click":
return None
window = click_event.get("window", {})
if not isinstance(window, dict):
return None
from_title = str(window.get("title", "")).strip()
app_name = str(window.get("app_name", "")).strip().lower()
if not from_title or not app_name or not from_title.startswith("*"):
return None
window_capture = click_event.get("window_capture", {})
if not isinstance(window_capture, dict):
return None
click_relative = window_capture.get("click_relative")
if not (isinstance(click_relative, list) and len(click_relative) == 2):
return None
try:
rel_y = int(click_relative[1])
except (TypeError, ValueError):
return None
if rel_y > 90:
return None
click_ts = click_event.get("timestamp")
click_pos = click_event.get("pos") or []
match_idx = None
for idx, raw_evt in enumerate(raw_events):
event_data = raw_evt.get("event", raw_evt)
if event_data.get("type") != "mouse_click":
continue
if event_data.get("timestamp") != click_ts:
continue
if (event_data.get("pos") or []) != click_pos:
continue
match_idx = idx
break
if match_idx is None:
return None
saw_follow_click_same_window = False
for follow_evt in raw_events[match_idx + 1: match_idx + 8]:
follow_data = follow_evt.get("event", follow_evt)
follow_type = follow_data.get("type", "")
if follow_type in {"text_input", "key_press", "key_combo"}:
return None
if follow_type == "mouse_click":
follow_window = follow_data.get("window", {})
if not isinstance(follow_window, dict):
return None
follow_app = str(follow_window.get("app_name", "")).strip().lower()
follow_title = str(follow_window.get("title", "")).strip()
if follow_app != app_name:
return None
if follow_title == from_title:
saw_follow_click_same_window = True
continue
return None
if follow_type != "window_focus_change" or not saw_follow_click_same_window:
continue
to_info = follow_data.get("to", {})
if not isinstance(to_info, dict):
continue
if str(to_info.get("app_name", "")).strip().lower() != app_name:
continue
to_title = str(to_info.get("title", "")).strip()
if to_title != "Enregistrer sous":
continue
follow_ts = follow_data.get("timestamp")
if (
isinstance(click_ts, (int, float))
and isinstance(follow_ts, (int, float))
and follow_ts - click_ts > 5.0
):
break
tab_label = _title_to_tab_label(from_title)
if not tab_label:
return None
return {
"by_text": "",
"by_role": "tab_close_button",
"window_title": from_title,
"context_hints": {
"window_title": from_title,
"active_tab_label": tab_label,
"interaction": "close_tab",
},
"vlm_description": (
f"Dans la fenêtre '{from_title}', le bouton x pour fermer "
f"l'onglet actif '{tab_label}' dans la barre d'onglets en haut"
),
}
return None
def _attach_expected_window_before(actions: list, raw_events: list) -> None:
"""Attacher la fenêtre attendue AVANT chaque clic en rejouant les
raw events et en conservant le dernier ``window_focus_change.to.title``.
Pourquoi : ``mouse_click.window.title`` capturé pendant
l'enregistrement peut être obsolète si une transition de fenêtre
se produit juste avant la capture (ex: dialog Windows qui s'ouvre
milliseconde avant le clic suivant). Le serveur dispose pourtant
des ``window_focus_change`` consécutifs — on s'en sert pour poser
explicitement ``expected_window_before`` sur le clic, lu en priorité
absolue par la pré-vérif côté agent.
Idempotent : si une action a déjà ``expected_window_before``, on
ne touche pas.
"""
if not actions or not raw_events:
return
last_focus_title = ""
action_idx = 0
def _next_click_idx(start: int) -> int:
i = start
while i < len(actions) and actions[i].get("type") != "click":
i += 1
return i
for raw_evt in raw_events:
ev = raw_evt.get("event", raw_evt) if isinstance(raw_evt, dict) else {}
etype = ev.get("type", "")
if etype == "window_focus_change":
to_info = ev.get("to") or {}
title = str(to_info.get("title", "") or "").strip()
if title and title != "unknown_window":
last_focus_title = title
continue
if etype != "mouse_click":
continue
action_idx = _next_click_idx(action_idx)
if action_idx >= len(actions):
return
a = actions[action_idx]
if last_focus_title and not a.get("expected_window_before"):
a["expected_window_before"] = last_focus_title
action_idx += 1
def _attach_expected_screenshots(
actions: list, raw_events: list, session_dir: Path,
) -> None:
@@ -1591,6 +1930,8 @@ def build_replay_from_raw_events(
k: v for k, v in enrichment.items()
if k != "by_position" # by_position est déjà dans x_pct/y_pct
}
if action.get("window_title") and not action["target_spec"].get("window_title"):
action["target_spec"]["window_title"] = action["window_title"]
# Ajouter les métadonnées fenêtre pour le grounding ciblé
wc = evt.get("window_capture", {})
if wc.get("rect"):
@@ -1600,6 +1941,33 @@ def build_replay_from_raw_events(
"click_relative": wc.get("click_relative"),
}
tab_switch_target = _infer_tab_switch_target(events, evt)
if tab_switch_target:
target_spec = action.setdefault("target_spec", {})
# Préférer une sémantique explicite d'onglet à un rôle brut
# `yolo`/anchor-only quand le flux brut montre une vraie
# bascule de focus dans la même application.
if not target_spec.get("by_text"):
target_spec["by_text"] = tab_switch_target["by_text"]
target_spec["by_role"] = tab_switch_target["by_role"]
target_spec["window_title"] = tab_switch_target["window_title"]
target_spec["vlm_description"] = tab_switch_target["vlm_description"]
context_hints = dict(target_spec.get("context_hints") or {})
context_hints.update(tab_switch_target["context_hints"])
target_spec["context_hints"] = context_hints
action["visual_mode"] = True
close_tab_target = _infer_close_tab_target(events, evt)
if close_tab_target:
target_spec = action.setdefault("target_spec", {})
target_spec["by_role"] = close_tab_target["by_role"]
target_spec["window_title"] = close_tab_target["window_title"]
target_spec["vlm_description"] = close_tab_target["vlm_description"]
context_hints = dict(target_spec.get("context_hints") or {})
context_hints.update(close_tab_target["context_hints"])
target_spec["context_hints"] = context_hints
action["visual_mode"] = True
elif evt_type == "text_input":
text = evt.get("text", "")
if not text:
@@ -1695,6 +2063,21 @@ def build_replay_from_raw_events(
if next_title:
result[ci]["expected_window_title"] = next_title
# ── 9b. Pré-condition fiable : expected_window_before ──
# Bug live 2026-05-22 (act_raw_c70976c8) : window.title d'un
# mouse_click peut être obsolète quand une transition de fenêtre
# (ex: ouverture dialog "Enregistrer sous") se produit juste avant
# la capture du click. Sans correction, target_spec.window_title
# reste sur l'ancien titre et la pré-vérif côté agent
# (executor.py:653) déclenche une pause supervisée à tort.
#
# On rejoue les raw events en maintenant le dernier titre vu via
# window_focus_change.to.title et on le pose comme
# expected_window_before sur chaque clic qui n'en a pas déjà un.
# Le champ est lu en priorité absolue par la pré-vérif agent, donc
# il prime sur target_spec.window_title obsolète.
_attach_expected_window_before(result, events)
# ── 10. Enrichir avec intention + expected_result via gemma4 (Critic) ──
# gemma4 analyse chaque action dans son contexte pour produire :
# - intention : ce que l'utilisateur veut accomplir

View File

@@ -64,6 +64,31 @@ class TestStreamerEndpoints:
_, kwargs = finalize_calls[0]
assert kwargs["params"]["session_id"] == "sess_test_002"
def test_finalize_callback_receives_server_payload(self):
"""Le payload enrichi de /finalize est remonté au callback client."""
from agent_v0.agent_v1.network.streamer import TraceStreamer
payload = {
"status": "queued_for_processing",
"replay_ready": True,
"replay_request": {
"endpoint": "/api/v1/traces/stream/replay-session",
"session_id": "sess_test_008",
"machine_id": "pc-alpha",
},
}
seen = []
with patch("agent_v0.agent_v1.network.streamer.requests") as mock_req:
mock_req.post.return_value = MagicMock(ok=True, json=lambda: payload)
streamer = TraceStreamer("sess_test_008")
streamer.set_on_finalize_result(seen.append)
streamer._server_available = True
streamer.running = False
streamer._finalize_session()
assert seen == [payload]
# =========================================================================
# Payload formats

View File

@@ -0,0 +1,134 @@
"""Tests du chainage produit finalize -> replay-session."""
from __future__ import annotations
import sys
from pathlib import Path
import pytest
_ROOT = str(Path(__file__).resolve().parents[2])
if _ROOT not in sys.path:
sys.path.insert(0, _ROOT)
class TestFinalizeReplayChain:
_TEST_API_TOKEN = "test_finalize_replay_chain_token_0123456789"
@pytest.fixture(autouse=True)
def _ensure_api_token(self, monkeypatch):
monkeypatch.setenv("RPA_API_TOKEN", self._TEST_API_TOKEN)
api_stream_mod = sys.modules.get("agent_v0.server_v1.api_stream")
if api_stream_mod is not None:
monkeypatch.setattr(api_stream_mod, "API_TOKEN", self._TEST_API_TOKEN)
@pytest.fixture
def client(self, tmp_path, monkeypatch):
from fastapi.testclient import TestClient
from agent_v0.server_v1 import api_stream
from agent_v0.server_v1.stream_processor import StreamProcessor
from agent_v0.server_v1.worker_stream import StreamWorker
original_processor = api_stream.processor
original_worker = api_stream.worker
test_processor = StreamProcessor(data_dir=str(tmp_path))
api_stream.processor = test_processor
api_stream.worker = StreamWorker(
live_dir=str(tmp_path),
processor=test_processor,
)
monkeypatch.setattr(api_stream, "_enqueue_to_worker", lambda session_id: None)
client = TestClient(api_stream.app, raise_server_exceptions=False)
yield client, api_stream, test_processor, api_stream.API_TOKEN
api_stream.processor = original_processor
api_stream.worker = original_worker
def test_finalize_exposes_replay_request_without_launch(self, client):
c, _, proc, token = client
proc.session_manager.register_session("sess_final_001", machine_id="pc-alpha")
resp = c.post(
"/api/v1/traces/stream/finalize",
params={"session_id": "sess_final_001"},
headers={"Authorization": f"Bearer {token}"},
)
assert resp.status_code == 200
data = resp.json()
assert data["status"] == "queued_for_processing"
assert data["replay_ready"] is True
assert data["replay_request"] == {
"endpoint": "/api/v1/traces/stream/replay-session",
"session_id": "sess_final_001",
"machine_id": "pc-alpha",
}
assert "replay_launch" not in data
def test_finalize_can_launch_replay_session(self, client, monkeypatch):
c, api_stream, proc, token = client
proc.session_manager.register_session("sess_final_002", machine_id="pc-beta")
calls = []
async def fake_replay_from_session(session_id: str, machine_id: str = "default"):
calls.append((session_id, machine_id))
return {
"replay_id": "replay_sess_1234abcd",
"status": "running",
"source_session_id": session_id,
"target_session_id": "agent_demo",
"machine_id": machine_id,
"total_actions": 7,
}
monkeypatch.setattr(api_stream, "replay_from_session", fake_replay_from_session)
resp = c.post(
"/api/v1/traces/stream/finalize",
params={
"session_id": "sess_final_002",
"launch_replay": "true",
},
headers={"Authorization": f"Bearer {token}"},
)
assert resp.status_code == 200
data = resp.json()
assert calls == [("sess_final_002", "pc-beta")]
assert data["replay_launch"]["status"] == "started"
assert data["replay_launch"]["replay"]["replay_id"] == "replay_sess_1234abcd"
assert data["replay_launch"]["replay"]["source_session_id"] == "sess_final_002"
assert data["replay_launch"]["replay"]["machine_id"] == "pc-beta"
def test_finalize_remains_successful_if_auto_replay_fails(self, client, monkeypatch):
c, api_stream, proc, token = client
proc.session_manager.register_session("sess_final_003", machine_id="pc-gamma")
async def fake_replay_from_session(session_id: str, machine_id: str = "default"):
raise api_stream.HTTPException(
status_code=404,
detail=f"Aucune session Agent V1 active sur {machine_id}",
)
monkeypatch.setattr(api_stream, "replay_from_session", fake_replay_from_session)
resp = c.post(
"/api/v1/traces/stream/finalize",
params={
"session_id": "sess_final_003",
"launch_replay": "true",
},
headers={"Authorization": f"Bearer {token}"},
)
assert resp.status_code == 200
data = resp.json()
assert data["status"] == "queued_for_processing"
assert data["replay_launch"] == {
"status": "failed",
"status_code": 404,
"detail": "Aucune session Agent V1 active sur pc-gamma",
}
assert data["replay_request"]["machine_id"] == "pc-gamma"

View File

@@ -0,0 +1,161 @@
"""Tests intégration : /replay/resume doit réinjecter l'action complète en pause."""
from __future__ import annotations
import sys
from pathlib import Path
import pytest
_ROOT = str(Path(__file__).resolve().parents[2])
if _ROOT not in sys.path:
sys.path.insert(0, _ROOT)
class TestReplayResumePreservesOriginalAction:
_TEST_API_TOKEN = "test_replay_resume_preserves_original_action_token"
@pytest.fixture(autouse=True)
def _ensure_api_token(self, monkeypatch):
monkeypatch.setenv("RPA_API_TOKEN", self._TEST_API_TOKEN)
api_stream_mod = sys.modules.get("agent_v0.server_v1.api_stream")
if api_stream_mod is not None:
monkeypatch.setattr(api_stream_mod, "API_TOKEN", self._TEST_API_TOKEN)
@pytest.fixture
def client(self, monkeypatch):
from fastapi.testclient import TestClient
from agent_v0.server_v1 import api_stream
monkeypatch.setattr(api_stream, "API_TOKEN", self._TEST_API_TOKEN)
saved_states = dict(api_stream._replay_states)
saved_queues = dict(api_stream._replay_queues)
saved_retry = dict(api_stream._retry_pending)
api_stream._replay_states.clear()
api_stream._replay_queues.clear()
api_stream._retry_pending.clear()
client = TestClient(api_stream.app, raise_server_exceptions=False)
yield client, api_stream, self._TEST_API_TOKEN
api_stream._replay_states.clear()
api_stream._replay_states.update(saved_states)
api_stream._replay_queues.clear()
api_stream._replay_queues.update(saved_queues)
api_stream._retry_pending.clear()
api_stream._retry_pending.update(saved_retry)
def test_resume_reinjects_full_original_action_from_failed_action(self, client):
http_client, api_stream, token = client
original_action = {
"action_id": "act_raw_75272d22",
"type": "click",
"visual_mode": True,
"x_pct": 0.8781,
"y_pct": 0.9856,
"expected_window_before": "http192.168.1.408765dossier.htmlid=.txt Bloc-notes",
"target_spec": {
"window_title": "http192.168.1.408765dossier.htmlid=.txt Bloc-notes",
"by_role": "yolo",
},
}
api_stream._replay_states["replay_xyz"] = {
"replay_id": "replay_xyz",
"session_id": "sess_resume_xyz",
"machine_id": "pc-alpha",
"status": "paused_need_help",
"failed_action": {
"action_id": "act_raw_75272d22",
"type": "click",
"reason": "wrong_window",
"target_spec": original_action["target_spec"],
"original_action": original_action,
},
"pause_message": "Replay en pause",
"safety_checks": [],
"checks_acknowledged": [],
"params": {},
}
api_stream._replay_queues["sess_resume_xyz"] = []
resp = http_client.post(
"/api/v1/traces/stream/replay/replay_xyz/resume",
headers={"Authorization": f"Bearer {token}"},
)
assert resp.status_code == 200
data = resp.json()
assert data["status"] == "resumed"
reinjected = api_stream._replay_queues["sess_resume_xyz"][0]
assert reinjected["action_id"] == "act_raw_75272d22_resume"
assert reinjected["x_pct"] == pytest.approx(0.8781)
assert reinjected["y_pct"] == pytest.approx(0.9856)
assert reinjected["expected_window_before"] == (
"http192.168.1.408765dossier.htmlid=.txt Bloc-notes"
)
assert reinjected["target_spec"]["window_title"] == (
"http192.168.1.408765dossier.htmlid=.txt Bloc-notes"
)
def test_resume_dispatch_backfills_retry_pending_for_watchdog(self, client):
http_client, api_stream, token = client
original_action = {
"action_id": "act_resume_01",
"type": "click",
"visual_mode": True,
"x_pct": 0.41,
"y_pct": 0.52,
"target_spec": {"window_title": "test - Bloc-notes"},
}
api_stream._replay_states["replay_resume_watchdog"] = {
"replay_id": "replay_resume_watchdog",
"session_id": "sess_resume_watchdog",
"machine_id": "pc-watchdog",
"status": "paused_need_help",
"failed_action": {
"action_id": "act_resume_01",
"type": "click",
"reason": "wrong_window",
"target_spec": original_action["target_spec"],
"original_action": original_action,
},
"pause_message": "Replay en pause",
"safety_checks": [],
"checks_acknowledged": [],
"params": {},
}
api_stream._replay_queues["sess_resume_watchdog"] = []
resume_resp = http_client.post(
"/api/v1/traces/stream/replay/replay_resume_watchdog/resume",
headers={"Authorization": f"Bearer {token}"},
)
assert resume_resp.status_code == 200
next_resp = http_client.get(
"/api/v1/traces/stream/replay/next",
params={"session_id": "sess_resume_watchdog", "machine_id": "pc-watchdog"},
)
assert next_resp.status_code == 200
payload = next_resp.json()
dispatched = payload["action"]
assert dispatched["action_id"] == "act_resume_01_resume"
retry_info = api_stream._retry_pending["act_resume_01_resume"]
assert retry_info["action"]["action_id"] == "act_resume_01"
assert retry_info["dispatched_action"]["action_id"] == "act_resume_01_resume"
assert retry_info["session_id"] == "sess_resume_watchdog"
assert retry_info["machine_id"] == "pc-watchdog"
assert retry_info["replay_id"] == "replay_resume_watchdog"
assert retry_info["first_dispatched_at"] > 0
assert retry_info["dispatched_at"] >= retry_info["first_dispatched_at"]

View File

@@ -0,0 +1,151 @@
"""Non-régression — trim du préambule redondant pour /replay-session.
Bug fixé le 2026-05-20 (cf. ``docs/AUDIT_FINALIZE_CONTRACT_INTEGRATION_2026-05-20.md``
et ``CR_AUDIT_PAUSED_RESUME_BUS_2026-05-22.md``) : sur la session source
``sess_20260520T102916_066851``, le premier event raw rejoué après le
setup auto Windows était un clic intra-Notepad sur la barre d'onglets
qui basculait de ``http...txt Bloc-notes`` vers ``Sans titre Bloc-notes``.
Comme le setup amène déjà Notepad dans ``Sans titre``, ce clic ne
modifiait rien à l'écran → `retry_threshold`.
Ce test reproduit la chaîne complète d'``api_stream.replay-session``
côté serveur (sans HTTP) sur une fixture synthétique correspondante,
et vérifie que la première action utile post-setup est bien la
saisie de texte ``test`` — pas un clic de bascule d'onglet ``Sans titre``.
"""
from __future__ import annotations
import sys
from pathlib import Path
ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(ROOT))
import pytest
from agent_v0.server_v1.replay_engine import ( # noqa: E402
_extract_required_apps_from_events,
_generate_setup_actions,
_trim_redundant_setup_events,
)
from agent_v0.server_v1.stream_processor import ( # noqa: E402
build_replay_from_raw_events,
)
def _make_session_events() -> list:
"""Reproduit le pattern de ``sess_20260520T102916_066851`` :
Démarrer → Rechercher → Notepad ouvre un fichier .txt → l'utilisateur
clique sur l'onglet ``Sans titre`` → tape ``test`` → Ctrl+S.
L'enregistrement initial passe par un titre non-neutre puis bascule
sur un titre neutre — c'est le scénario qui piégeait le trim."""
return [
# Démarrer
{"event": {
"type": "window_focus_change",
"to": {"app_name": "explorer.exe", "title": "Explorateur"},
}},
{"event": {
"type": "mouse_click", "pos": [50, 1430], "timestamp": 1.0,
"window": {"app_name": "explorer.exe", "title": "Explorateur"},
}},
# SearchHost
{"event": {
"type": "window_focus_change",
"to": {"app_name": "SearchHost.exe", "title": "Rechercher"},
}},
{"event": {
"type": "text_input", "text": "bloc", "timestamp": 2.0,
"window": {"app_name": "SearchHost.exe", "title": "Rechercher"},
}},
{"event": {
"type": "mouse_click", "pos": [681, 448], "timestamp": 2.5,
"window": {"app_name": "SearchHost.exe", "title": "Rechercher"},
}},
# Notepad ouvre un fichier .txt existant (non-neutre)
{"event": {
"type": "window_focus_change",
"to": {
"app_name": "Notepad.exe",
"title": "http192.168.1.408765dossier.htmlid=.txt Bloc-notes",
},
}},
# Clic dans la barre d'onglets (y=40) → bascule vers Sans titre
{"event": {
"type": "mouse_click", "pos": [1191, 40], "timestamp": 4.0,
"window": {
"app_name": "Notepad.exe",
"title": "http192.168.1.408765dossier.htmlid=.txt Bloc-notes",
},
"window_capture": {"click_relative": [1191, 40]},
}},
{"event": {
"type": "window_focus_change",
"to": {"app_name": "Notepad.exe", "title": "Sans titre Bloc-notes"},
}},
# Saisie réelle de l'utilisateur — c'est la première action utile
{"event": {
"type": "text_input", "text": "test", "timestamp": 5.0,
"window": {"app_name": "Notepad.exe",
"title": "Sans titre Bloc-notes"},
}},
]
def test_replay_session_pipeline_skips_redundant_tab_switch(tmp_path):
"""Pipeline complet replay-session : setup auto + trim + build doit
produire un replay dont la première action post-setup est la saisie
``test``, pas le clic de bascule d'onglet ``Sans titre``.
"""
raw_events = _make_session_events()
app_info = _extract_required_apps_from_events(raw_events)
# 1) Setup auto reconnaît Notepad et génère ses actions
assert app_info.get("primary_app") == "Notepad.exe"
setup_actions = _generate_setup_actions(app_info, setup_id_prefix="setup_sess")
assert setup_actions, "le setup auto doit injecter des actions Notepad"
action_ids = {a.get("action_id", "") for a in setup_actions}
assert any("click_start" in aid for aid in action_ids)
assert any("click_result" in aid for aid in action_ids)
# 2) Trim : le clic intra-Notepad redondant doit disparaître
trimmed = _trim_redundant_setup_events(raw_events, app_info)
click_titles = [
(ev.get("event") or ev).get("window", {}).get("title", "")
for ev in trimmed
if (ev.get("event") or ev).get("type") == "mouse_click"
]
assert not any(
"http192.168.1.40" in t for t in click_titles
), "le clic intra-Notepad redondant doit être coupé par le trim"
# 3) Build replay propre : la première action utile post-trim est
# la saisie 'test' — pas un click "Sans titre" issu de
# _infer_tab_switch_target.
actions = build_replay_from_raw_events(
trimmed, session_id="sess_synthetic", session_dir=str(tmp_path),
)
actionable = [a for a in actions if a.get("type") in ("click", "type", "key_combo")]
assert actionable, "le replay doit contenir au moins une action utile"
first = actionable[0]
assert first.get("type") == "type", (
f"première action utile doit être 'type', pas '{first.get('type')}' "
f"(target_spec={first.get('target_spec')})"
)
assert first.get("text") == "test"
# Sanity : aucune action click ne doit cibler "Sans titre" (= la
# bascule d'onglet inférée par _infer_tab_switch_target) dans le
# replay nettoyé.
sans_titre_clicks = [
a for a in actions
if a.get("type") == "click"
and a.get("target_spec", {}).get("by_text", "").strip().lower() == "sans titre"
]
assert not sans_titre_clicks, (
"le replay ne doit plus contenir de click ciblant 'Sans titre' "
f"(trouvés : {sans_titre_clicks})"
)

View File

@@ -0,0 +1,352 @@
"""Integration tests for the replay orphan watchdog."""
from __future__ import annotations
import asyncio
import contextlib
import importlib
import time
from typing import Any, Dict, List
import pytest
@contextlib.asynccontextmanager
async def fake_lock():
yield
@pytest.fixture(autouse=True)
def reset_watchdog_singleton():
import agent_v0.server_v1.replay_watchdog as wd_mod
wd_mod._singleton = None
for key in list(wd_mod._metrics.keys()):
if isinstance(wd_mod._metrics[key], (int, float)):
wd_mod._metrics[key] = 0
yield
@pytest.fixture
def env_short_timeout(monkeypatch):
monkeypatch.setenv("RPA_WATCHDOG_ENABLED", "1")
monkeypatch.setenv("RPA_WATCHDOG_SCAN_INTERVAL_S", "0.1")
monkeypatch.setenv("RPA_WATCHDOG_ORPHAN_TIMEOUT_S", "0.2")
monkeypatch.setenv("RPA_WATCHDOG_MAX_RESENDS", "2")
import agent_v0.server_v1.replay_watchdog as wd_mod
importlib.reload(wd_mod)
yield
@pytest.mark.asyncio
async def test_no_orphan_below_timeout(env_short_timeout):
from agent_v0.server_v1.replay_watchdog import ReplayWatchdog
now = time.time()
retry_pending: Dict[str, Dict[str, Any]] = {
"act1": {
"action": {"action_id": "act1", "type": "click"},
"dispatched_action": {"action_id": "act1", "type": "click"},
"session_id": "sess1",
"machine_id": "m1",
"dispatched_at": now,
"first_dispatched_at": now,
"resent_count": 0,
}
}
replay_queues: Dict[str, List[Dict[str, Any]]] = {"sess1": []}
watchdog = ReplayWatchdog(retry_pending, replay_queues, fake_lock)
result = await watchdog._scan_once()
assert result == {
"orphans": 0,
"resent": 0,
"gaveup": 0,
"skipped": 0,
"in_flight": 1,
}
assert replay_queues["sess1"] == []
assert retry_pending["act1"]["resent_count"] == 0
@pytest.mark.asyncio
async def test_orphan_above_timeout_resent_in_head(env_short_timeout):
from agent_v0.server_v1.replay_watchdog import ReplayWatchdog
action = {"action_id": "act1", "type": "click"}
other = {"action_id": "act_next", "type": "click"}
retry_pending = {
"act1": {
"action": {"action_id": "original", "type": "click"},
"dispatched_action": action,
"session_id": "sess1",
"machine_id": "m1",
"dispatched_at": time.time() - 5.0,
"first_dispatched_at": time.time() - 5.0,
"resent_count": 0,
}
}
replay_queues = {"sess1": [other]}
watchdog = ReplayWatchdog(retry_pending, replay_queues, fake_lock)
result = await watchdog._scan_once()
assert result["resent"] == 1
assert replay_queues["sess1"] == [action, other]
assert retry_pending["act1"]["resent_count"] == 1
assert retry_pending["act1"]["dispatched_at"] == 0.0
@pytest.mark.asyncio
async def test_giveup_after_max_resends(env_short_timeout):
from agent_v0.server_v1.replay_watchdog import ReplayWatchdog
retry_pending = {
"act1": {
"action": {"action_id": "act1", "type": "click"},
"dispatched_action": {"action_id": "act1", "type": "click"},
"session_id": "sess1",
"machine_id": "m1",
"dispatched_at": time.time() - 5.0,
"first_dispatched_at": time.time() - 90.0,
"resent_count": 2,
}
}
replay_queues = {"sess1": []}
watchdog = ReplayWatchdog(retry_pending, replay_queues, fake_lock)
result = await watchdog._scan_once()
assert result["gaveup"] == 1
assert result["resent"] == 0
assert "act1" not in retry_pending
assert replay_queues["sess1"] == []
@pytest.mark.asyncio
async def test_race_report_arrives_during_scan(env_short_timeout):
from agent_v0.server_v1.replay_watchdog import ReplayWatchdog
retry_pending = {
"act1": {
"action": {"action_id": "act1", "type": "click"},
"dispatched_action": {"action_id": "act1", "type": "click"},
"session_id": "sess1",
"machine_id": "m1",
"dispatched_at": time.time() - 5.0,
"first_dispatched_at": time.time() - 5.0,
"resent_count": 0,
}
}
replay_queues = {"sess1": []}
@contextlib.asynccontextmanager
async def lock_that_pops_before_resend():
count = getattr(lock_that_pops_before_resend, "_count", 0) + 1
lock_that_pops_before_resend._count = count
if count == 2:
retry_pending.pop("act1", None)
yield
watchdog = ReplayWatchdog(retry_pending, replay_queues, lock_that_pops_before_resend)
result = await watchdog._scan_once()
assert result["orphans"] == 1
assert result["resent"] == 0
assert replay_queues["sess1"] == []
@pytest.mark.asyncio
async def test_disabled_via_env(monkeypatch):
monkeypatch.setenv("RPA_WATCHDOG_ENABLED", "0")
import agent_v0.server_v1.replay_watchdog as wd_mod
importlib.reload(wd_mod)
watchdog = wd_mod.ReplayWatchdog({}, {}, fake_lock)
await watchdog.start()
assert watchdog._task is None
await watchdog.stop()
@pytest.mark.asyncio
async def test_lifecycle_start_stop_clean(env_short_timeout):
from agent_v0.server_v1.replay_watchdog import ReplayWatchdog
watchdog = ReplayWatchdog({}, {}, fake_lock)
await watchdog.start()
assert watchdog._task is not None
assert not watchdog._task.done()
await asyncio.sleep(0.25)
await watchdog.stop(timeout_s=2.0)
assert watchdog._task is None
@pytest.mark.asyncio
async def test_orphan_with_repush_tail(monkeypatch, env_short_timeout):
monkeypatch.setenv("RPA_WATCHDOG_REPUSH_POSITION", "tail")
import agent_v0.server_v1.replay_watchdog as wd_mod
importlib.reload(wd_mod)
from agent_v0.server_v1.replay_watchdog import ReplayWatchdog
action = {"action_id": "act1", "type": "click"}
other = {"action_id": "act_next", "type": "click"}
retry_pending = {
"act1": {
"action": {"action_id": "original", "type": "click"},
"dispatched_action": action,
"session_id": "sess1",
"machine_id": "m1",
"dispatched_at": time.time() - 5.0,
"first_dispatched_at": time.time() - 5.0,
"resent_count": 0,
}
}
replay_queues = {"sess1": [other]}
watchdog = ReplayWatchdog(retry_pending, replay_queues, fake_lock)
await watchdog._scan_once()
assert replay_queues["sess1"] == [other, action]
@pytest.mark.asyncio
async def test_metrics_snapshot(env_short_timeout):
from agent_v0.server_v1.replay_watchdog import ReplayWatchdog, get_metrics_snapshot
retry_pending = {
"act1": {
"action": {"action_id": "act1", "type": "click"},
"dispatched_action": {"action_id": "act1", "type": "click"},
"session_id": "sess1",
"machine_id": "m1",
"dispatched_at": time.time() - 5.0,
"first_dispatched_at": time.time() - 5.0,
"resent_count": 0,
}
}
watchdog = ReplayWatchdog(retry_pending, {"sess1": []}, fake_lock)
await watchdog._scan_once()
snapshot = get_metrics_snapshot()
assert snapshot["scans_total"] >= 1
assert snapshot["orphans_detected_total"] >= 1
assert snapshot["orphans_resent_total"] >= 1
def test_default_orphan_timeout_matches_spec(monkeypatch):
monkeypatch.delenv("RPA_WATCHDOG_ORPHAN_TIMEOUT_S", raising=False)
import agent_v0.server_v1.replay_watchdog as wd_mod
importlib.reload(wd_mod)
assert wd_mod.WATCHDOG_ORPHAN_TIMEOUT_S == 45.0
@pytest.mark.asyncio
async def test_late_report_clears_resent_duplicate_from_queue(monkeypatch):
monkeypatch.setenv("RPA_API_TOKEN", "test_replay_watchdog_token")
from agent_v0.server_v1 import api_stream
monkeypatch.setattr(api_stream, "API_TOKEN", "test_replay_watchdog_token")
saved_states = dict(api_stream._replay_states)
saved_queues = dict(api_stream._replay_queues)
saved_retry = dict(api_stream._retry_pending)
api_stream._replay_states.clear()
api_stream._replay_queues.clear()
api_stream._retry_pending.clear()
try:
action = {
"action_id": "act_setup_sess_click_start",
"type": "click",
"visual_mode": True,
"x_pct": 0.387891,
"y_pct": 0.974375,
"_setup_phase": True,
"target_spec": {"by_role": "start_button"},
}
next_action = {"action_id": "act_setup_sess_wait_start", "type": "wait"}
replay_id = "replay_watchdog_dup"
session_id = "sess_watchdog_dup"
now = time.time()
api_stream._replay_states[replay_id] = {
"replay_id": replay_id,
"workflow_id": "session_replay:test",
"session_id": session_id,
"machine_id": "pc-watchdog",
"status": "running",
"total_actions": 2,
"completed_actions": 0,
"failed_actions": 0,
"current_action_index": 0,
"params": {},
"results": [],
"actions": [action, next_action],
"retried_actions": 0,
"unverified_actions": 0,
"error_log": [],
"last_screenshot": None,
"failed_action": None,
"pause_message": None,
"variables": {},
"safety_checks": [],
"checks_acknowledged": [],
"pause_reason": "",
"pause_payload": None,
}
api_stream._replay_queues[session_id] = [dict(action), dict(next_action)]
api_stream._retry_pending[action["action_id"]] = {
"action": dict(action),
"dispatched_action": dict(action),
"retry_count": 0,
"replay_id": replay_id,
"session_id": session_id,
"machine_id": "pc-watchdog",
"dispatched_at": now,
"first_dispatched_at": now - 5.0,
"resent_count": 1,
"last_resent_at": now - 1.0,
}
report = api_stream.ReplayResultReport(
session_id=session_id,
action_id=action["action_id"],
success=True,
warning="start_button_hotkey_fallback",
resolution_method="semantic_start_button_hotkey",
resolution_score=1.0,
)
result = await api_stream.report_action_result(report)
assert result["status"] == "recorded"
assert [a["action_id"] for a in api_stream._replay_queues[session_id]] == [
"act_setup_sess_wait_start"
]
assert action["action_id"] not in api_stream._retry_pending
assert api_stream._replay_states[replay_id]["completed_actions"] == 1
assert api_stream._replay_states[replay_id]["current_action_index"] == 1
finally:
api_stream._replay_states.clear()
api_stream._replay_states.update(saved_states)
api_stream._replay_queues.clear()
api_stream._replay_queues.update(saved_queues)
api_stream._retry_pending.clear()
api_stream._retry_pending.update(saved_retry)

View File

@@ -112,6 +112,58 @@ class TestLiveSessionManager:
assert len(raw["screenshots"]) == 1
assert raw["screenshots"][0]["screenshot_id"] == "shot_full_001"
def test_discovers_bg_session_machine_id_from_root_folder(self, tmp_path):
from agent_v0.server_v1.live_session_manager import LiveSessionManager
live_dir = tmp_path / "live_sessions"
session_dir = live_dir / "bg_DESKTOP-58D5CAC_windows"
session_dir.mkdir(parents=True)
(session_dir / "live_events.jsonl").write_text("{}", encoding="utf-8")
mgr = LiveSessionManager(
persist_dir=str(tmp_path / "persist"),
live_sessions_dir=str(live_dir),
)
session = mgr.get_session("bg_DESKTOP-58D5CAC_windows")
assert session is not None
assert session.machine_id == "DESKTOP-58D5CAC_windows"
def test_loads_persisted_bg_session_with_machine_id_inferred(self, tmp_path):
from agent_v0.server_v1.live_session_manager import LiveSessionManager
persist_dir = tmp_path / "persist"
persist_dir.mkdir()
(persist_dir / "bg_DESKTOP-58D5CAC_windows.json").write_text(
'{"session_id":"bg_DESKTOP-58D5CAC_windows","machine_id":"default",'
'"events":[],"shot_paths":{},"last_window_info":{"title":"Unknown","app_name":"unknown"},'
'"created_at":"2026-05-20T14:00:00","last_activity":"2026-05-20T14:00:00",'
'"finalized":false,"window_titles_seen":{},"app_names_seen":{}}',
encoding="utf-8",
)
mgr = LiveSessionManager(persist_dir=str(persist_dir))
session = mgr.get_session("bg_DESKTOP-58D5CAC_windows")
assert session is not None
assert session.machine_id == "DESKTOP-58D5CAC_windows"
def test_find_active_agent_session_falls_back_to_bg_machine_session(self, tmp_path):
from agent_v0.server_v1.live_session_manager import LiveSessionManager
from agent_v0.server_v1.replay_engine import _find_active_agent_session
mgr = LiveSessionManager(persist_dir=str(tmp_path / "persist"))
mgr.register_session(
"sess_20260520T102916_066851",
machine_id="DESKTOP-58D5CAC_windows",
)
mgr.finalize("sess_20260520T102916_066851")
mgr.register_session("bg_DESKTOP-58D5CAC_windows")
active = _find_active_agent_session(mgr, machine_id="DESKTOP-58D5CAC_windows")
assert active == "bg_DESKTOP-58D5CAC_windows"
# =========================================================================
# StreamProcessor
@@ -195,6 +247,238 @@ class TestStreamProcessor:
assert stats["total_workflows"] == 0
assert stats["initialized"] is False
def test_build_replay_does_not_compile_save_dialog_open_as_switch_tab(
self, tmp_path, monkeypatch,
):
"""`Enregistrer sous` same-app n'est pas un onglet.
Régression live 2026-05-23 : un clic menu dans Notepad était
recompilé en faux `switch_tab`, ce qui injectait un clic parasite
avant la vraie ouverture de dialog.
"""
from agent_v0.server_v1 import stream_processor as sp
session_dir = tmp_path / "sess"
(session_dir / "shots").mkdir(parents=True)
monkeypatch.setattr(sp, "_load_crop_for_event", lambda *args, **kwargs: None)
monkeypatch.setattr(
sp,
"enrich_click_from_screenshot",
lambda *args, **kwargs: {"anchor_image_base64": "abc123", "by_role": "yolo"},
)
monkeypatch.setattr(sp, "_attach_expected_screenshots", lambda *args, **kwargs: None)
monkeypatch.setattr(sp, "_enrich_actions_with_intentions", lambda *args, **kwargs: None)
monkeypatch.setattr(sp, "_unload_gemma4", lambda *args, **kwargs: None)
events = [
{"event": {
"type": "mouse_click",
"timestamp": 1.0,
"pos": [820, 630],
"button": "left",
"screenshot_id": "shot_001",
"window": {"title": "*test Bloc-notes", "app_name": "Notepad.exe"},
"window_capture": {
"rect": [320, 520, 2240, 1636],
"click_relative": [500, 110],
"window_size": [1920, 1116],
},
}},
{"event": {
"type": "mouse_click",
"timestamp": 1.2,
"pos": [860, 562],
"button": "left",
"screenshot_id": "shot_002",
"window": {"title": "*test Bloc-notes", "app_name": "Notepad.exe"},
"window_capture": {
"rect": [320, 520, 2240, 1636],
"click_relative": [540, 40],
"window_size": [1920, 1116],
},
}},
{"event": {
"type": "window_focus_change",
"timestamp": 1.35,
"from": {"title": "*test Bloc-notes", "app_name": "Notepad.exe"},
"to": {"title": "Enregistrer sous", "app_name": "Notepad.exe"},
}},
{"event": {
"type": "mouse_click",
"timestamp": 1.6,
"pos": [997, 743],
"button": "left",
"screenshot_id": "shot_003",
"window": {"title": "Enregistrer sous", "app_name": "Notepad.exe"},
}},
]
actions = sp.build_replay_from_raw_events(
events, session_id="sess_save_dialog", session_dir=str(session_dir),
)
clicks = [a for a in actions if a.get("type") == "click"]
assert len(clicks) == 3
assert all(
(c.get("target_spec", {}).get("context_hints") or {}).get("interaction") != "switch_tab"
for c in clicks
)
assert clicks[1].get("expected_window_title") == "Enregistrer sous"
assert clicks[2].get("expected_window_before") == "Enregistrer sous"
def test_build_replay_tab_switch_focus_belongs_to_latest_click_only(
self, tmp_path, monkeypatch,
):
"""Le focus d'onglet doit être rattaché au dernier clic causal."""
from agent_v0.server_v1 import stream_processor as sp
session_dir = tmp_path / "sess"
(session_dir / "shots").mkdir(parents=True)
monkeypatch.setattr(sp, "_load_crop_for_event", lambda *args, **kwargs: None)
monkeypatch.setattr(
sp,
"enrich_click_from_screenshot",
lambda *args, **kwargs: {"anchor_image_base64": "abc123", "by_role": "yolo"},
)
monkeypatch.setattr(sp, "_attach_expected_screenshots", lambda *args, **kwargs: None)
monkeypatch.setattr(sp, "_enrich_actions_with_intentions", lambda *args, **kwargs: None)
monkeypatch.setattr(sp, "_unload_gemma4", lambda *args, **kwargs: None)
events = [
{"event": {
"type": "mouse_click",
"timestamp": 1.0,
"pos": [1410, 562],
"button": "left",
"screenshot_id": "shot_001",
"window": {
"title": "http192.168.1.408765dossier.htmlid=.txt Bloc-notes",
"app_name": "Notepad.exe",
},
"window_capture": {
"rect": [323, 522, 2243, 1638],
"click_relative": [1087, 40],
"window_size": [1920, 1116],
},
}},
{"event": {
"type": "mouse_click",
"timestamp": 1.1,
"pos": [1514, 562],
"button": "left",
"screenshot_id": "shot_002",
"window": {
"title": "http192.168.1.408765dossier.htmlid=.txt Bloc-notes",
"app_name": "Notepad.exe",
},
"window_capture": {
"rect": [323, 522, 2243, 1638],
"click_relative": [1191, 40],
"window_size": [1920, 1116],
},
}},
{"event": {
"type": "window_focus_change",
"timestamp": 1.2,
"from": {
"title": "http192.168.1.408765dossier.htmlid=.txt Bloc-notes",
"app_name": "Notepad.exe",
},
"to": {
"title": "Sans titre Bloc-notes",
"app_name": "Notepad.exe",
},
}},
]
actions = sp.build_replay_from_raw_events(
events,
session_id="sess_intervening_click",
session_dir=str(session_dir),
)
assert len(actions) == 2
first_hints = actions[0].get("target_spec", {}).get("context_hints") or {}
second_hints = actions[1].get("target_spec", {}).get("context_hints") or {}
assert first_hints.get("interaction") != "switch_tab"
assert actions[1]["target_spec"]["by_text"] == "Sans titre"
assert actions[1]["target_spec"]["by_role"] == "tab"
assert second_hints.get("interaction") == "switch_tab"
def test_build_replay_infers_close_tab_before_save_dialog(
self, tmp_path, monkeypatch,
):
"""Le clic sur le x d'onglet actif doit être sémantisé comme close_tab."""
from agent_v0.server_v1 import stream_processor as sp
session_dir = tmp_path / "sess"
(session_dir / "shots").mkdir(parents=True)
monkeypatch.setattr(sp, "_load_crop_for_event", lambda *args, **kwargs: None)
monkeypatch.setattr(
sp,
"enrich_click_from_screenshot",
lambda *args, **kwargs: {"anchor_image_base64": "abc123", "by_role": "yolo"},
)
monkeypatch.setattr(sp, "_attach_expected_screenshots", lambda *args, **kwargs: None)
monkeypatch.setattr(sp, "_enrich_actions_with_intentions", lambda *args, **kwargs: None)
monkeypatch.setattr(sp, "_unload_gemma4", lambda *args, **kwargs: None)
events = [
{"event": {
"type": "mouse_click",
"timestamp": 1.0,
"pos": [1814, 560],
"button": "left",
"screenshot_id": "shot_001",
"window": {"title": "*test Bloc-notes", "app_name": "Notepad.exe"},
"window_capture": {
"rect": [323, 522, 2243, 1638],
"click_relative": [1491, 38],
"window_size": [1920, 1116],
},
}},
{"event": {
"type": "mouse_click",
"timestamp": 1.3,
"pos": [1183, 1156],
"button": "left",
"screenshot_id": "shot_002",
"window": {"title": "*test Bloc-notes", "app_name": "Notepad.exe"},
"window_capture": {
"rect": [323, 522, 2243, 1638],
"click_relative": [860, 634],
"window_size": [1920, 1116],
},
}},
{"event": {
"type": "window_focus_change",
"timestamp": 1.5,
"from": {"title": "*test Bloc-notes", "app_name": "Notepad.exe"},
"to": {"title": "Enregistrer sous", "app_name": "Notepad.exe"},
}},
]
actions = sp.build_replay_from_raw_events(
events,
session_id="sess_close_tab",
session_dir=str(session_dir),
)
clicks = [a for a in actions if a.get("type") == "click"]
assert len(clicks) == 2
first_spec = clicks[0].get("target_spec", {})
first_hints = first_spec.get("context_hints") or {}
assert first_spec.get("by_role") == "tab_close_button"
assert first_spec.get("by_text", "") == ""
assert first_hints.get("interaction") == "close_tab"
assert first_hints.get("active_tab_label") == "test"
assert "fermer l'onglet actif 'test'" in first_spec.get("vlm_description", "")
# =========================================================================
# StreamWorker

View File

@@ -0,0 +1,184 @@
"""Tests ciblés sur l'intégration agent du contrat finalize enrichi."""
from __future__ import annotations
import sys
import types
from pathlib import Path
from unittest.mock import MagicMock, patch
_ROOT = str(Path(__file__).resolve().parents[2])
if _ROOT not in sys.path:
sys.path.insert(0, _ROOT)
class _ImmediateThread:
def __init__(self, target=None, args=(), kwargs=None, daemon=None):
self._target = target
self._args = args
self._kwargs = kwargs or {}
def start(self):
if self._target is not None:
self._target(*self._args, **self._kwargs)
class _DummyServerClient:
_stream_base = "http://server.test:5005"
def __init__(self):
self.on_connection_change = None
def set_on_connection_change(self, callback):
self.on_connection_change = callback
def _auth_headers(self):
return {"Authorization": "Bearer test-token"}
def _install_pystray_stub():
pystray_stub = types.ModuleType("pystray")
class _DummyMenu:
SEPARATOR = object()
def __init__(self, *args, **kwargs):
self.args = args
self.kwargs = kwargs
class _DummyIcon:
def __init__(self, *args, **kwargs):
self.args = args
self.kwargs = kwargs
def run(self):
return None
def stop(self):
return None
def update_menu(self):
return None
pystray_stub.MenuItem = lambda *args, **kwargs: (args, kwargs)
pystray_stub.Menu = _DummyMenu
pystray_stub.Icon = _DummyIcon
sys.modules["pystray"] = pystray_stub
def _build_tray():
_install_pystray_stub()
from agent_v0.agent_v1.ui.smart_tray import SmartTrayV1
tray = SmartTrayV1(
on_start_callback=lambda _name: None,
on_stop_callback=lambda: None,
server_client=_DummyServerClient(),
)
tray._notifier = MagicMock()
return tray
def test_offer_finalize_replay_requires_user_consent():
_install_pystray_stub()
from agent_v0.agent_v1.ui import smart_tray as smart_tray_mod
tray = _build_tray()
tray._launch_replay_request = MagicMock()
with patch.object(smart_tray_mod.threading, "Thread", _ImmediateThread), \
patch.object(smart_tray_mod, "_ask_consent", return_value=False):
tray.offer_finalize_replay(
{
"endpoint": "/api/v1/traces/stream/replay-session",
"session_id": "sess_offer_001",
"machine_id": "pc-offer",
},
"Bloc-notes",
)
tray._notifier.notify.assert_called_once()
tray._launch_replay_request.assert_not_called()
def test_launch_replay_request_calls_replay_session_endpoint():
_install_pystray_stub()
from agent_v0.agent_v1.ui import smart_tray as smart_tray_mod
tray = _build_tray()
with patch.object(smart_tray_mod.threading, "Thread", _ImmediateThread), \
patch("requests.post") as mock_post:
mock_post.return_value = MagicMock(ok=True)
tray._launch_replay_request(
{
"endpoint": "/api/v1/traces/stream/replay-session",
"session_id": "sess_offer_002",
"machine_id": "pc-replay",
},
"Bloc-notes",
)
mock_post.assert_called_once()
_, kwargs = mock_post.call_args
assert kwargs["params"] == {
"session_id": "sess_offer_002",
"machine_id": "pc-replay",
}
assert kwargs["headers"] == {"Authorization": "Bearer test-token"}
assert kwargs["allow_redirects"] is False
def test_agent_finalize_result_delegates_to_tray_offer():
from agent_v0.agent_v1.finalize_contract import dispatch_finalize_result
ui = MagicMock()
dispatch_finalize_result(
ui,
{
"replay_ready": True,
"replay_request": {
"endpoint": "/api/v1/traces/stream/replay-session",
"session_id": "sess_offer_003",
"machine_id": "pc-main",
},
},
"Saisie dossier",
)
ui.offer_finalize_replay.assert_called_once_with(
{
"endpoint": "/api/v1/traces/stream/replay-session",
"session_id": "sess_offer_003",
"machine_id": "pc-main",
},
"Saisie dossier",
)
def test_agent_finalize_result_ignores_already_started_replay():
from agent_v0.agent_v1.finalize_contract import dispatch_finalize_result
ui = MagicMock()
dispatch_finalize_result(
ui,
{
"replay_ready": True,
"replay_request": {
"endpoint": "/api/v1/traces/stream/replay-session",
"session_id": "sess_offer_004",
"machine_id": "pc-main",
},
"replay_launch": {
"status": "started",
"replay": {"replay_id": "replay_sess_1234"},
},
},
"Saisie dossier",
)
ui.offer_finalize_replay.assert_not_called()

View File

@@ -0,0 +1,78 @@
"""Tests ciblés sur l'état replay côté AgentV1 pendant pause supervisée."""
import sys
import threading
from types import SimpleNamespace
from unittest.mock import MagicMock
def _make_agent():
sys.modules.setdefault("pynput", MagicMock())
sys.modules.setdefault("pynput.mouse", MagicMock())
sys.modules.setdefault("pynput.keyboard", MagicMock())
sys.modules.setdefault("pystray", MagicMock())
from agent_v0.agent_v1.main import AgentV1
agent = AgentV1.__new__(AgentV1)
agent.user_id = "demo_user"
agent.machine_id = "machine_test"
agent.running = True
agent._replay_active = True
agent._state = SimpleNamespace(calls=[], set_replay_active=lambda active: agent._state.calls.append(active))
agent.ui = SimpleNamespace(calls=[], set_replay_active=lambda active: agent.ui.calls.append(active))
return agent
def test_replay_pause_does_not_mark_replay_finished(monkeypatch):
"""Quand l'executor signale replay_paused, AgentV1 doit rester en mode replay."""
agent = _make_agent()
class _Executor:
_poll_backoff = 1.0
_replay_paused = True
def poll_and_execute(self, session_id: str, server_url: str, machine_id: str = "default") -> bool:
return False
agent._executor = _Executor()
def _fake_sleep(_delay):
agent.running = False
monkeypatch.setattr("agent_v0.agent_v1.main.time.sleep", _fake_sleep)
t = threading.Thread(target=agent._replay_poll_loop)
t.start()
t.join(timeout=1)
assert agent._replay_active is True
assert agent.ui.calls == []
assert agent._state.calls == []
def test_replay_without_action_and_without_pause_marks_replay_finished(monkeypatch):
"""Sans action et sans pause, AgentV1 doit sortir du mode replay."""
agent = _make_agent()
class _Executor:
_poll_backoff = 1.0
_replay_paused = False
def poll_and_execute(self, session_id: str, server_url: str, machine_id: str = "default") -> bool:
return False
agent._executor = _Executor()
def _fake_sleep(_delay):
agent.running = False
monkeypatch.setattr("agent_v0.agent_v1.main.time.sleep", _fake_sleep)
t = threading.Thread(target=agent._replay_poll_loop)
t.start()
t.join(timeout=1)
assert agent._replay_active is False
assert agent.ui.calls == [False]
assert agent._state.calls == [False]

View File

@@ -0,0 +1,485 @@
"""Garde dimensions monitor — agent_v0/agent_v1/vision/capturer.py
Contexte (démo GHT 19 mai 2026) : `mss.monitors[1]` peut retourner
intermittemment des dimensions tronquées (cas observé : 2560×60 au lieu
de 2560×1600). Toute capture utilisant ces dims pour normaliser des
coordonnées empoisonne ensuite la mémoire persistante (`TargetMemoryStore`).
Ce module teste la garde qui doit :
- détecter une dimension aberrante avant capture
- retenter (mss peut avoir un cache stale)
- tomber en fallback sur un autre monitor physique si dispo
- abandonner explicitement (logs WARNING/ERROR) sans empoisonner
Périmètre : capturer.py uniquement (pas executor, pas replay).
"""
from __future__ import annotations
import logging
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from PIL import Image
def _make_mock_mss(monitors_sequence):
"""Construit un mock `mss.mss()` qui renvoie successivement les listes
`monitors` fournies. Permet de simuler retry / changement de dims
entre deux appels.
Args:
monitors_sequence: liste de listes-de-monitors. Chaque entrée
représente l'état renvoyé par `sct.monitors` à un appel
successif de `mss.mss()`. La dernière entrée est réutilisée
si plus d'appels ont lieu.
Returns:
Un mock utilisable comme `patch(..., side_effect=mock)` côté `mss.mss`.
"""
call_counter = {"n": 0}
instances = []
def factory():
idx = min(call_counter["n"], len(monitors_sequence) - 1)
call_counter["n"] += 1
instance = MagicMock(name=f"mss_instance_{idx}")
instance.monitors = monitors_sequence[idx]
# grab() renvoie un objet avec size + bgra pour passer dans PIL
grab_result = MagicMock()
# On simule un buffer cohérent avec les dims du monitor sain
m = monitors_sequence[idx][1] if len(monitors_sequence[idx]) > 1 else {}
w = m.get("width", 100)
h = m.get("height", 100)
grab_result.size = (w, h)
# Une image saine ne doit pas être entièrement noire, sinon le nouveau
# fail-closed black-frame la rejetterait.
grab_result.bgra = b"\x80\x80\x80\x00" * (w * h)
instance.grab = MagicMock(return_value=grab_result)
# context manager
cm = MagicMock(name=f"mss_cm_{idx}")
cm.__enter__ = MagicMock(return_value=instance)
cm.__exit__ = MagicMock(return_value=False)
instances.append((cm, instance))
return cm
factory.instances = instances
return factory
def _vision_capturer(tmp_path):
"""Import paresseux pour permettre au patch d'opérer avant le import."""
from agent_v0.agent_v1.vision.capturer import VisionCapturer
return VisionCapturer(str(tmp_path))
def _solid_img(color: tuple[int, int, int], size=(320, 240)) -> Image.Image:
"""Image unie simple pour piloter les tests de fallback noir."""
return Image.new("RGB", size, color)
# ============================================================================
# Test 1 — Dim aberrante (height=60) refusée : capture_full_context renvoie ""
# ============================================================================
def test_capture_full_context_returns_empty_when_monitor_height_aberrant(
tmp_path: Path, caplog: pytest.LogCaptureFixture
):
"""Cas démo GHT : mss.monitors[1] = 2560×60 (au lieu de 2560×1600).
La capture doit refuser de produire un PNG basé sur ces dims (sinon
toute coord normalisée derrière sera fausse d'un facteur ~27×).
Retour attendu : chaîne vide (comme le contrat existant en cas
d'erreur).
"""
aberrant_monitors = [
{"left": 0, "top": 0, "width": 2560, "height": 1660}, # composite
{"left": 0, "top": 0, "width": 2560, "height": 60}, # PRIMAIRE aberrant
]
factory = _make_mock_mss([aberrant_monitors])
with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \
patch("agent_v0.agent_v1.vision.capturer.time.sleep"):
caplog.set_level(logging.WARNING, logger="agent_v0.agent_v1.vision.capturer")
cap = _vision_capturer(tmp_path)
result = cap.capture_full_context("test_aberrant")
assert result == "", (
f"Capture devrait retourner '' sur dim aberrante, got {result!r}"
)
# Sanity : aucun grab() ne doit avoir été appelé sur un monitor aberrant.
# Tous les mss instances créés ne doivent JAMAIS avoir appelé grab().
for _cm, instance in factory.instances:
instance.grab.assert_not_called()
# ============================================================================
# Test 2 — Le log WARNING doit citer la dim observée (debuggabilité)
# ============================================================================
def test_aberrant_monitor_logs_warning_with_observed_dimensions(
tmp_path: Path, caplog: pytest.LogCaptureFixture
):
"""L'opérateur doit pouvoir diagnostiquer la cause depuis les logs sans
rejouer la session. Le WARNING doit contenir les dims aberrantes vues.
"""
aberrant_monitors = [
{"left": 0, "top": 0, "width": 2560, "height": 1660},
{"left": 0, "top": 0, "width": 2560, "height": 60},
]
factory = _make_mock_mss([aberrant_monitors])
with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \
patch("agent_v0.agent_v1.vision.capturer.time.sleep"):
caplog.set_level(logging.WARNING, logger="agent_v0.agent_v1.vision.capturer")
cap = _vision_capturer(tmp_path)
cap.capture_full_context("test")
warnings = [r for r in caplog.records if r.levelno == logging.WARNING]
assert warnings, "Au moins un WARNING attendu sur dim aberrante"
msg = " ".join(r.getMessage() for r in warnings)
assert "2560" in msg, f"Largeur observée doit apparaître dans le WARNING : {msg!r}"
assert "60" in msg, f"Hauteur observée doit apparaître dans le WARNING : {msg!r}"
# ============================================================================
# Test 3 — Retry : un 1er appel aberrant suivi d'un appel sain produit la capture
# ============================================================================
def test_capture_retries_when_first_monitor_query_is_aberrant(
tmp_path: Path, caplog: pytest.LogCaptureFixture
):
"""Le bug observé est intermittent (mss peut avoir un cache stale). Si on
retente immédiatement, le second appel renvoie souvent les vraies dims.
La capture doit donc retenter et réussir quand le second appel est sain.
"""
aberrant_then_ok = [
# 1er appel : aberrant
[
{"left": 0, "top": 0, "width": 2560, "height": 1660},
{"left": 0, "top": 0, "width": 2560, "height": 60},
],
# 2e appel : OK
[
{"left": 0, "top": 0, "width": 2560, "height": 1660},
{"left": 0, "top": 0, "width": 2560, "height": 1600},
],
]
factory = _make_mock_mss(aberrant_then_ok)
with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \
patch("agent_v0.agent_v1.vision.capturer.time.sleep"):
caplog.set_level(logging.WARNING, logger="agent_v0.agent_v1.vision.capturer")
cap = _vision_capturer(tmp_path)
result = cap.capture_full_context("test_retry", force=True)
assert result, (
f"Capture doit réussir après retry sur dims saines, got {result!r}"
)
assert Path(result).exists(), "Le PNG doit être physiquement créé"
# Au moins 2 appels mss.mss() : le premier (aberrant) + le retry
assert len(factory.instances) >= 2, (
f"Au moins 2 appels mss.mss() attendus (retry), vu {len(factory.instances)}"
)
# ============================================================================
# Test 4 — Fallback : monitors[1] aberrant mais monitors[2] sain → capture OK
# ============================================================================
def test_capture_falls_back_to_secondary_monitor_when_primary_aberrant(
tmp_path: Path, caplog: pytest.LogCaptureFixture
):
"""Cas multi-écrans : monitors[1] cassé en permanence, monitors[2] sain.
La capture doit utiliser monitors[2] et logger un WARNING fallback.
"""
monitors_with_fallback = [
{"left": 0, "top": 0, "width": 2560, "height": 1660}, # composite
{"left": 0, "top": 0, "width": 2560, "height": 60}, # primaire cassé
{"left": 2560, "top": 0, "width": 1920, "height": 1080}, # secondaire sain
]
# Même état renvoyé à tous les appels (cas stationnaire, pas intermittent)
factory = _make_mock_mss([monitors_with_fallback])
with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \
patch("agent_v0.agent_v1.vision.capturer.time.sleep"):
caplog.set_level(logging.WARNING, logger="agent_v0.agent_v1.vision.capturer")
cap = _vision_capturer(tmp_path)
result = cap.capture_full_context("test_fallback", force=True)
assert result, f"Capture doit réussir via monitor[2], got {result!r}"
msg = " ".join(r.getMessage() for r in caplog.records)
assert "fallback" in msg.lower(), (
f"Un log doit signaler le fallback monitor : {msg!r}"
)
# ============================================================================
# Test 5 — capture_dual bénéficie aussi de la garde
# ============================================================================
def test_capture_dual_returns_empty_dict_when_monitor_aberrant(tmp_path: Path):
"""capture_dual (3 captures simultanées) ne doit pas non plus produire
de PNG sur dim aberrante : c'est la même source d'empoisonnement.
"""
aberrant_monitors = [
{"left": 0, "top": 0, "width": 2560, "height": 1660},
{"left": 0, "top": 0, "width": 2560, "height": 60},
]
factory = _make_mock_mss([aberrant_monitors])
with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \
patch("agent_v0.agent_v1.vision.capturer.time.sleep"):
cap = _vision_capturer(tmp_path)
result = cap.capture_dual(x=100, y=200, screenshot_id="shot_dual")
assert result == {}, (
f"capture_dual doit retourner {{}} sur dim aberrante, got {result!r}"
)
# ============================================================================
# Test 6 — capture_active_window bénéficie aussi de la garde
# ============================================================================
def test_capture_active_window_returns_none_when_monitor_aberrant(tmp_path: Path):
"""capture_active_window (standalone, sans full_img fourni) doit aussi
refuser de capturer sur monitor aberrant.
"""
aberrant_monitors = [
{"left": 0, "top": 0, "width": 2560, "height": 1660},
{"left": 0, "top": 0, "width": 2560, "height": 60},
]
factory = _make_mock_mss([aberrant_monitors])
# Mocker get_active_window_rect pour qu'il renvoie une fenêtre valide
# (sinon le test sort prématurément avant d'atteindre le grab).
fake_rect = {
"rect": [100, 100, 800, 600],
"size": [700, 500],
"title": "Test Window",
"app_name": "test_app",
}
with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \
patch("agent_v0.agent_v1.vision.capturer.time.sleep"), \
patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect",
return_value=fake_rect,
):
cap = _vision_capturer(tmp_path)
result = cap.capture_active_window(x=200, y=300, screenshot_id="shot_win")
assert result is None, (
f"capture_active_window doit retourner None sur dim aberrante, got {result!r}"
)
# ============================================================================
# Test 7 — Non-régression : dim normale produit toujours un PNG
# ============================================================================
def test_capture_full_context_succeeds_on_normal_dimensions(tmp_path: Path):
"""Sanity check : la garde ne casse pas le chemin nominal."""
normal_monitors = [
{"left": 0, "top": 0, "width": 2560, "height": 1660},
{"left": 0, "top": 0, "width": 2560, "height": 1600},
]
factory = _make_mock_mss([normal_monitors])
with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \
patch("agent_v0.agent_v1.vision.capturer.time.sleep"):
cap = _vision_capturer(tmp_path)
result = cap.capture_full_context("test_normal", force=True)
assert result, f"Capture nominale doit produire un PNG, got {result!r}"
assert Path(result).exists(), "PNG doit exister sur disque"
# Un seul appel mss.mss() attendu en cas normal (pas de retry)
assert len(factory.instances) == 1, (
f"Un seul appel mss.mss() attendu sur dims saines, vu {len(factory.instances)}"
)
# ============================================================================
# Test 8 — fail-closed : capture_dual refuse le fallback monitor secondaire
# ============================================================================
def test_capture_dual_fails_closed_when_only_secondary_monitor_sane(
tmp_path: Path, caplog: pytest.LogCaptureFixture
):
"""capture_dual reçoit des coords (x, y) en système écran composite.
Si on capture monitors[2] (offset 2560, 0), le crop calculé via
img.crop((x, y, ...)) pointe à la mauvaise zone car les coords ne
sont pas traduites. Plutôt que de produire une image décalée
silencieusement, on refuse le fallback secondaire pour cette méthode.
"""
monitors_with_fallback = [
{"left": 0, "top": 0, "width": 2560, "height": 1660},
{"left": 0, "top": 0, "width": 2560, "height": 60}, # primary cassé
{"left": 2560, "top": 0, "width": 1920, "height": 1080}, # secondary sain
]
factory = _make_mock_mss([monitors_with_fallback])
with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \
patch("agent_v0.agent_v1.vision.capturer.time.sleep"):
caplog.set_level(logging.WARNING, logger="agent_v0.agent_v1.vision.capturer")
cap = _vision_capturer(tmp_path)
result = cap.capture_dual(x=300, y=400, screenshot_id="shot_dual_fb")
assert result == {}, (
f"capture_dual doit fail-closed sur fallback secondaire, got {result!r}"
)
msg = " ".join(r.getMessage() for r in caplog.records).lower()
assert "fallback" in msg or "secondaire" in msg or "refus" in msg, (
f"Un log doit expliquer le refus du fallback pour coords : {msg!r}"
)
# ============================================================================
# Test 9 — fail-closed : capture_active_window refuse le fallback secondaire
# ============================================================================
def test_capture_active_window_fails_closed_when_only_secondary_monitor_sane(
tmp_path: Path,
):
"""Même raison que test 8 : capture_active_window cropperait depuis l'image
de monitors[2] avec un win_rect en coords globales → zone fausse.
"""
monitors_with_fallback = [
{"left": 0, "top": 0, "width": 2560, "height": 1660},
{"left": 0, "top": 0, "width": 2560, "height": 60},
{"left": 2560, "top": 0, "width": 1920, "height": 1080},
]
factory = _make_mock_mss([monitors_with_fallback])
fake_rect = {
"rect": [100, 100, 800, 600], # coords globales dans monitors[1]
"size": [700, 500],
"title": "Test Window",
"app_name": "test_app",
}
with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \
patch("agent_v0.agent_v1.vision.capturer.time.sleep"), \
patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect",
return_value=fake_rect,
):
cap = _vision_capturer(tmp_path)
result = cap.capture_active_window(x=200, y=300, screenshot_id="shot_win_fb")
assert result is None, (
f"capture_active_window doit fail-closed sur fallback secondaire, got {result!r}"
)
# ============================================================================
# Test 10 — mss noir : fallback ImageGrab
# ============================================================================
def test_capture_screen_image_falls_back_to_imagegrab_when_mss_is_black():
"""Un frame mss noir ne doit plus être accepté silencieusement.
Si ImageGrab fournit une image exploitable, elle doit être retenue.
"""
from agent_v0.agent_v1.vision import capturer
black_img = _solid_img((0, 0, 0))
fallback_img = _solid_img((210, 180, 90))
monitor = {"left": 0, "top": 0, "width": 320, "height": 240}
with patch.object(
capturer, "_acquire_safe_grab", return_value=(monitor, black_img)
), patch.object(
capturer,
"_capture_via_imagegrab",
return_value=(monitor, fallback_img, {
"backend": "imagegrab",
"luma": {"mean": 180.0, "stddev": 0.0, "min": 180, "max": 180},
}),
):
out_monitor, out_img, meta = capturer.capture_screen_image()
assert out_monitor == monitor
assert out_img is fallback_img
assert meta["backend"] == "imagegrab"
# ============================================================================
# Test 11 — capture_dual dégradé : conserver window_capture
# ============================================================================
def test_capture_dual_keeps_window_capture_when_fullscreen_is_unavailable(
tmp_path: Path,
):
"""Même sans full/crop, la capture fenêtre doit survivre.
Cela permet au serveur de conserver un contexte utile plutôt que de
travailler sur un écran noir.
"""
fake_window = {
"window_image": str(tmp_path / "window_only.png"),
"window_title": "Bloc-notes",
"app_name": "notepad.exe",
"window_rect": [100, 100, 800, 600],
"window_size": [700, 500],
"click_in_window": [42, 24],
"click_inside_window": True,
}
cap = _vision_capturer(tmp_path)
with patch(
"agent_v0.agent_v1.vision.capturer.capture_screen_image",
return_value=(None, None, {"backend": "mss_black"}),
), patch.object(cap, "capture_active_window", return_value=fake_window):
result = cap.capture_dual(x=200, y=300, screenshot_id="shot_dual")
assert "full" not in result
assert "crop" not in result
assert result["window_capture"] == fake_window
# ============================================================================
# Test 12 — non-régression : capture_full_context PEUT utiliser le fallback
# ============================================================================
def test_capture_full_context_still_uses_secondary_fallback(
tmp_path: Path, caplog: pytest.LogCaptureFixture
):
"""capture_full_context (heartbeat) ne porte pas de coords client : un
écran sain quelconque suffit. Le fallback secondaire reste autorisé.
Sinon le heartbeat tomberait dès qu'un monitor est cassé en permanence.
"""
monitors_with_fallback = [
{"left": 0, "top": 0, "width": 2560, "height": 1660},
{"left": 0, "top": 0, "width": 2560, "height": 60},
{"left": 2560, "top": 0, "width": 1920, "height": 1080},
]
factory = _make_mock_mss([monitors_with_fallback])
with patch("agent_v0.agent_v1.vision.capturer.mss.mss", side_effect=factory), \
patch("agent_v0.agent_v1.vision.capturer.time.sleep"):
caplog.set_level(logging.WARNING, logger="agent_v0.agent_v1.vision.capturer")
cap = _vision_capturer(tmp_path)
result = cap.capture_full_context("test_heartbeat_fb", force=True)
assert result, (
f"capture_full_context doit accepter fallback (heartbeat sans coords), got {result!r}"
)
assert Path(result).exists()

View File

@@ -0,0 +1,165 @@
"""Tests pour ChatWindow._dispatch_paused_action.
Couvre le routage bus SocketIO → fallback HTTP de la bulle paused.
Le bug d'origine ``paused_bubble: bus déconnecté, resume non émis``
était causé par l'absence de ce fallback (cf.
``docs/CR_AUDIT_PAUSED_RESUME_BUS_2026-05-22.md``).
Les tests appellent ``ChatWindow._dispatch_paused_action`` en tant
que fonction unbound avec un faux ``self`` (``SimpleNamespace``) pour
éviter de démarrer Tkinter pendant les tests unitaires.
"""
from __future__ import annotations
import sys
from pathlib import Path
from types import SimpleNamespace
from unittest.mock import MagicMock
ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(ROOT))
from agent_v0.agent_v1.ui.chat_window import ChatWindow # noqa: E402
def _make_self(bus=None, server_client=None):
return SimpleNamespace(_bus=bus, _server_client=server_client)
def _call(mock_self, replay_id="replay_xyz",
bus_method="resume_replay", client_method="resume_replay"):
return ChatWindow._dispatch_paused_action(
mock_self, replay_id, bus_method=bus_method, client_method=client_method,
)
class TestDispatchPausedAction:
def test_bus_connected_and_emits_uses_bus(self):
bus = MagicMock(connected=True)
bus.resume_replay.return_value = True
client = MagicMock(resume_replay=MagicMock(return_value=True))
emitted, channel = _call(_make_self(bus=bus, server_client=client))
assert emitted is True
assert channel == "bus"
bus.resume_replay.assert_called_once_with("replay_xyz")
client.resume_replay.assert_not_called()
def test_bus_disconnected_falls_back_to_http(self):
bus = MagicMock(connected=False)
client = MagicMock(resume_replay=MagicMock(return_value=True))
emitted, channel = _call(_make_self(bus=bus, server_client=client))
assert emitted is True
assert channel == "http"
bus.resume_replay.assert_not_called()
client.resume_replay.assert_called_once_with("replay_xyz")
def test_bus_emit_returns_false_falls_back_to_http(self):
"""Bus marqué connecté mais l'emit retourne False (socket cassé
entre connect() et send) → bascule sur HTTP."""
bus = MagicMock(connected=True)
bus.resume_replay.return_value = False
client = MagicMock(resume_replay=MagicMock(return_value=True))
emitted, channel = _call(_make_self(bus=bus, server_client=client))
assert emitted is True
assert channel == "http"
def test_bus_emit_raises_falls_back_to_http(self):
bus = MagicMock(connected=True)
bus.resume_replay.side_effect = RuntimeError("socket broken")
client = MagicMock(resume_replay=MagicMock(return_value=True))
emitted, channel = _call(_make_self(bus=bus, server_client=client))
assert emitted is True
assert channel == "http"
def test_no_bus_uses_http_directly(self):
client = MagicMock(resume_replay=MagicMock(return_value=True))
emitted, channel = _call(_make_self(bus=None, server_client=client))
assert emitted is True
assert channel == "http"
def test_all_channels_fail_returns_false(self):
"""Cas critique : bus déconnecté ET HTTP injoignable → l'UI
doit ré-activer les boutons côté appelant. Ici on vérifie
juste que dispatch retourne (False, '')."""
bus = MagicMock(connected=False)
client = MagicMock(resume_replay=MagicMock(return_value=False))
emitted, channel = _call(_make_self(bus=bus, server_client=client))
assert emitted is False
assert channel == ""
def test_neither_bus_nor_client_returns_false(self):
emitted, channel = _call(_make_self(bus=None, server_client=None))
assert emitted is False
assert channel == ""
def test_client_method_missing_falls_through(self):
"""Si server_client est un vieux client sans resume_replay,
on ne plante pas — on retourne (False, '')."""
bus = MagicMock(connected=False)
legacy_client = SimpleNamespace() # pas de resume_replay
emitted, channel = _call(
_make_self(bus=bus, server_client=legacy_client),
)
assert emitted is False
assert channel == ""
def test_abort_routing_symmetric(self):
"""Le même mécanisme couvre l'abort — vérifie qu'on utilise
bien la méthode demandée par le caller."""
bus = MagicMock(connected=False)
client = MagicMock(abort_replay=MagicMock(return_value=True))
emitted, channel = _call(
_make_self(bus=bus, server_client=client),
bus_method="abort_replay",
client_method="abort_replay",
)
assert emitted is True
assert channel == "http"
client.abort_replay.assert_called_once_with("replay_xyz")
class TestPausedBubbleHeight:
"""Couvre _compute_paused_bubble_height — patch troncature 22 mai 2026."""
def test_empty_message_uses_minimum_height(self):
h, scroll = ChatWindow._compute_paused_bubble_height("")
assert h == 2
assert scroll is False
def test_short_message_no_scrollbar(self):
h, scroll = ChatWindow._compute_paused_bubble_height("Court message.")
assert h == 2
assert scroll is False
def test_long_single_line_triggers_scrollbar(self):
# ~600 chars sans \n → wrapped_lines = 600 // 60 + 1 = 11
msg = "x" * 600
h, scroll = ChatWindow._compute_paused_bubble_height(msg)
assert h == 11
assert scroll is True
def test_message_with_many_newlines_uses_explicit_count(self):
"""Cas du bug : reason serveur listant 6 candidats sur 6 lignes
courtes — wrapped_lines bas mais explicit_lines élevé."""
msg = "\n".join([f"option {i}" for i in range(6)])
h, scroll = ChatWindow._compute_paused_bubble_height(msg)
# 6 lignes explicites > 2 lignes wrappées → hauteur = 6
assert h == 6
# Pas encore au cap, contenu court → pas de scrollbar
assert scroll is False
def test_cap_reached_triggers_scrollbar_even_if_short(self):
"""Quand on dépasse le cap (12 lignes), la scrollbar DOIT
s'afficher quel que soit la longueur en caractères."""
msg = "\n".join([f"l{i}" for i in range(20)])
h, scroll = ChatWindow._compute_paused_bubble_height(msg)
assert h == 12 # plafond
assert scroll is True
def test_long_content_triggers_scrollbar_at_200_chars(self):
"""Seuil sécurité texte : ≥ 200 chars → scrollbar même si
peu de lignes (filet anti-troncature visuel)."""
msg = "x" * 220
h, scroll = ChatWindow._compute_paused_bubble_height(msg)
assert scroll is True

View File

@@ -16,6 +16,7 @@ sys.path.insert(0, str(ROOT))
from agent_v0.server_v1.api_stream import (
_extract_required_apps_from_events,
_extract_required_apps_from_workflow,
_trim_redundant_setup_events,
_resolve_launch_command,
_infer_app_from_window_titles,
_generate_setup_actions,
@@ -220,6 +221,139 @@ class TestExtractRequiredAppsFromEvents:
# Le premier app hors ignorées est Notepad
assert result["first_window_title"] == "Bloc-notes"
def test_extracts_searchhost_launch_result_target(self):
"""Récupère le vrai clic SearchHost qui lance l'app."""
events = [
{"event": {"type": "window_focus_change", "from": None, "to": {
"app_name": "explorer.exe", "title": "Explorateur"}}},
{"event": {"type": "window_focus_change", "from": {
"app_name": "explorer.exe", "title": "Explorateur"}, "to": {
"app_name": "SearchHost.exe", "title": "Rechercher"}}},
{"event": {"type": "text_input", "text": "bloc", "window": {
"app_name": "SearchHost.exe", "title": "Rechercher"}}},
{"event": {"type": "mouse_click", "button": "left", "pos": [1449, 641],
"timestamp": 10.0,
"screen_metadata": {"screen_resolution": [2560, 1600]},
"window": {"app_name": "SearchHost.exe", "title": "Rechercher"},
"window_capture": {
"click_relative": [681, 448],
"window_size": [1287, 1407],
}}},
{"event": {"type": "window_focus_change", "from": {
"app_name": "SearchHost.exe", "title": "Rechercher"}, "to": {
"app_name": "explorer.exe", "title": "unknown_window"},
"timestamp": 10.4}},
{"event": {"type": "window_focus_change", "from": {
"app_name": "explorer.exe", "title": "unknown_window"}, "to": {
"app_name": "Notepad.exe", "title": "Sans titre Bloc-notes"},
"timestamp": 11.1}},
]
result = _extract_required_apps_from_events(events)
target = result["launch_result_target"]
assert result["primary_app"] == "Notepad.exe"
assert target["window_title"] == "Rechercher"
assert target["expected_window_before"] == "Rechercher"
assert target["x_pct"] == pytest.approx(1449 / 2560, rel=0, abs=1e-6)
assert target["y_pct"] == pytest.approx(641 / 1600, rel=0, abs=1e-6)
assert target["original_position"]["x_relative"] == "au centre"
assert target["original_position"]["y_relative"] == "au milieu"
assert target["window_capture"]["click_relative"] == [681, 448]
def test_extracts_start_menu_target(self):
"""Récupère le vrai clic Démarrer qui ouvre SearchHost."""
events = [
{"event": {"type": "window_focus_change", "from": None, "to": {
"app_name": "explorer.exe", "title": "Explorateur"}}},
{"event": {"type": "mouse_click", "button": "left", "pos": [993, 1559],
"timestamp": 1.0,
"screen_metadata": {"screen_resolution": [2560, 1600]},
"window": {"app_name": "explorer.exe", "title": "Explorateur"}}},
{"event": {"type": "window_focus_change", "from": {
"app_name": "explorer.exe", "title": "Explorateur"}, "to": {
"app_name": "SearchHost.exe", "title": "Rechercher"},
"timestamp": 1.2}},
{"event": {"type": "mouse_click", "button": "left", "pos": [1449, 641],
"timestamp": 4.0,
"screen_metadata": {"screen_resolution": [2560, 1600]},
"window": {"app_name": "SearchHost.exe", "title": "Rechercher"}}},
{"event": {"type": "window_focus_change", "from": {
"app_name": "SearchHost.exe", "title": "Rechercher"}, "to": {
"app_name": "Notepad.exe", "title": "Sans titre Bloc-notes"},
"timestamp": 4.4}},
]
result = _extract_required_apps_from_events(events)
target = result["start_menu_target"]
assert target["x_pct"] == pytest.approx(993 / 2560, rel=0, abs=1e-6)
assert target["y_pct"] == pytest.approx(1559 / 1600, rel=0, abs=1e-6)
assert target["original_position"]["x_relative"] == "au centre"
assert target["original_position"]["y_relative"] == "en bas"
assert "en bas" in target["position_desc"]
def test_extracts_start_menu_target_anchor_from_session_shot(self, tmp_path):
"""Le clic Démarrer récupère aussi une ancre image depuis le shot source."""
from PIL import Image
session_dir = tmp_path / "sess"
shots_dir = session_dir / "shots"
shots_dir.mkdir(parents=True)
Image.new("RGB", (2560, 1600), color="white").save(
shots_dir / "shot_start_full.png"
)
events = [
{"event": {"type": "window_focus_change", "from": None, "to": {
"app_name": "explorer.exe", "title": "Explorateur"}}},
{"event": {"type": "mouse_click", "button": "left", "pos": [993, 1559],
"timestamp": 1.0,
"screenshot_id": "shot_start",
"screen_metadata": {"screen_resolution": [2560, 1600]},
"window": {"app_name": "explorer.exe", "title": "Explorateur"}}},
{"event": {"type": "window_focus_change", "from": {
"app_name": "explorer.exe", "title": "Explorateur"}, "to": {
"app_name": "SearchHost.exe", "title": "Rechercher"},
"timestamp": 1.2}},
{"event": {"type": "window_focus_change", "from": {
"app_name": "SearchHost.exe", "title": "Rechercher"}, "to": {
"app_name": "Notepad.exe", "title": "Sans titre Bloc-notes"},
"timestamp": 2.0}},
]
result = _extract_required_apps_from_events(
events,
session_dir=str(session_dir),
)
target = result["start_menu_target"]
assert target["anchor_image_base64"]
def test_extracts_direct_typing_search_interaction(self):
"""Détecte qu'aucun clic SearchHost n'est requis avant la saisie."""
events = [
{"event": {"type": "window_focus_change", "from": None, "to": {
"app_name": "explorer.exe", "title": "Explorateur"}}},
{"event": {"type": "mouse_click", "button": "left", "pos": [993, 1559],
"timestamp": 1.0,
"screen_metadata": {"screen_resolution": [2560, 1600]},
"window": {"app_name": "explorer.exe", "title": "Explorateur"}}},
{"event": {"type": "window_focus_change", "from": {
"app_name": "explorer.exe", "title": "Explorateur"}, "to": {
"app_name": "SearchHost.exe", "title": "Rechercher"},
"timestamp": 1.2}},
{"event": {"type": "text_input", "text": "bloc",
"window": {"app_name": "SearchHost.exe", "title": "Rechercher"},
"timestamp": 2.0}},
{"event": {"type": "window_focus_change", "from": {
"app_name": "SearchHost.exe", "title": "Rechercher"}, "to": {
"app_name": "Notepad.exe", "title": "Sans titre Bloc-notes"},
"timestamp": 2.4}},
]
result = _extract_required_apps_from_events(events)
assert result["search_box_interaction"]["mode"] == "direct_typing"
assert result["search_box_interaction"]["window_title"] == "Rechercher"
def test_empty_events(self):
"""Pas d'événements → dict vide."""
assert _extract_required_apps_from_events([]) == {}
@@ -245,6 +379,187 @@ class TestExtractRequiredAppsFromEvents:
assert result["primary_launch_cmd"] == "calc"
class TestTrimRedundantSetupEvents:
"""Tests pour la coupe du préambule déjà couvert par le setup."""
def test_trims_until_first_primary_app_focus(self):
raw_events = [
{"event": {"type": "window_focus_change", "to": {
"app_name": "explorer.exe", "title": "Explorateur"}}},
{"event": {"type": "mouse_click", "pos": [993, 1559], "window": {
"app_name": "explorer.exe", "title": "Explorateur"}}},
{"event": {"type": "window_focus_change", "to": {
"app_name": "SearchHost.exe", "title": "Rechercher"}}},
{"event": {"type": "text_input", "text": "bloc", "window": {
"app_name": "SearchHost.exe", "title": "Rechercher"}}},
{"event": {"type": "mouse_click", "pos": [1449, 641], "window": {
"app_name": "SearchHost.exe", "title": "Rechercher"}}},
{"event": {"type": "window_focus_change", "to": {
"app_name": "Notepad.exe",
"title": "http192.168.1.408765dossier.htmlid=.txt Bloc-notes",
}}},
{"event": {"type": "mouse_click", "pos": [1514, 562], "window": {
"app_name": "Notepad.exe", "title": "*test Bloc-notes"}}},
{"event": {"type": "text_input", "text": "test", "window": {
"app_name": "Notepad.exe", "title": "*test Bloc-notes"}}},
]
app_info = {
"primary_app": "Notepad.exe",
"first_window_title": "Bloc-notes",
}
trimmed = _trim_redundant_setup_events(raw_events, app_info)
assert len(trimmed) == 2
assert trimmed[0]["event"]["type"] == "mouse_click"
assert trimmed[0]["event"]["pos"] == [1514, 562]
assert trimmed[1]["event"]["type"] == "text_input"
def test_keeps_events_when_no_matching_focus_found(self):
raw_events = [
{"event": {"type": "mouse_click", "pos": [10, 10], "window": {
"app_name": "explorer.exe", "title": "Explorateur"}}},
{"event": {"type": "text_input", "text": "abc", "window": {
"app_name": "explorer.exe", "title": "Explorateur"}}},
]
app_info = {
"primary_app": "Notepad.exe",
"first_window_title": "Bloc-notes",
}
trimmed = _trim_redundant_setup_events(raw_events, app_info)
assert trimmed == raw_events
def test_prefers_neutral_title_focus_after_non_neutral_first_focus(self):
"""Cas observé sess_20260520T102916_066851 : premier focus Notepad
a un titre non-neutre (http...txt), suivi d'un clic intra-Notepad
et d'un focus vers 'Sans titre' (= état initial neutre que le setup
auto produit). Le trim doit couper jusqu'au focus neutre pour
éliminer le clic intra-Notepad redondant.
"""
raw_events = [
{"event": {"type": "window_focus_change", "to": {
"app_name": "SearchHost.exe", "title": "Rechercher"}}},
{"event": {"type": "mouse_click", "pos": [681, 448], "window": {
"app_name": "SearchHost.exe", "title": "Rechercher"}}},
{"event": {"type": "window_focus_change", "to": {
"app_name": "Notepad.exe",
"title": "http192.168.1.408765dossier.htmlid=.txt Bloc-notes",
}}},
{"event": {"type": "mouse_click", "pos": [1191, 40], "window": {
"app_name": "Notepad.exe",
"title": "http192.168.1.408765dossier.htmlid=.txt Bloc-notes",
}}},
{"event": {"type": "window_focus_change", "to": {
"app_name": "Notepad.exe", "title": "Sans titre Bloc-notes"}}},
{"event": {"type": "text_input", "text": "test", "window": {
"app_name": "Notepad.exe", "title": "*test Bloc-notes"}}},
]
app_info = {
"primary_app": "Notepad.exe",
"first_window_title": (
"http192.168.1.408765dossier.htmlid=.txt Bloc-notes"
),
}
trimmed = _trim_redundant_setup_events(raw_events, app_info)
# Le clic intra-Notepad (event idx 3) doit être supprimé : il
# bascule vers 'Sans titre' qui est déjà l'état setup, donc
# rejoué il n'a aucun effet visuel et déclenche retry_threshold.
assert len(trimmed) == 1
assert trimmed[0]["event"]["type"] == "text_input"
assert trimmed[0]["event"]["text"] == "test"
def test_neutral_focus_outside_lookahead_window_is_ignored(self):
"""Filet de sécurité : un focus 'Sans titre' qui arrive trop loin
après le premier focus primary_app n'est pas considéré comme
l'état de bootstrap. Évite de couper un workflow qui re-visite
'Sans titre' bien après le démarrage."""
# 30 events séparent le premier focus du focus neutre
raw_events = [
{"event": {"type": "window_focus_change", "to": {
"app_name": "Notepad.exe",
"title": "rapport_final.txt Bloc-notes"}}},
]
# Bourrer avec des events utiles intra-Notepad
for i in range(30):
raw_events.append({"event": {
"type": "mouse_click", "pos": [100 + i, 200],
"window": {"app_name": "Notepad.exe",
"title": "rapport_final.txt Bloc-notes"},
}})
raw_events.append({"event": {"type": "window_focus_change", "to": {
"app_name": "Notepad.exe", "title": "Sans titre Bloc-notes"}}})
raw_events.append({"event": {"type": "text_input", "text": "x",
"window": {"app_name": "Notepad.exe",
"title": "Sans titre Bloc-notes"}}})
app_info = {
"primary_app": "Notepad.exe",
"first_window_title": "rapport_final.txt Bloc-notes",
}
trimmed = _trim_redundant_setup_events(raw_events, app_info)
# Doit garder les 30 clicks + focus tardif + text_input = 32 events
# (cut uniquement au premier focus primary_app, comportement legacy)
assert len(trimmed) == 32
assert trimmed[0]["event"]["type"] == "mouse_click"
assert trimmed[0]["event"]["pos"] == [100, 200]
def test_keeps_legacy_behavior_when_first_focus_already_neutral(self):
"""Non-régression : si le premier focus primary_app est déjà sur
un titre neutre (cas normal), on coupe au premier focus comme
avant — pas de chasse au neutral_idx inutile."""
raw_events = [
{"event": {"type": "window_focus_change", "to": {
"app_name": "SearchHost.exe", "title": "Rechercher"}}},
{"event": {"type": "window_focus_change", "to": {
"app_name": "Notepad.exe", "title": "Sans titre Bloc-notes"}}},
{"event": {"type": "text_input", "text": "hello",
"window": {"app_name": "Notepad.exe",
"title": "Sans titre Bloc-notes"}}},
]
app_info = {
"primary_app": "Notepad.exe",
"first_window_title": "Sans titre Bloc-notes",
}
trimmed = _trim_redundant_setup_events(raw_events, app_info)
assert len(trimmed) == 1
assert trimmed[0]["event"]["type"] == "text_input"
def test_neutral_detection_recognizes_office_default_titles(self):
"""Word, Excel, PowerPoint utilisent leurs propres titres
par défaut (Document1, Classeur1, etc.) que le setup auto
amène également."""
raw_events = [
{"event": {"type": "window_focus_change", "to": {
"app_name": "winword.exe",
"title": "rapport.docx - Word"}}},
{"event": {"type": "mouse_click", "pos": [100, 40],
"window": {"app_name": "winword.exe",
"title": "rapport.docx - Word"}}},
{"event": {"type": "window_focus_change", "to": {
"app_name": "winword.exe", "title": "Document1 - Word"}}},
{"event": {"type": "text_input", "text": "abc",
"window": {"app_name": "winword.exe",
"title": "Document1 - Word"}}},
]
app_info = {
"primary_app": "winword.exe",
"first_window_title": "rapport.docx - Word",
}
trimmed = _trim_redundant_setup_events(raw_events, app_info)
assert len(trimmed) == 1
assert trimmed[0]["event"]["type"] == "text_input"
# =========================================================================
# Tests pour _extract_required_apps_from_workflow
# =========================================================================
@@ -304,10 +619,10 @@ class TestExtractRequiredAppsFromWorkflow:
# =========================================================================
class TestGenerateSetupActions:
"""Tests pour la génération des actions de setup 100% visuelles."""
"""Tests pour la génération des actions de setup."""
def test_notepad_setup_visual(self):
"""Génère les bonnes actions visuelles pour lancer Notepad."""
def test_notepad_setup_uses_run_dialog(self):
"""Bloc-notes utilise désormais le setup sémantique Win+R."""
app_info = {
"primary_app": "Notepad.exe",
"primary_launch_cmd": "notepad",
@@ -315,74 +630,52 @@ class TestGenerateSetupActions:
}
actions = _generate_setup_actions(app_info)
# 9 actions : click_start, wait, click_search, wait, type, wait, click_result, wait, verify
assert len(actions) == 9
assert len(actions) == 7
# Étape 1 : clic visuel sur le bouton Démarrer
assert actions[0]["type"] == "click"
assert actions[0]["visual_mode"] is True
assert actions[0]["target_spec"]["by_role"] == "start_button"
assert actions[0]["target_spec"]["by_text"] == "Démarrer"
assert actions[0]["type"] == "key_combo"
assert actions[0]["keys"] == ["win", "r"]
assert actions[0]["_setup_step"] == "open_run_dialog"
# Étape 2 : attente menu Démarrer
assert actions[1]["type"] == "wait"
assert actions[1]["duration_ms"] == 1000
assert actions[1]["duration_ms"] == 500
# Étape 3 : clic visuel sur la barre de recherche
assert actions[2]["type"] == "click"
assert actions[2]["visual_mode"] is True
assert actions[2]["target_spec"]["by_role"] == "search_box"
assert actions[2]["type"] == "type"
assert actions[2]["text"] == "notepad"
# Étape 4 : attente barre de recherche active
assert actions[3]["type"] == "wait"
assert actions[3]["duration_ms"] == 500
assert actions[3]["duration_ms"] == 300
# Étape 5 : taper le nom visuel français
assert actions[4]["type"] == "type"
assert actions[4]["text"] == "Bloc-notes"
assert actions[4]["type"] == "key_combo"
assert actions[4]["keys"] == ["enter"]
# Étape 6 : attente résultats
assert actions[5]["type"] == "wait"
assert actions[5]["duration_ms"] == 1200
assert actions[5]["duration_ms"] == 2000
# Étape 7 : clic visuel sur le résultat
assert actions[6]["type"] == "click"
assert actions[6]["visual_mode"] is True
assert actions[6]["target_spec"]["by_text"] == "Bloc-notes"
assert actions[6]["target_spec"]["by_role"] == "app_icon"
# Étape 8 : attente lancement (app légère = 2000ms)
assert actions[7]["type"] == "wait"
assert actions[7]["duration_ms"] == 2000
# Étape 9 : vérification visuelle
assert actions[8]["type"] == "verify_screen"
assert actions[8]["_expected_title"] == "Sans titre Bloc-notes"
assert actions[6]["type"] == "verify_screen"
assert actions[6]["expected_window_title_contains"] == ["Bloc-notes", "notepad"]
# Toutes les actions sont marquées comme phase setup
for action in actions:
assert action.get("_setup_phase") is True
assert action.get("_setup_strategy") == "run_dialog"
def test_no_key_combo_in_setup(self):
"""AUCUNE action key_combo ne doit être générée dans le setup."""
def test_visual_setup_keeps_no_key_combo_for_word(self):
"""Le setup visuel classique ne doit pas introduire de key_combo."""
app_info = {
"primary_app": "Notepad.exe",
"primary_launch_cmd": "notepad",
"first_window_title": "Bloc-notes",
"primary_app": "winword.exe",
"primary_launch_cmd": "winword",
"first_window_title": "Document1 - Word",
}
actions = _generate_setup_actions(app_info)
key_combos = [a for a in actions if a["type"] == "key_combo"]
assert key_combos == [], (
"Le setup 100% visuel ne doit JAMAIS contenir de key_combo. "
f"Trouvé : {key_combos}"
)
assert key_combos == []
def test_all_clicks_are_visual(self):
"""Tous les clics du setup doivent avoir visual_mode=True et un target_spec."""
def test_all_clicks_are_visual_for_visual_setup(self):
"""Tous les clics du setup visuel doivent avoir visual_mode=True."""
app_info = {
"primary_app": "Notepad.exe",
"primary_launch_cmd": "notepad",
"first_window_title": "Bloc-notes",
"primary_app": "winword.exe",
"primary_launch_cmd": "winword",
"first_window_title": "Document1 - Word",
}
actions = _generate_setup_actions(app_info)
clicks = [a for a in actions if a["type"] == "click"]
@@ -402,11 +695,11 @@ class TestGenerateSetupActions:
assert "vlm_description" in spec, f"target_spec sans vlm_description : {spec}"
def test_clicks_have_fallback_coordinates(self):
"""Tous les clics visuels ont des coordonnées de fallback (x_pct, y_pct)."""
"""Tous les clics visuels ont des coordonnées de fallback."""
app_info = {
"primary_app": "Notepad.exe",
"primary_launch_cmd": "notepad",
"first_window_title": "Bloc-notes",
"primary_app": "winword.exe",
"primary_launch_cmd": "winword",
"first_window_title": "Document1 - Word",
}
actions = _generate_setup_actions(app_info)
clicks = [a for a in actions if a["type"] == "click"]
@@ -456,28 +749,130 @@ class TestGenerateSetupActions:
click_result = [a for a in actions if a.get("_setup_step") == "click_app_result"][0]
assert click_result["target_spec"]["by_text"] == "Microsoft Word"
def test_verify_screen_present_with_title(self):
"""Un verify_screen est ajouté quand un titre de fenêtre est connu."""
def test_prefers_recorded_searchhost_click_target(self):
"""Le setup réutilise la vraie cible SearchHost quand elle existe."""
app_info = {
"primary_app": "winword.exe",
"primary_launch_cmd": "winword",
"first_window_title": "Document1 - Word",
"launch_result_target": {
"x_pct": 0.566016,
"y_pct": 0.400625,
"window_title": "Rechercher",
"expected_window_before": "Rechercher",
"original_position": {
"x_relative": "au centre",
"y_relative": "au milieu",
},
"window_capture": {
"click_relative": [681, 448],
"window_size": [1287, 1407],
},
"position_desc": "au milieu au centre",
},
}
actions = _generate_setup_actions(app_info)
click_result = [a for a in actions if a.get("_setup_step") == "click_app_result"][0]
assert click_result["x_pct"] == pytest.approx(0.566016)
assert click_result["y_pct"] == pytest.approx(0.400625)
assert click_result["expected_window_before"] == "Rechercher"
assert click_result["target_spec"]["by_text"] == "Microsoft Word"
assert click_result["target_spec"]["by_role"] == "search_result"
assert click_result["target_spec"]["allow_position_fallback"] is True
assert click_result["target_spec"]["window_title"] == "Rechercher"
assert click_result["target_spec"]["original_position"]["x_relative"] == "au centre"
assert click_result["target_spec"]["window_capture"]["window_size"] == [1287, 1407]
assert "résultat de recherche" in click_result["target_spec"]["vlm_description"]
def test_prefers_recorded_start_button_target(self):
"""Le setup visuel réutilise le vrai clic Démarrer quand il existe."""
app_info = {
"primary_app": "winword.exe",
"primary_launch_cmd": "winword",
"first_window_title": "Document1 - Word",
"start_menu_target": {
"x_pct": 0.387891,
"y_pct": 0.974375,
"anchor_image_base64": "abc123",
"original_position": {
"x_relative": "au centre",
"y_relative": "en bas",
},
"position_desc": "en bas au centre",
},
}
actions = _generate_setup_actions(app_info)
click_start = [a for a in actions if a.get("_setup_step") == "click_start_menu"][0]
assert click_start["x_pct"] == pytest.approx(0.387891)
assert click_start["y_pct"] == pytest.approx(0.974375)
assert click_start["target_spec"]["by_text"] == ""
assert click_start["target_spec"]["by_role"] == "start_button"
assert click_start["target_spec"]["screen_scope"] == "full_screen"
assert click_start["target_spec"]["allow_position_fallback"] is True
assert click_start["target_spec"]["anchor_image_base64"] == "abc123"
assert click_start["target_spec"]["original_position"]["y_relative"] == "en bas"
assert "icône Windows" in click_start["target_spec"]["vlm_description"]
def test_skips_search_click_for_direct_typing(self):
"""Quand la session tape directement dans SearchHost, on saute
click_search et son wait/verify dédiés. La garde
verify_start_menu_open reste obligatoire et précède le type."""
app_info = {
"primary_app": "winword.exe",
"primary_launch_cmd": "winword",
"first_window_title": "Document1 - Word",
"search_box_interaction": {
"mode": "direct_typing",
"window_title": "Rechercher",
},
}
actions = _generate_setup_actions(app_info)
setup_steps = [a.get("_setup_step") for a in actions]
assert "click_search_box" not in setup_steps
assert "wait_search_ready" not in setup_steps
assert "verify_search_box_active" not in setup_steps
# Garde générique conservée — c'est elle qui sécurise la frappe.
assert "verify_start_menu_open" in setup_steps
idx_type = setup_steps.index("type_app_name")
assert actions[idx_type]["type"] == "type"
assert actions[idx_type]["text"] == "Word"
def test_verify_screen_final_present_with_title(self):
"""Le setup run_dialog termine par une vérification souple sur le titre app."""
app_info = {
"primary_app": "Notepad.exe",
"primary_launch_cmd": "notepad",
"first_window_title": "Sans titre Bloc-notes",
}
actions = _generate_setup_actions(app_info)
verify = [a for a in actions if a.get("type") == "verify_screen"]
assert len(verify) == 1
assert verify[0]["_expected_title"] == "Sans titre Bloc-notes"
final_verifies = [
a for a in actions
if a.get("type") == "verify_screen"
and a.get("_setup_step") == "verify_app_ready"
]
assert len(final_verifies) == 1
assert "Bloc-notes" in final_verifies[0]["expected_window_title_contains"]
def test_no_verify_without_title(self):
"""Pas de verify_screen si aucun titre de fenêtre n'est connu."""
def test_run_dialog_keeps_final_verify_even_without_exact_title(self):
"""Le setup run_dialog garde une vérification finale générique."""
app_info = {
"primary_app": "Notepad.exe",
"primary_launch_cmd": "notepad",
"first_window_title": "",
}
actions = _generate_setup_actions(app_info)
verify = [a for a in actions if a.get("type") == "verify_screen"]
assert len(verify) == 0
# Aucun verify_screen ne doit porter _expected_title.
final_verifies = [
a for a in actions
if a.get("type") == "verify_screen"
and a.get("_setup_step") == "verify_app_ready"
]
assert len(final_verifies) == 1
assert "notepad" in [p.lower() for p in final_verifies[0]["expected_window_title_contains"]]
def test_empty_app_info(self):
"""Dict vide → pas d'actions."""
@@ -537,12 +932,184 @@ class TestGenerateSetupActions:
assert type_action["text"] == "MonAppMedical"
# =========================================================================
# Tests des gardes visuelles du setup (verify_screen titre fenêtre)
# =========================================================================
class TestSetupVisualGuards:
"""Couvre les gardes visuelles insérées entre les étapes du setup
auto Windows (post-blocage `position_fallback` live du 22 mai 2026).
Sans ces gardes, un clic Démarrer qui touche en fait le systray
overflow popup laissait le setup taper « bloc » dans la mauvaise
fenêtre, et seul le `click_result` final remontait l'erreur — trop
tard. Les `verify_screen` titre-fenêtre stoppent net après chaque
étape critique.
"""
def test_verify_start_menu_open_inserted_after_wait_start(self):
"""Une garde verify_screen est insérée juste après wait_start_menu."""
app_info = {
"primary_app": "winword.exe",
"primary_launch_cmd": "winword",
"first_window_title": "Document1 - Word",
}
actions = _generate_setup_actions(app_info)
steps = [a.get("_setup_step") for a in actions]
# Ordre : click_start_menu → wait_start_menu → verify_start_menu_open
assert "verify_start_menu_open" in steps
idx_wait = steps.index("wait_start_menu")
idx_verify = steps.index("verify_start_menu_open")
assert idx_verify == idx_wait + 1
verify = actions[idx_verify]
assert verify["type"] == "verify_screen"
assert verify.get("_setup_phase") is True
patterns = verify.get("expected_window_title_contains") or []
assert isinstance(patterns, list) and patterns
lowered = [p.lower() for p in patterns]
# Doit couvrir au minimum FR + EN + l'app SearchHost / StartMenu
assert any("recherch" in p for p in lowered), patterns
assert any("search" in p for p in lowered), patterns
def test_verify_search_box_active_inserted_when_click_then_type(self):
"""Quand le setup clique sur la barre Rechercher puis attend,
une garde verify_screen suit l'attente pour bloquer la frappe
si le focus n'est pas réellement dans la barre."""
app_info = {
"primary_app": "winword.exe",
"primary_launch_cmd": "winword",
"first_window_title": "Document1 - Word",
"search_box_interaction": {
"mode": "click_then_type",
"window_title": "Rechercher",
"x_pct": 0.10, "y_pct": 0.95,
},
}
actions = _generate_setup_actions(app_info)
steps = [a.get("_setup_step") for a in actions]
assert "verify_search_box_active" in steps
idx_wait_ready = steps.index("wait_search_ready")
idx_verify = steps.index("verify_search_box_active")
idx_type = steps.index("type_app_name")
# Ordre : wait_search_ready → verify_search_box_active → type_app_name
assert idx_verify == idx_wait_ready + 1
assert idx_type == idx_verify + 1
verify = actions[idx_verify]
assert verify["type"] == "verify_screen"
patterns = verify.get("expected_window_title_contains") or []
assert "Rechercher" in patterns or any(
p.lower() == "rechercher" for p in patterns
)
def test_no_verify_search_box_when_direct_typing(self):
"""En mode direct_typing on n'a pas de click sur la barre — donc
pas de verify_search_box_active dédié (la garde verify_start_menu_open
suffit, on tape directement après)."""
app_info = {
"primary_app": "winword.exe",
"primary_launch_cmd": "winword",
"first_window_title": "Document1 - Word",
"search_box_interaction": {
"mode": "direct_typing",
"window_title": "Rechercher",
},
}
actions = _generate_setup_actions(app_info)
steps = [a.get("_setup_step") for a in actions]
assert "verify_search_box_active" not in steps
# La garde verify_start_menu_open reste présente (couvre la frappe).
assert "verify_start_menu_open" in steps
idx_verify = steps.index("verify_start_menu_open")
idx_type = steps.index("type_app_name")
assert idx_type > idx_verify, (
"type_app_name doit suivre verify_start_menu_open en direct_typing"
)
def test_verify_search_results_visible_inserted_before_click_result(self):
"""Dernier filet : la barre Rechercher (et ses résultats) doit
être encore active juste avant `click_app_result`. Sans cette
garde finale, un focus perdu pendant `wait_search_results`
peut faire cliquer le `click_app_result` dans la mauvaise
surface (constat live 2026-05-22 — fenêtre observée
``Fenêtre de dépassement de capacité de la barre d'état
système.``)."""
app_info = {
"primary_app": "winword.exe",
"primary_launch_cmd": "winword",
"first_window_title": "Document1 - Word",
}
actions = _generate_setup_actions(app_info)
steps = [a.get("_setup_step") for a in actions]
assert "verify_search_results_visible" in steps
idx_wait_results = steps.index("wait_search_results")
idx_verify = steps.index("verify_search_results_visible")
idx_click_result = steps.index("click_app_result")
# Ordre : wait_search_results → verify_search_results_visible → click_app_result
assert idx_verify == idx_wait_results + 1
assert idx_click_result == idx_verify + 1
verify = actions[idx_verify]
assert verify["type"] == "verify_screen"
patterns = verify.get("expected_window_title_contains") or []
assert isinstance(patterns, list) and patterns
lowered = [p.lower() for p in patterns]
assert any("recherch" in p for p in lowered), patterns
assert any("search" in p for p in lowered), patterns
def test_verify_search_results_visible_present_in_direct_typing(self):
"""La garde finale avant click_app_result reste obligatoire
quelle que soit la modalité de la barre Rechercher."""
app_info = {
"primary_app": "winword.exe",
"primary_launch_cmd": "winword",
"first_window_title": "Document1 - Word",
"search_box_interaction": {
"mode": "direct_typing",
"window_title": "Rechercher",
},
}
actions = _generate_setup_actions(app_info)
steps = [a.get("_setup_step") for a in actions]
assert "verify_search_results_visible" in steps
def test_setup_guards_have_short_timeout(self):
"""Les gardes verify_screen ont un timeout court (≤ 2 s) — c'est
un check titre, pas un wait long."""
app_info = {
"primary_app": "winword.exe",
"primary_launch_cmd": "winword",
"first_window_title": "Document1 - Word",
"search_box_interaction": {
"mode": "click_then_type",
"window_title": "Rechercher",
},
}
actions = _generate_setup_actions(app_info)
guards = [
a for a in actions
if a.get("_setup_step") in (
"verify_start_menu_open",
"verify_search_box_active",
"verify_search_results_visible",
)
]
assert guards, "il doit exister au moins une garde verify_screen"
for g in guards:
assert g.get("timeout_ms", 5000) <= 2000
# =========================================================================
# Tests d'intégration : pipeline complet events → setup visuel
# =========================================================================
class TestSetupPipeline:
"""Tests du pipeline complet : extraction + génération visuelle."""
"""Tests du pipeline complet : extraction + génération du setup."""
def test_full_pipeline_from_events(self):
"""Pipeline complet depuis des événements bruts de type Notepad."""
@@ -561,24 +1128,25 @@ class TestSetupPipeline:
assert app_info["primary_app"] == "Notepad.exe"
actions = _generate_setup_actions(app_info)
assert len(actions) >= 8 # Au minimum 8 actions visuelles (sans verify si pas de titre)
assert len(actions) == 7
# Vérifier l'ordre logique 100% visuel
types = [a["type"] for a in actions]
assert types[0] == "click" # Clic Démarrer
assert types[1] == "wait" # Attente menu
assert types[2] == "click" # Clic barre de recherche
assert types[3] == "wait" # Attente barre active
assert types[4] == "type" # Taper le nom
assert types[5] == "wait" # Attente résultats
assert types[6] == "click" # Clic sur le résultat
assert types[7] == "wait" # Attente lancement
steps = [a.get("_setup_step") for a in actions]
expected_step_order = [
"open_run_dialog",
"wait_run_dialog",
"type_launch_command",
"wait_launch_command",
"submit_run_dialog",
"wait_app_launch",
"verify_app_ready",
]
assert steps == expected_step_order, steps
# AUCUN key_combo dans le pipeline
assert "key_combo" not in types, "Le pipeline ne doit contenir aucun key_combo"
assert types.count("key_combo") == 2
# Le texte tapé est le nom visuel français
assert actions[4]["text"] == "Bloc-notes"
idx_type = steps.index("type_launch_command")
assert actions[idx_type]["text"] == "notepad"
def test_full_pipeline_from_workflow(self):
"""Pipeline complet depuis un workflow structuré."""
@@ -599,12 +1167,12 @@ class TestSetupPipeline:
assert app_info["primary_app"] == "Notepad.exe"
actions = _generate_setup_actions(app_info)
assert len(actions) >= 8
assert len(actions) == 7
# Le texte tapé doit être le nom visuel, pas la commande shell
# Le texte tapé doit être la commande shell pour le setup Win+R.
type_action = [a for a in actions if a["type"] == "type"][0]
assert type_action["text"] == "Bloc-notes"
assert type_action["text"] == "notepad"
# Aucun key_combo
# Le setup Notepad s'appuie maintenant sur deux key_combo.
key_combos = [a for a in actions if a["type"] == "key_combo"]
assert key_combos == []
assert len(key_combos) == 2

View File

@@ -0,0 +1,79 @@
"""Tests pour la garde drift de `_template_match_anchor`.
Brief Codex 2026-05-23 07:56 : faux succès live `act_raw_77db702f` où
ANCHOR-TM matche un crop dans OBS Studio à (0.205, 0.170) score 0.842
alors que la position enregistrée est ~(0.706, 0.348) dans Bloc-notes.
La cascade serveur avait rejeté (`rejected_text_mismatch`) mais l'agent
fallback ANCHOR-TM côté client sans aucune garde de position acceptait
n'importe quel match au-dessus du seuil score.
Le helper statique `_anchor_match_within_drift` rejette les matchs
loin de la position fallback enregistrée.
"""
from __future__ import annotations
import sys
from pathlib import Path
ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(ROOT))
from agent_v0.agent_v1.core.executor import ActionExecutorV1 # noqa: E402
class TestAnchorMatchDriftGuard:
def test_match_close_to_fallback_accepted(self):
# 5% de drift en x → accepté
assert ActionExecutorV1._anchor_match_within_drift(
matched_x_pct=0.71, matched_y_pct=0.35,
fallback_x_pct=0.706, fallback_y_pct=0.348,
)
def test_match_far_from_fallback_rejected(self):
# cas live exact
assert not ActionExecutorV1._anchor_match_within_drift(
matched_x_pct=0.205, matched_y_pct=0.170,
fallback_x_pct=0.706, fallback_y_pct=0.348,
)
def test_drift_at_threshold_accepted(self):
# drift = 0.25 exact (frontière)
assert ActionExecutorV1._anchor_match_within_drift(
matched_x_pct=0.5, matched_y_pct=0.5,
fallback_x_pct=0.25, fallback_y_pct=0.5,
)
def test_drift_just_above_threshold_rejected(self):
assert not ActionExecutorV1._anchor_match_within_drift(
matched_x_pct=0.5, matched_y_pct=0.5,
fallback_x_pct=0.24, fallback_y_pct=0.5,
)
def test_no_recorded_fallback_keeps_legacy_behavior(self):
"""Si pas de fallback enregistré (0,0), pas de garde possible."""
assert ActionExecutorV1._anchor_match_within_drift(
matched_x_pct=0.5, matched_y_pct=0.5,
fallback_x_pct=0.0, fallback_y_pct=0.0,
)
def test_custom_max_drift(self):
"""Le seuil est configurable par caller."""
# Avec max_drift=0.10, un drift 0.15 est rejeté
assert not ActionExecutorV1._anchor_match_within_drift(
matched_x_pct=0.65, matched_y_pct=0.50,
fallback_x_pct=0.50, fallback_y_pct=0.50,
max_drift=0.10,
)
# Mais accepté avec le défaut 0.25
assert ActionExecutorV1._anchor_match_within_drift(
matched_x_pct=0.65, matched_y_pct=0.50,
fallback_x_pct=0.50, fallback_y_pct=0.50,
)
def test_drift_y_axis(self):
"""Drift y > seuil → rejet (même si x dans la zone)."""
assert not ActionExecutorV1._anchor_match_within_drift(
matched_x_pct=0.50, matched_y_pct=0.95,
fallback_x_pct=0.50, fallback_y_pct=0.50,
)

View File

@@ -0,0 +1,744 @@
"""Tests pour la garde verify_screen.expected_window_title_contains.
Cette garde protège les étapes du setup auto Windows contre les
configurations où ``click_start_menu`` se trompe de cible (systray
overflow popup, par exemple) et laisse la frappe partir dans la
mauvaise fenêtre. Ajoutée le 22 mai 2026 — cf.
``docs/CR_AUDIT_SETUP_VISUAL_GUARDS_2026-05-22.md``.
On teste deux choses :
1. Le helper statique ``_window_title_matches_any`` (substring + case).
2. Le routage de la garde dans ``verify_screen`` : succès si titre
matche, bascule en mode apprentissage / pause sinon.
"""
from __future__ import annotations
import sys
from pathlib import Path
from types import SimpleNamespace
from unittest.mock import patch, MagicMock
ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(ROOT))
from agent_v0.agent_v1.core.executor import ActionExecutorV1 # noqa: E402
# =========================================================================
# Helper substring matching
# =========================================================================
class TestWindowTitleMatchesAny:
def test_substring_match(self):
assert ActionExecutorV1._window_title_matches_any(
"Rechercher", ["Rechercher"]
)
def test_case_insensitive(self):
assert ActionExecutorV1._window_title_matches_any(
"RECHERCHER - Cortana", ["rechercher"]
)
def test_partial_match_first_pattern(self):
assert ActionExecutorV1._window_title_matches_any(
"Cortana - Rechercher", ["search", "rechercher", "cortana"]
)
def test_no_match_returns_false(self):
assert not ActionExecutorV1._window_title_matches_any(
"Fenêtre de dépassement de capacité de la barre d'état système",
["Rechercher", "Search", "Cortana"],
)
def test_empty_patterns_returns_true(self):
"""Pas de patterns demandés → la garde est neutre."""
assert ActionExecutorV1._window_title_matches_any("X", [])
assert ActionExecutorV1._window_title_matches_any("X", None)
def test_empty_title_with_patterns_returns_false(self):
assert not ActionExecutorV1._window_title_matches_any("", ["X"])
def test_ignore_empty_pattern_entries(self):
"""Les chaînes vides dans la liste ne doivent pas matcher
l'ensemble du titre."""
assert not ActionExecutorV1._window_title_matches_any(
"rien à voir", ["", None, ""]
)
class TestKnownRuntimeDialogs:
def test_match_confirm_save_overwrite_dialog(self):
spec = ActionExecutorV1._match_known_runtime_dialog(
"Confirmer l'enregistrement"
)
assert spec is not None
assert spec["id"] == "confirm_save_overwrite"
assert spec["button_texts"][0] == "Oui"
def test_match_confirm_save_overwrite_dialog_with_typographic_apostrophe(self):
spec = ActionExecutorV1._match_known_runtime_dialog(
"Confirmer lenregistrement"
)
assert spec is not None
assert spec["id"] == "confirm_save_overwrite"
def test_unknown_title_returns_none(self):
assert ActionExecutorV1._match_known_runtime_dialog(
"Bloc-notes"
) is None
class TestContextualRuntimeDialogs:
def test_contextual_notepad_unsaved_dialog_is_detected_via_visual_evidence(self):
exe = _make_executor_skeleton()
exe._capture_screenshot_b64 = MagicMock(return_value="shot")
exe._find_text_on_screen = MagicMock(
side_effect=lambda _shot, text: (100, 100)
if text == "Ne pas enregistrer"
else None
)
action = {
"action_id": "act_save_from_dialog",
"type": "click",
"visual_mode": True,
"target_spec": {
"window_title": "*test Bloc-notes",
"by_text": "Enregistrer",
},
"expected_window_before": "*test Bloc-notes",
}
target_spec = dict(action["target_spec"])
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_info",
return_value={"title": "Bloc-notes", "app_name": "Notepad.exe"},
):
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect",
return_value={
"title": "Bloc-notes",
"app_name": "Notepad.exe",
"rect": [500, 300, 1400, 900],
},
):
adapted = exe._maybe_contextualize_action_to_foreground_dialog(
action,
target_spec,
)
assert adapted is not None
assert adapted["dialog_spec"]["id"] == "notepad_unsaved_changes"
assert adapted["action"]["expected_window_before"] == "Bloc-notes"
assert adapted["target_spec"]["window_title"] == "Bloc-notes"
assert adapted["target_spec"]["context_hints"]["foreground_dialog_id"] == (
"notepad_unsaved_changes"
)
assert adapted["target_spec"]["window_capture"]["rect"] == [500, 300, 1400, 900]
def test_contextual_notepad_dialog_is_ignored_without_matching_action(self):
exe = _make_executor_skeleton()
exe._capture_screenshot_b64 = MagicMock(return_value="shot")
exe._find_text_on_screen = MagicMock(
side_effect=lambda _shot, text: (100, 100)
if text == "Ne pas enregistrer"
else None
)
action = {
"action_id": "act_other_button",
"type": "click",
"visual_mode": True,
"target_spec": {
"window_title": "*test Bloc-notes",
"by_text": "Annuler",
},
}
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_info",
return_value={"title": "Bloc-notes", "app_name": "Notepad.exe"},
):
adapted = exe._maybe_contextualize_action_to_foreground_dialog(
action,
dict(action["target_spec"]),
)
assert adapted is None
class TestPostVerifyWindowTransition:
def test_requires_transition_when_expected_after_differs_from_source_window(self):
assert ActionExecutorV1._requires_post_verify_window_transition(
action={"expected_window_before": "*test Bloc-notes"},
target_spec=None,
expected_after="Enregistrer sous",
)
def test_same_window_title_does_not_require_transition(self):
assert not ActionExecutorV1._requires_post_verify_window_transition(
action={"expected_window_before": "*test Bloc-notes"},
target_spec=None,
expected_after="test Bloc-notes",
)
# =========================================================================
# Routage de la garde dans verify_screen
# =========================================================================
def _make_executor_skeleton():
"""Construit un ActionExecutorV1 sans son __init__ lourd
(MouseController/KeyboardController/mss). On câble manuellement
les attributs strictement nécessaires aux branches testées.
"""
exe = ActionExecutorV1.__new__(ActionExecutorV1)
exe._notification_manager = None
exe._system_dialog_pause = None
exe._chat_window_ref = None
exe._api_token = ""
exe._poll_backoff = 1.0
exe._poll_backoff_min = 1.0
exe._poll_backoff_max = 30.0
exe._poll_backoff_factor = 1.5
# mss factice (monitor 1920×1080)
exe._sct = MagicMock()
exe._sct.monitors = [None, {"width": 1920, "height": 1080}]
# Patcher les helpers IO côté agent
exe._check_and_pause_on_system_dialog = MagicMock(return_value=False)
exe._capture_screenshot_b64 = MagicMock(return_value=None)
return exe
def _verify_action(patterns, timeout_ms=200):
return {
"action_id": "act_test_verify",
"type": "verify_screen",
"expected_node": "",
"timeout_ms": timeout_ms,
"expected_window_title_contains": patterns,
}
class TestVerifyScreenWindowGuard:
def test_matching_title_returns_success(self):
exe = _make_executor_skeleton()
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_info",
return_value={"title": "Rechercher"},
):
res = exe.execute_replay_action(_verify_action(
["Rechercher", "Search"]
))
assert res["success"] is True
assert res.get("warning") != "setup_guard_window_mismatch"
def test_mismatch_with_human_correction_returns_success_supervised(self):
exe = _make_executor_skeleton()
# L'utilisateur fait un clic correctif quand le mode apprentissage
# se déclenche → on récupère la séquence et on rend la main au serveur.
exe._capture_human_correction = MagicMock(return_value=[
{"type": "click", "x_pct": 0.10, "y_pct": 0.95},
])
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_info",
return_value={
"title": "Fenêtre de dépassement de capacité de la barre d'état système",
},
):
res = exe.execute_replay_action(_verify_action(
["Rechercher", "Search"]
))
assert res["success"] is True
assert res["warning"] == "setup_guard_window_mismatch"
assert res["resolution_method"] == "human_supervised"
assert res["correction"]["trigger"] == "setup_guard_window_mismatch"
assert res["correction"]["expected_patterns"] == ["Rechercher", "Search"]
def test_mismatch_without_human_pauses_replay(self):
exe = _make_executor_skeleton()
exe._capture_human_correction = MagicMock(return_value=[])
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_info",
return_value={"title": "Notepad - Sans titre"},
):
res = exe.execute_replay_action(_verify_action(["Rechercher"]))
assert res["success"] is False
assert res["warning"] == "setup_guard_window_mismatch"
assert res.get("needs_human") is True
assert "Rechercher" in res["error"]
def test_verify_without_patterns_is_neutral_wait(self):
"""Sans expected_window_title_contains, verify_screen reste un
simple wait — pas de check fenêtre, pas de mode apprentissage."""
exe = _make_executor_skeleton()
exe._capture_human_correction = MagicMock()
action = {
"action_id": "act_test_verify_neutral",
"type": "verify_screen",
"expected_node": "node_x",
"timeout_ms": 200,
}
res = exe.execute_replay_action(action)
assert res["success"] is True
exe._capture_human_correction.assert_not_called()
def test_known_runtime_dialog_is_auto_handled_before_pause(self):
exe = _make_executor_skeleton()
exe._capture_human_correction = MagicMock(return_value=[])
exe._maybe_handle_runtime_dialog_before_pause = MagicMock(
return_value={
"action_id": "act_test_click",
"success": True,
"warning": "runtime_dialog_handled_skip",
"resolution_method": "runtime_dialog:confirm_save_overwrite",
"screenshot": None,
"visual_resolved": False,
}
)
action = {
"action_id": "act_test_click",
"type": "click",
"visual_mode": True,
"x_pct": 0.5,
"y_pct": 0.5,
"target_spec": {
"window_title": "http192.168.1.408765dossier.htmlid=.txt Bloc-notes",
"by_text": "",
},
}
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_info",
return_value={"title": "Confirmer l'enregistrement"},
):
res = exe.execute_replay_action(action)
assert res["success"] is True
assert res["warning"] == "runtime_dialog_handled_skip"
exe._maybe_handle_runtime_dialog_before_pause.assert_called_once()
exe._capture_human_correction.assert_not_called()
# =========================================================================
# Skip pixel-change validation pour les actions _setup_phase
# =========================================================================
def _make_executor_with_mouse_skeleton():
"""Comme `_make_executor_skeleton` mais avec aussi un mouse mock,
pour pouvoir traverser la branche click de execute_replay_action
sans toucher au desktop."""
exe = _make_executor_skeleton()
exe.mouse = MagicMock()
exe.mouse.position = (0, 0)
exe.keyboard = MagicMock()
# _quick_screenshot_hash retourne une string non-vide → pixel check actif
exe._quick_screenshot_hash = MagicMock(return_value="hash_before")
return exe
class TestSetupActionsSkipPixelChange:
"""Pour les actions du setup auto (`_setup_phase=True`), la
validation par simple pixel-change est neutralisée. C'est la garde
verify_screen suivante qui décide — sinon un click_start qui ouvre
le systray overflow popup serait validé sur changement d'écran.
"""
def test_setup_click_skips_screen_change_check(self):
exe = _make_executor_with_mouse_skeleton()
exe._wait_for_screen_change = MagicMock(return_value=False)
exe._capture_human_correction = MagicMock()
# On évite la résolution visuelle réelle : pas de visual_mode.
action = {
"action_id": "act_setup_click_start",
"type": "click",
"x_pct": 0.02,
"y_pct": 0.98,
"_setup_phase": True,
"_setup_step": "click_start_menu",
}
res = exe.execute_replay_action(action)
assert res["success"] is True
# La fonction _wait_for_screen_change ne doit PAS être appelée
# pour les actions setup.
exe._wait_for_screen_change.assert_not_called()
# Et le mode apprentissage ne doit pas se déclencher non plus.
exe._capture_human_correction.assert_not_called()
def test_non_setup_click_still_runs_screen_change_check(self):
"""Non-régression : une action click hors setup conserve la
validation pixel-change qui déclenche le mode apprentissage si
l'écran ne change pas."""
exe = _make_executor_with_mouse_skeleton()
exe._wait_for_screen_change = MagicMock(return_value=False)
exe._capture_human_correction = MagicMock(return_value=[])
action = {
"action_id": "act_user_click",
"type": "click",
"x_pct": 0.5,
"y_pct": 0.5,
# Pas de _setup_phase
}
res = exe.execute_replay_action(action)
exe._wait_for_screen_change.assert_called_once()
# Pas visual_mode → branche échec simple, success=False
assert res.get("warning") == "no_screen_change"
assert res["success"] is False
class TestRuntimeDialogHandling:
def test_handle_confirm_save_dialog_clicks_oui_via_server(self):
exe = _make_executor_skeleton()
exe._capture_screenshot_b64 = MagicMock(return_value="abc")
exe._server_resolve_target = MagicMock(
return_value={
"resolved": True,
"x_pct": 0.25,
"y_pct": 0.75,
"method": "hybrid_text_direct",
"score": 0.91,
}
)
exe._find_text_on_screen = MagicMock(return_value=None)
exe._click = MagicMock()
spec = ActionExecutorV1._match_known_runtime_dialog(
"Confirmer l'enregistrement"
)
with patch("agent_v0.agent_v1.config.SERVER_URL", "http://srv"):
handled = exe._handle_known_runtime_dialog(
spec, "Confirmer l'enregistrement", 1920, 1080
)
assert handled["handled"] is True
assert handled["button_text"] == "Oui"
exe._server_resolve_target.assert_called_once()
exe._click.assert_called_once_with((480, 810), "left")
def test_runtime_dialog_before_pause_returns_skip_result(self):
exe = _make_executor_skeleton()
exe._check_and_pause_on_system_dialog = MagicMock(return_value=False)
exe._handle_known_runtime_dialog = MagicMock(
return_value={
"handled": True,
"button_text": "Oui",
"x_pct": 0.33,
"y_pct": 0.66,
"resolution_score": 0.9,
}
)
exe._capture_screenshot_b64 = MagicMock(return_value="after")
res = exe._maybe_handle_runtime_dialog_before_pause(
action={"action_id": "act_final_click", "type": "click"},
target_spec={},
expected_title="http192.168.1.408765dossier.htmlid=.txt Bloc-notes",
current_title="Confirmer l'enregistrement",
screen_width=1920,
screen_height=1080,
)
assert res["success"] is True
assert res["warning"] == "runtime_dialog_handled_skip"
assert res["correction"]["button_text"] == "Oui"
assert res["actual_position"] == {"x_pct": 0.33, "y_pct": 0.66}
def test_post_verify_handles_runtime_dialog_and_recovers_expected_window(self):
exe = _make_executor_skeleton()
exe._click = MagicMock()
exe._quick_screenshot_hash = MagicMock(return_value="hash_before")
exe._wait_for_screen_change = MagicMock(return_value=True)
handled_state = {"done": False}
def _fake_handle(dialog_spec, current_title, screen_width, screen_height):
handled_state["done"] = True
return {
"handled": True,
"button_text": "Oui",
"x_pct": 0.33,
"y_pct": 0.66,
"resolution_score": 0.9,
}
exe._handle_known_runtime_dialog = MagicMock(side_effect=_fake_handle)
action = {
"action_id": "act_save_dialog",
"type": "click",
"x_pct": 0.5,
"y_pct": 0.5,
"expected_window_title": "http192.168.1.408765dossier.htmlid=.txt Bloc-notes",
}
def _window_info():
if handled_state["done"]:
return {"title": "http192.168.1.408765dossier.htmlid=.txt Bloc-notes"}
return {"title": "Confirmer lenregistrement"}
with patch("agent_v0.agent_v1.core.executor.time.sleep", lambda *_a, **_k: None):
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_info",
side_effect=_window_info,
):
res = exe.execute_replay_action(action)
assert res["success"] is True
assert res["warning"] == "runtime_dialog_handled_post_verify"
assert res["actual_position"] == {"x_pct": 0.5, "y_pct": 0.5}
exe._handle_known_runtime_dialog.assert_called_once()
def test_post_verify_can_retry_same_runtime_dialog_before_recovery(self):
exe = _make_executor_skeleton()
exe._click = MagicMock()
exe._quick_screenshot_hash = MagicMock(return_value="hash_before")
exe._wait_for_screen_change = MagicMock(return_value=True)
handled_state = {"count": 0}
def _fake_handle(dialog_spec, current_title, screen_width, screen_height):
handled_state["count"] += 1
return {
"handled": True,
"button_text": "Oui",
"x_pct": 0.33,
"y_pct": 0.66,
"resolution_score": 0.9,
}
exe._handle_known_runtime_dialog = MagicMock(side_effect=_fake_handle)
action = {
"action_id": "act_save_dialog_retry",
"type": "click",
"x_pct": 0.5,
"y_pct": 0.5,
"expected_window_title": "http192.168.1.408765dossier.htmlid=.txt Bloc-notes",
}
def _window_info():
if handled_state["count"] >= 2:
return {"title": "http192.168.1.408765dossier.htmlid=.txt Bloc-notes"}
return {"title": "Confirmer lenregistrement"}
with patch("agent_v0.agent_v1.core.executor.time.sleep", lambda *_a, **_k: None):
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_info",
side_effect=_window_info,
):
res = exe.execute_replay_action(action)
assert res["success"] is True
assert res["warning"] == "runtime_dialog_handled_post_verify"
assert handled_state["count"] == 2
assert res["runtime_dialog"]["dialog_id"] == "confirm_save_overwrite"
def test_post_verify_wrong_window_fails_when_dialog_transition_was_expected(self):
exe = _make_executor_skeleton()
exe._click = MagicMock()
exe._quick_screenshot_hash = MagicMock(return_value="hash_before")
exe._wait_for_screen_change = MagicMock(return_value=True)
exe._capture_screenshot_b64 = MagicMock(return_value="after")
exe._notification_manager = MagicMock()
action = {
"action_id": "act_open_save_dialog",
"type": "click",
"x_pct": 0.5,
"y_pct": 0.5,
"expected_window_before": "*test Bloc-notes",
"expected_window_title": "Enregistrer sous",
}
with patch("agent_v0.agent_v1.core.executor.time.sleep", lambda *_a, **_k: None):
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_info",
return_value={"title": "rpa_vision : Explorateur de fichiers"},
):
res = exe.execute_replay_action(action)
assert res["success"] is False
assert res["warning"] == "wrong_window"
assert "Enregistrer sous" in res["error"]
assert "rpa_vision : Explorateur de fichiers" in res["error"]
assert res["needs_human"] is True
exe._notification_manager.replay_wrong_window.assert_called_once()
def test_post_verify_same_window_mismatch_stays_legacy_warning(self):
exe = _make_executor_skeleton()
exe._click = MagicMock()
exe._quick_screenshot_hash = MagicMock(return_value="hash_before")
exe._wait_for_screen_change = MagicMock(return_value=True)
exe._capture_screenshot_b64 = MagicMock(return_value="after")
action = {
"action_id": "act_same_window_click",
"type": "click",
"x_pct": 0.5,
"y_pct": 0.5,
"expected_window_before": "*test Bloc-notes",
"expected_window_title": "test Bloc-notes",
}
with patch("agent_v0.agent_v1.core.executor.time.sleep", lambda *_a, **_k: None):
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_info",
return_value={"title": "rpa_vision : Explorateur de fichiers"},
):
res = exe.execute_replay_action(action)
assert res["success"] is True
assert res["warning"] == "post_verif_timeout:rpa_vision : Explorateur de fichiers"
class TestCloseTabHotkeyFallback:
def test_visual_close_tab_uses_ctrl_w_when_tab_x_is_hidden(self):
exe = _make_executor_with_mouse_skeleton()
exe._observe_screen = MagicMock(return_value=None)
exe._capture_human_correction = MagicMock(return_value=[])
exe._execute_key_combo = MagicMock()
exe._click = MagicMock()
exe._wait_for_screen_change = MagicMock(return_value=True)
exe._capture_screenshot_b64 = MagicMock(return_value="after")
action = {
"action_id": "act_close_tab",
"type": "click",
"visual_mode": True,
"x_pct": 0.88,
"y_pct": 0.04,
"target_spec": {
"window_title": "*test Bloc-notes",
"by_role": "tab_close_button",
"context_hints": {
"interaction": "close_tab",
"active_tab_label": "test",
},
},
}
with patch("agent_v0.agent_v1.core.executor.time.sleep", lambda *_a, **_k: None):
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_info",
return_value={"title": "*test Bloc-notes"},
):
res = exe.execute_replay_action(action)
assert res["success"] is True
assert res["warning"] == "close_tab_hotkey_fallback"
assert res["resolution_method"] == "semantic_close_tab_hotkey"
exe._execute_key_combo.assert_called_once_with(["ctrl", "w"])
exe._click.assert_not_called()
exe._capture_human_correction.assert_not_called()
class TestStartButtonHotkeyFallback:
def test_setup_start_button_position_fallback_uses_windows_key(self):
exe = _make_executor_with_mouse_skeleton()
exe._observe_screen = MagicMock(return_value=None)
exe._capture_human_correction = MagicMock(return_value=[])
exe._execute_key_combo = MagicMock()
exe._click = MagicMock()
exe._capture_screenshot_b64 = MagicMock(return_value="after")
exe._wait_for_screen_change = MagicMock(return_value=True)
action = {
"action_id": "act_setup_click_start",
"type": "click",
"visual_mode": True,
"x_pct": 0.387891,
"y_pct": 0.974375,
"_setup_phase": True,
"_setup_step": "click_start_menu",
"target_spec": {
"by_role": "start_button",
"by_text": "",
"anchor_image_base64": "abc123",
"allow_position_fallback": True,
"screen_scope": "full_screen",
},
}
grounding_result = SimpleNamespace(
found=True,
x_pct=0.387891,
y_pct=0.974375,
method="position_fallback",
score=0.2,
detail="fallback positionnel explicite",
elapsed_ms=12.0,
)
with patch(
"agent_v0.agent_v1.core.grounding.GroundingEngine.locate",
return_value=grounding_result,
) as locate_mock:
with patch(
"agent_v0.agent_v1.core.executor.time.sleep",
lambda *_a, **_k: None,
):
res = exe.execute_replay_action(action, server_url="http://srv")
assert res["success"] is True
assert res["warning"] == "start_button_hotkey_fallback"
assert res["resolution_method"] == "semantic_start_button_hotkey"
exe._execute_key_combo.assert_called_once_with(["win"])
exe._click.assert_not_called()
exe._wait_for_screen_change.assert_not_called()
exe._capture_human_correction.assert_not_called()
def test_real_visual_start_button_match_keeps_mouse_click(self):
exe = _make_executor_with_mouse_skeleton()
exe._observe_screen = MagicMock(return_value=None)
exe._capture_human_correction = MagicMock(return_value=[])
exe._execute_key_combo = MagicMock()
exe._click = MagicMock()
exe._capture_screenshot_b64 = MagicMock(return_value="after")
exe._wait_for_screen_change = MagicMock(return_value=True)
action = {
"action_id": "act_setup_click_start",
"type": "click",
"visual_mode": True,
"x_pct": 0.387891,
"y_pct": 0.974375,
"_setup_phase": True,
"_setup_step": "click_start_menu",
"target_spec": {
"by_role": "start_button",
"by_text": "",
"anchor_image_base64": "abc123",
"allow_position_fallback": True,
"screen_scope": "full_screen",
},
}
grounding_result = SimpleNamespace(
found=True,
x_pct=0.389,
y_pct=0.973,
method="vlm_quick_find",
score=0.93,
detail="match VLM plausible",
elapsed_ms=35.0,
)
with patch(
"agent_v0.agent_v1.core.grounding.GroundingEngine.locate",
return_value=grounding_result,
):
with patch(
"agent_v0.agent_v1.core.executor.time.sleep",
lambda *_a, **_k: None,
):
res = exe.execute_replay_action(action, server_url="http://srv")
assert res["success"] is True
assert res["resolution_method"] == "vlm_quick_find"
exe._execute_key_combo.assert_not_called()
exe._click.assert_called_once()
exe._wait_for_screen_change.assert_not_called()
exe._capture_human_correction.assert_not_called()

View File

@@ -0,0 +1,58 @@
"""Tests pour le flag RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE.
Brief Codex 2026-05-23 09:02 : le chemin produit cible est le workflow
compilé (post worker VLM), pas le replay direct depuis raw events.
Le flag env désactive la proposition automatique de replay direct par
défaut. Le chemin direct reste accessible (smoke/debug) via RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE=true.
"""
from __future__ import annotations
import sys
from pathlib import Path
ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(ROOT))
from agent_v0.server_v1.replay_engine import ( # noqa: E402
_auto_launch_replay_after_finalize,
)
class TestAutoLaunchReplayFlag:
def test_default_is_false(self, monkeypatch):
"""Sans variable d'env, le mode produit est actif → pas de
proposition automatique de replay direct."""
monkeypatch.delenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", raising=False)
assert _auto_launch_replay_after_finalize() is False
def test_true_value_activates(self, monkeypatch):
monkeypatch.setenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", "true")
assert _auto_launch_replay_after_finalize() is True
def test_1_value_activates(self, monkeypatch):
monkeypatch.setenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", "1")
assert _auto_launch_replay_after_finalize() is True
def test_yes_value_activates(self, monkeypatch):
monkeypatch.setenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", "yes")
assert _auto_launch_replay_after_finalize() is True
def test_false_value_deactivates(self, monkeypatch):
monkeypatch.setenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", "false")
assert _auto_launch_replay_after_finalize() is False
def test_empty_value_deactivates(self, monkeypatch):
monkeypatch.setenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", "")
assert _auto_launch_replay_after_finalize() is False
def test_arbitrary_value_deactivates(self, monkeypatch):
"""Toute valeur non-truthy retourne False (default-deny)."""
monkeypatch.setenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", "maybe")
assert _auto_launch_replay_after_finalize() is False
def test_case_insensitive(self, monkeypatch):
monkeypatch.setenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", "TRUE")
assert _auto_launch_replay_after_finalize() is True
monkeypatch.setenv("RPA_AUTO_LAUNCH_REPLAY_AFTER_FINALIZE", "Yes")
assert _auto_launch_replay_after_finalize() is True

View File

@@ -0,0 +1,46 @@
from __future__ import annotations
import sys
from pathlib import Path
from unittest.mock import MagicMock
ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(ROOT))
from agent_v0.agent_v1.core.grounding import GroundingEngine # noqa: E402
def test_template_strategy_passes_fallback_coords_to_anchor_drift_guard():
executor = MagicMock()
executor._template_match_anchor = MagicMock(
return_value={
"resolved": True,
"x_pct": 0.7,
"y_pct": 0.35,
"score": 0.95,
}
)
engine = GroundingEngine(executor)
target_spec = {"anchor_image_base64": "abc123"}
result = engine._try_strategy(
"template",
server_url="",
screenshot_b64="shot",
target_spec=target_spec,
fallback_x=0.708594,
fallback_y=0.35,
screen_width=2560,
screen_height=1600,
)
assert result.found is True
executor._template_match_anchor.assert_called_once_with(
"shot",
"abc123",
2560,
1600,
fallback_x_pct=0.708594,
fallback_y_pct=0.35,
)

View File

@@ -111,6 +111,310 @@ class TestGroundingEngine:
assert d["x_pct"] == 0.5
assert d["method"] == "som"
def test_start_button_uses_full_screen_instead_of_active_window(self):
"""Le bouton Démarrer doit être résolu sur l'écran entier."""
engine, executor = self._make_engine()
executor._server_resolve_target.return_value = {
"resolved": True,
"x_pct": 0.02,
"y_pct": 0.98,
"method": "som_text",
"score": 0.9,
"matched_element": {"label": "Démarrer"},
}
engine._capture_window_or_screen = MagicMock(return_value="fake_b64_data")
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect",
return_value={"rect": [100, 100, 1100, 900]},
):
result = engine.locate(
"http://server",
{"by_text": "Démarrer", "by_role": "start_button"},
0.02, 0.98, 1920, 1080,
)
assert result.found is True
engine._capture_window_or_screen.assert_called_once_with(None)
def test_regular_targets_stay_scoped_to_active_window(self):
"""Les cibles applicatives ordinaires restent bornées à la fenêtre active."""
engine, executor = self._make_engine()
executor._server_resolve_target.return_value = {
"resolved": True,
"x_pct": 0.5,
"y_pct": 0.25,
"method": "som_text",
"score": 0.9,
"matched_element": {"label": "Enregistrer"},
}
engine._capture_window_or_screen = MagicMock(return_value="fake_b64_data")
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect",
return_value={"rect": [100, 200, 1100, 1000]},
):
result = engine.locate(
"http://server",
{"by_text": "Enregistrer", "by_role": "button"},
0.5, 0.3, 1920, 1080,
)
assert result.found is True
engine._capture_window_or_screen.assert_called_once_with(
{"left": 100, "top": 200, "width": 1000, "height": 800}
)
def test_unknown_window_rect_falls_back_to_full_screen_on_visual_mismatch(self):
"""Un titre inconnu n'est accepté que si le crop est validé visuellement."""
engine, executor = self._make_engine()
executor._server_resolve_target.return_value = {
"resolved": True,
"x_pct": 0.5,
"y_pct": 0.25,
"method": "som_text",
"score": 0.9,
"matched_element": {"label": "Enregistrer"},
}
executor._find_text_on_screen.return_value = None
engine._capture_window_or_screen = MagicMock(
side_effect=["fake_window_b64", "fake_screen_b64"]
)
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect",
return_value={
"title": "unknown_window",
"rect": [100, 200, 1100, 1000],
},
):
result = engine.locate(
"http://server",
{"by_text": "Enregistrer", "by_role": "button"},
0.5, 0.3, 1920, 1080,
)
assert result.found is True
assert [c.args[0] for c in engine._capture_window_or_screen.call_args_list] == [
{"left": 100, "top": 200, "width": 1000, "height": 800},
None,
]
def test_taskbar_like_rect_falls_back_to_full_screen(self):
"""Une taskbar/systray ne doit jamais être utilisée comme fenêtre active."""
engine, executor = self._make_engine()
executor._server_resolve_target.return_value = {
"resolved": True,
"x_pct": 0.5,
"y_pct": 0.25,
"method": "som_text",
"score": 0.9,
"matched_element": {"label": "Enregistrer"},
}
engine._capture_window_or_screen = MagicMock(return_value="fake_b64_data")
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect",
return_value={
"title": "Fenêtre de dépassement de capacité de la barre d'état système",
"rect": [0, 1492, 2560, 1600],
},
):
result = engine.locate(
"http://server",
{"by_text": "Enregistrer", "by_role": "button"},
0.5, 0.3, 2560, 1600,
)
assert result.found is True
engine._capture_window_or_screen.assert_called_once_with(None)
def test_visually_mismatched_window_crop_falls_back_to_full_screen(self):
"""Un crop fenêtre plausible mais visuellement faux est rejeté."""
engine, executor = self._make_engine()
executor._server_resolve_target.return_value = {
"resolved": True,
"x_pct": 0.5,
"y_pct": 0.25,
"method": "som_text",
"score": 0.9,
"matched_element": {"label": "Enregistrer"},
}
executor._find_text_on_screen.return_value = None
engine._capture_window_or_screen = MagicMock(
side_effect=["fake_window_b64", "fake_screen_b64"]
)
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect",
return_value={
"title": "Enregistrer sous",
"rect": [100, 200, 1100, 1000],
},
):
result = engine.locate(
"http://server",
{
"by_text": "Enregistrer",
"by_role": "button",
"window_title": "Enregistrer sous",
},
0.5, 0.3, 1920, 1080,
)
assert result.found is True
assert [c.args[0] for c in engine._capture_window_or_screen.call_args_list] == [
{"left": 100, "top": 200, "width": 1000, "height": 800},
None,
]
executor._server_resolve_target.assert_called_once_with(
"http://server",
"fake_screen_b64",
{
"by_text": "Enregistrer",
"by_role": "button",
"window_title": "Enregistrer sous",
},
0.5,
0.3,
1920,
1080,
)
def test_visually_validated_window_crop_stays_scoped(self):
"""Un crop fenêtre plausible et validé visuellement reste autorisé."""
engine, executor = self._make_engine()
executor._server_resolve_target.return_value = {
"resolved": True,
"x_pct": 0.5,
"y_pct": 0.25,
"method": "som_text",
"score": 0.9,
"matched_element": {"label": "Enregistrer"},
}
executor._find_text_on_screen.return_value = (321, 222)
engine._capture_window_or_screen = MagicMock(return_value="fake_window_b64")
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect",
return_value={
"title": "Enregistrer sous",
"rect": [100, 200, 1100, 1000],
},
):
result = engine.locate(
"http://server",
{
"by_text": "Enregistrer",
"by_role": "button",
"window_title": "Enregistrer sous",
},
0.5, 0.3, 1920, 1080,
)
assert result.found is True
engine._capture_window_or_screen.assert_called_once_with(
{"left": 100, "top": 200, "width": 1000, "height": 800}
)
def test_lea_active_window_does_not_scope_external_target(self):
"""Une fenêtre Léa au premier plan ne doit jamais contraindre une cible externe."""
engine, executor = self._make_engine()
executor._server_resolve_target.return_value = {
"resolved": True,
"x_pct": 0.5,
"y_pct": 0.25,
"method": "som_text",
"score": 0.9,
"matched_element": {"label": "Bloc-notes"},
}
engine._capture_window_or_screen = MagicMock(return_value="fake_b64_data")
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect",
return_value={
"title": "Léa — Assistante",
"app_name": "pythonw.exe",
"rect": [1948, 750, 2570, 1606],
},
):
result = engine.locate(
"http://server",
{"by_text": "Bloc-notes", "by_role": "search_result"},
0.2, 0.5, 2560, 1600,
)
assert result.found is True
engine._capture_window_or_screen.assert_called_once_with(None)
executor._server_resolve_target.assert_called_once_with(
"http://server",
"fake_b64_data",
{"by_text": "Bloc-notes", "by_role": "search_result"},
0.2,
0.5,
2560,
1600,
)
def test_lea_active_window_stays_scoped_for_explicit_lea_target(self):
"""Si la cible mentionne explicitement Léa, le scope sur sa fenêtre reste autorisé."""
engine, executor = self._make_engine()
executor._server_resolve_target.return_value = {
"resolved": True,
"x_pct": 0.5,
"y_pct": 0.25,
"method": "som_text",
"score": 0.9,
"matched_element": {"label": "Continuer"},
}
engine._capture_window_or_screen = MagicMock(return_value="fake_b64_data")
with patch(
"agent_v0.agent_v1.window_info_crossplatform.get_active_window_rect",
return_value={
"title": "Léa — Assistante",
"app_name": "pythonw.exe",
"rect": [1948, 750, 2570, 1606],
},
):
result = engine.locate(
"http://server",
{
"by_text": "Continuer",
"by_role": "button",
"window_title": "Léa — Assistante",
},
0.5, 0.3, 3000, 2000,
)
assert result.found is True
engine._capture_window_or_screen.assert_called_once_with(
{"left": 1948, "top": 750, "width": 622, "height": 856}
)
def test_allow_position_fallback_returns_recorded_coords(self):
"""Quand autorisé, le grounding peut retomber sur la position enregistrée."""
engine, executor = self._make_engine()
executor._server_resolve_target.return_value = None
executor._template_match_anchor.return_value = None
executor._hybrid_vlm_resolve.return_value = None
result = engine.locate(
"http://server",
{
"by_role": "start_button",
"vlm_description": "icône Windows",
"screen_scope": "full_screen",
"allow_position_fallback": True,
},
0.387891, 0.974375, 1920, 1080,
)
assert result.found is True
assert result.method == "position_fallback"
assert result.x_pct == pytest.approx(0.387891)
assert result.y_pct == pytest.approx(0.974375)
# =========================================================================
# P2 : Policy — décisions quand grounding échoue
@@ -407,6 +711,65 @@ class TestReplayLearner:
assert "action_id" in data
assert "success" in data
def test_record_human_correction_persists_to_memory_helper(self, learner, monkeypatch):
"""Une correction humaine doit alimenter la mémoire persistante via replay_memory."""
captured = {}
def fake_memory_record_success(**kwargs):
captured.update(kwargs)
return True
monkeypatch.setattr(
"agent_v0.server_v1.replay_memory.memory_record_success",
fake_memory_record_success,
)
learner.record_human_correction(
session_id="s_corr",
action={
"action_id": "a_corr",
"target_spec": {"by_text": "Valider", "window_title": "Bloc-notes"},
},
correction={"x_pct": 0.42, "y_pct": 0.84},
)
loaded = learner.load_session("s_corr")
assert len(loaded) == 1
assert loaded[0].resolution_method == "human_supervised"
assert loaded[0].window_title == "Bloc-notes"
assert captured["window_title"] == "Bloc-notes"
assert captured["target_spec"]["by_text"] == "Valider"
assert captured["x_pct"] == 0.42
assert captured["y_pct"] == 0.84
assert captured["method"] == "human_supervised"
assert captured["confidence"] == 1.0
def test_record_human_correction_fallback_window_title_from_action(self, learner, monkeypatch):
"""Si target_spec.window_title est absent, on retombe sur action.window_title."""
captured = {}
def fake_memory_record_success(**kwargs):
captured.update(kwargs)
return True
monkeypatch.setattr(
"agent_v0.server_v1.replay_memory.memory_record_success",
fake_memory_record_success,
)
learner.record_human_correction(
session_id="s_corr2",
action={
"action_id": "a_corr2",
"window_title": "Fenêtre fallback",
"target_spec": {"by_text": "Enregistrer"},
},
correction={"x_pct": 0.1, "y_pct": 0.2},
)
assert captured["window_title"] == "Fenêtre fallback"
# =========================================================================
# Boucle d'apprentissage : consolidation cross-workflow

View File

@@ -145,6 +145,20 @@ class TestVerifyWithCritic:
assert result.suggestion == "retry"
assert result.semantic_verified is None # VLM non appelé
def test_verify_screen_identique_ne_declenche_pas_retry(
self, verifier, screenshot_gray,
):
"""verify_screen est une stabilisation, pas une action qui doit re-changer l'écran."""
result = verifier.verify_action(
action={"type": "verify_screen", "action_id": "verify_setup"},
result={"success": True},
screenshot_before=screenshot_gray,
screenshot_after=screenshot_gray,
)
assert result.verified is True
assert result.suggestion == "continue"
assert result.changes_detected is False
@patch("agent_v0.server_v1.replay_verifier.ReplayVerifier._verify_semantic")
def test_pixel_ok_semantic_ok(
self, mock_semantic, verifier, screenshot_gray, screenshot_white,

View File

@@ -0,0 +1,118 @@
from types import SimpleNamespace
from agent_v0.server_v1 import replay_memory
from core.learning.target_memory_store import TargetMemoryStore
class _DummyStore:
def __init__(self, fp):
self._fp = fp
def lookup(self, screen_sig, spec_shim):
return self._fp
def test_memory_lookup_uses_window_relative_coords_when_available(monkeypatch):
fp = SimpleNamespace(
bbox=(0.566016, 0.400625, 0.0, 0.0),
etype="position_fallback",
confidence=0.2,
)
monkeypatch.setattr(replay_memory, "get_memory_store", lambda: _DummyStore(fp))
result = replay_memory.memory_lookup(
window_title="Rechercher",
target_spec={
"by_text": "Bloc-notes",
"window_capture": {
"click_relative": [681, 448],
"window_size": [1287, 1407],
},
},
)
assert result is not None
assert result["method"] == "memory_position_fallback"
assert result["x_pct"] == 681 / 1287
assert result["y_pct"] == 448 / 1407
def test_memory_lookup_keeps_bbox_coords_without_window_capture(monkeypatch):
fp = SimpleNamespace(
bbox=(0.566016, 0.400625, 0.0, 0.0),
etype="position_fallback",
confidence=0.2,
)
monkeypatch.setattr(replay_memory, "get_memory_store", lambda: _DummyStore(fp))
result = replay_memory.memory_lookup(
window_title="Rechercher",
target_spec={"by_text": "Bloc-notes"},
)
assert result is not None
assert result["x_pct"] == 0.566016
assert result["y_pct"] == 0.400625
def test_memory_lookup_keeps_learned_visual_coords_with_window_capture(monkeypatch):
fp = SimpleNamespace(
bbox=(0.402734375, 0.578125, 0.0, 0.0),
etype="anchor_template",
confidence=0.99,
)
monkeypatch.setattr(replay_memory, "get_memory_store", lambda: _DummyStore(fp))
result = replay_memory.memory_lookup(
window_title="*test Bloc-notes",
target_spec={
"by_text": "Enregistrer",
"by_role": "yolo",
"window_capture": {
"click_relative": [860, 634],
"window_size": [1920, 1116],
},
},
)
assert result is not None
assert result["method"] == "memory_anchor_template"
assert result["x_pct"] == 0.402734375
assert result["y_pct"] == 0.578125
def test_target_spec_hash_distinguishes_same_text_with_different_spatial_hints(tmp_path):
store = TargetMemoryStore(base_path=str(tmp_path / "learning"))
spec_left = replay_memory._TargetSpecLike(
{
"by_text": "Enregistrer",
"by_role": "yolo",
"vlm_description": "Dans la fenêtre '*test Bloc-notes', l'élément cliqué se trouve au milieu au centre de l'écran",
"window_capture": {
"click_relative": [860, 634],
"window_size": [1920, 1116],
},
"som_element": {
"bbox_norm": [0.40234375, 0.701875, 0.46640625, 0.74125],
"center_norm": [0.434375, 0.72125],
},
}
)
spec_right = replay_memory._TargetSpecLike(
{
"by_text": "Enregistrer",
"by_role": "yolo",
"vlm_description": "Dans la fenêtre '*test Bloc-notes', l'élément cliqué se trouve au milieu au centre de l'écran",
"window_capture": {
"click_relative": [1491, 38],
"window_size": [1920, 1116],
},
"som_element": {
"bbox_norm": [0.697265625, 0.335625, 0.715625, 0.3625],
"center_norm": [0.70625, 0.34875],
},
}
)
assert store._hash_target_spec(spec_left) != store._hash_target_spec(spec_right)

View File

@@ -0,0 +1,152 @@
from __future__ import annotations
import base64
import io
import sys
from pathlib import Path
from PIL import Image, ImageDraw
ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(ROOT))
from agent_v0.server_v1 import resolve_engine # noqa: E402
class _FakeElem:
def __init__(self, elem_id, label, source, center, center_norm, confidence=0.9):
self.id = elem_id
self.label = label
self.source = source
self.center = center
self.center_norm = center_norm
self.confidence = confidence
class _FakeSomResult:
def __init__(self, elements):
self.elements = elements
self.som_image = None
class _FakeSomEngine:
def __init__(self, elements):
self._elements = elements
def analyze(self, _img):
return _FakeSomResult(self._elements)
def _make_close_button_image(tmp_path: Path) -> tuple[str, str]:
screenshot = Image.new("RGB", (200, 100), "white")
draw = ImageDraw.Draw(screenshot)
draw.rounded_rectangle((130, 4, 170, 36), radius=8, fill=(242, 244, 247))
draw.line((144, 12, 156, 24), fill="black", width=2)
draw.line((156, 12, 144, 24), fill="black", width=2)
screenshot_path = tmp_path / "screen.png"
screenshot.save(screenshot_path)
anchor = screenshot.crop((130, 4, 170, 36))
buf = io.BytesIO()
anchor.save(buf, format="PNG")
anchor_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
return str(screenshot_path), anchor_b64
def test_close_tab_uses_exact_anchor_coords(tmp_path, monkeypatch):
screenshot_path, anchor_b64 = _make_close_button_image(tmp_path)
fake_engine = _FakeSomEngine([
_FakeElem(
elem_id=47,
label="test",
source="yolo",
center=(120, 20),
center_norm=(0.60, 0.20),
),
])
monkeypatch.setattr(resolve_engine, "_get_som_engine_api", lambda: fake_engine)
monkeypatch.setattr(resolve_engine, "_get_vlm_client", lambda: object())
result = resolve_engine._resolve_by_som(
screenshot_path=screenshot_path,
target_spec={
"anchor_image_base64": anchor_b64,
"by_text": "",
"vlm_description": "fermer l'onglet actif",
"context_hints": {"interaction": "close_tab"},
"window_capture": {
"rect": [0, 0, 200, 100],
"click_relative": [150, 20],
"window_size": [200, 100],
},
},
screen_width=200,
screen_height=100,
)
assert result is not None
assert result["method"] == "som_anchor_match"
assert result["matched_element"]["role"] == "som_anchor_exact"
assert result["x_pct"] == 0.75
assert result["y_pct"] == 0.20
def test_close_tab_rejects_exact_anchor_far_from_recorded_click(tmp_path, monkeypatch):
screenshot_path, anchor_b64 = _make_close_button_image(tmp_path)
fake_engine = _FakeSomEngine([])
monkeypatch.setattr(resolve_engine, "_get_som_engine_api", lambda: fake_engine)
monkeypatch.setattr(resolve_engine, "_get_vlm_client", lambda: object())
result = resolve_engine._resolve_by_som(
screenshot_path=screenshot_path,
target_spec={
"anchor_image_base64": anchor_b64,
"by_text": "",
"vlm_description": "fermer l'onglet actif",
"context_hints": {"interaction": "close_tab"},
"window_capture": {
"rect": [0, 0, 200, 100],
"click_relative": [50, 20],
"window_size": [200, 100],
},
},
screen_width=200,
screen_height=100,
)
assert result is None
def test_non_close_tab_keeps_nearest_som_center(tmp_path, monkeypatch):
screenshot_path, anchor_b64 = _make_close_button_image(tmp_path)
fake_engine = _FakeSomEngine([
_FakeElem(
elem_id=47,
label="test",
source="yolo",
center=(120, 20),
center_norm=(0.60, 0.20),
),
])
monkeypatch.setattr(resolve_engine, "_get_som_engine_api", lambda: fake_engine)
monkeypatch.setattr(resolve_engine, "_get_vlm_client", lambda: object())
result = resolve_engine._resolve_by_som(
screenshot_path=screenshot_path,
target_spec={
"anchor_image_base64": anchor_b64,
"by_text": "",
"vlm_description": "icône en haut",
},
screen_width=200,
screen_height=100,
)
assert result is not None
assert result["method"] == "som_anchor_match"
assert result["matched_element"]["role"] == "som_anchor_match"
assert result["x_pct"] == 0.60
assert result["y_pct"] == 0.20

View File

@@ -0,0 +1,51 @@
import pytest
from agent_v0.server_v1 import resolve_engine
@pytest.fixture(autouse=True)
def _disable_memory_lookup(monkeypatch):
monkeypatch.setattr(
"agent_v0.server_v1.replay_memory.memory_lookup",
lambda **kwargs: None,
)
def test_dialog_button_skips_vlm_cascade_when_ocr_misses(tmp_path, monkeypatch):
screenshot = tmp_path / "screen.jpg"
screenshot.write_bytes(b"fake")
monkeypatch.setattr(
resolve_engine,
"_resolve_by_ocr_text",
lambda *args, **kwargs: None,
)
def _unexpected_vlm(*args, **kwargs):
raise AssertionError("VLM ne doit pas être appelé pour dialog_button")
def _unexpected_som(*args, **kwargs):
raise AssertionError("SoM ne doit pas être appelé pour dialog_button")
monkeypatch.setattr(resolve_engine, "_vlm_quick_find", _unexpected_vlm)
monkeypatch.setattr(resolve_engine, "_resolve_by_som", _unexpected_som)
result = resolve_engine._resolve_target_sync(
str(screenshot),
{
"by_role": "dialog_button",
"by_text": "Oui",
"window_title": "Confirmer lenregistrement",
"vlm_description": "Dans la fenêtre 'Confirmer lenregistrement', le bouton 'Oui'",
},
2560,
1600,
0.5,
0.5,
True,
processor=None,
)
assert result["resolved"] is False
assert result["method"] == "dialog_button_ocr_only"
assert result["reason"] == "ocr_direct_failed_dialog_button_no_vlm"

View File

@@ -0,0 +1,139 @@
import pytest
from agent_v0.server_v1 import resolve_engine
@pytest.fixture(autouse=True)
def _disable_memory_lookup(monkeypatch):
monkeypatch.setattr(
"agent_v0.server_v1.replay_memory.memory_lookup",
lambda **kwargs: None,
)
@pytest.fixture
def _patched_resolvers(monkeypatch):
monkeypatch.setattr(
resolve_engine,
"_resolve_by_template_matching",
lambda *args, **kwargs: None,
)
monkeypatch.setattr(
resolve_engine,
"_resolve_by_som",
lambda *args, **kwargs: None,
)
def _start_button_spec():
return {
"by_role": "start_button",
"by_text": "",
"anchor_image_base64": "abc123",
"vlm_description": "Le bouton Démarrer (icône Windows) dans la barre des tâches, en bas",
"screen_scope": "full_screen",
}
def _generic_button_spec():
return {
"by_role": "button",
"by_text": "",
"anchor_image_base64": "abc123",
"vlm_description": "Le bouton principal",
}
def _vlm_result(x_pct: float, y_pct: float, score: float = 0.95):
return {
"resolved": True,
"method": "vlm_quick_find",
"x_pct": x_pct,
"y_pct": y_pct,
"score": score,
"matched_element": {
"label": "target",
"type": "vlm_located",
"role": "vlm_quick_find",
"confidence": score,
},
}
def test_start_button_rejects_far_vlm_false_positive(tmp_path, monkeypatch, _patched_resolvers):
screenshot = tmp_path / "screen.jpg"
screenshot.write_bytes(b"fake")
monkeypatch.setattr(
resolve_engine,
"_vlm_quick_find",
lambda *args, **kwargs: _vlm_result(0.01, 0.95),
)
result = resolve_engine._resolve_target_sync(
str(screenshot),
_start_button_spec(),
1920,
1080,
0.387891,
0.974375,
True,
processor=None,
)
assert result["resolved"] is False
assert result["method"] == "strict_vlm_template_failed"
def test_start_button_accepts_plausible_vlm_result(tmp_path, monkeypatch, _patched_resolvers):
screenshot = tmp_path / "screen.jpg"
screenshot.write_bytes(b"fake")
monkeypatch.setattr(
resolve_engine,
"_vlm_quick_find",
lambda *args, **kwargs: _vlm_result(0.395, 0.972),
)
result = resolve_engine._resolve_target_sync(
str(screenshot),
_start_button_spec(),
1920,
1080,
0.387891,
0.974375,
True,
processor=None,
)
assert result["resolved"] is True
assert result["method"] == "vlm_quick_find"
assert result["x_pct"] == pytest.approx(0.395)
assert result["y_pct"] == pytest.approx(0.972)
def test_non_start_button_keeps_vlm_result_even_if_far(tmp_path, monkeypatch, _patched_resolvers):
screenshot = tmp_path / "screen.jpg"
screenshot.write_bytes(b"fake")
monkeypatch.setattr(
resolve_engine,
"_vlm_quick_find",
lambda *args, **kwargs: _vlm_result(0.01, 0.95),
)
result = resolve_engine._resolve_target_sync(
str(screenshot),
_generic_button_spec(),
1920,
1080,
0.387891,
0.974375,
True,
processor=None,
)
assert result["resolved"] is True
assert result["method"] == "vlm_quick_find"
assert result["x_pct"] == pytest.approx(0.01)
assert result["y_pct"] == pytest.approx(0.95)

View File

@@ -0,0 +1,103 @@
"""Tests pour les contrôles HTTP de replay paused (resume/abort).
Ces appels sont le fallback du chemin SocketIO `lea:replay_resume`
/ `lea:replay_abort` quand le bus feedback est déconnecté au moment
où l'utilisateur clique dans la bulle paused (cf.
`docs/CR_AUDIT_PAUSED_RESUME_BUS_2026-05-22.md`).
"""
from __future__ import annotations
import sys
from pathlib import Path
from unittest.mock import patch, MagicMock
import pytest
ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(ROOT))
from agent_v0.lea_ui.server_client import LeaServerClient # noqa: E402
# Préfixe partagé pour comparer les URLs sans coller à la valeur de
# RPA_STREAMING_URL côté env d'exécution des tests.
RESUME_PATH = "/traces/stream/replay/replay_xyz/resume"
CANCEL_PATH = "/traces/stream/replay/replay_xyz/cancel"
@pytest.fixture
def client(monkeypatch):
monkeypatch.setenv("RPA_API_TOKEN", "tok-test-1234")
c = LeaServerClient()
return c
# =========================================================================
# resume_replay
# =========================================================================
class TestResumeReplay:
def test_returns_true_when_server_accepts(self, client):
resp = MagicMock(ok=True)
with patch("requests.post", return_value=resp) as post:
assert client.resume_replay("replay_xyz") is True
assert post.call_count == 1
def test_returns_false_when_server_rejects(self, client):
resp = MagicMock(ok=False)
with patch("requests.post", return_value=resp):
assert client.resume_replay("replay_xyz") is False
def test_returns_false_on_empty_replay_id(self, client):
with patch("requests.post") as post:
assert client.resume_replay("") is False
post.assert_not_called()
def test_returns_false_on_exception(self, client):
with patch("requests.post", side_effect=ConnectionError("network down")):
assert client.resume_replay("replay_xyz") is False
def test_posts_to_resume_endpoint_with_auth_header(self, client):
resp = MagicMock(ok=True)
with patch("requests.post", return_value=resp) as post:
client.resume_replay("replay_xyz")
call = post.call_args
url = call.args[0] if call.args else call.kwargs.get("url", "")
assert url.endswith(RESUME_PATH)
headers = call.kwargs.get("headers", {})
assert headers.get("Authorization") == "Bearer tok-test-1234"
# =========================================================================
# abort_replay
# =========================================================================
class TestAbortReplay:
def test_returns_true_when_server_accepts(self, client):
resp = MagicMock(ok=True)
with patch("requests.post", return_value=resp):
assert client.abort_replay("replay_xyz") is True
def test_returns_false_when_server_rejects(self, client):
resp = MagicMock(ok=False)
with patch("requests.post", return_value=resp):
assert client.abort_replay("replay_xyz") is False
def test_returns_false_on_empty_replay_id(self, client):
with patch("requests.post") as post:
assert client.abort_replay("") is False
post.assert_not_called()
def test_returns_false_on_exception(self, client):
with patch("requests.post", side_effect=TimeoutError("timeout")):
assert client.abort_replay("replay_xyz") is False
def test_posts_to_cancel_endpoint(self, client):
resp = MagicMock(ok=True)
with patch("requests.post", return_value=resp) as post:
client.abort_replay("replay_xyz")
url = post.call_args.args[0]
assert url.endswith(CANCEL_PATH)

View File

@@ -0,0 +1,83 @@
"""Tests pour `_should_reject_on_text_mismatch` — patch 2026-05-23 :
distinguer `observed=''` (OCR n'a rien lu, ambigu) de `observed='X'`
(autre texte lu = mismatch confirmé) dans le pré-check OCR.
Brief Codex 2026-05-23 08:55 : le crop bbox SoM précis (50 × 48 px)
sur un onglet Notepad moderne donne `observed=''` car EasyOCR n'a pas
suffisamment de signal (texte peu contrasté, zone trop petite). Le
patch précédent rejetait ce cas comme mismatch — alors qu'aucune
preuve d'un mauvais clic n'existe. On ne rejette plus que quand l'OCR
a effectivement lu autre chose que la cible attendue.
Le faux succès OBS Studio reste bloqué : (1) son OCR retournait
`'ue audio disponible GUI OBS Studio…'` = non-vide → rejet conservé ;
(2) la garde drift agent posée sur ANCHOR-TM bloque déjà ce match.
"""
from __future__ import annotations
import sys
from pathlib import Path
ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(ROOT))
from agent_v0.server_v1.resolve_engine import ( # noqa: E402
_should_reject_on_text_mismatch,
)
class TestShouldRejectOnTextMismatch:
def test_valid_passes(self):
"""Cas nominal : OCR a vu la cible → on ne rejette pas."""
assert not _should_reject_on_text_mismatch(
is_valid=True, observed="Enregistrer sous",
)
def test_invalid_with_text_rejects(self):
"""Cas 0745 historique : OCR voit '9 ?' qui ne matche pas
'Enregistrer sous' → rejet confirmé."""
assert _should_reject_on_text_mismatch(
is_valid=False, observed="9 ?",
)
def test_invalid_with_obs_studio_rejects(self):
"""Cas 0756 : OCR voit du texte OBS Studio → rejet confirmé."""
assert _should_reject_on_text_mismatch(
is_valid=False, observed="ue audio disponible GUI OBS Studio",
)
def test_invalid_with_empty_observed_does_not_reject(self):
"""Cas 0855 : OCR n'a rien lu (zone trop petite/peu contrastée)
→ ambigu, pas un mismatch confirmé. On préserve la résolution
serveur — la garde drift agent protège en aval."""
assert not _should_reject_on_text_mismatch(
is_valid=False, observed="",
)
def test_invalid_with_whitespace_only_does_not_reject(self):
"""Espace seul = équivalent vide pour notre logique."""
assert not _should_reject_on_text_mismatch(
is_valid=False, observed=" ",
)
def test_invalid_with_newline_only_does_not_reject(self):
assert not _should_reject_on_text_mismatch(
is_valid=False, observed="\n\t",
)
def test_invalid_with_none_observed_does_not_reject(self):
"""Robustesse : observed None (cas dégénéré OCR-lib absente)
ne doit pas planter."""
assert not _should_reject_on_text_mismatch(
is_valid=False, observed=None,
)
def test_valid_with_empty_passes(self):
"""is_valid=True avec observed vide — ne peut normalement pas
arriver via _text_match_fuzzy (qui retourne False sur vide)
mais on garde la logique cohérente : si is_valid=True, on
ne rejette pas, peu importe observed."""
assert not _should_reject_on_text_mismatch(
is_valid=True, observed="",
)

View File

@@ -0,0 +1,62 @@
from __future__ import annotations
import sys
from pathlib import Path
ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(ROOT))
from agent_v0.server_v1.resolve_engine import _validate_resolution_quality # noqa: E402
def _result(score: float) -> dict:
return {
"resolved": True,
"method": "som_anchor_match",
"score": score,
"x_pct": 0.75,
"y_pct": 0.20,
}
def _close_tab_spec() -> dict:
return {
"by_text": "",
"by_role": "tab_close_button",
"anchor_image_base64": "abc123",
"context_hints": {"interaction": "close_tab", "active_tab_label": "test"},
}
def test_close_tab_relaxes_threshold_for_near_match():
out = _validate_resolution_quality(
_result(0.744),
0.708594,
0.35,
target_spec=_close_tab_spec(),
)
assert out["resolved"] is True
assert out["score"] == 0.744
def test_close_tab_still_rejects_low_score():
out = _validate_resolution_quality(
_result(0.65),
0.708594,
0.35,
target_spec=_close_tab_spec(),
)
assert out["resolved"] is False
assert "below_threshold" in out["reason"]
def test_close_tab_rejects_far_zone_even_with_good_score():
out = _validate_resolution_quality(
_result(0.80),
0.30,
0.20,
target_spec=_close_tab_spec(),
)
assert out["resolved"] is False
assert out["reason"] == "close_tab_out_of_recorded_zone"
assert out["method"] == "rejected_close_tab_zone_som_anchor_match"

View File

@@ -0,0 +1,134 @@
"""Tests pour `_validate_resolution_quality` — relâchement contextuel
du seuil de score pour les cibles `interaction = switch_tab` avec un
`som_element` calibré.
Cas live 2026-05-22 (act_raw_2f7e316c) :
- Onglet Notepad moderne `Enregistrer sous`
- Score som_text_match = 0.745 (juste sous seuil 0.75)
- Cible bien localisée par SoM (bbox_norm) + focus_change pré-clic
confirmant déjà la bonne fenêtre
- Rejeté à tort → pause supervisée
Le patch abaisse le seuil à 0.60 UNIQUEMENT pour
`context_hints.interaction == "switch_tab"` + `som_element` présent
+ méthode `som_*`. Pas de baisse globale.
"""
from __future__ import annotations
import sys
from pathlib import Path
ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(ROOT))
from agent_v0.server_v1.resolve_engine import ( # noqa: E402
_validate_resolution_quality,
)
def _result(method: str, score: float, x: float = 0.5, y: float = 0.5) -> dict:
return {
"resolved": True,
"method": method,
"score": score,
"x_pct": x,
"y_pct": y,
}
def _switch_tab_spec(with_som: bool = True) -> dict:
spec = {
"by_text": "Enregistrer sous",
"by_role": "tab",
"window_title": "*test Bloc-notes",
"context_hints": {
"interaction": "switch_tab",
"switch_to_window_title": "Enregistrer sous",
},
}
if with_som:
spec["som_element"] = {
"bbox_norm": [0.697, 0.335, 0.715, 0.362],
"center_norm": [0.706, 0.348],
}
return spec
class TestSwitchTabThresholdRelaxation:
def test_baseline_no_target_spec_keeps_strict_threshold(self):
"""Sans target_spec passé, comportement legacy : 0.745 < 0.75 → rejet."""
res = _result("som_text_match", score=0.745)
out = _validate_resolution_quality(res, 0.5, 0.5)
assert out is not None
assert out["resolved"] is False
assert "below_threshold" in out["reason"]
def test_switch_tab_with_som_accepts_score_above_relaxed_threshold(self):
"""switch_tab + som_element + method som_* + score 0.745 → accepté."""
res = _result("som_text_match", score=0.745, x=0.706, y=0.348)
out = _validate_resolution_quality(
res, 0.706, 0.348, target_spec=_switch_tab_spec(with_som=True),
)
assert out is not None
assert out["resolved"] is True
assert out["method"] == "som_text_match"
assert out["score"] == 0.745
def test_switch_tab_with_som_still_rejects_very_low_score(self):
"""Filet final : même en switch_tab, un score 0.50 reste rejeté
(seuil relâché 0.60). On ne valide pas n'importe quoi."""
res = _result("som_text_match", score=0.50)
out = _validate_resolution_quality(
res, 0.5, 0.5, target_spec=_switch_tab_spec(with_som=True),
)
assert out["resolved"] is False
assert "below_threshold" in out["reason"]
def test_switch_tab_without_som_keeps_strict_threshold(self):
"""Sans som_element calibré, on garde le seuil strict — on ne
peut pas faire confiance à un score VLM lower sans ancre spatiale."""
res = _result("som_text_match", score=0.745)
out = _validate_resolution_quality(
res, 0.5, 0.5, target_spec=_switch_tab_spec(with_som=False),
)
assert out["resolved"] is False
def test_non_switch_tab_keeps_strict_threshold(self):
"""Cible non-tab : pas de relaxation. Le 0.745 reste rejeté."""
spec = {
"by_text": "Submit",
"by_role": "button",
"som_element": {"bbox_norm": [0.4, 0.4, 0.5, 0.5]},
}
res = _result("som_text_match", score=0.745)
out = _validate_resolution_quality(res, 0.5, 0.5, target_spec=spec)
assert out["resolved"] is False
def test_switch_tab_with_non_som_method_keeps_strict_threshold(self):
"""La relaxation ne s'applique qu'aux méthodes som_* (qui exploitent
la bbox calibrée). Un vlm_quick_find à 0.745 sur une cible
switch_tab reste régi par son propre seuil legacy (0.60 → accepté)."""
# vlm_quick_find a déjà un seuil 0.60 (cf. _RESOLUTION_MIN_SCORES),
# donc 0.745 est largement au-dessus. On vérifie juste l'absence
# de régression sur ce cas.
res = _result("vlm_quick_find", score=0.745)
out = _validate_resolution_quality(
res, 0.5, 0.5, target_spec=_switch_tab_spec(with_som=True),
)
assert out["resolved"] is True
def test_unresolved_result_passes_through(self):
"""Non-régression : un result resolved=False traverse sans modif."""
res = {"resolved": False, "method": "no_target_criteria"}
out = _validate_resolution_quality(
res, 0.5, 0.5, target_spec=_switch_tab_spec(),
)
assert out is res
def test_target_spec_parameter_is_optional_for_legacy_callers(self):
"""Compatibilité ascendante : appel sans target_spec ne plante pas
et applique le seuil legacy."""
res = _result("som_anchor_match", score=0.80)
out = _validate_resolution_quality(res, 0.5, 0.5)
assert out["resolved"] is True

View File

@@ -0,0 +1,158 @@
"""Tests pour `_validate_text_at_position` — patch 2026-05-23 :
utilisation prioritaire de la bbox SoM enregistrée quand disponible.
Cas live (brief Codex 2026-05-23 07:45) : pré-check OCR rejette à tort
`expected='Enregistrer sous' observed='9 ?'` car le crop fait
``radius_px=280`` autour de la coord résolue capture du texte voisin
(numéro de ligne « 9 » de la status bar Notepad) au lieu du label
étroit de l'onglet. La bbox SoM ``[0.697, 0.335, 0.715, 0.362]``
localise précisément l'onglet — l'utiliser comme zone OCR donne
l'OCR exact du label.
"""
from __future__ import annotations
import sys
from pathlib import Path
from unittest.mock import patch, MagicMock
import pytest
ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(ROOT))
@pytest.fixture
def fake_screenshot(tmp_path):
"""Crée un screenshot 1920×1200 noir."""
from PIL import Image
p = tmp_path / "shot.png"
img = Image.new("RGB", (1920, 1200), (0, 0, 0))
img.save(p)
return str(p)
@pytest.fixture
def patched_reader():
"""Mock EasyOCR reader qui retourne ce qu'on veut selon la taille
du crop reçu. Permet de simuler 'voit Enregistrer sous' vs 'voit 9 ?'.
"""
from unittest.mock import patch
reader = MagicMock()
# observed_by_size : map taille_crop_approx → texte OCR retourné
reader._observed_by_size = {}
def fake_readtext(arr):
h, w = arr.shape[:2]
key = (w, h)
text = reader._observed_by_size.get(key, "fallback text")
return [(None, text, 0.95)]
reader.readtext.side_effect = fake_readtext
with patch(
"agent_v0.server_v1.resolve_engine._get_validation_ocr_reader",
return_value=reader,
):
yield reader
def _spec_with_som_bbox():
return {
"by_text": "Enregistrer sous",
"som_element": {
"bbox_norm": [0.697, 0.335, 0.715, 0.362],
},
}
class TestValidateTextWithSomBbox:
def test_uses_som_bbox_when_present(self, fake_screenshot, patched_reader):
"""Quand som_bbox_norm est fourni, la zone OCR est calculée
depuis cette bbox (pas le radius autour de x/y_pct)."""
from agent_v0.server_v1.resolve_engine import _validate_text_at_position
spec = _spec_with_som_bbox()
bbox = spec["som_element"]["bbox_norm"]
# Le crop attendu fait largeur = (0.715-0.697)*1920 = 34 + 2*padding
# et hauteur = (0.362-0.335)*1200 = 32 + 2*padding (padding=8)
# → environ (50, 48) px.
patched_reader._observed_by_size[(50, 48)] = "Enregistrer sous"
is_valid, observed, _ms = _validate_text_at_position(
fake_screenshot,
x_pct=0.706, y_pct=0.348,
expected_text="Enregistrer sous",
screen_width=1920, screen_height=1200,
som_bbox_norm=bbox,
)
assert observed == "Enregistrer sous"
assert is_valid is True
def test_falls_back_to_radius_when_no_bbox(self, fake_screenshot, patched_reader):
"""Sans som_bbox_norm, comportement legacy : crop radius_px=280
autour de (x_pct, y_pct)."""
from agent_v0.server_v1.resolve_engine import _validate_text_at_position
# Sans bbox → crop ≈ 560×560
patched_reader._observed_by_size[(560, 560)] = "Enregistrer sous"
is_valid, observed, _ms = _validate_text_at_position(
fake_screenshot,
x_pct=0.5, y_pct=0.5,
expected_text="Enregistrer sous",
screen_width=1920, screen_height=1200,
)
assert is_valid is True
assert observed == "Enregistrer sous"
def test_invalid_bbox_falls_back_gracefully(self, fake_screenshot, patched_reader):
"""Une bbox malformée ne doit pas planter — fallback radius."""
from agent_v0.server_v1.resolve_engine import _validate_text_at_position
patched_reader._observed_by_size[(560, 560)] = "OK"
is_valid, observed, _ms = _validate_text_at_position(
fake_screenshot,
x_pct=0.5, y_pct=0.5,
expected_text="OK",
screen_width=1920, screen_height=1200,
som_bbox_norm=[0.5], # malformé
)
# Pas de crash, fallback applique le radius classique.
assert observed == "OK"
def test_bbox_too_small_falls_back_to_radius(self, fake_screenshot, patched_reader):
"""Une bbox dégénérée (largeur/hauteur < quelques px) → fallback
sur le radius, on ne tente pas un crop minuscule inutilisable."""
from agent_v0.server_v1.resolve_engine import _validate_text_at_position
patched_reader._observed_by_size[(560, 560)] = "OK"
is_valid, observed, _ms = _validate_text_at_position(
fake_screenshot,
x_pct=0.5, y_pct=0.5,
expected_text="OK",
screen_width=1920, screen_height=1200,
som_bbox_norm=[0.500, 0.500, 0.501, 0.501],
)
# Bbox de ~2×1px → fallback radius
assert observed == "OK"
def test_bbox_normalized_values_outside_unit_clipped(self, fake_screenshot, patched_reader):
"""Bbox dépassant les bornes [0, 1] est clippée aux dimensions
écran sans crash."""
from agent_v0.server_v1.resolve_engine import _validate_text_at_position
# Bbox qui déborderait → clip à l'écran
# x = (-0.05 → 0) * 1920 - 8 = -8 → 0, x2 = 1.05 * 1920 + 8 = 2024 → 1920
# largeur = 1920, hauteur = (1.05-0)*1200 +16 = 1216 → 1200
patched_reader._observed_by_size[(1920, 1200)] = "déborde"
is_valid, observed, _ms = _validate_text_at_position(
fake_screenshot,
x_pct=0.5, y_pct=0.5,
expected_text="déborde",
screen_width=1920, screen_height=1200,
som_bbox_norm=[-0.05, 0.0, 1.05, 1.05],
)
assert observed == "déborde"

View File

@@ -0,0 +1,296 @@
"""Tests ciblés sur le contrat window_title -> mémoire persistante."""
from __future__ import annotations
import importlib
import sys
from pathlib import Path
import pytest
_ROOT = str(Path(__file__).resolve().parents[2])
if _ROOT not in sys.path:
sys.path.insert(0, _ROOT)
def _reload_api_stream():
mod_name = "agent_v0.server_v1.api_stream"
if mod_name in sys.modules:
del sys.modules[mod_name]
return importlib.import_module(mod_name)
def test_build_replay_from_raw_events_propagates_window_title_into_target_spec(
tmp_path, monkeypatch,
):
"""Le flux Lea-first doit propager window_title dans target_spec si connu."""
from agent_v0.server_v1 import stream_processor as sp
session_dir = tmp_path / "sess"
(session_dir / "shots").mkdir(parents=True)
monkeypatch.setattr(sp, "_load_crop_for_event", lambda *args, **kwargs: None)
monkeypatch.setattr(
sp,
"enrich_click_from_screenshot",
lambda *args, **kwargs: {"anchor_image_base64": "abc123"},
)
monkeypatch.setattr(sp, "_attach_expected_screenshots", lambda *args, **kwargs: None)
monkeypatch.setattr(sp, "_enrich_actions_with_intentions", lambda *args, **kwargs: None)
monkeypatch.setattr(sp, "_unload_gemma4", lambda *args, **kwargs: None)
actions = sp.build_replay_from_raw_events(
[
{
"event": {
"type": "mouse_click",
"timestamp": 1.0,
"pos": [100, 200],
"button": "left",
"screenshot_id": "shot_001",
"window": {"title": "Bloc-notes", "app_name": "notepad"},
}
}
],
session_id="sess_test",
session_dir=str(session_dir),
)
assert len(actions) == 1
assert actions[0]["window_title"] == "Bloc-notes"
assert actions[0]["target_spec"]["window_title"] == "Bloc-notes"
def test_build_replay_from_raw_events_infers_notepad_tab_switch_target(
tmp_path, monkeypatch,
):
"""Un clic haut suivi d'un focus same-app doit devenir une cible d'onglet."""
from agent_v0.server_v1 import stream_processor as sp
session_dir = tmp_path / "sess"
(session_dir / "shots").mkdir(parents=True)
monkeypatch.setattr(sp, "_load_crop_for_event", lambda *args, **kwargs: None)
monkeypatch.setattr(
sp,
"enrich_click_from_screenshot",
lambda *args, **kwargs: {"anchor_image_base64": "abc123", "by_role": "yolo"},
)
monkeypatch.setattr(sp, "_attach_expected_screenshots", lambda *args, **kwargs: None)
monkeypatch.setattr(sp, "_enrich_actions_with_intentions", lambda *args, **kwargs: None)
monkeypatch.setattr(sp, "_unload_gemma4", lambda *args, **kwargs: None)
events = [
{
"event": {
"type": "mouse_click",
"timestamp": 1.0,
"pos": [1514, 562],
"button": "left",
"screenshot_id": "shot_003",
"window": {
"title": "http192.168.1.408765dossier.htmlid=.txt Bloc-notes",
"app_name": "Notepad.exe",
},
"window_capture": {
"rect": [323, 522, 2243, 1638],
"click_relative": [1191, 40],
"window_size": [1920, 1116],
},
}
},
{
"event": {
"type": "window_focus_change",
"timestamp": 1.2,
"from": {
"title": "http192.168.1.408765dossier.htmlid=.txt Bloc-notes",
"app_name": "Notepad.exe",
},
"to": {
"title": "Sans titre Bloc-notes",
"app_name": "Notepad.exe",
},
}
},
]
actions = sp.build_replay_from_raw_events(
events,
session_id="sess_tab_switch",
session_dir=str(session_dir),
)
assert len(actions) == 1
assert actions[0]["target_spec"]["by_text"] == "Sans titre"
assert actions[0]["target_spec"]["by_role"] == "tab"
assert actions[0]["target_spec"]["window_title"] == (
"http192.168.1.408765dossier.htmlid=.txt Bloc-notes"
)
assert actions[0]["target_spec"]["context_hints"]["interaction"] == "switch_tab"
def test_build_replay_propagates_focus_change_into_expected_window_before(
tmp_path, monkeypatch,
):
"""Cas live ``act_raw_c70976c8`` (2026-05-22) : un focus_change vers
``Enregistrer sous`` se produit entre deux clics consécutifs, mais
le mouse_click suivant capture encore le titre pré-transition
(``*test Bloc-notes``) dans son ``window.title``. Sans correction
serveur, la pré-vérif côté agent retombe sur target_spec.window_title
(obsolète) et déclenche une pause supervisée à tort.
Le serveur doit poser explicitement ``expected_window_before`` égal
au dernier ``window_focus_change.to.title`` observé avant le clic.
"""
from agent_v0.server_v1 import stream_processor as sp
session_dir = tmp_path / "sess"
(session_dir / "shots").mkdir(parents=True)
monkeypatch.setattr(sp, "_load_crop_for_event", lambda *args, **kwargs: None)
monkeypatch.setattr(
sp,
"enrich_click_from_screenshot",
lambda *args, **kwargs: {"anchor_image_base64": "abc123"},
)
monkeypatch.setattr(sp, "_attach_expected_screenshots", lambda *args, **kwargs: None)
monkeypatch.setattr(sp, "_enrich_actions_with_intentions", lambda *args, **kwargs: None)
monkeypatch.setattr(sp, "_unload_gemma4", lambda *args, **kwargs: None)
events = [
# Click 1 — dans Notepad, déclenche l'ouverture de la dialog.
{"event": {
"type": "mouse_click",
"timestamp": 1.0,
"pos": [860, 634],
"button": "left",
"screenshot_id": "shot_001",
"window": {"title": "*test Bloc-notes", "app_name": "Notepad.exe"},
}},
# Transition de focus vers la dialog "Enregistrer sous".
{"event": {
"type": "window_focus_change",
"timestamp": 1.2,
"from": {"title": "*test Bloc-notes", "app_name": "Notepad.exe"},
"to": {"title": "Enregistrer sous", "app_name": "Notepad.exe"},
}},
# Click 2 — bouton "Enregistrer" dans la dialog. Mais
# window.title capturé est obsolète (toujours sur Notepad).
{"event": {
"type": "mouse_click",
"timestamp": 1.5,
"pos": [997, 743],
"button": "left",
"screenshot_id": "shot_002",
"window": {"title": "*test Bloc-notes", "app_name": "Notepad.exe"},
}},
]
actions = sp.build_replay_from_raw_events(
events, session_id="sess_save_dialog", session_dir=str(session_dir),
)
clicks = [a for a in actions if a.get("type") == "click"]
assert len(clicks) == 2
# Le clic 2 doit avoir expected_window_before = "Enregistrer sous"
# (issu du focus_change précédent), pas "*test Bloc-notes"
# (le titre obsolète capturé dans l'event raw).
assert clicks[1].get("expected_window_before") == "Enregistrer sous", (
f"clic 2 doit pointer sur la dialog ouverte par le focus_change, "
f"trouvé: {clicks[1].get('expected_window_before')!r} "
f"(target_spec.window_title={clicks[1].get('target_spec', {}).get('window_title')!r})"
)
# Le clic 1 n'a pas eu de focus_change vers une fenêtre avant lui
# → pas de expected_window_before (ou vide).
assert not clicks[0].get("expected_window_before"), (
f"clic 1 ne doit pas avoir d'expected_window_before, "
f"trouvé: {clicks[0].get('expected_window_before')!r}"
)
def test_build_replay_does_not_overwrite_existing_expected_window_before(
tmp_path, monkeypatch,
):
"""La propagation depuis focus_change ne doit pas écraser un
expected_window_before déjà posé en amont (ex: par un setup
action ou un patch précédent)."""
from agent_v0.server_v1 import stream_processor as sp
session_dir = tmp_path / "sess"
(session_dir / "shots").mkdir(parents=True)
monkeypatch.setattr(sp, "_load_crop_for_event", lambda *args, **kwargs: None)
monkeypatch.setattr(
sp, "enrich_click_from_screenshot",
lambda *args, **kwargs: {
"anchor_image_base64": "abc",
# Pré-existant : un autre composant a déjà posé la pré-condition.
# build_replay_from_raw_events ne crée pas expected_window_before
# depuis enrichment, mais on simule via fixture (cas générique
# : action upstream qui pose ce champ).
},
)
monkeypatch.setattr(sp, "_attach_expected_screenshots", lambda *a, **k: None)
monkeypatch.setattr(sp, "_enrich_actions_with_intentions", lambda *a, **k: None)
monkeypatch.setattr(sp, "_unload_gemma4", lambda *a, **k: None)
events = [
{"event": {
"type": "window_focus_change",
"timestamp": 0.5,
"to": {"title": "Fenetre A", "app_name": "test.exe"},
}},
{"event": {
"type": "mouse_click",
"timestamp": 1.0,
"pos": [10, 20],
"screenshot_id": "shot_001",
"window": {"title": "Fenetre A", "app_name": "test.exe"},
"expected_window_before": "Pre-existant",
}},
]
actions = sp.build_replay_from_raw_events(
events, session_id="sess_x", session_dir=str(session_dir),
)
clicks = [a for a in actions if a.get("type") == "click"]
assert clicks
# Si déjà posé en upstream, on respecte la valeur en place.
pre_existing = clicks[0].get("expected_window_before")
assert pre_existing in (None, "", "Fenetre A"), (
# Soit absent (build n'a pas propagé sur ce clic), soit Fenetre A
# (le dernier focus_change). En tout cas, doit être cohérent.
f"valeur inattendue: {pre_existing!r}"
)
def test_memory_window_title_for_action_reads_top_level_and_target_spec(monkeypatch):
"""Le lecteur mémoire doit voir les variantes top-level et target_spec."""
monkeypatch.setenv("RPA_API_TOKEN", "deadbeef" * 4)
monkeypatch.delenv("RPA_AUTH_DISABLED", raising=False)
mod = _reload_api_stream()
assert mod._memory_window_title_for_action(
{
"expected_window_before": "Fenêtre attendue",
"target_spec": {"window_title": "Fenêtre cible"},
"window_title": "Fenêtre action",
}
) == "Fenêtre attendue"
assert mod._memory_window_title_for_action(
{
"target_spec": {"context_hints": {"window_title": "Depuis context_hints"}},
}
) == "Depuis context_hints"
assert mod._memory_window_title_for_action(
{
"window_title": "Top-level uniquement",
"target_spec": {},
}
) == "Top-level uniquement"