snapshot: WIP 5j replay reliability (B1 watchdog + dialog handlers + grounding drift)

Snapshot avant correction du blocage relance Léa (3 incidents 24h: SSH refusé,
polls morts ×2). Point de rollback stable.

Contenu:
- agent_v1/core/executor.py: 5 patchs dialog handling (saveas drift, close_tab
  hotkey fallback, confirm_save Unicode apostrophe, foreground dialog
  recontextualization, runtime_dialog in-loop) + helpers normalize_window_hint,
  requires_post_verify_window_transition
- agent_v1/core/grounding.py: garde drift template fix (fallback_x/y plumbed)
- server_v1/replay_watchdog.py (NEW): orphan watchdog B1, scan 10s timeout 30s
- server_v1/api_stream.py: dispatched_action plumbing, watchdog lifespan,
  metrics endpoint
- server_v1/replay_engine.py: _schedule_retry préserve original_action +
  dispatched_action
- stream_processor.py: gardes _infer_tab_switch_target (no false switch_tab
  on save_as dialog open) + _attach_expected_window_before
- tests/integration: test_replay_watchdog.py (8 cas), test_stream_processor.py
- tests/unit: test_executor_verify_window_guard.py (start_button, close_tab,
  runtime_dialog, post_verify, transition fallbacks)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-05-24 16:48:37 +02:00
parent 5ea4960e65
commit 7df51d2c79
47 changed files with 9811 additions and 451 deletions

View File

@@ -243,6 +243,168 @@ def _validate_match_context(
return True
def _has_meaningful_recorded_coords(
fallback_x_pct: float,
fallback_y_pct: float,
) -> bool:
"""Indiquer si les coordonnées fallback représentent une vraie position source."""
return (
fallback_x_pct > 0.001
and fallback_y_pct > 0.001
and not (
abs(fallback_x_pct - 0.5) < 0.001
and abs(fallback_y_pct - 0.5) < 0.001
)
)
def _is_close_tab_target(target_spec: Optional[Dict[str, Any]]) -> bool:
"""Détecter une action close_tab issue du compilateur replay."""
if not isinstance(target_spec, dict):
return False
context_hints = target_spec.get("context_hints") or {}
return str((context_hints.get("interaction") or "")).strip().lower() == "close_tab"
def _get_expected_close_tab_coords(
target_spec: Optional[Dict[str, Any]],
screen_width: int,
screen_height: int,
fallback_x_pct: float = 0.0,
fallback_y_pct: float = 0.0,
) -> Optional[tuple[float, float]]:
"""Retrouver la position attendue la plus fiable pour un close_tab.
Ordre de préférence :
1. Coordonnées fallback explicites de l'action replay
2. centre SoM calibré à l'enregistrement
3. click_relative + rect fenêtre source
"""
if _has_meaningful_recorded_coords(fallback_x_pct, fallback_y_pct):
return float(fallback_x_pct), float(fallback_y_pct)
if not isinstance(target_spec, dict):
return None
som_center = (target_spec.get("som_element") or {}).get("center_norm")
if isinstance(som_center, (list, tuple)) and len(som_center) >= 2:
try:
exp_x = float(som_center[0])
exp_y = float(som_center[1])
if 0.0 <= exp_x <= 1.0 and 0.0 <= exp_y <= 1.0:
return exp_x, exp_y
except (TypeError, ValueError):
pass
window_capture = target_spec.get("window_capture") or {}
rect = window_capture.get("rect")
click_relative = window_capture.get("click_relative")
if (
isinstance(rect, (list, tuple))
and len(rect) >= 4
and isinstance(click_relative, (list, tuple))
and len(click_relative) >= 2
and screen_width > 0
and screen_height > 0
):
try:
abs_x = float(rect[0]) + float(click_relative[0])
abs_y = float(rect[1]) + float(click_relative[1])
exp_x = abs_x / float(screen_width)
exp_y = abs_y / float(screen_height)
if 0.0 <= exp_x <= 1.0 and 0.0 <= exp_y <= 1.0:
return exp_x, exp_y
except (TypeError, ValueError, ZeroDivisionError):
pass
return None
def _is_close_tab_result_plausible(
resolved_x: float,
resolved_y: float,
target_spec: Optional[Dict[str, Any]],
screen_width: int,
screen_height: int,
fallback_x_pct: float = 0.0,
fallback_y_pct: float = 0.0,
) -> bool:
"""Filtrer les faux positifs close_tab qui dérivent vers le bouton fermer."""
if not _is_close_tab_target(target_spec):
return True
expected = _get_expected_close_tab_coords(
target_spec,
screen_width,
screen_height,
fallback_x_pct=fallback_x_pct,
fallback_y_pct=fallback_y_pct,
)
if expected is None:
return True
exp_x, exp_y = expected
dx = abs(float(resolved_x) - exp_x)
dy = abs(float(resolved_y) - exp_y)
distance = (dx ** 2 + dy ** 2) ** 0.5
is_plausible = dx <= 0.18 and distance <= 0.20
if not is_plausible:
logger.warning(
"close_tab guard : résultat rejeté car trop éloigné de la zone "
"source (resolved=(%.4f, %.4f), expected=(%.4f, %.4f), "
"drift=(%.4f, %.4f), dist=%.4f)",
float(resolved_x),
float(resolved_y),
exp_x,
exp_y,
dx,
dy,
distance,
)
return is_plausible
def _is_start_button_vlm_result_plausible(
result: Dict[str, Any],
fallback_x_pct: float,
fallback_y_pct: float,
target_spec: Dict[str, Any],
max_distance: float = 0.20,
) -> bool:
"""Filtrer les faux positifs VLM sur le bouton Démarrer.
Le bouton Démarrer est un singleton système. Quand on dispose d'un vrai clic
enregistré (`fallback_*`), une localisation VLM très éloignée de cette zone
est plus probablement un faux positif qu'un vrai déplacement UI.
"""
by_role = str(target_spec.get("by_role", "") or "").strip().lower()
if by_role != "start_button":
return True
if not _has_meaningful_recorded_coords(fallback_x_pct, fallback_y_pct):
return True
if _validate_match_context(
result,
fallback_x_pct,
fallback_y_pct,
target_spec,
max_distance=max_distance,
):
return True
logger.warning(
"Start button guard : résultat VLM rejeté car trop éloigné de la "
"position enregistrée (resolved=(%.4f, %.4f), expected=(%.4f, %.4f), max=%.2f)",
float(result.get("x_pct", 0) or 0),
float(result.get("y_pct", 0) or 0),
fallback_x_pct,
fallback_y_pct,
max_distance,
)
return False
# =========================================================================
# YOLO/OmniParser — Résolution par détection d'éléments UI
# =========================================================================
@@ -1109,16 +1271,66 @@ def _resolve_by_som(
# Centre du match
match_cx = max_loc[0] + anc_w // 2
match_cy = max_loc[1] + anc_h // 2
interaction = str(
(target_spec.get("context_hints") or {}).get("interaction", "") or ""
).strip().lower()
if interaction == "close_tab":
elapsed = time.time() - t0
cx_norm = match_cx / screen_width if screen_width > 0 else 0.0
cy_norm = match_cy / screen_height if screen_height > 0 else 0.0
if _is_close_tab_result_plausible(
cx_norm,
cy_norm,
target_spec,
screen_width,
screen_height,
):
logger.info(
"SoM resolve ANCHOR exact close_tab : score=%.3f "
"centre=(%d, %d) → (%.4f, %.4f) en %.1fs",
max_score, match_cx, match_cy, cx_norm, cy_norm, elapsed,
)
return {
"resolved": True,
"method": "som_anchor_match",
"x_pct": round(cx_norm, 6),
"y_pct": round(cy_norm, 6),
"matched_element": {
"label": "close_tab_button",
"type": "visual_anchor",
"role": "som_anchor_exact",
"confidence": max_score,
},
"score": max_score,
"match_box": {
"x": int(max_loc[0]),
"y": int(max_loc[1]),
"width": int(anc_w),
"height": int(anc_h),
},
}
logger.warning(
"SoM resolve ANCHOR exact close_tab rejeté : score=%.3f "
"centre=(%d, %d) → (%.4f, %.4f), passage VLM/fallback",
max_score, match_cx, match_cy, cx_norm, cy_norm,
)
# Ne pas recycler ce faux match vers l'élément SoM le plus
# proche : pour close_tab, cela retombe facilement sur le
# bouton de fermeture de la fenêtre.
best_elem = None
else:
best_elem = None
# Trouver l'élément SomEngine le plus proche du centre du match
best_elem = None
best_dist = float("inf")
for elem in som_result.elements:
cx, cy = elem.center
dist = ((match_cx - cx) ** 2 + (match_cy - cy) ** 2) ** 0.5
if dist < best_dist:
best_dist = dist
best_elem = elem
if best_elem is None and interaction != "close_tab":
for elem in som_result.elements:
cx, cy = elem.center
dist = ((match_cx - cx) ** 2 + (match_cy - cy) ** 2) ** 0.5
if dist < best_dist:
best_dist = dist
best_elem = elem
if best_elem and best_dist < 100: # Max 100px de distance
elapsed = time.time() - t0
@@ -1584,6 +1796,49 @@ def _resolve_target_sync(
"fallback cascade legacy"
)
# ===================================================================
# Cas spécial : boutons de dialogue runtime ("Oui", "Non", "OK", ...)
# ===================================================================
# Ces boutons sont textuels, sans ancre stable, et apparaissent souvent
# au milieu d'une action déjà en cours. Si on les laisse partir dans la
# cascade générique (VLM -> SoM -> ScreenAnalyzer), on peut bloquer
# l'action principale assez longtemps pour déclencher le watchdog.
# Contrat voulu : OCR direct rapide, sinon abandon immédiat pour que le
# client essaie son fallback local par template texte.
dialog_role = str(target_spec.get("by_role", "") or "").strip().lower()
dialog_text = str(target_spec.get("by_text", "") or "").strip()
if dialog_role == "dialog_button" and dialog_text and not anchor_image_b64:
ocr_result = _resolve_by_ocr_text(
screenshot_path=screenshot_path,
target_text=dialog_text,
screen_width=screen_width,
screen_height=screen_height,
)
if ocr_result and ocr_result.get("score", 0) >= 0.80:
ocr_result["method"] = "hybrid_text_direct"
logger.info(
"Resolve dialog_button OCR-DIRECT : OK '%s' → (%.4f, %.4f) score=%.2f",
dialog_text[:40],
ocr_result.get("x_pct", 0),
ocr_result.get("y_pct", 0),
ocr_result.get("score", 0),
)
return ocr_result
logger.info(
"Resolve dialog_button OCR-only : '%s' non trouvé "
"(fenêtre='%s') — skip VLM/SoM/ScreenAnalyzer",
dialog_text[:40],
str(target_spec.get("window_title", "") or "")[:80],
)
return {
"resolved": False,
"method": "dialog_button_ocr_only",
"reason": "ocr_direct_failed_dialog_button_no_vlm",
"x_pct": fallback_x_pct,
"y_pct": fallback_y_pct,
}
# ===================================================================
# MODE STRICT (replay sessions) — Stratégie VLM-FIRST
# ===================================================================
@@ -1656,13 +1911,25 @@ def _resolve_target_sync(
screen_height=screen_height,
)
if grounding_result and grounding_result.get("resolved"):
logger.info(
"Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'",
grounding_result.get("x_pct", 0),
grounding_result.get("y_pct", 0),
by_text_strict[:50],
if _is_close_tab_result_plausible(
float(grounding_result.get("x_pct", 0) or 0),
float(grounding_result.get("y_pct", 0) or 0),
target_spec,
screen_width,
screen_height,
fallback_x_pct=fallback_x_pct,
fallback_y_pct=fallback_y_pct,
):
logger.info(
"Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'",
grounding_result.get("x_pct", 0),
grounding_result.get("y_pct", 0),
by_text_strict[:50],
)
return grounding_result
logger.warning(
"Strict resolve GROUNDING : résultat close_tab rejeté, passage template/VLM"
)
return grounding_result
if not by_text_strict or by_text_source not in ("ocr", "vlm"):
# Template matching pour les éléments sans texte (icônes pures)
@@ -1690,11 +1957,23 @@ def _resolve_target_sync(
abs_y = window_rect[1] + y_tm * tm_screen_h
result["x_pct"] = round(abs_x / screen_width, 6)
result["y_pct"] = round(abs_y / screen_height, 6)
logger.info(
"Strict resolve TEMPLATE : icon match (score=%.3f)",
result.get("score", 0),
if _is_close_tab_result_plausible(
float(result.get("x_pct", 0) or 0),
float(result.get("y_pct", 0) or 0),
target_spec,
screen_width,
screen_height,
fallback_x_pct=fallback_x_pct,
fallback_y_pct=fallback_y_pct,
):
logger.info(
"Strict resolve TEMPLATE : icon match (score=%.3f)",
result.get("score", 0),
)
return result
logger.warning(
"Strict resolve TEMPLATE : résultat close_tab rejeté, passage cascade suivante"
)
return result
# ---------------------------------------------------------------
# Étape 0.5 : OCR direct (hybrid_text_direct) — chemin rapide
@@ -1739,6 +2018,27 @@ def _resolve_target_sync(
by_text_strict[:40],
)
# Les boutons de dialogues runtime connus ("Oui", "Non", "OK", etc.)
# ne doivent pas partir dans la cascade lente VLM -> SoM. Si l'OCR
# direct ne les trouve pas immédiatement, on rend la main au client
# pour son fallback local par template texte, sinon on bloque l'action
# principale assez longtemps pour déclencher le watchdog.
dialog_role = str(target_spec.get("by_role", "") or "").strip().lower()
if dialog_role == "dialog_button" and by_text_strict and not anchor_image_b64:
logger.info(
"Strict resolve dialog_button : OCR-direct only pour '%s' "
"(fenêtre='%s') — skip VLM/SoM/template",
by_text_strict[:40],
str(target_spec.get("window_title", "") or "")[:80],
)
return {
"resolved": False,
"method": "dialog_button_ocr_only",
"reason": "ocr_direct_failed_dialog_button_no_vlm",
"x_pct": fallback_x_pct,
"y_pct": fallback_y_pct,
}
# ---------------------------------------------------------------
# Étape 1 : VLM Quick Find (fallback, multi-image)
# ---------------------------------------------------------------
@@ -1750,12 +2050,29 @@ def _resolve_target_sync(
)
if vlm_result and vlm_result.get("resolved"):
if vlm_result.get("score", 0) >= 0.3:
logger.info(
"Strict resolve VLM-first : VLM OK (score=%.2f) pour '%s'",
vlm_result.get("score", 0),
vlm_description[:60] if vlm_description else "(anchor)",
if _is_start_button_vlm_result_plausible(
vlm_result,
fallback_x_pct,
fallback_y_pct,
target_spec,
) and _is_close_tab_result_plausible(
float(vlm_result.get("x_pct", 0) or 0),
float(vlm_result.get("y_pct", 0) or 0),
target_spec,
screen_width,
screen_height,
fallback_x_pct=fallback_x_pct,
fallback_y_pct=fallback_y_pct,
):
logger.info(
"Strict resolve VLM-first : VLM OK (score=%.2f) pour '%s'",
vlm_result.get("score", 0),
vlm_description[:60] if vlm_description else "(anchor)",
)
return vlm_result
logger.warning(
"Strict resolve VLM-first : résultat VLM rejeté par un garde-fou, passage SoM/template"
)
return vlm_result
else:
logger.info(
"Strict resolve VLM-first : VLM score=%.2f trop bas, passage template",
@@ -1782,12 +2099,24 @@ def _resolve_target_sync(
screen_height=screen_height,
)
if som_result and som_result.get("resolved"):
logger.info(
"Strict resolve SoM+VLM : OK (score=%.2f, mark=#%s)",
som_result.get("score", 0),
som_result.get("matched_element", {}).get("som_id", "?"),
if _is_close_tab_result_plausible(
float(som_result.get("x_pct", 0) or 0),
float(som_result.get("y_pct", 0) or 0),
target_spec,
screen_width,
screen_height,
fallback_x_pct=fallback_x_pct,
fallback_y_pct=fallback_y_pct,
):
logger.info(
"Strict resolve SoM+VLM : OK (score=%.2f, mark=#%s)",
som_result.get("score", 0),
som_result.get("matched_element", {}).get("som_id", "?"),
)
return som_result
logger.warning(
"Strict resolve SoM+VLM : résultat close_tab rejeté, passage template matching"
)
return som_result
else:
logger.info("Strict resolve SoM+VLM : échoué, passage template matching")
@@ -1805,12 +2134,24 @@ def _resolve_target_sync(
score = result.get("score", 0)
# Score >= 0.95 : match quasi-parfait, pas besoin de valider le contexte
if score >= 0.95:
logger.info(
"Strict resolve VLM-first : template matching fallback OK "
"(score=%.3f >= 0.95, contexte skip — match quasi-parfait)",
score,
if _is_close_tab_result_plausible(
float(result.get("x_pct", 0) or 0),
float(result.get("y_pct", 0) or 0),
target_spec,
screen_width,
screen_height,
fallback_x_pct=fallback_x_pct,
fallback_y_pct=fallback_y_pct,
):
logger.info(
"Strict resolve VLM-first : template matching fallback OK "
"(score=%.3f >= 0.95, contexte skip — match quasi-parfait)",
score,
)
return result
logger.warning(
"Strict resolve TEMPLATE : match close_tab très fort mais hors zone source, rejeté"
)
return result
elif _validate_match_context(result, fallback_x_pct, fallback_y_pct, target_spec):
logger.info(
"Strict resolve VLM-first : template matching fallback OK "
@@ -2189,6 +2530,37 @@ def _text_match_fuzzy(expected: str, observed: str, min_token_ratio: float = 0.6
return matched / len(tokens) >= min_token_ratio
_SOM_BBOX_OCR_PADDING_PX: int = 8
_SOM_BBOX_MIN_DIM_PX: int = 12
def _should_reject_on_text_mismatch(
is_valid: bool,
observed: Optional[str],
) -> bool:
"""Décide si le pré-check OCR doit rejeter la résolution.
Patch 2026-05-23 : on distingue deux cas d'échec du fuzzy match :
- ``observed`` contient du texte (ex: ``'9 ?'``, ``'OBS Studio…'``)
→ mismatch confirmé, la cascade a probablement cliqué ailleurs
→ on rejette.
- ``observed`` est vide ou whitespace
→ l'OCR n'a rien lu (zone trop petite, texte peu contrasté,
modèle EasyOCR sous le seuil de détection). C'est ambigu :
ce n'est PAS la preuve d'un faux positif, on accepte la
résolution serveur. La garde drift ANCHOR-TM côté agent
protège en aval contre les vrais faux positifs.
Si ``is_valid=True`` → jamais de rejet (cas nominal).
"""
if is_valid:
return False
if observed is None:
return False
return bool(str(observed).strip())
def _validate_text_at_position(
screenshot_path: str,
x_pct: float,
@@ -2197,9 +2569,20 @@ def _validate_text_at_position(
screen_width: int,
screen_height: int,
radius_px: int = 280,
som_bbox_norm: Optional[List[float]] = None,
) -> tuple:
"""Pré-check sémantique : OCR sur une zone autour de (x_pct, y_pct) et
vérifie que `expected_text` y est présent (substring ou fuzzy 50%).
"""Pré-check sémantique : OCR sur une zone et vérifie que
`expected_text` y est présent (substring ou fuzzy 50%).
Zone OCR (par priorité) :
1. Si ``som_bbox_norm = [x1, y1, x2, y2]`` (normalisé 0..1) est
fourni et a une largeur/hauteur > _SOM_BBOX_MIN_DIM_PX en
pixels écran : OCR sur cette bbox élargie d'un padding court.
Plus précis pour les éléments étroits (onglets Notepad
moderne, ~30-40px haut) que le radius générique qui capture
le texte voisin (status bar, etc.).
2. Sinon : fallback historique → carré de ``radius_px`` autour
de (x_pct, y_pct).
Retourne (is_valid: bool, observed_text: str, elapsed_ms: float).
@@ -2219,16 +2602,52 @@ def _validate_text_at_position(
t0 = time.time()
img = Image.open(screenshot_path).convert("RGB")
img_w, img_h = img.size
cx = int(x_pct * screen_width)
cy = int(y_pct * screen_height)
# Saturer dans les bornes de l'image (le screenshot peut être plus
# large que la fenêtre logique — utiliser min(img_*, screen_*) en sécurité).
max_x = min(img_w, screen_width)
max_y = min(img_h, screen_height)
x1 = max(0, cx - radius_px)
y1 = max(0, cy - radius_px)
x2 = min(max_x, cx + radius_px)
y2 = min(max_y, cy + radius_px)
# --- Tentative 1 : zone OCR depuis la bbox SoM (préférée) ---
x1 = y1 = x2 = y2 = None
if (
isinstance(som_bbox_norm, (list, tuple))
and len(som_bbox_norm) == 4
):
try:
bx1, by1, bx2, by2 = (float(v) for v in som_bbox_norm)
# Tolérer ordre inversé.
bx1, bx2 = sorted((bx1, bx2))
by1, by2 = sorted((by1, by2))
# Refuser les bboxes dégénérées AVANT padding : si
# l'élément cible fait < _SOM_BBOX_MIN_DIM_PX en
# natif, c'est probablement une bbox d'apparence
# (curseur, séparateur 1px) — pas un label OCRable.
raw_w = (bx2 - bx1) * screen_width
raw_h = (by2 - by1) * screen_height
if (
raw_w >= _SOM_BBOX_MIN_DIM_PX
and raw_h >= _SOM_BBOX_MIN_DIM_PX
):
# Conversion en pixels écran + clipping et padding.
px1 = int(bx1 * screen_width) - _SOM_BBOX_OCR_PADDING_PX
py1 = int(by1 * screen_height) - _SOM_BBOX_OCR_PADDING_PX
px2 = int(bx2 * screen_width) + _SOM_BBOX_OCR_PADDING_PX
py2 = int(by2 * screen_height) + _SOM_BBOX_OCR_PADDING_PX
x1 = max(0, px1)
y1 = max(0, py1)
x2 = min(max_x, px2)
y2 = min(max_y, py2)
except (TypeError, ValueError):
# Bbox malformée : fallback silencieux sur le radius.
x1 = y1 = x2 = y2 = None
# --- Fallback : carré radius_px autour de (x_pct, y_pct) ---
if x1 is None:
cx = int(x_pct * screen_width)
cy = int(y_pct * screen_height)
x1 = max(0, cx - radius_px)
y1 = max(0, cy - radius_px)
x2 = min(max_x, cx + radius_px)
y2 = min(max_y, cy + radius_px)
if x2 - x1 < 10 or y2 - y1 < 10:
return True, "", 0.0
crop = img.crop((x1, y1, x2, y2))
@@ -2246,6 +2665,7 @@ def _validate_resolution_quality(
result: Optional[Dict[str, Any]],
fallback_x_pct: float,
fallback_y_pct: float,
target_spec: Optional[Dict[str, Any]] = None,
) -> Optional[Dict[str, Any]]:
"""Valide un résultat de résolution et le rejette s'il est peu fiable.
@@ -2263,6 +2683,16 @@ def _validate_resolution_quality(
elle n'est PAS appelée par les méthodes internes de la cascade, mais
uniquement depuis le handler HTTP `/resolve_target` après que la
cascade a produit son meilleur candidat.
Argument optionnel `target_spec` : permet d'appliquer des relaxations
contextuelles. Cas couvert (2026-05-22) : pour une cible
`context_hints.interaction == "switch_tab"` qui dispose d'un
`som_element.bbox_norm`, on abaisse le seuil des méthodes ``som_*``
de 0.75 → 0.60. Justification : (1) le focus_change pré-clic
prouve qu'on est dans la bonne fenêtre, (2) la bbox SoM a été
calibrée à l'enregistrement et reste valide, (3) les onglets
Notepad moderne sont visuellement quasi-identiques → score VLM
inévitablement lower.
"""
if not result or not isinstance(result, dict):
return result
@@ -2291,6 +2721,52 @@ def _validate_resolution_quality(
min_score = threshold
break
# Relaxation contextuelle pour switch_tab + SoM calibré (2026-05-22).
# Les onglets Notepad moderne (et apps similaires) sont visuellement
# quasi-identiques : le grounding VLM/SoM produit fréquemment un
# score 0.65-0.75, juste sous le seuil strict. Comme le contexte
# `interaction=switch_tab` + bbox SoM enregistrée + focus_change
# pré-clic confirment déjà la fenêtre et la zone, on relâche le
# seuil des méthodes som_* à 0.60 dans CE cas précis uniquement.
if (
min_score is not None
and target_spec
and method.startswith("som_")
):
context_hints = target_spec.get("context_hints") or {}
is_tab_switch = (
context_hints.get("interaction") == "switch_tab"
and target_spec.get("by_role") == "tab"
)
som_element = target_spec.get("som_element") or {}
has_calibrated_som = bool(som_element.get("bbox_norm"))
if is_tab_switch and has_calibrated_som:
relaxed = 0.60
if relaxed < min_score:
logger.info(
"[REPLAY] switch_tab + som_element calibré → seuil "
"som_* relâché %.2f%.2f (cible='%s')",
min_score, relaxed,
target_spec.get("by_text", ""),
)
min_score = relaxed
is_close_tab = (
method == "som_anchor_match"
and str((context_hints.get("interaction") or "")).strip().lower() == "close_tab"
and not str(target_spec.get("by_text", "") or "").strip()
and bool(target_spec.get("anchor_image_base64"))
)
if is_close_tab:
relaxed = 0.70
if relaxed < min_score:
logger.info(
"[REPLAY] close_tab + anchor-only → seuil som_anchor_match "
"relâché %.2f%.2f",
min_score, relaxed,
)
min_score = relaxed
if min_score is not None and score < min_score:
logger.warning(
"[REPLAY] Resolution REJETÉE (score trop bas) : method=%s score=%.3f < %.2f",
@@ -2306,13 +2782,40 @@ def _validate_resolution_quality(
"y_pct": fallback_y_pct,
}
if _is_close_tab_target(target_spec) and not _is_close_tab_result_plausible(
resolved_x,
resolved_y,
target_spec,
0,
0,
fallback_x_pct=fallback_x_pct,
fallback_y_pct=fallback_y_pct,
):
logger.warning(
"[REPLAY] Resolution REJETÉE (close_tab hors zone source) : "
"method=%s resolved=(%.3f, %.3f) expected=(%.3f, %.3f)",
method,
resolved_x,
resolved_y,
fallback_x_pct,
fallback_y_pct,
)
return {
"resolved": False,
"method": f"rejected_close_tab_zone_{method}",
"reason": "close_tab_out_of_recorded_zone",
"original_method": method,
"original_score": score,
"x_pct": fallback_x_pct,
"y_pct": fallback_y_pct,
}
# --- Check 2 : garde de proximité ---
# On n'applique la garde que si les coordonnées enregistrées ont un
# sens (pas des placeholders 0.5/0.5 des plans V4 ni des 0.0/0.0).
_has_recorded_coords = (
fallback_x_pct > 0.001
and fallback_y_pct > 0.001
and not (abs(fallback_x_pct - 0.5) < 0.001 and abs(fallback_y_pct - 0.5) < 0.001)
_has_recorded_coords = _has_meaningful_recorded_coords(
fallback_x_pct,
fallback_y_pct,
)
if _has_recorded_coords:
dx = abs(resolved_x - fallback_x_pct)