snapshot: WIP 5j replay reliability (B1 watchdog + dialog handlers + grounding drift)
Snapshot avant correction du blocage relance Léa (3 incidents 24h: SSH refusé, polls morts ×2). Point de rollback stable. Contenu: - agent_v1/core/executor.py: 5 patchs dialog handling (saveas drift, close_tab hotkey fallback, confirm_save Unicode apostrophe, foreground dialog recontextualization, runtime_dialog in-loop) + helpers normalize_window_hint, requires_post_verify_window_transition - agent_v1/core/grounding.py: garde drift template fix (fallback_x/y plumbed) - server_v1/replay_watchdog.py (NEW): orphan watchdog B1, scan 10s timeout 30s - server_v1/api_stream.py: dispatched_action plumbing, watchdog lifespan, metrics endpoint - server_v1/replay_engine.py: _schedule_retry préserve original_action + dispatched_action - stream_processor.py: gardes _infer_tab_switch_target (no false switch_tab on save_as dialog open) + _attach_expected_window_before - tests/integration: test_replay_watchdog.py (8 cas), test_stream_processor.py - tests/unit: test_executor_verify_window_guard.py (start_button, close_tab, runtime_dialog, post_verify, transition fallbacks) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -243,6 +243,168 @@ def _validate_match_context(
|
||||
return True
|
||||
|
||||
|
||||
def _has_meaningful_recorded_coords(
|
||||
fallback_x_pct: float,
|
||||
fallback_y_pct: float,
|
||||
) -> bool:
|
||||
"""Indiquer si les coordonnées fallback représentent une vraie position source."""
|
||||
return (
|
||||
fallback_x_pct > 0.001
|
||||
and fallback_y_pct > 0.001
|
||||
and not (
|
||||
abs(fallback_x_pct - 0.5) < 0.001
|
||||
and abs(fallback_y_pct - 0.5) < 0.001
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _is_close_tab_target(target_spec: Optional[Dict[str, Any]]) -> bool:
|
||||
"""Détecter une action close_tab issue du compilateur replay."""
|
||||
if not isinstance(target_spec, dict):
|
||||
return False
|
||||
context_hints = target_spec.get("context_hints") or {}
|
||||
return str((context_hints.get("interaction") or "")).strip().lower() == "close_tab"
|
||||
|
||||
|
||||
def _get_expected_close_tab_coords(
|
||||
target_spec: Optional[Dict[str, Any]],
|
||||
screen_width: int,
|
||||
screen_height: int,
|
||||
fallback_x_pct: float = 0.0,
|
||||
fallback_y_pct: float = 0.0,
|
||||
) -> Optional[tuple[float, float]]:
|
||||
"""Retrouver la position attendue la plus fiable pour un close_tab.
|
||||
|
||||
Ordre de préférence :
|
||||
1. Coordonnées fallback explicites de l'action replay
|
||||
2. centre SoM calibré à l'enregistrement
|
||||
3. click_relative + rect fenêtre source
|
||||
"""
|
||||
if _has_meaningful_recorded_coords(fallback_x_pct, fallback_y_pct):
|
||||
return float(fallback_x_pct), float(fallback_y_pct)
|
||||
|
||||
if not isinstance(target_spec, dict):
|
||||
return None
|
||||
|
||||
som_center = (target_spec.get("som_element") or {}).get("center_norm")
|
||||
if isinstance(som_center, (list, tuple)) and len(som_center) >= 2:
|
||||
try:
|
||||
exp_x = float(som_center[0])
|
||||
exp_y = float(som_center[1])
|
||||
if 0.0 <= exp_x <= 1.0 and 0.0 <= exp_y <= 1.0:
|
||||
return exp_x, exp_y
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
window_capture = target_spec.get("window_capture") or {}
|
||||
rect = window_capture.get("rect")
|
||||
click_relative = window_capture.get("click_relative")
|
||||
if (
|
||||
isinstance(rect, (list, tuple))
|
||||
and len(rect) >= 4
|
||||
and isinstance(click_relative, (list, tuple))
|
||||
and len(click_relative) >= 2
|
||||
and screen_width > 0
|
||||
and screen_height > 0
|
||||
):
|
||||
try:
|
||||
abs_x = float(rect[0]) + float(click_relative[0])
|
||||
abs_y = float(rect[1]) + float(click_relative[1])
|
||||
exp_x = abs_x / float(screen_width)
|
||||
exp_y = abs_y / float(screen_height)
|
||||
if 0.0 <= exp_x <= 1.0 and 0.0 <= exp_y <= 1.0:
|
||||
return exp_x, exp_y
|
||||
except (TypeError, ValueError, ZeroDivisionError):
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _is_close_tab_result_plausible(
|
||||
resolved_x: float,
|
||||
resolved_y: float,
|
||||
target_spec: Optional[Dict[str, Any]],
|
||||
screen_width: int,
|
||||
screen_height: int,
|
||||
fallback_x_pct: float = 0.0,
|
||||
fallback_y_pct: float = 0.0,
|
||||
) -> bool:
|
||||
"""Filtrer les faux positifs close_tab qui dérivent vers le bouton fermer."""
|
||||
if not _is_close_tab_target(target_spec):
|
||||
return True
|
||||
|
||||
expected = _get_expected_close_tab_coords(
|
||||
target_spec,
|
||||
screen_width,
|
||||
screen_height,
|
||||
fallback_x_pct=fallback_x_pct,
|
||||
fallback_y_pct=fallback_y_pct,
|
||||
)
|
||||
if expected is None:
|
||||
return True
|
||||
|
||||
exp_x, exp_y = expected
|
||||
dx = abs(float(resolved_x) - exp_x)
|
||||
dy = abs(float(resolved_y) - exp_y)
|
||||
distance = (dx ** 2 + dy ** 2) ** 0.5
|
||||
is_plausible = dx <= 0.18 and distance <= 0.20
|
||||
if not is_plausible:
|
||||
logger.warning(
|
||||
"close_tab guard : résultat rejeté car trop éloigné de la zone "
|
||||
"source (resolved=(%.4f, %.4f), expected=(%.4f, %.4f), "
|
||||
"drift=(%.4f, %.4f), dist=%.4f)",
|
||||
float(resolved_x),
|
||||
float(resolved_y),
|
||||
exp_x,
|
||||
exp_y,
|
||||
dx,
|
||||
dy,
|
||||
distance,
|
||||
)
|
||||
return is_plausible
|
||||
|
||||
|
||||
def _is_start_button_vlm_result_plausible(
|
||||
result: Dict[str, Any],
|
||||
fallback_x_pct: float,
|
||||
fallback_y_pct: float,
|
||||
target_spec: Dict[str, Any],
|
||||
max_distance: float = 0.20,
|
||||
) -> bool:
|
||||
"""Filtrer les faux positifs VLM sur le bouton Démarrer.
|
||||
|
||||
Le bouton Démarrer est un singleton système. Quand on dispose d'un vrai clic
|
||||
enregistré (`fallback_*`), une localisation VLM très éloignée de cette zone
|
||||
est plus probablement un faux positif qu'un vrai déplacement UI.
|
||||
"""
|
||||
by_role = str(target_spec.get("by_role", "") or "").strip().lower()
|
||||
if by_role != "start_button":
|
||||
return True
|
||||
|
||||
if not _has_meaningful_recorded_coords(fallback_x_pct, fallback_y_pct):
|
||||
return True
|
||||
|
||||
if _validate_match_context(
|
||||
result,
|
||||
fallback_x_pct,
|
||||
fallback_y_pct,
|
||||
target_spec,
|
||||
max_distance=max_distance,
|
||||
):
|
||||
return True
|
||||
|
||||
logger.warning(
|
||||
"Start button guard : résultat VLM rejeté car trop éloigné de la "
|
||||
"position enregistrée (resolved=(%.4f, %.4f), expected=(%.4f, %.4f), max=%.2f)",
|
||||
float(result.get("x_pct", 0) or 0),
|
||||
float(result.get("y_pct", 0) or 0),
|
||||
fallback_x_pct,
|
||||
fallback_y_pct,
|
||||
max_distance,
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# YOLO/OmniParser — Résolution par détection d'éléments UI
|
||||
# =========================================================================
|
||||
@@ -1109,16 +1271,66 @@ def _resolve_by_som(
|
||||
# Centre du match
|
||||
match_cx = max_loc[0] + anc_w // 2
|
||||
match_cy = max_loc[1] + anc_h // 2
|
||||
interaction = str(
|
||||
(target_spec.get("context_hints") or {}).get("interaction", "") or ""
|
||||
).strip().lower()
|
||||
|
||||
if interaction == "close_tab":
|
||||
elapsed = time.time() - t0
|
||||
cx_norm = match_cx / screen_width if screen_width > 0 else 0.0
|
||||
cy_norm = match_cy / screen_height if screen_height > 0 else 0.0
|
||||
if _is_close_tab_result_plausible(
|
||||
cx_norm,
|
||||
cy_norm,
|
||||
target_spec,
|
||||
screen_width,
|
||||
screen_height,
|
||||
):
|
||||
logger.info(
|
||||
"SoM resolve ANCHOR exact close_tab : score=%.3f "
|
||||
"centre=(%d, %d) → (%.4f, %.4f) en %.1fs",
|
||||
max_score, match_cx, match_cy, cx_norm, cy_norm, elapsed,
|
||||
)
|
||||
return {
|
||||
"resolved": True,
|
||||
"method": "som_anchor_match",
|
||||
"x_pct": round(cx_norm, 6),
|
||||
"y_pct": round(cy_norm, 6),
|
||||
"matched_element": {
|
||||
"label": "close_tab_button",
|
||||
"type": "visual_anchor",
|
||||
"role": "som_anchor_exact",
|
||||
"confidence": max_score,
|
||||
},
|
||||
"score": max_score,
|
||||
"match_box": {
|
||||
"x": int(max_loc[0]),
|
||||
"y": int(max_loc[1]),
|
||||
"width": int(anc_w),
|
||||
"height": int(anc_h),
|
||||
},
|
||||
}
|
||||
logger.warning(
|
||||
"SoM resolve ANCHOR exact close_tab rejeté : score=%.3f "
|
||||
"centre=(%d, %d) → (%.4f, %.4f), passage VLM/fallback",
|
||||
max_score, match_cx, match_cy, cx_norm, cy_norm,
|
||||
)
|
||||
# Ne pas recycler ce faux match vers l'élément SoM le plus
|
||||
# proche : pour close_tab, cela retombe facilement sur le
|
||||
# bouton de fermeture de la fenêtre.
|
||||
best_elem = None
|
||||
else:
|
||||
best_elem = None
|
||||
|
||||
# Trouver l'élément SomEngine le plus proche du centre du match
|
||||
best_elem = None
|
||||
best_dist = float("inf")
|
||||
for elem in som_result.elements:
|
||||
cx, cy = elem.center
|
||||
dist = ((match_cx - cx) ** 2 + (match_cy - cy) ** 2) ** 0.5
|
||||
if dist < best_dist:
|
||||
best_dist = dist
|
||||
best_elem = elem
|
||||
if best_elem is None and interaction != "close_tab":
|
||||
for elem in som_result.elements:
|
||||
cx, cy = elem.center
|
||||
dist = ((match_cx - cx) ** 2 + (match_cy - cy) ** 2) ** 0.5
|
||||
if dist < best_dist:
|
||||
best_dist = dist
|
||||
best_elem = elem
|
||||
|
||||
if best_elem and best_dist < 100: # Max 100px de distance
|
||||
elapsed = time.time() - t0
|
||||
@@ -1584,6 +1796,49 @@ def _resolve_target_sync(
|
||||
"fallback cascade legacy"
|
||||
)
|
||||
|
||||
# ===================================================================
|
||||
# Cas spécial : boutons de dialogue runtime ("Oui", "Non", "OK", ...)
|
||||
# ===================================================================
|
||||
# Ces boutons sont textuels, sans ancre stable, et apparaissent souvent
|
||||
# au milieu d'une action déjà en cours. Si on les laisse partir dans la
|
||||
# cascade générique (VLM -> SoM -> ScreenAnalyzer), on peut bloquer
|
||||
# l'action principale assez longtemps pour déclencher le watchdog.
|
||||
# Contrat voulu : OCR direct rapide, sinon abandon immédiat pour que le
|
||||
# client essaie son fallback local par template texte.
|
||||
dialog_role = str(target_spec.get("by_role", "") or "").strip().lower()
|
||||
dialog_text = str(target_spec.get("by_text", "") or "").strip()
|
||||
if dialog_role == "dialog_button" and dialog_text and not anchor_image_b64:
|
||||
ocr_result = _resolve_by_ocr_text(
|
||||
screenshot_path=screenshot_path,
|
||||
target_text=dialog_text,
|
||||
screen_width=screen_width,
|
||||
screen_height=screen_height,
|
||||
)
|
||||
if ocr_result and ocr_result.get("score", 0) >= 0.80:
|
||||
ocr_result["method"] = "hybrid_text_direct"
|
||||
logger.info(
|
||||
"Resolve dialog_button OCR-DIRECT : OK '%s' → (%.4f, %.4f) score=%.2f",
|
||||
dialog_text[:40],
|
||||
ocr_result.get("x_pct", 0),
|
||||
ocr_result.get("y_pct", 0),
|
||||
ocr_result.get("score", 0),
|
||||
)
|
||||
return ocr_result
|
||||
|
||||
logger.info(
|
||||
"Resolve dialog_button OCR-only : '%s' non trouvé "
|
||||
"(fenêtre='%s') — skip VLM/SoM/ScreenAnalyzer",
|
||||
dialog_text[:40],
|
||||
str(target_spec.get("window_title", "") or "")[:80],
|
||||
)
|
||||
return {
|
||||
"resolved": False,
|
||||
"method": "dialog_button_ocr_only",
|
||||
"reason": "ocr_direct_failed_dialog_button_no_vlm",
|
||||
"x_pct": fallback_x_pct,
|
||||
"y_pct": fallback_y_pct,
|
||||
}
|
||||
|
||||
# ===================================================================
|
||||
# MODE STRICT (replay sessions) — Stratégie VLM-FIRST
|
||||
# ===================================================================
|
||||
@@ -1656,13 +1911,25 @@ def _resolve_target_sync(
|
||||
screen_height=screen_height,
|
||||
)
|
||||
if grounding_result and grounding_result.get("resolved"):
|
||||
logger.info(
|
||||
"Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'",
|
||||
grounding_result.get("x_pct", 0),
|
||||
grounding_result.get("y_pct", 0),
|
||||
by_text_strict[:50],
|
||||
if _is_close_tab_result_plausible(
|
||||
float(grounding_result.get("x_pct", 0) or 0),
|
||||
float(grounding_result.get("y_pct", 0) or 0),
|
||||
target_spec,
|
||||
screen_width,
|
||||
screen_height,
|
||||
fallback_x_pct=fallback_x_pct,
|
||||
fallback_y_pct=fallback_y_pct,
|
||||
):
|
||||
logger.info(
|
||||
"Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'",
|
||||
grounding_result.get("x_pct", 0),
|
||||
grounding_result.get("y_pct", 0),
|
||||
by_text_strict[:50],
|
||||
)
|
||||
return grounding_result
|
||||
logger.warning(
|
||||
"Strict resolve GROUNDING : résultat close_tab rejeté, passage template/VLM"
|
||||
)
|
||||
return grounding_result
|
||||
|
||||
if not by_text_strict or by_text_source not in ("ocr", "vlm"):
|
||||
# Template matching pour les éléments sans texte (icônes pures)
|
||||
@@ -1690,11 +1957,23 @@ def _resolve_target_sync(
|
||||
abs_y = window_rect[1] + y_tm * tm_screen_h
|
||||
result["x_pct"] = round(abs_x / screen_width, 6)
|
||||
result["y_pct"] = round(abs_y / screen_height, 6)
|
||||
logger.info(
|
||||
"Strict resolve TEMPLATE : icon match (score=%.3f)",
|
||||
result.get("score", 0),
|
||||
if _is_close_tab_result_plausible(
|
||||
float(result.get("x_pct", 0) or 0),
|
||||
float(result.get("y_pct", 0) or 0),
|
||||
target_spec,
|
||||
screen_width,
|
||||
screen_height,
|
||||
fallback_x_pct=fallback_x_pct,
|
||||
fallback_y_pct=fallback_y_pct,
|
||||
):
|
||||
logger.info(
|
||||
"Strict resolve TEMPLATE : icon match (score=%.3f)",
|
||||
result.get("score", 0),
|
||||
)
|
||||
return result
|
||||
logger.warning(
|
||||
"Strict resolve TEMPLATE : résultat close_tab rejeté, passage cascade suivante"
|
||||
)
|
||||
return result
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Étape 0.5 : OCR direct (hybrid_text_direct) — chemin rapide
|
||||
@@ -1739,6 +2018,27 @@ def _resolve_target_sync(
|
||||
by_text_strict[:40],
|
||||
)
|
||||
|
||||
# Les boutons de dialogues runtime connus ("Oui", "Non", "OK", etc.)
|
||||
# ne doivent pas partir dans la cascade lente VLM -> SoM. Si l'OCR
|
||||
# direct ne les trouve pas immédiatement, on rend la main au client
|
||||
# pour son fallback local par template texte, sinon on bloque l'action
|
||||
# principale assez longtemps pour déclencher le watchdog.
|
||||
dialog_role = str(target_spec.get("by_role", "") or "").strip().lower()
|
||||
if dialog_role == "dialog_button" and by_text_strict and not anchor_image_b64:
|
||||
logger.info(
|
||||
"Strict resolve dialog_button : OCR-direct only pour '%s' "
|
||||
"(fenêtre='%s') — skip VLM/SoM/template",
|
||||
by_text_strict[:40],
|
||||
str(target_spec.get("window_title", "") or "")[:80],
|
||||
)
|
||||
return {
|
||||
"resolved": False,
|
||||
"method": "dialog_button_ocr_only",
|
||||
"reason": "ocr_direct_failed_dialog_button_no_vlm",
|
||||
"x_pct": fallback_x_pct,
|
||||
"y_pct": fallback_y_pct,
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Étape 1 : VLM Quick Find (fallback, multi-image)
|
||||
# ---------------------------------------------------------------
|
||||
@@ -1750,12 +2050,29 @@ def _resolve_target_sync(
|
||||
)
|
||||
if vlm_result and vlm_result.get("resolved"):
|
||||
if vlm_result.get("score", 0) >= 0.3:
|
||||
logger.info(
|
||||
"Strict resolve VLM-first : VLM OK (score=%.2f) pour '%s'",
|
||||
vlm_result.get("score", 0),
|
||||
vlm_description[:60] if vlm_description else "(anchor)",
|
||||
if _is_start_button_vlm_result_plausible(
|
||||
vlm_result,
|
||||
fallback_x_pct,
|
||||
fallback_y_pct,
|
||||
target_spec,
|
||||
) and _is_close_tab_result_plausible(
|
||||
float(vlm_result.get("x_pct", 0) or 0),
|
||||
float(vlm_result.get("y_pct", 0) or 0),
|
||||
target_spec,
|
||||
screen_width,
|
||||
screen_height,
|
||||
fallback_x_pct=fallback_x_pct,
|
||||
fallback_y_pct=fallback_y_pct,
|
||||
):
|
||||
logger.info(
|
||||
"Strict resolve VLM-first : VLM OK (score=%.2f) pour '%s'",
|
||||
vlm_result.get("score", 0),
|
||||
vlm_description[:60] if vlm_description else "(anchor)",
|
||||
)
|
||||
return vlm_result
|
||||
logger.warning(
|
||||
"Strict resolve VLM-first : résultat VLM rejeté par un garde-fou, passage SoM/template"
|
||||
)
|
||||
return vlm_result
|
||||
else:
|
||||
logger.info(
|
||||
"Strict resolve VLM-first : VLM score=%.2f trop bas, passage template",
|
||||
@@ -1782,12 +2099,24 @@ def _resolve_target_sync(
|
||||
screen_height=screen_height,
|
||||
)
|
||||
if som_result and som_result.get("resolved"):
|
||||
logger.info(
|
||||
"Strict resolve SoM+VLM : OK (score=%.2f, mark=#%s)",
|
||||
som_result.get("score", 0),
|
||||
som_result.get("matched_element", {}).get("som_id", "?"),
|
||||
if _is_close_tab_result_plausible(
|
||||
float(som_result.get("x_pct", 0) or 0),
|
||||
float(som_result.get("y_pct", 0) or 0),
|
||||
target_spec,
|
||||
screen_width,
|
||||
screen_height,
|
||||
fallback_x_pct=fallback_x_pct,
|
||||
fallback_y_pct=fallback_y_pct,
|
||||
):
|
||||
logger.info(
|
||||
"Strict resolve SoM+VLM : OK (score=%.2f, mark=#%s)",
|
||||
som_result.get("score", 0),
|
||||
som_result.get("matched_element", {}).get("som_id", "?"),
|
||||
)
|
||||
return som_result
|
||||
logger.warning(
|
||||
"Strict resolve SoM+VLM : résultat close_tab rejeté, passage template matching"
|
||||
)
|
||||
return som_result
|
||||
else:
|
||||
logger.info("Strict resolve SoM+VLM : échoué, passage template matching")
|
||||
|
||||
@@ -1805,12 +2134,24 @@ def _resolve_target_sync(
|
||||
score = result.get("score", 0)
|
||||
# Score >= 0.95 : match quasi-parfait, pas besoin de valider le contexte
|
||||
if score >= 0.95:
|
||||
logger.info(
|
||||
"Strict resolve VLM-first : template matching fallback OK "
|
||||
"(score=%.3f >= 0.95, contexte skip — match quasi-parfait)",
|
||||
score,
|
||||
if _is_close_tab_result_plausible(
|
||||
float(result.get("x_pct", 0) or 0),
|
||||
float(result.get("y_pct", 0) or 0),
|
||||
target_spec,
|
||||
screen_width,
|
||||
screen_height,
|
||||
fallback_x_pct=fallback_x_pct,
|
||||
fallback_y_pct=fallback_y_pct,
|
||||
):
|
||||
logger.info(
|
||||
"Strict resolve VLM-first : template matching fallback OK "
|
||||
"(score=%.3f >= 0.95, contexte skip — match quasi-parfait)",
|
||||
score,
|
||||
)
|
||||
return result
|
||||
logger.warning(
|
||||
"Strict resolve TEMPLATE : match close_tab très fort mais hors zone source, rejeté"
|
||||
)
|
||||
return result
|
||||
elif _validate_match_context(result, fallback_x_pct, fallback_y_pct, target_spec):
|
||||
logger.info(
|
||||
"Strict resolve VLM-first : template matching fallback OK "
|
||||
@@ -2189,6 +2530,37 @@ def _text_match_fuzzy(expected: str, observed: str, min_token_ratio: float = 0.6
|
||||
return matched / len(tokens) >= min_token_ratio
|
||||
|
||||
|
||||
_SOM_BBOX_OCR_PADDING_PX: int = 8
|
||||
_SOM_BBOX_MIN_DIM_PX: int = 12
|
||||
|
||||
|
||||
def _should_reject_on_text_mismatch(
|
||||
is_valid: bool,
|
||||
observed: Optional[str],
|
||||
) -> bool:
|
||||
"""Décide si le pré-check OCR doit rejeter la résolution.
|
||||
|
||||
Patch 2026-05-23 : on distingue deux cas d'échec du fuzzy match :
|
||||
|
||||
- ``observed`` contient du texte (ex: ``'9 ?'``, ``'OBS Studio…'``)
|
||||
→ mismatch confirmé, la cascade a probablement cliqué ailleurs
|
||||
→ on rejette.
|
||||
- ``observed`` est vide ou whitespace
|
||||
→ l'OCR n'a rien lu (zone trop petite, texte peu contrasté,
|
||||
modèle EasyOCR sous le seuil de détection). C'est ambigu :
|
||||
ce n'est PAS la preuve d'un faux positif, on accepte la
|
||||
résolution serveur. La garde drift ANCHOR-TM côté agent
|
||||
protège en aval contre les vrais faux positifs.
|
||||
|
||||
Si ``is_valid=True`` → jamais de rejet (cas nominal).
|
||||
"""
|
||||
if is_valid:
|
||||
return False
|
||||
if observed is None:
|
||||
return False
|
||||
return bool(str(observed).strip())
|
||||
|
||||
|
||||
def _validate_text_at_position(
|
||||
screenshot_path: str,
|
||||
x_pct: float,
|
||||
@@ -2197,9 +2569,20 @@ def _validate_text_at_position(
|
||||
screen_width: int,
|
||||
screen_height: int,
|
||||
radius_px: int = 280,
|
||||
som_bbox_norm: Optional[List[float]] = None,
|
||||
) -> tuple:
|
||||
"""Pré-check sémantique : OCR sur une zone autour de (x_pct, y_pct) et
|
||||
vérifie que `expected_text` y est présent (substring ou fuzzy 50%).
|
||||
"""Pré-check sémantique : OCR sur une zone et vérifie que
|
||||
`expected_text` y est présent (substring ou fuzzy 50%).
|
||||
|
||||
Zone OCR (par priorité) :
|
||||
1. Si ``som_bbox_norm = [x1, y1, x2, y2]`` (normalisé 0..1) est
|
||||
fourni et a une largeur/hauteur > _SOM_BBOX_MIN_DIM_PX en
|
||||
pixels écran : OCR sur cette bbox élargie d'un padding court.
|
||||
Plus précis pour les éléments étroits (onglets Notepad
|
||||
moderne, ~30-40px haut) que le radius générique qui capture
|
||||
le texte voisin (status bar, etc.).
|
||||
2. Sinon : fallback historique → carré de ``radius_px`` autour
|
||||
de (x_pct, y_pct).
|
||||
|
||||
Retourne (is_valid: bool, observed_text: str, elapsed_ms: float).
|
||||
|
||||
@@ -2219,16 +2602,52 @@ def _validate_text_at_position(
|
||||
t0 = time.time()
|
||||
img = Image.open(screenshot_path).convert("RGB")
|
||||
img_w, img_h = img.size
|
||||
cx = int(x_pct * screen_width)
|
||||
cy = int(y_pct * screen_height)
|
||||
# Saturer dans les bornes de l'image (le screenshot peut être plus
|
||||
# large que la fenêtre logique — utiliser min(img_*, screen_*) en sécurité).
|
||||
max_x = min(img_w, screen_width)
|
||||
max_y = min(img_h, screen_height)
|
||||
x1 = max(0, cx - radius_px)
|
||||
y1 = max(0, cy - radius_px)
|
||||
x2 = min(max_x, cx + radius_px)
|
||||
y2 = min(max_y, cy + radius_px)
|
||||
|
||||
# --- Tentative 1 : zone OCR depuis la bbox SoM (préférée) ---
|
||||
x1 = y1 = x2 = y2 = None
|
||||
if (
|
||||
isinstance(som_bbox_norm, (list, tuple))
|
||||
and len(som_bbox_norm) == 4
|
||||
):
|
||||
try:
|
||||
bx1, by1, bx2, by2 = (float(v) for v in som_bbox_norm)
|
||||
# Tolérer ordre inversé.
|
||||
bx1, bx2 = sorted((bx1, bx2))
|
||||
by1, by2 = sorted((by1, by2))
|
||||
# Refuser les bboxes dégénérées AVANT padding : si
|
||||
# l'élément cible fait < _SOM_BBOX_MIN_DIM_PX en
|
||||
# natif, c'est probablement une bbox d'apparence
|
||||
# (curseur, séparateur 1px) — pas un label OCRable.
|
||||
raw_w = (bx2 - bx1) * screen_width
|
||||
raw_h = (by2 - by1) * screen_height
|
||||
if (
|
||||
raw_w >= _SOM_BBOX_MIN_DIM_PX
|
||||
and raw_h >= _SOM_BBOX_MIN_DIM_PX
|
||||
):
|
||||
# Conversion en pixels écran + clipping et padding.
|
||||
px1 = int(bx1 * screen_width) - _SOM_BBOX_OCR_PADDING_PX
|
||||
py1 = int(by1 * screen_height) - _SOM_BBOX_OCR_PADDING_PX
|
||||
px2 = int(bx2 * screen_width) + _SOM_BBOX_OCR_PADDING_PX
|
||||
py2 = int(by2 * screen_height) + _SOM_BBOX_OCR_PADDING_PX
|
||||
x1 = max(0, px1)
|
||||
y1 = max(0, py1)
|
||||
x2 = min(max_x, px2)
|
||||
y2 = min(max_y, py2)
|
||||
except (TypeError, ValueError):
|
||||
# Bbox malformée : fallback silencieux sur le radius.
|
||||
x1 = y1 = x2 = y2 = None
|
||||
|
||||
# --- Fallback : carré radius_px autour de (x_pct, y_pct) ---
|
||||
if x1 is None:
|
||||
cx = int(x_pct * screen_width)
|
||||
cy = int(y_pct * screen_height)
|
||||
x1 = max(0, cx - radius_px)
|
||||
y1 = max(0, cy - radius_px)
|
||||
x2 = min(max_x, cx + radius_px)
|
||||
y2 = min(max_y, cy + radius_px)
|
||||
|
||||
if x2 - x1 < 10 or y2 - y1 < 10:
|
||||
return True, "", 0.0
|
||||
crop = img.crop((x1, y1, x2, y2))
|
||||
@@ -2246,6 +2665,7 @@ def _validate_resolution_quality(
|
||||
result: Optional[Dict[str, Any]],
|
||||
fallback_x_pct: float,
|
||||
fallback_y_pct: float,
|
||||
target_spec: Optional[Dict[str, Any]] = None,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Valide un résultat de résolution et le rejette s'il est peu fiable.
|
||||
|
||||
@@ -2263,6 +2683,16 @@ def _validate_resolution_quality(
|
||||
elle n'est PAS appelée par les méthodes internes de la cascade, mais
|
||||
uniquement depuis le handler HTTP `/resolve_target` après que la
|
||||
cascade a produit son meilleur candidat.
|
||||
|
||||
Argument optionnel `target_spec` : permet d'appliquer des relaxations
|
||||
contextuelles. Cas couvert (2026-05-22) : pour une cible
|
||||
`context_hints.interaction == "switch_tab"` qui dispose d'un
|
||||
`som_element.bbox_norm`, on abaisse le seuil des méthodes ``som_*``
|
||||
de 0.75 → 0.60. Justification : (1) le focus_change pré-clic
|
||||
prouve qu'on est dans la bonne fenêtre, (2) la bbox SoM a été
|
||||
calibrée à l'enregistrement et reste valide, (3) les onglets
|
||||
Notepad moderne sont visuellement quasi-identiques → score VLM
|
||||
inévitablement lower.
|
||||
"""
|
||||
if not result or not isinstance(result, dict):
|
||||
return result
|
||||
@@ -2291,6 +2721,52 @@ def _validate_resolution_quality(
|
||||
min_score = threshold
|
||||
break
|
||||
|
||||
# Relaxation contextuelle pour switch_tab + SoM calibré (2026-05-22).
|
||||
# Les onglets Notepad moderne (et apps similaires) sont visuellement
|
||||
# quasi-identiques : le grounding VLM/SoM produit fréquemment un
|
||||
# score 0.65-0.75, juste sous le seuil strict. Comme le contexte
|
||||
# `interaction=switch_tab` + bbox SoM enregistrée + focus_change
|
||||
# pré-clic confirment déjà la fenêtre et la zone, on relâche le
|
||||
# seuil des méthodes som_* à 0.60 dans CE cas précis uniquement.
|
||||
if (
|
||||
min_score is not None
|
||||
and target_spec
|
||||
and method.startswith("som_")
|
||||
):
|
||||
context_hints = target_spec.get("context_hints") or {}
|
||||
is_tab_switch = (
|
||||
context_hints.get("interaction") == "switch_tab"
|
||||
and target_spec.get("by_role") == "tab"
|
||||
)
|
||||
som_element = target_spec.get("som_element") or {}
|
||||
has_calibrated_som = bool(som_element.get("bbox_norm"))
|
||||
if is_tab_switch and has_calibrated_som:
|
||||
relaxed = 0.60
|
||||
if relaxed < min_score:
|
||||
logger.info(
|
||||
"[REPLAY] switch_tab + som_element calibré → seuil "
|
||||
"som_* relâché %.2f → %.2f (cible='%s')",
|
||||
min_score, relaxed,
|
||||
target_spec.get("by_text", ""),
|
||||
)
|
||||
min_score = relaxed
|
||||
|
||||
is_close_tab = (
|
||||
method == "som_anchor_match"
|
||||
and str((context_hints.get("interaction") or "")).strip().lower() == "close_tab"
|
||||
and not str(target_spec.get("by_text", "") or "").strip()
|
||||
and bool(target_spec.get("anchor_image_base64"))
|
||||
)
|
||||
if is_close_tab:
|
||||
relaxed = 0.70
|
||||
if relaxed < min_score:
|
||||
logger.info(
|
||||
"[REPLAY] close_tab + anchor-only → seuil som_anchor_match "
|
||||
"relâché %.2f → %.2f",
|
||||
min_score, relaxed,
|
||||
)
|
||||
min_score = relaxed
|
||||
|
||||
if min_score is not None and score < min_score:
|
||||
logger.warning(
|
||||
"[REPLAY] Resolution REJETÉE (score trop bas) : method=%s score=%.3f < %.2f",
|
||||
@@ -2306,13 +2782,40 @@ def _validate_resolution_quality(
|
||||
"y_pct": fallback_y_pct,
|
||||
}
|
||||
|
||||
if _is_close_tab_target(target_spec) and not _is_close_tab_result_plausible(
|
||||
resolved_x,
|
||||
resolved_y,
|
||||
target_spec,
|
||||
0,
|
||||
0,
|
||||
fallback_x_pct=fallback_x_pct,
|
||||
fallback_y_pct=fallback_y_pct,
|
||||
):
|
||||
logger.warning(
|
||||
"[REPLAY] Resolution REJETÉE (close_tab hors zone source) : "
|
||||
"method=%s resolved=(%.3f, %.3f) expected=(%.3f, %.3f)",
|
||||
method,
|
||||
resolved_x,
|
||||
resolved_y,
|
||||
fallback_x_pct,
|
||||
fallback_y_pct,
|
||||
)
|
||||
return {
|
||||
"resolved": False,
|
||||
"method": f"rejected_close_tab_zone_{method}",
|
||||
"reason": "close_tab_out_of_recorded_zone",
|
||||
"original_method": method,
|
||||
"original_score": score,
|
||||
"x_pct": fallback_x_pct,
|
||||
"y_pct": fallback_y_pct,
|
||||
}
|
||||
|
||||
# --- Check 2 : garde de proximité ---
|
||||
# On n'applique la garde que si les coordonnées enregistrées ont un
|
||||
# sens (pas des placeholders 0.5/0.5 des plans V4 ni des 0.0/0.0).
|
||||
_has_recorded_coords = (
|
||||
fallback_x_pct > 0.001
|
||||
and fallback_y_pct > 0.001
|
||||
and not (abs(fallback_x_pct - 0.5) < 0.001 and abs(fallback_y_pct - 0.5) < 0.001)
|
||||
_has_recorded_coords = _has_meaningful_recorded_coords(
|
||||
fallback_x_pct,
|
||||
fallback_y_pct,
|
||||
)
|
||||
if _has_recorded_coords:
|
||||
dx = abs(resolved_x - fallback_x_pct)
|
||||
|
||||
Reference in New Issue
Block a user