fix: template matching prioritaire pour icônes sans texte (by_text vide)
Quand by_text est vide (icônes : logo Windows, disquette, croix), le template matching du crop 80x80 est plus fiable que le VLM qui choisit des éléments au hasard. Cascade strict mode : 0. Template matching (si by_text vide) — crop 80x80 discriminant 1. VLM Quick Find (compréhension sémantique) 1.5. SoM + VLM 2. Template matching (fallback avec seuil 0.90) 3. Échec → STOP Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3760,20 +3760,38 @@ def _resolve_target_sync(
|
|||||||
# MODE STRICT (replay sessions) — Stratégie VLM-FIRST
|
# MODE STRICT (replay sessions) — Stratégie VLM-FIRST
|
||||||
# ===================================================================
|
# ===================================================================
|
||||||
if strict_mode and anchor_image_b64:
|
if strict_mode and anchor_image_b64:
|
||||||
# ---------------------------------------------------------------
|
|
||||||
# Étape 1 : VLM Quick Find (compréhension sémantique)
|
|
||||||
# Le VLM reçoit le screenshot + le crop de référence + la description
|
|
||||||
# riche (titre fenêtre, position relative, texte visible, type).
|
|
||||||
# ---------------------------------------------------------------
|
|
||||||
vlm_description = target_spec.get("vlm_description", "")
|
vlm_description = target_spec.get("vlm_description", "")
|
||||||
|
by_text_strict = target_spec.get("by_text", "").strip()
|
||||||
|
|
||||||
# Fallback : construire la description depuis by_text/by_role
|
# Fallback : construire la description depuis by_text/by_role
|
||||||
if not vlm_description:
|
if not vlm_description:
|
||||||
by_text = target_spec.get("by_text", "").strip()
|
|
||||||
by_role = target_spec.get("by_role", "").strip()
|
by_role = target_spec.get("by_role", "").strip()
|
||||||
if by_text or by_role:
|
if by_text_strict or by_role:
|
||||||
vlm_description = _build_target_description(target_spec)
|
vlm_description = _build_target_description(target_spec)
|
||||||
|
|
||||||
# Toujours tenter le VLM si on a un anchor (multi-image) ou une description
|
# ---------------------------------------------------------------
|
||||||
|
# Étape 0 : Template matching PRIORITAIRE pour les icônes sans texte
|
||||||
|
# Les crops 80x80 sont très discriminants pour les icônes (logo Windows,
|
||||||
|
# disquette, croix). Le VLM se trompe souvent sur ces éléments.
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
if not by_text_strict:
|
||||||
|
result = _resolve_by_template_matching(
|
||||||
|
screenshot_path=screenshot_path,
|
||||||
|
anchor_image_b64=anchor_image_b64,
|
||||||
|
screen_width=screen_width,
|
||||||
|
screen_height=screen_height,
|
||||||
|
confidence_threshold=0.70,
|
||||||
|
)
|
||||||
|
if result and result.get("score", 0) >= 0.70:
|
||||||
|
logger.info(
|
||||||
|
"Strict resolve icon : template matching OK (score=%.3f) pour icône sans texte",
|
||||||
|
result.get("score", 0),
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Étape 1 : VLM Quick Find (compréhension sémantique)
|
||||||
|
# ---------------------------------------------------------------
|
||||||
if vlm_description or anchor_image_b64:
|
if vlm_description or anchor_image_b64:
|
||||||
vlm_result = _vlm_quick_find(
|
vlm_result = _vlm_quick_find(
|
||||||
screenshot_path=screenshot_path,
|
screenshot_path=screenshot_path,
|
||||||
|
|||||||
Reference in New Issue
Block a user