fix: stratégie hybride OCR→grounding VLM / icônes→template matching
Résolution 4/4 (100%) validée localement : - Texte OCR (by_text_source="ocr") → grounding Qwen2.5-VL (dist < 0.04) - Icônes sans texte (by_text_source="") → template matching crop 80x80 (dist = 0.000) Le VLM identify element est supprimé pour les icônes (descriptions non-déterministes qui faisaient échouer le grounding). Le template matching est instantané et parfait quand le crop est net (80x80). Ajout de by_text_source dans target_spec pour distinguer OCR vs VLM. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3970,12 +3970,14 @@ def _resolve_target_sync(
|
|||||||
vlm_description = _build_target_description(target_spec)
|
vlm_description = _build_target_description(target_spec)
|
||||||
|
|
||||||
# ---------------------------------------------------------------
|
# ---------------------------------------------------------------
|
||||||
# Étape 0 : Grounding VLM Direct (Qwen2.5-VL)
|
# Étape 0 : Choisir la stratégie selon le type d'élément
|
||||||
# Le VLM reçoit le screenshot + description textuelle et retourne
|
# - Texte OCR fiable → grounding VLM (description textuelle)
|
||||||
# directement les coordonnées. Plus fiable que SomEngine + numérotation.
|
# - Icône sans texte → template matching (crop 80x80)
|
||||||
# ---------------------------------------------------------------
|
# ---------------------------------------------------------------
|
||||||
grounding_desc = by_text_strict or vlm_description
|
by_text_source = target_spec.get("by_text_source", "")
|
||||||
if grounding_desc:
|
|
||||||
|
if by_text_strict and by_text_source == "ocr":
|
||||||
|
# Texte OCR fiable → grounding VLM direct
|
||||||
grounding_result = _resolve_by_grounding(
|
grounding_result = _resolve_by_grounding(
|
||||||
screenshot_path=screenshot_path,
|
screenshot_path=screenshot_path,
|
||||||
target_spec=target_spec,
|
target_spec=target_spec,
|
||||||
@@ -3987,14 +3989,11 @@ def _resolve_target_sync(
|
|||||||
"Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'",
|
"Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'",
|
||||||
grounding_result.get("x_pct", 0),
|
grounding_result.get("x_pct", 0),
|
||||||
grounding_result.get("y_pct", 0),
|
grounding_result.get("y_pct", 0),
|
||||||
grounding_desc[:50],
|
by_text_strict[:50],
|
||||||
)
|
)
|
||||||
return grounding_result
|
return grounding_result
|
||||||
|
|
||||||
# ---------------------------------------------------------------
|
if not by_text_strict or by_text_source != "ocr":
|
||||||
# Étape 0.5 : Template matching pour icônes sans texte (crop 80x80)
|
|
||||||
# ---------------------------------------------------------------
|
|
||||||
if not by_text_strict:
|
|
||||||
result = _resolve_by_template_matching(
|
result = _resolve_by_template_matching(
|
||||||
screenshot_path=screenshot_path,
|
screenshot_path=screenshot_path,
|
||||||
anchor_image_b64=anchor_image_b64,
|
anchor_image_b64=anchor_image_b64,
|
||||||
|
|||||||
@@ -1141,25 +1141,28 @@ def build_replay_from_raw_events(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Déterminer le texte de l'élément cliqué (by_text)
|
# Déterminer le texte de l'élément cliqué (by_text)
|
||||||
# Priorité : vision_info.text > som_element.label > VLM identification
|
# Priorité : vision_info.text > som_element.label
|
||||||
|
# Source "ocr" = fiable (texte réel), "vlm" = bavardage non-fiable
|
||||||
element_text = ""
|
element_text = ""
|
||||||
element_type = ""
|
element_type = ""
|
||||||
|
text_source = "" # "ocr" ou "vlm"
|
||||||
if isinstance(vision_info, dict):
|
if isinstance(vision_info, dict):
|
||||||
element_text = vision_info.get("text", "")
|
element_text = vision_info.get("text", "")
|
||||||
element_type = vision_info.get("type", "")
|
element_type = vision_info.get("type", "")
|
||||||
|
if element_text:
|
||||||
|
text_source = "ocr"
|
||||||
if not element_text and som_elem and som_elem.get("label"):
|
if not element_text and som_elem and som_elem.get("label"):
|
||||||
element_text = som_elem["label"]
|
element_text = som_elem["label"]
|
||||||
|
text_source = "ocr"
|
||||||
|
|
||||||
# Si pas de texte (icône sans label), demander au VLM
|
# Icônes sans texte OCR → NE PAS utiliser le VLM pour nommer
|
||||||
# d'identifier CE QUE c'est à partir du crop
|
# (descriptions non-déterministes qui font échouer le grounding)
|
||||||
if not element_text and anchor_b64:
|
# Le template matching du crop 80x80 sera utilisé à la place
|
||||||
element_text = _vlm_identify_element(
|
|
||||||
anchor_b64, window_title,
|
|
||||||
)
|
|
||||||
|
|
||||||
action["target_spec"] = {
|
action["target_spec"] = {
|
||||||
"anchor_image_base64": anchor_b64,
|
"anchor_image_base64": anchor_b64,
|
||||||
"by_text": element_text, # CE QUE l'élément EST
|
"by_text": element_text,
|
||||||
|
"by_text_source": text_source, # "ocr" = fiable, "" = icône
|
||||||
"by_role": element_type or (som_elem.get("source", "") if som_elem else ""),
|
"by_role": element_type or (som_elem.get("source", "") if som_elem else ""),
|
||||||
"vlm_description": vlm_description,
|
"vlm_description": vlm_description,
|
||||||
"window_title": window_title,
|
"window_title": window_title,
|
||||||
|
|||||||
Reference in New Issue
Block a user