From 6724f4395042041c1caa3f75834ca47c62d7bc47 Mon Sep 17 00:00:00 2001 From: Dom Date: Tue, 31 Mar 2026 23:21:06 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20strat=C3=A9gie=20hybride=20OCR=E2=86=92g?= =?UTF-8?q?rounding=20VLM=20/=20ic=C3=B4nes=E2=86=92template=20matching?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Résolution 4/4 (100%) validée localement : - Texte OCR (by_text_source="ocr") → grounding Qwen2.5-VL (dist < 0.04) - Icônes sans texte (by_text_source="") → template matching crop 80x80 (dist = 0.000) Le VLM identify element est supprimé pour les icônes (descriptions non-déterministes qui faisaient échouer le grounding). Le template matching est instantané et parfait quand le crop est net (80x80). Ajout de by_text_source dans target_spec pour distinguer OCR vs VLM. Co-Authored-By: Claude Opus 4.6 (1M context) --- agent_v0/server_v1/api_stream.py | 19 +++++++++---------- agent_v0/server_v1/stream_processor.py | 19 +++++++++++-------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/agent_v0/server_v1/api_stream.py b/agent_v0/server_v1/api_stream.py index d84cca8bd..7da9e3e59 100644 --- a/agent_v0/server_v1/api_stream.py +++ b/agent_v0/server_v1/api_stream.py @@ -3970,12 +3970,14 @@ def _resolve_target_sync( vlm_description = _build_target_description(target_spec) # --------------------------------------------------------------- - # Étape 0 : Grounding VLM Direct (Qwen2.5-VL) - # Le VLM reçoit le screenshot + description textuelle et retourne - # directement les coordonnées. Plus fiable que SomEngine + numérotation. + # Étape 0 : Choisir la stratégie selon le type d'élément + # - Texte OCR fiable → grounding VLM (description textuelle) + # - Icône sans texte → template matching (crop 80x80) # --------------------------------------------------------------- - grounding_desc = by_text_strict or vlm_description - if grounding_desc: + by_text_source = target_spec.get("by_text_source", "") + + if by_text_strict and by_text_source == "ocr": + # Texte OCR fiable → grounding VLM direct grounding_result = _resolve_by_grounding( screenshot_path=screenshot_path, target_spec=target_spec, @@ -3987,14 +3989,11 @@ def _resolve_target_sync( "Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'", grounding_result.get("x_pct", 0), grounding_result.get("y_pct", 0), - grounding_desc[:50], + by_text_strict[:50], ) return grounding_result - # --------------------------------------------------------------- - # Étape 0.5 : Template matching pour icônes sans texte (crop 80x80) - # --------------------------------------------------------------- - if not by_text_strict: + if not by_text_strict or by_text_source != "ocr": result = _resolve_by_template_matching( screenshot_path=screenshot_path, anchor_image_b64=anchor_image_b64, diff --git a/agent_v0/server_v1/stream_processor.py b/agent_v0/server_v1/stream_processor.py index c9a9925f9..44595b6e7 100644 --- a/agent_v0/server_v1/stream_processor.py +++ b/agent_v0/server_v1/stream_processor.py @@ -1141,25 +1141,28 @@ def build_replay_from_raw_events( ) # Déterminer le texte de l'élément cliqué (by_text) - # Priorité : vision_info.text > som_element.label > VLM identification + # Priorité : vision_info.text > som_element.label + # Source "ocr" = fiable (texte réel), "vlm" = bavardage non-fiable element_text = "" element_type = "" + text_source = "" # "ocr" ou "vlm" if isinstance(vision_info, dict): element_text = vision_info.get("text", "") element_type = vision_info.get("type", "") + if element_text: + text_source = "ocr" if not element_text and som_elem and som_elem.get("label"): element_text = som_elem["label"] + text_source = "ocr" - # Si pas de texte (icône sans label), demander au VLM - # d'identifier CE QUE c'est à partir du crop - if not element_text and anchor_b64: - element_text = _vlm_identify_element( - anchor_b64, window_title, - ) + # Icônes sans texte OCR → NE PAS utiliser le VLM pour nommer + # (descriptions non-déterministes qui font échouer le grounding) + # Le template matching du crop 80x80 sera utilisé à la place action["target_spec"] = { "anchor_image_base64": anchor_b64, - "by_text": element_text, # CE QUE l'élément EST + "by_text": element_text, + "by_text_source": text_source, # "ocr" = fiable, "" = icône "by_role": element_type or (som_elem.get("source", "") if som_elem else ""), "vlm_description": vlm_description, "window_title": window_title,