From 6724f4395042041c1caa3f75834ca47c62d7bc47 Mon Sep 17 00:00:00 2001
From: Dom <dom@rpa-vision-v3.local>
Date: Tue, 31 Mar 2026 23:21:06 +0200
Subject: [PATCH] =?UTF-8?q?fix:=20strat=C3=A9gie=20hybride=20OCR=E2=86=92g?=
 =?UTF-8?q?rounding=20VLM=20/=20ic=C3=B4nes=E2=86=92template=20matching?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Résolution 4/4 (100%) validée localement :
- Texte OCR (by_text_source="ocr") → grounding Qwen2.5-VL (dist < 0.04)
- Icônes sans texte (by_text_source="") → template matching crop 80x80 (dist = 0.000)

Le VLM identify element est supprimé pour les icônes (descriptions
non-déterministes qui faisaient échouer le grounding). Le template
matching est instantané et parfait quand le crop est net (80x80).

Ajout de by_text_source dans target_spec pour distinguer OCR vs VLM.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 agent_v0/server_v1/api_stream.py       | 19 +++++++++----------
 agent_v0/server_v1/stream_processor.py | 19 +++++++++++--------
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/agent_v0/server_v1/api_stream.py b/agent_v0/server_v1/api_stream.py
index d84cca8bd..7da9e3e59 100644
--- a/agent_v0/server_v1/api_stream.py
+++ b/agent_v0/server_v1/api_stream.py
@@ -3970,12 +3970,14 @@ def _resolve_target_sync(
                 vlm_description = _build_target_description(target_spec)
 
         # ---------------------------------------------------------------
-        # Étape 0 : Grounding VLM Direct (Qwen2.5-VL)
-        # Le VLM reçoit le screenshot + description textuelle et retourne
-        # directement les coordonnées. Plus fiable que SomEngine + numérotation.
+        # Étape 0 : Choisir la stratégie selon le type d'élément
+        # - Texte OCR fiable → grounding VLM (description textuelle)
+        # - Icône sans texte → template matching (crop 80x80)
         # ---------------------------------------------------------------
-        grounding_desc = by_text_strict or vlm_description
-        if grounding_desc:
+        by_text_source = target_spec.get("by_text_source", "")
+
+        if by_text_strict and by_text_source == "ocr":
+            # Texte OCR fiable → grounding VLM direct
             grounding_result = _resolve_by_grounding(
                 screenshot_path=screenshot_path,
                 target_spec=target_spec,
@@ -3987,14 +3989,11 @@ def _resolve_target_sync(
                     "Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'",
                     grounding_result.get("x_pct", 0),
                     grounding_result.get("y_pct", 0),
-                    grounding_desc[:50],
+                    by_text_strict[:50],
                 )
                 return grounding_result
 
-        # ---------------------------------------------------------------
-        # Étape 0.5 : Template matching pour icônes sans texte (crop 80x80)
-        # ---------------------------------------------------------------
-        if not by_text_strict:
+        if not by_text_strict or by_text_source != "ocr":
             result = _resolve_by_template_matching(
                 screenshot_path=screenshot_path,
                 anchor_image_b64=anchor_image_b64,
diff --git a/agent_v0/server_v1/stream_processor.py b/agent_v0/server_v1/stream_processor.py
index c9a9925f9..44595b6e7 100644
--- a/agent_v0/server_v1/stream_processor.py
+++ b/agent_v0/server_v1/stream_processor.py
@@ -1141,25 +1141,28 @@ def build_replay_from_raw_events(
                     )
 
                     # Déterminer le texte de l'élément cliqué (by_text)
-                    # Priorité : vision_info.text > som_element.label > VLM identification
+                    # Priorité : vision_info.text > som_element.label
+                    # Source "ocr" = fiable (texte réel), "vlm" = bavardage non-fiable
                     element_text = ""
                     element_type = ""
+                    text_source = ""  # "ocr" ou "vlm"
                     if isinstance(vision_info, dict):
                         element_text = vision_info.get("text", "")
                         element_type = vision_info.get("type", "")
+                        if element_text:
+                            text_source = "ocr"
                     if not element_text and som_elem and som_elem.get("label"):
                         element_text = som_elem["label"]
+                        text_source = "ocr"
 
-                    # Si pas de texte (icône sans label), demander au VLM
-                    # d'identifier CE QUE c'est à partir du crop
-                    if not element_text and anchor_b64:
-                        element_text = _vlm_identify_element(
-                            anchor_b64, window_title,
-                        )
+                    # Icônes sans texte OCR → NE PAS utiliser le VLM pour nommer
+                    # (descriptions non-déterministes qui font échouer le grounding)
+                    # Le template matching du crop 80x80 sera utilisé à la place
 
                     action["target_spec"] = {
                         "anchor_image_base64": anchor_b64,
-                        "by_text": element_text,  # CE QUE l'élément EST
+                        "by_text": element_text,
+                        "by_text_source": text_source,  # "ocr" = fiable, "" = icône
                         "by_role": element_type or (som_elem.get("source", "") if som_elem else ""),
                         "vlm_description": vlm_description,
                         "window_title": window_title,