From 875367dea9d2674b601fb9a1adbfa435dd5120b5 Mon Sep 17 00:00:00 2001
From: Dom <dom@rpa-vision-v3.local>
Date: Tue, 31 Mar 2026 18:11:24 +0200
Subject: [PATCH] =?UTF-8?q?fix:=20template=20matching=20prioritaire=20pour?=
 =?UTF-8?q?=20ic=C3=B4nes=20sans=20texte=20(by=5Ftext=20vide)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Quand by_text est vide (icônes : logo Windows, disquette, croix),
le template matching du crop 80x80 est plus fiable que le VLM qui
choisit des éléments au hasard.

Cascade strict mode :
0. Template matching (si by_text vide) — crop 80x80 discriminant
1. VLM Quick Find (compréhension sémantique)
1.5. SoM + VLM
2. Template matching (fallback avec seuil 0.90)
3. Échec → STOP

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 agent_v0/server_v1/api_stream.py | 34 ++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/agent_v0/server_v1/api_stream.py b/agent_v0/server_v1/api_stream.py
index 8c04bb02a..58ee4f15d 100644
--- a/agent_v0/server_v1/api_stream.py
+++ b/agent_v0/server_v1/api_stream.py
@@ -3760,20 +3760,38 @@ def _resolve_target_sync(
     # MODE STRICT (replay sessions) — Stratégie VLM-FIRST
     # ===================================================================
     if strict_mode and anchor_image_b64:
-        # ---------------------------------------------------------------
-        # Étape 1 : VLM Quick Find (compréhension sémantique)
-        # Le VLM reçoit le screenshot + le crop de référence + la description
-        # riche (titre fenêtre, position relative, texte visible, type).
-        # ---------------------------------------------------------------
         vlm_description = target_spec.get("vlm_description", "")
+        by_text_strict = target_spec.get("by_text", "").strip()
+
         # Fallback : construire la description depuis by_text/by_role
         if not vlm_description:
-            by_text = target_spec.get("by_text", "").strip()
             by_role = target_spec.get("by_role", "").strip()
-            if by_text or by_role:
+            if by_text_strict or by_role:
                 vlm_description = _build_target_description(target_spec)
 
-        # Toujours tenter le VLM si on a un anchor (multi-image) ou une description
+        # ---------------------------------------------------------------
+        # Étape 0 : Template matching PRIORITAIRE pour les icônes sans texte
+        # Les crops 80x80 sont très discriminants pour les icônes (logo Windows,
+        # disquette, croix). Le VLM se trompe souvent sur ces éléments.
+        # ---------------------------------------------------------------
+        if not by_text_strict:
+            result = _resolve_by_template_matching(
+                screenshot_path=screenshot_path,
+                anchor_image_b64=anchor_image_b64,
+                screen_width=screen_width,
+                screen_height=screen_height,
+                confidence_threshold=0.70,
+            )
+            if result and result.get("score", 0) >= 0.70:
+                logger.info(
+                    "Strict resolve icon : template matching OK (score=%.3f) pour icône sans texte",
+                    result.get("score", 0),
+                )
+                return result
+
+        # ---------------------------------------------------------------
+        # Étape 1 : VLM Quick Find (compréhension sémantique)
+        # ---------------------------------------------------------------
         if vlm_description or anchor_image_b64:
             vlm_result = _vlm_quick_find(
                 screenshot_path=screenshot_path,