fix: anchor match sur screenshot entier + proximité élément SomEngine

Le template matching du crop anchor contre les régions YOLO échouait car l'anchor (150x150) est plus grand que les éléments détectés. Maintenant : match sur le screenshot entier → centre du match → élément SomEngine le plus proche (max 100px). Fonctionne pour les icônes mais limité par la taille du crop (150x150 de barre de titre matche à plusieurs endroits). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
fix: rejeter bavardage VLM dans _vlm_identify_element
2026-03-31 15:51:18 +02:00 · 2026-03-31 15:44:56 +02:00 · 2026-03-31 15:38:38 +02:00
2 changed files with 55 additions and 44 deletions
--- a/agent_v0/server_v1/api_stream.py
+++ b/agent_v0/server_v1/api_stream.py
@@ -3532,7 +3532,8 @@ def _resolve_by_som(
    # Pour les icônes sans texte : comparer le crop de référence contre
    # chaque région YOLO détectée par SomEngine.
    anchor_b64 = target_spec.get("anchor_image_base64", "")
-    if anchor_b64 and not anchor_label:
+    by_text = target_spec.get("by_text", "").strip()
    if anchor_b64 and (not anchor_label or not by_text):
        try:
            import cv2
            import numpy as np
@@ -3546,36 +3547,36 @@ def _resolve_by_som(
            screenshot_cv = cv2.imread(screenshot_path, cv2.IMREAD_GRAYSCALE)
            if anc_img is not None and screenshot_cv is not None:
-                best_elem = None
+                # Template matching de l'anchor sur le SCREENSHOT ENTIER
-                best_score = 0.0
+                # (pas sur les régions individuelles — l'anchor est souvent plus grand)
                anc_h, anc_w = anc_img.shape[:2]
                if screenshot_cv.shape[0] >= anc_h and screenshot_cv.shape[1] >= anc_w:
                    res = cv2.matchTemplate(screenshot_cv, anc_img, cv2.TM_CCOEFF_NORMED)
                    _, max_score, _, max_loc = cv2.minMaxLoc(res)
                    if max_score >= 0.5:
                        # Centre du match
                        match_cx = max_loc[0] + anc_w // 2
                        match_cy = max_loc[1] + anc_h // 2
                        # Trouver l'élément SomEngine le plus proche du centre du match
                        best_elem = None
                        best_dist = float("inf")
                        for elem in som_result.elements:
-                    x1, y1, x2, y2 = elem.bbox
+                            cx, cy = elem.center
-                    # Agrandir la zone de 20% pour tolérer les différences
+                            dist = ((match_cx - cx) ** 2 + (match_cy - cy) ** 2) ** 0.5
-                    margin_x = int((x2 - x1) * 0.2)
+                            if dist < best_dist:
-                    margin_y = int((y2 - y1) * 0.2)
+                                best_dist = dist
                    rx1 = max(0, x1 - margin_x)
                    ry1 = max(0, y1 - margin_y)
                    rx2 = min(screenshot_cv.shape[1], x2 + margin_x)
                    ry2 = min(screenshot_cv.shape[0], y2 + margin_y)
                    region = screenshot_cv[ry1:ry2, rx1:rx2]
                    if region.shape[0] < anc_h or region.shape[1] < anc_w:
                        continue
                    res = cv2.matchTemplate(region, anc_img, cv2.TM_CCOEFF_NORMED)
                    _, score, _, _ = cv2.minMaxLoc(res)
                    if score > best_score:
                        best_score = score
                                best_elem = elem
-                if best_elem and best_score >= 0.6:
+                        if best_elem and best_dist < 100:  # Max 100px de distance
                            elapsed = time.time() - t0
                            cx_norm, cy_norm = best_elem.center_norm
                            logger.info(
-                        "SoM resolve ANCHOR : match crop '#%d' score=%.3f → (%.4f, %.4f) en %.1fs",
+                                "SoM resolve ANCHOR : match crop score=%.3f → "
-                        best_elem.id, best_score, cx_norm, cy_norm, elapsed,
+                                "elem '#%d %s' (dist=%.0fpx) → (%.4f, %.4f) en %.1fs",
                                max_score, best_elem.id, best_elem.label,
                                best_dist, cx_norm, cy_norm, elapsed,
                            )
                            return {
                                "resolved": True,
@@ -3586,10 +3587,10 @@ def _resolve_by_som(
                                    "label": best_elem.label or f"icon #{best_elem.id}",
                                    "type": best_elem.source,
                                    "role": "som_anchor_match",
-                            "confidence": best_score,
+                                    "confidence": max_score,
                                    "som_id": best_elem.id,
                                },
-                        "score": best_score,
+                                "score": max_score,
                            }
        except ImportError:
            pass
--- a/agent_v0/server_v1/stream_processor.py
+++ b/agent_v0/server_v1/stream_processor.py
@@ -502,14 +502,24 @@ def _vlm_identify_element(anchor_b64: str, window_title: str = "") -> str:
                if raw.lower().startswith(prefix.lower()):
                    raw = raw[len(prefix):]
                    break
            # Rejeter les réponses qui sont du bavardage, pas un label
            reject_patterns = (
                "several", "multiple", "various", "image",
                "I can", "there are", "there is", "elements",
                "the following", "here are",
            )
            if any(p in raw.lower()[:30] for p in reject_patterns):
                logger.debug("VLM identify : réponse bavarde rejetée (raw='%s')", raw[:60])
                return ""
            # Prendre les 5 premiers mots utiles
            words = raw.split()[:5]
            label = " ".join(words).strip('",.\' ').rstrip(".")
-            if label and 2 <= len(label) <= 60:
+            if label and 2 <= len(label) <= 40:
                logger.info("VLM identify element : '%s'", label)
                return label
            else:
-                logger.debug("VLM identify : label trop court ou vide après nettoyage (raw='%s')", raw[:80])
+                logger.debug("VLM identify : label trop court/long après nettoyage (raw='%s')", raw[:80])
    except Exception as e:
        logger.debug("VLM identify element échoué : %s", e)