fix: by_text dans build_replay + anchor matching pour icônes sans texte

build_replay (stream_processor.py) : - Remplir by_text depuis vision_info.text ou som_element.label - VLM identification pour les éléments sans texte (icônes) - Nettoyage du bavardage VLM (retrait préfixes courants) resolve_target (api_stream.py) : - Nouveau som_anchor_match : template matching du crop anchor vs régions YOLO - Pour les icônes sans texte (disquette, loupe, etc.) - Cascade : text match → anchor match → VLM Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 15:28:31 +02:00
parent 5ceee9c393
commit ef5d595d98
2 changed files with 181 additions and 13 deletions
--- a/agent_v0/server_v1/api_stream.py
+++ b/agent_v0/server_v1/api_stream.py
@@ -3528,6 +3528,74 @@ def _resolve_by_som(
                len(exact_matches), anchor_label,
            )

+    # ── 2.7. Fallback : template matching anchor vs éléments SomEngine ──
+    # Pour les icônes sans texte : comparer le crop de référence contre
+    # chaque région YOLO détectée par SomEngine.
+    anchor_b64 = target_spec.get("anchor_image_base64", "")
+    if anchor_b64 and not anchor_label:
+        try:
+            import cv2
+            import numpy as np
+
+            # Décoder l'anchor
+            anc_bytes = base64.b64decode(anchor_b64)
+            anc_array = np.frombuffer(anc_bytes, dtype=np.uint8)
+            anc_img = cv2.imdecode(anc_array, cv2.IMREAD_GRAYSCALE)
+
+            # Charger le screenshot en OpenCV
+            screenshot_cv = cv2.imread(screenshot_path, cv2.IMREAD_GRAYSCALE)
+
+            if anc_img is not None and screenshot_cv is not None:
+                best_elem = None
+                best_score = 0.0
+                anc_h, anc_w = anc_img.shape[:2]
+
+                for elem in som_result.elements:
+                    x1, y1, x2, y2 = elem.bbox
+                    # Agrandir la zone de 20% pour tolérer les différences
+                    margin_x = int((x2 - x1) * 0.2)
+                    margin_y = int((y2 - y1) * 0.2)
+                    rx1 = max(0, x1 - margin_x)
+                    ry1 = max(0, y1 - margin_y)
+                    rx2 = min(screenshot_cv.shape[1], x2 + margin_x)
+                    ry2 = min(screenshot_cv.shape[0], y2 + margin_y)
+                    region = screenshot_cv[ry1:ry2, rx1:rx2]
+
+                    if region.shape[0] < anc_h or region.shape[1] < anc_w:
+                        continue
+
+                    res = cv2.matchTemplate(region, anc_img, cv2.TM_CCOEFF_NORMED)
+                    _, score, _, _ = cv2.minMaxLoc(res)
+                    if score > best_score:
+                        best_score = score
+                        best_elem = elem
+
+                if best_elem and best_score >= 0.6:
+                    elapsed = time.time() - t0
+                    cx_norm, cy_norm = best_elem.center_norm
+                    logger.info(
+                        "SoM resolve ANCHOR : match crop '#%d' score=%.3f → (%.4f, %.4f) en %.1fs",
+                        best_elem.id, best_score, cx_norm, cy_norm, elapsed,
+                    )
+                    return {
+                        "resolved": True,
+                        "method": "som_anchor_match",
+                        "x_pct": round(cx_norm, 6),
+                        "y_pct": round(cy_norm, 6),
+                        "matched_element": {
+                            "label": best_elem.label or f"icon #{best_elem.id}",
+                            "type": best_elem.source,
+                            "role": "som_anchor_match",
+                            "confidence": best_score,
+                            "som_id": best_elem.id,
+                        },
+                        "score": best_score,
+                    }
+        except ImportError:
+            pass
+        except Exception as e:
+            logger.debug("SoM anchor match erreur : %s", e)
+
    # ── 3. Sauvegarder l'image annotée SoM temporairement ──
    if som_result.som_image is None:
        logger.debug("SoM resolve : pas d'image annotée, skip VLM")
--- a/agent_v0/server_v1/stream_processor.py
+++ b/agent_v0/server_v1/stream_processor.py
@@ -427,6 +427,95 @@ def _needs_post_wait(action: dict) -> int:
    return 0


+# ---------------------------------------------------------------------------
+# VLM identification d'éléments UI (pour les éléments sans texte OCR)
+# ---------------------------------------------------------------------------
+
+
+def _vlm_identify_element(anchor_b64: str, window_title: str = "") -> str:
+    """Demander au VLM de décrire un élément UI à partir de son crop.
+
+    Utilisé pendant le build_replay quand un élément cliqué n'a pas de
+    texte visible (icône YOLO sans label OCR). Le VLM décrit CE QUE c'est
+    (bouton, icône, menu) pour permettre la résolution sémantique au replay.
+
+    Returns:
+        Description courte de l'élément (ex: "search icon", "Word icon")
+        ou chaîne vide si le VLM n'est pas disponible.
+    """
+    try:
+        import io
+        import tempfile
+        from PIL import Image
+    except ImportError:
+        return ""
+
+    try:
+        # Décoder le crop base64 → fichier temporaire pour le VLM
+        img_bytes = base64.b64decode(anchor_b64)
+        img = Image.open(io.BytesIO(img_bytes))
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+            img.save(tmp, format="PNG")
+            tmp_path = tmp.name
+
+        from core.detection.ollama_client import OllamaClient
+        client = OllamaClient(
+            endpoint="http://localhost:11434",
+            model="qwen3-vl:8b",
+            timeout=15,
+        )
+        context = f" in the window '{window_title}'" if window_title else ""
+        result = client.generate(
+            prompt=(
+                f"This is a cropped UI element{context}. "
+                "What is it? Answer with a short label (2-5 words max). "
+                "Examples: 'search bar icon', 'Word application icon', 'close button', "
+                "'file menu', 'save button'.\n"
+                "Answer ONLY the label, nothing else."
+            ),
+            image_path=tmp_path,
+            system_prompt="You identify UI elements. Answer with a short label only.",
+            temperature=0.1,
+            max_tokens=20,
+        )
+
+        import os
+        os.unlink(tmp_path)
+
+        if result.get("success"):
+            raw = result.get("response", "").strip()
+            # Extraire un label court depuis la réponse (le VLM bavarde souvent)
+            # Retirer les préfixes courants
+            for prefix in (
+                "Based on the image, the UI element shown is a ",
+                "Based on the image, the UI element is a ",
+                "Based on the image, this is a ",
+                "Based on the image, it is a ",
+                "Based on the image, I can see ",
+                "Based on the image, ",
+                "The UI element shown is a ",
+                "The UI element is a ",
+                "The element is a ",
+                "This is a ", "It is a ", "It's a ", "I can see a ",
+                "I can see ", "A ",
+            ):
+                if raw.lower().startswith(prefix.lower()):
+                    raw = raw[len(prefix):]
+                    break
+            # Prendre les 5 premiers mots utiles
+            words = raw.split()[:5]
+            label = " ".join(words).strip('",.\' ').rstrip(".")
+            if label and 2 <= len(label) <= 60:
+                logger.info("VLM identify element : '%s'", label)
+                return label
+            else:
+                logger.debug("VLM identify : label trop court ou vide après nettoyage (raw='%s')", raw[:80])
+    except Exception as e:
+        logger.debug("VLM identify element échoué : %s", e)
+
+    return ""
+
+
 # ---------------------------------------------------------------------------
 # SomEngine — enrichissement Set-of-Mark des clics pendant le build_replay
 # ---------------------------------------------------------------------------
@@ -1025,8 +1114,32 @@ def build_replay_from_raw_events(
                            )
                    vlm_description = ", ".join(vlm_parts) if vlm_parts else ""

+                    # ── SomEngine : identifier l'élément cliqué ──
+                    som_elem = _som_identify_clicked_element(
+                        evt, session_dir_path, screen_w, screen_h,
+                    )
+
+                    # Déterminer le texte de l'élément cliqué (by_text)
+                    # Priorité : vision_info.text > som_element.label > VLM identification
+                    element_text = ""
+                    element_type = ""
+                    if isinstance(vision_info, dict):
+                        element_text = vision_info.get("text", "")
+                        element_type = vision_info.get("type", "")
+                    if not element_text and som_elem and som_elem.get("label"):
+                        element_text = som_elem["label"]
+
+                    # Si pas de texte (icône sans label), demander au VLM
+                    # d'identifier CE QUE c'est à partir du crop
+                    if not element_text and anchor_b64:
+                        element_text = _vlm_identify_element(
+                            anchor_b64, window_title,
+                        )
+
                    action["target_spec"] = {
                        "anchor_image_base64": anchor_b64,
+                        "by_text": element_text,  # CE QUE l'élément EST
+                        "by_role": element_type or (som_elem.get("source", "") if som_elem else ""),
                        "vlm_description": vlm_description,
                        "window_title": window_title,
                        "original_position": {
@@ -1034,22 +1147,9 @@ def build_replay_from_raw_events(
                            "y_relative": y_relative,
                        },
                    }
-                    # NE PAS mettre window_title comme by_text !
-                    # by_text doit être le texte de l'ÉLÉMENT cliqué, pas le titre de la fenêtre.
-                    # Sinon le template matching texte cherche "13071967.txt – Bloc-notes"
-                    # sur l'écran et clique sur la barre de titre au lieu du bon élément.

-                    # ── SomEngine : identifier l'élément cliqué ──
-                    som_elem = _som_identify_clicked_element(
-                        evt, session_dir_path, screen_w, screen_h,
-                    )
                    if som_elem:
                        action["target_spec"]["som_element"] = som_elem
-                        # Enrichir la description VLM avec le label SoM
-                        if som_elem.get("label") and not vision_info.get("text"):
-                            action["target_spec"]["vlm_description"] += (
-                                f", le texte de l'élément est '{som_elem['label']}'"
-                            )

        elif evt_type == "text_input":
            text = evt.get("text", "")