fix: by_text dans build_replay + anchor matching pour icônes sans texte

build_replay (stream_processor.py) : - Remplir by_text depuis vision_info.text ou som_element.label - VLM identification pour les éléments sans texte (icônes) - Nettoyage du bavardage VLM (retrait préfixes courants) resolve_target (api_stream.py) : - Nouveau som_anchor_match : template matching du crop anchor vs régions YOLO - Pour les icônes sans texte (disquette, loupe, etc.) - Cascade : text match → anchor match → VLM Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 15:28:31 +02:00
parent 5ceee9c393
commit ef5d595d98
2 changed files with 181 additions and 13 deletions
--- a/agent_v0/server_v1/api_stream.py
+++ b/agent_v0/server_v1/api_stream.py
@@ -3528,6 +3528,74 @@ def _resolve_by_som(
                len(exact_matches), anchor_label,
            )
    # ── 2.7. Fallback : template matching anchor vs éléments SomEngine ──
    # Pour les icônes sans texte : comparer le crop de référence contre
    # chaque région YOLO détectée par SomEngine.
    anchor_b64 = target_spec.get("anchor_image_base64", "")
    if anchor_b64 and not anchor_label:
        try:
            import cv2
            import numpy as np
            # Décoder l'anchor
            anc_bytes = base64.b64decode(anchor_b64)
            anc_array = np.frombuffer(anc_bytes, dtype=np.uint8)
            anc_img = cv2.imdecode(anc_array, cv2.IMREAD_GRAYSCALE)
            # Charger le screenshot en OpenCV
            screenshot_cv = cv2.imread(screenshot_path, cv2.IMREAD_GRAYSCALE)
            if anc_img is not None and screenshot_cv is not None:
                best_elem = None
                best_score = 0.0
                anc_h, anc_w = anc_img.shape[:2]
                for elem in som_result.elements:
                    x1, y1, x2, y2 = elem.bbox
                    # Agrandir la zone de 20% pour tolérer les différences
                    margin_x = int((x2 - x1) * 0.2)
                    margin_y = int((y2 - y1) * 0.2)
                    rx1 = max(0, x1 - margin_x)
                    ry1 = max(0, y1 - margin_y)
                    rx2 = min(screenshot_cv.shape[1], x2 + margin_x)
                    ry2 = min(screenshot_cv.shape[0], y2 + margin_y)
                    region = screenshot_cv[ry1:ry2, rx1:rx2]
                    if region.shape[0] < anc_h or region.shape[1] < anc_w:
                        continue
                    res = cv2.matchTemplate(region, anc_img, cv2.TM_CCOEFF_NORMED)
                    _, score, _, _ = cv2.minMaxLoc(res)
                    if score > best_score:
                        best_score = score
                        best_elem = elem
                if best_elem and best_score >= 0.6:
                    elapsed = time.time() - t0
                    cx_norm, cy_norm = best_elem.center_norm
                    logger.info(
                        "SoM resolve ANCHOR : match crop '#%d' score=%.3f → (%.4f, %.4f) en %.1fs",
                        best_elem.id, best_score, cx_norm, cy_norm, elapsed,
                    )
                    return {
                        "resolved": True,
                        "method": "som_anchor_match",
                        "x_pct": round(cx_norm, 6),
                        "y_pct": round(cy_norm, 6),
                        "matched_element": {
                            "label": best_elem.label or f"icon #{best_elem.id}",
                            "type": best_elem.source,
                            "role": "som_anchor_match",
                            "confidence": best_score,
                            "som_id": best_elem.id,
                        },
                        "score": best_score,
                    }
        except ImportError:
            pass
        except Exception as e:
            logger.debug("SoM anchor match erreur : %s", e)
    # ── 3. Sauvegarder l'image annotée SoM temporairement ──
    if som_result.som_image is None:
        logger.debug("SoM resolve : pas d'image annotée, skip VLM")
--- a/agent_v0/server_v1/stream_processor.py
+++ b/agent_v0/server_v1/stream_processor.py
@@ -427,6 +427,95 @@ def _needs_post_wait(action: dict) -> int:
    return 0
 # ---------------------------------------------------------------------------
 # VLM identification d'éléments UI (pour les éléments sans texte OCR)
 # ---------------------------------------------------------------------------
 def _vlm_identify_element(anchor_b64: str, window_title: str = "") -> str:
    """Demander au VLM de décrire un élément UI à partir de son crop.
    Utilisé pendant le build_replay quand un élément cliqué n'a pas de
    texte visible (icône YOLO sans label OCR). Le VLM décrit CE QUE c'est
    (bouton, icône, menu) pour permettre la résolution sémantique au replay.
    Returns:
        Description courte de l'élément (ex: "search icon", "Word icon")
        ou chaîne vide si le VLM n'est pas disponible.
    """
    try:
        import io
        import tempfile
        from PIL import Image
    except ImportError:
        return ""
    try:
        # Décoder le crop base64 → fichier temporaire pour le VLM
        img_bytes = base64.b64decode(anchor_b64)
        img = Image.open(io.BytesIO(img_bytes))
        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
            img.save(tmp, format="PNG")
            tmp_path = tmp.name
        from core.detection.ollama_client import OllamaClient
        client = OllamaClient(
            endpoint="http://localhost:11434",
            model="qwen3-vl:8b",
            timeout=15,
        )
        context = f" in the window '{window_title}'" if window_title else ""
        result = client.generate(
            prompt=(
                f"This is a cropped UI element{context}. "
                "What is it? Answer with a short label (2-5 words max). "
                "Examples: 'search bar icon', 'Word application icon', 'close button', "
                "'file menu', 'save button'.\n"
                "Answer ONLY the label, nothing else."
            ),
            image_path=tmp_path,
            system_prompt="You identify UI elements. Answer with a short label only.",
            temperature=0.1,
            max_tokens=20,
        )
        import os
        os.unlink(tmp_path)
        if result.get("success"):
            raw = result.get("response", "").strip()
            # Extraire un label court depuis la réponse (le VLM bavarde souvent)
            # Retirer les préfixes courants
            for prefix in (
                "Based on the image, the UI element shown is a ",
                "Based on the image, the UI element is a ",
                "Based on the image, this is a ",
                "Based on the image, it is a ",
                "Based on the image, I can see ",
                "Based on the image, ",
                "The UI element shown is a ",
                "The UI element is a ",
                "The element is a ",
                "This is a ", "It is a ", "It's a ", "I can see a ",
                "I can see ", "A ",
            ):
                if raw.lower().startswith(prefix.lower()):
                    raw = raw[len(prefix):]
                    break
            # Prendre les 5 premiers mots utiles
            words = raw.split()[:5]
            label = " ".join(words).strip('",.\' ').rstrip(".")
            if label and 2 <= len(label) <= 60:
                logger.info("VLM identify element : '%s'", label)
                return label
            else:
                logger.debug("VLM identify : label trop court ou vide après nettoyage (raw='%s')", raw[:80])
    except Exception as e:
        logger.debug("VLM identify element échoué : %s", e)
    return ""
 # ---------------------------------------------------------------------------
 # SomEngine — enrichissement Set-of-Mark des clics pendant le build_replay
 # ---------------------------------------------------------------------------
@@ -1025,8 +1114,32 @@ def build_replay_from_raw_events(
                            )
                    vlm_description = ", ".join(vlm_parts) if vlm_parts else ""
                    # ── SomEngine : identifier l'élément cliqué ──
                    som_elem = _som_identify_clicked_element(
                        evt, session_dir_path, screen_w, screen_h,
                    )
                    # Déterminer le texte de l'élément cliqué (by_text)
                    # Priorité : vision_info.text > som_element.label > VLM identification
                    element_text = ""
                    element_type = ""
                    if isinstance(vision_info, dict):
                        element_text = vision_info.get("text", "")
                        element_type = vision_info.get("type", "")
                    if not element_text and som_elem and som_elem.get("label"):
                        element_text = som_elem["label"]
                    # Si pas de texte (icône sans label), demander au VLM
                    # d'identifier CE QUE c'est à partir du crop
                    if not element_text and anchor_b64:
                        element_text = _vlm_identify_element(
                            anchor_b64, window_title,
                        )
                    action["target_spec"] = {
                        "anchor_image_base64": anchor_b64,
                        "by_text": element_text,  # CE QUE l'élément EST
                        "by_role": element_type or (som_elem.get("source", "") if som_elem else ""),
                        "vlm_description": vlm_description,
                        "window_title": window_title,
                        "original_position": {
@@ -1034,22 +1147,9 @@ def build_replay_from_raw_events(
                            "y_relative": y_relative,
                        },
                    }
                    # NE PAS mettre window_title comme by_text !
                    # by_text doit être le texte de l'ÉLÉMENT cliqué, pas le titre de la fenêtre.
                    # Sinon le template matching texte cherche "13071967.txt – Bloc-notes"
                    # sur l'écran et clique sur la barre de titre au lieu du bon élément.
                    # ── SomEngine : identifier l'élément cliqué ──
                    som_elem = _som_identify_clicked_element(
                        evt, session_dir_path, screen_w, screen_h,
                    )
                    if som_elem:
                        action["target_spec"]["som_element"] = som_elem
                        # Enrichir la description VLM avec le label SoM
                        if som_elem.get("label") and not vision_info.get("text"):
                            action["target_spec"]["vlm_description"] += (
                                f", le texte de l'élément est '{som_elem['label']}'"
                            )
        elif evt_type == "text_input":
            text = evt.get("text", "")