diff --git a/agent_v0/server_v1/api_stream.py b/agent_v0/server_v1/api_stream.py index 9e86342f5..49e22ec98 100644 --- a/agent_v0/server_v1/api_stream.py +++ b/agent_v0/server_v1/api_stream.py @@ -3528,6 +3528,74 @@ def _resolve_by_som( len(exact_matches), anchor_label, ) + # ── 2.7. Fallback : template matching anchor vs éléments SomEngine ── + # Pour les icônes sans texte : comparer le crop de référence contre + # chaque région YOLO détectée par SomEngine. + anchor_b64 = target_spec.get("anchor_image_base64", "") + if anchor_b64 and not anchor_label: + try: + import cv2 + import numpy as np + + # Décoder l'anchor + anc_bytes = base64.b64decode(anchor_b64) + anc_array = np.frombuffer(anc_bytes, dtype=np.uint8) + anc_img = cv2.imdecode(anc_array, cv2.IMREAD_GRAYSCALE) + + # Charger le screenshot en OpenCV + screenshot_cv = cv2.imread(screenshot_path, cv2.IMREAD_GRAYSCALE) + + if anc_img is not None and screenshot_cv is not None: + best_elem = None + best_score = 0.0 + anc_h, anc_w = anc_img.shape[:2] + + for elem in som_result.elements: + x1, y1, x2, y2 = elem.bbox + # Agrandir la zone de 20% pour tolérer les différences + margin_x = int((x2 - x1) * 0.2) + margin_y = int((y2 - y1) * 0.2) + rx1 = max(0, x1 - margin_x) + ry1 = max(0, y1 - margin_y) + rx2 = min(screenshot_cv.shape[1], x2 + margin_x) + ry2 = min(screenshot_cv.shape[0], y2 + margin_y) + region = screenshot_cv[ry1:ry2, rx1:rx2] + + if region.shape[0] < anc_h or region.shape[1] < anc_w: + continue + + res = cv2.matchTemplate(region, anc_img, cv2.TM_CCOEFF_NORMED) + _, score, _, _ = cv2.minMaxLoc(res) + if score > best_score: + best_score = score + best_elem = elem + + if best_elem and best_score >= 0.6: + elapsed = time.time() - t0 + cx_norm, cy_norm = best_elem.center_norm + logger.info( + "SoM resolve ANCHOR : match crop '#%d' score=%.3f → (%.4f, %.4f) en %.1fs", + best_elem.id, best_score, cx_norm, cy_norm, elapsed, + ) + return { + "resolved": True, + "method": "som_anchor_match", + "x_pct": round(cx_norm, 6), + "y_pct": round(cy_norm, 6), + "matched_element": { + "label": best_elem.label or f"icon #{best_elem.id}", + "type": best_elem.source, + "role": "som_anchor_match", + "confidence": best_score, + "som_id": best_elem.id, + }, + "score": best_score, + } + except ImportError: + pass + except Exception as e: + logger.debug("SoM anchor match erreur : %s", e) + # ── 3. Sauvegarder l'image annotée SoM temporairement ── if som_result.som_image is None: logger.debug("SoM resolve : pas d'image annotée, skip VLM") diff --git a/agent_v0/server_v1/stream_processor.py b/agent_v0/server_v1/stream_processor.py index 234abaa2e..74f113de2 100644 --- a/agent_v0/server_v1/stream_processor.py +++ b/agent_v0/server_v1/stream_processor.py @@ -427,6 +427,95 @@ def _needs_post_wait(action: dict) -> int: return 0 +# --------------------------------------------------------------------------- +# VLM identification d'éléments UI (pour les éléments sans texte OCR) +# --------------------------------------------------------------------------- + + +def _vlm_identify_element(anchor_b64: str, window_title: str = "") -> str: + """Demander au VLM de décrire un élément UI à partir de son crop. + + Utilisé pendant le build_replay quand un élément cliqué n'a pas de + texte visible (icône YOLO sans label OCR). Le VLM décrit CE QUE c'est + (bouton, icône, menu) pour permettre la résolution sémantique au replay. + + Returns: + Description courte de l'élément (ex: "search icon", "Word icon") + ou chaîne vide si le VLM n'est pas disponible. + """ + try: + import io + import tempfile + from PIL import Image + except ImportError: + return "" + + try: + # Décoder le crop base64 → fichier temporaire pour le VLM + img_bytes = base64.b64decode(anchor_b64) + img = Image.open(io.BytesIO(img_bytes)) + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: + img.save(tmp, format="PNG") + tmp_path = tmp.name + + from core.detection.ollama_client import OllamaClient + client = OllamaClient( + endpoint="http://localhost:11434", + model="qwen3-vl:8b", + timeout=15, + ) + context = f" in the window '{window_title}'" if window_title else "" + result = client.generate( + prompt=( + f"This is a cropped UI element{context}. " + "What is it? Answer with a short label (2-5 words max). " + "Examples: 'search bar icon', 'Word application icon', 'close button', " + "'file menu', 'save button'.\n" + "Answer ONLY the label, nothing else." + ), + image_path=tmp_path, + system_prompt="You identify UI elements. Answer with a short label only.", + temperature=0.1, + max_tokens=20, + ) + + import os + os.unlink(tmp_path) + + if result.get("success"): + raw = result.get("response", "").strip() + # Extraire un label court depuis la réponse (le VLM bavarde souvent) + # Retirer les préfixes courants + for prefix in ( + "Based on the image, the UI element shown is a ", + "Based on the image, the UI element is a ", + "Based on the image, this is a ", + "Based on the image, it is a ", + "Based on the image, I can see ", + "Based on the image, ", + "The UI element shown is a ", + "The UI element is a ", + "The element is a ", + "This is a ", "It is a ", "It's a ", "I can see a ", + "I can see ", "A ", + ): + if raw.lower().startswith(prefix.lower()): + raw = raw[len(prefix):] + break + # Prendre les 5 premiers mots utiles + words = raw.split()[:5] + label = " ".join(words).strip('",.\' ').rstrip(".") + if label and 2 <= len(label) <= 60: + logger.info("VLM identify element : '%s'", label) + return label + else: + logger.debug("VLM identify : label trop court ou vide après nettoyage (raw='%s')", raw[:80]) + except Exception as e: + logger.debug("VLM identify element échoué : %s", e) + + return "" + + # --------------------------------------------------------------------------- # SomEngine — enrichissement Set-of-Mark des clics pendant le build_replay # --------------------------------------------------------------------------- @@ -1025,8 +1114,32 @@ def build_replay_from_raw_events( ) vlm_description = ", ".join(vlm_parts) if vlm_parts else "" + # ── SomEngine : identifier l'élément cliqué ── + som_elem = _som_identify_clicked_element( + evt, session_dir_path, screen_w, screen_h, + ) + + # Déterminer le texte de l'élément cliqué (by_text) + # Priorité : vision_info.text > som_element.label > VLM identification + element_text = "" + element_type = "" + if isinstance(vision_info, dict): + element_text = vision_info.get("text", "") + element_type = vision_info.get("type", "") + if not element_text and som_elem and som_elem.get("label"): + element_text = som_elem["label"] + + # Si pas de texte (icône sans label), demander au VLM + # d'identifier CE QUE c'est à partir du crop + if not element_text and anchor_b64: + element_text = _vlm_identify_element( + anchor_b64, window_title, + ) + action["target_spec"] = { "anchor_image_base64": anchor_b64, + "by_text": element_text, # CE QUE l'élément EST + "by_role": element_type or (som_elem.get("source", "") if som_elem else ""), "vlm_description": vlm_description, "window_title": window_title, "original_position": { @@ -1034,22 +1147,9 @@ def build_replay_from_raw_events( "y_relative": y_relative, }, } - # NE PAS mettre window_title comme by_text ! - # by_text doit être le texte de l'ÉLÉMENT cliqué, pas le titre de la fenêtre. - # Sinon le template matching texte cherche "13071967.txt – Bloc-notes" - # sur l'écran et clique sur la barre de titre au lieu du bon élément. - # ── SomEngine : identifier l'élément cliqué ── - som_elem = _som_identify_clicked_element( - evt, session_dir_path, screen_w, screen_h, - ) if som_elem: action["target_spec"]["som_element"] = som_elem - # Enrichir la description VLM avec le label SoM - if som_elem.get("label") and not vision_info.get("text"): - action["target_spec"]["vlm_description"] += ( - f", le texte de l'élément est '{som_elem['label']}'" - ) elif evt_type == "text_input": text = evt.get("text", "")