fix: by_text dans build_replay + anchor matching pour icônes sans texte

build_replay (stream_processor.py) :
- Remplir by_text depuis vision_info.text ou som_element.label
- VLM identification pour les éléments sans texte (icônes)
- Nettoyage du bavardage VLM (retrait préfixes courants)

resolve_target (api_stream.py) :
- Nouveau som_anchor_match : template matching du crop anchor vs régions YOLO
- Pour les icônes sans texte (disquette, loupe, etc.)
- Cascade : text match → anchor match → VLM

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-03-31 15:28:31 +02:00
parent 5ceee9c393
commit ef5d595d98
2 changed files with 181 additions and 13 deletions

View File

@@ -3528,6 +3528,74 @@ def _resolve_by_som(
len(exact_matches), anchor_label, len(exact_matches), anchor_label,
) )
# ── 2.7. Fallback : template matching anchor vs éléments SomEngine ──
# Pour les icônes sans texte : comparer le crop de référence contre
# chaque région YOLO détectée par SomEngine.
anchor_b64 = target_spec.get("anchor_image_base64", "")
if anchor_b64 and not anchor_label:
try:
import cv2
import numpy as np
# Décoder l'anchor
anc_bytes = base64.b64decode(anchor_b64)
anc_array = np.frombuffer(anc_bytes, dtype=np.uint8)
anc_img = cv2.imdecode(anc_array, cv2.IMREAD_GRAYSCALE)
# Charger le screenshot en OpenCV
screenshot_cv = cv2.imread(screenshot_path, cv2.IMREAD_GRAYSCALE)
if anc_img is not None and screenshot_cv is not None:
best_elem = None
best_score = 0.0
anc_h, anc_w = anc_img.shape[:2]
for elem in som_result.elements:
x1, y1, x2, y2 = elem.bbox
# Agrandir la zone de 20% pour tolérer les différences
margin_x = int((x2 - x1) * 0.2)
margin_y = int((y2 - y1) * 0.2)
rx1 = max(0, x1 - margin_x)
ry1 = max(0, y1 - margin_y)
rx2 = min(screenshot_cv.shape[1], x2 + margin_x)
ry2 = min(screenshot_cv.shape[0], y2 + margin_y)
region = screenshot_cv[ry1:ry2, rx1:rx2]
if region.shape[0] < anc_h or region.shape[1] < anc_w:
continue
res = cv2.matchTemplate(region, anc_img, cv2.TM_CCOEFF_NORMED)
_, score, _, _ = cv2.minMaxLoc(res)
if score > best_score:
best_score = score
best_elem = elem
if best_elem and best_score >= 0.6:
elapsed = time.time() - t0
cx_norm, cy_norm = best_elem.center_norm
logger.info(
"SoM resolve ANCHOR : match crop '#%d' score=%.3f → (%.4f, %.4f) en %.1fs",
best_elem.id, best_score, cx_norm, cy_norm, elapsed,
)
return {
"resolved": True,
"method": "som_anchor_match",
"x_pct": round(cx_norm, 6),
"y_pct": round(cy_norm, 6),
"matched_element": {
"label": best_elem.label or f"icon #{best_elem.id}",
"type": best_elem.source,
"role": "som_anchor_match",
"confidence": best_score,
"som_id": best_elem.id,
},
"score": best_score,
}
except ImportError:
pass
except Exception as e:
logger.debug("SoM anchor match erreur : %s", e)
# ── 3. Sauvegarder l'image annotée SoM temporairement ── # ── 3. Sauvegarder l'image annotée SoM temporairement ──
if som_result.som_image is None: if som_result.som_image is None:
logger.debug("SoM resolve : pas d'image annotée, skip VLM") logger.debug("SoM resolve : pas d'image annotée, skip VLM")

View File

@@ -427,6 +427,95 @@ def _needs_post_wait(action: dict) -> int:
return 0 return 0
# ---------------------------------------------------------------------------
# VLM identification d'éléments UI (pour les éléments sans texte OCR)
# ---------------------------------------------------------------------------
def _vlm_identify_element(anchor_b64: str, window_title: str = "") -> str:
"""Demander au VLM de décrire un élément UI à partir de son crop.
Utilisé pendant le build_replay quand un élément cliqué n'a pas de
texte visible (icône YOLO sans label OCR). Le VLM décrit CE QUE c'est
(bouton, icône, menu) pour permettre la résolution sémantique au replay.
Returns:
Description courte de l'élément (ex: "search icon", "Word icon")
ou chaîne vide si le VLM n'est pas disponible.
"""
try:
import io
import tempfile
from PIL import Image
except ImportError:
return ""
try:
# Décoder le crop base64 → fichier temporaire pour le VLM
img_bytes = base64.b64decode(anchor_b64)
img = Image.open(io.BytesIO(img_bytes))
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
img.save(tmp, format="PNG")
tmp_path = tmp.name
from core.detection.ollama_client import OllamaClient
client = OllamaClient(
endpoint="http://localhost:11434",
model="qwen3-vl:8b",
timeout=15,
)
context = f" in the window '{window_title}'" if window_title else ""
result = client.generate(
prompt=(
f"This is a cropped UI element{context}. "
"What is it? Answer with a short label (2-5 words max). "
"Examples: 'search bar icon', 'Word application icon', 'close button', "
"'file menu', 'save button'.\n"
"Answer ONLY the label, nothing else."
),
image_path=tmp_path,
system_prompt="You identify UI elements. Answer with a short label only.",
temperature=0.1,
max_tokens=20,
)
import os
os.unlink(tmp_path)
if result.get("success"):
raw = result.get("response", "").strip()
# Extraire un label court depuis la réponse (le VLM bavarde souvent)
# Retirer les préfixes courants
for prefix in (
"Based on the image, the UI element shown is a ",
"Based on the image, the UI element is a ",
"Based on the image, this is a ",
"Based on the image, it is a ",
"Based on the image, I can see ",
"Based on the image, ",
"The UI element shown is a ",
"The UI element is a ",
"The element is a ",
"This is a ", "It is a ", "It's a ", "I can see a ",
"I can see ", "A ",
):
if raw.lower().startswith(prefix.lower()):
raw = raw[len(prefix):]
break
# Prendre les 5 premiers mots utiles
words = raw.split()[:5]
label = " ".join(words).strip('",.\' ').rstrip(".")
if label and 2 <= len(label) <= 60:
logger.info("VLM identify element : '%s'", label)
return label
else:
logger.debug("VLM identify : label trop court ou vide après nettoyage (raw='%s')", raw[:80])
except Exception as e:
logger.debug("VLM identify element échoué : %s", e)
return ""
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# SomEngine — enrichissement Set-of-Mark des clics pendant le build_replay # SomEngine — enrichissement Set-of-Mark des clics pendant le build_replay
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -1025,8 +1114,32 @@ def build_replay_from_raw_events(
) )
vlm_description = ", ".join(vlm_parts) if vlm_parts else "" vlm_description = ", ".join(vlm_parts) if vlm_parts else ""
# ── SomEngine : identifier l'élément cliqué ──
som_elem = _som_identify_clicked_element(
evt, session_dir_path, screen_w, screen_h,
)
# Déterminer le texte de l'élément cliqué (by_text)
# Priorité : vision_info.text > som_element.label > VLM identification
element_text = ""
element_type = ""
if isinstance(vision_info, dict):
element_text = vision_info.get("text", "")
element_type = vision_info.get("type", "")
if not element_text and som_elem and som_elem.get("label"):
element_text = som_elem["label"]
# Si pas de texte (icône sans label), demander au VLM
# d'identifier CE QUE c'est à partir du crop
if not element_text and anchor_b64:
element_text = _vlm_identify_element(
anchor_b64, window_title,
)
action["target_spec"] = { action["target_spec"] = {
"anchor_image_base64": anchor_b64, "anchor_image_base64": anchor_b64,
"by_text": element_text, # CE QUE l'élément EST
"by_role": element_type or (som_elem.get("source", "") if som_elem else ""),
"vlm_description": vlm_description, "vlm_description": vlm_description,
"window_title": window_title, "window_title": window_title,
"original_position": { "original_position": {
@@ -1034,22 +1147,9 @@ def build_replay_from_raw_events(
"y_relative": y_relative, "y_relative": y_relative,
}, },
} }
# NE PAS mettre window_title comme by_text !
# by_text doit être le texte de l'ÉLÉMENT cliqué, pas le titre de la fenêtre.
# Sinon le template matching texte cherche "13071967.txt Bloc-notes"
# sur l'écran et clique sur la barre de titre au lieu du bon élément.
# ── SomEngine : identifier l'élément cliqué ──
som_elem = _som_identify_clicked_element(
evt, session_dir_path, screen_w, screen_h,
)
if som_elem: if som_elem:
action["target_spec"]["som_element"] = som_elem action["target_spec"]["som_element"] = som_elem
# Enrichir la description VLM avec le label SoM
if som_elem.get("label") and not vision_info.get("text"):
action["target_spec"]["vlm_description"] += (
f", le texte de l'élément est '{som_elem['label']}'"
)
elif evt_type == "text_input": elif evt_type == "text_input":
text = evt.get("text", "") text = evt.get("text", "")