fix: by_text dans build_replay + anchor matching pour icônes sans texte
build_replay (stream_processor.py) : - Remplir by_text depuis vision_info.text ou som_element.label - VLM identification pour les éléments sans texte (icônes) - Nettoyage du bavardage VLM (retrait préfixes courants) resolve_target (api_stream.py) : - Nouveau som_anchor_match : template matching du crop anchor vs régions YOLO - Pour les icônes sans texte (disquette, loupe, etc.) - Cascade : text match → anchor match → VLM Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3528,6 +3528,74 @@ def _resolve_by_som(
|
|||||||
len(exact_matches), anchor_label,
|
len(exact_matches), anchor_label,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# ── 2.7. Fallback : template matching anchor vs éléments SomEngine ──
|
||||||
|
# Pour les icônes sans texte : comparer le crop de référence contre
|
||||||
|
# chaque région YOLO détectée par SomEngine.
|
||||||
|
anchor_b64 = target_spec.get("anchor_image_base64", "")
|
||||||
|
if anchor_b64 and not anchor_label:
|
||||||
|
try:
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# Décoder l'anchor
|
||||||
|
anc_bytes = base64.b64decode(anchor_b64)
|
||||||
|
anc_array = np.frombuffer(anc_bytes, dtype=np.uint8)
|
||||||
|
anc_img = cv2.imdecode(anc_array, cv2.IMREAD_GRAYSCALE)
|
||||||
|
|
||||||
|
# Charger le screenshot en OpenCV
|
||||||
|
screenshot_cv = cv2.imread(screenshot_path, cv2.IMREAD_GRAYSCALE)
|
||||||
|
|
||||||
|
if anc_img is not None and screenshot_cv is not None:
|
||||||
|
best_elem = None
|
||||||
|
best_score = 0.0
|
||||||
|
anc_h, anc_w = anc_img.shape[:2]
|
||||||
|
|
||||||
|
for elem in som_result.elements:
|
||||||
|
x1, y1, x2, y2 = elem.bbox
|
||||||
|
# Agrandir la zone de 20% pour tolérer les différences
|
||||||
|
margin_x = int((x2 - x1) * 0.2)
|
||||||
|
margin_y = int((y2 - y1) * 0.2)
|
||||||
|
rx1 = max(0, x1 - margin_x)
|
||||||
|
ry1 = max(0, y1 - margin_y)
|
||||||
|
rx2 = min(screenshot_cv.shape[1], x2 + margin_x)
|
||||||
|
ry2 = min(screenshot_cv.shape[0], y2 + margin_y)
|
||||||
|
region = screenshot_cv[ry1:ry2, rx1:rx2]
|
||||||
|
|
||||||
|
if region.shape[0] < anc_h or region.shape[1] < anc_w:
|
||||||
|
continue
|
||||||
|
|
||||||
|
res = cv2.matchTemplate(region, anc_img, cv2.TM_CCOEFF_NORMED)
|
||||||
|
_, score, _, _ = cv2.minMaxLoc(res)
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_elem = elem
|
||||||
|
|
||||||
|
if best_elem and best_score >= 0.6:
|
||||||
|
elapsed = time.time() - t0
|
||||||
|
cx_norm, cy_norm = best_elem.center_norm
|
||||||
|
logger.info(
|
||||||
|
"SoM resolve ANCHOR : match crop '#%d' score=%.3f → (%.4f, %.4f) en %.1fs",
|
||||||
|
best_elem.id, best_score, cx_norm, cy_norm, elapsed,
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"resolved": True,
|
||||||
|
"method": "som_anchor_match",
|
||||||
|
"x_pct": round(cx_norm, 6),
|
||||||
|
"y_pct": round(cy_norm, 6),
|
||||||
|
"matched_element": {
|
||||||
|
"label": best_elem.label or f"icon #{best_elem.id}",
|
||||||
|
"type": best_elem.source,
|
||||||
|
"role": "som_anchor_match",
|
||||||
|
"confidence": best_score,
|
||||||
|
"som_id": best_elem.id,
|
||||||
|
},
|
||||||
|
"score": best_score,
|
||||||
|
}
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("SoM anchor match erreur : %s", e)
|
||||||
|
|
||||||
# ── 3. Sauvegarder l'image annotée SoM temporairement ──
|
# ── 3. Sauvegarder l'image annotée SoM temporairement ──
|
||||||
if som_result.som_image is None:
|
if som_result.som_image is None:
|
||||||
logger.debug("SoM resolve : pas d'image annotée, skip VLM")
|
logger.debug("SoM resolve : pas d'image annotée, skip VLM")
|
||||||
|
|||||||
@@ -427,6 +427,95 @@ def _needs_post_wait(action: dict) -> int:
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# VLM identification d'éléments UI (pour les éléments sans texte OCR)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _vlm_identify_element(anchor_b64: str, window_title: str = "") -> str:
|
||||||
|
"""Demander au VLM de décrire un élément UI à partir de son crop.
|
||||||
|
|
||||||
|
Utilisé pendant le build_replay quand un élément cliqué n'a pas de
|
||||||
|
texte visible (icône YOLO sans label OCR). Le VLM décrit CE QUE c'est
|
||||||
|
(bouton, icône, menu) pour permettre la résolution sémantique au replay.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Description courte de l'élément (ex: "search icon", "Word icon")
|
||||||
|
ou chaîne vide si le VLM n'est pas disponible.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import io
|
||||||
|
import tempfile
|
||||||
|
from PIL import Image
|
||||||
|
except ImportError:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Décoder le crop base64 → fichier temporaire pour le VLM
|
||||||
|
img_bytes = base64.b64decode(anchor_b64)
|
||||||
|
img = Image.open(io.BytesIO(img_bytes))
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
||||||
|
img.save(tmp, format="PNG")
|
||||||
|
tmp_path = tmp.name
|
||||||
|
|
||||||
|
from core.detection.ollama_client import OllamaClient
|
||||||
|
client = OllamaClient(
|
||||||
|
endpoint="http://localhost:11434",
|
||||||
|
model="qwen3-vl:8b",
|
||||||
|
timeout=15,
|
||||||
|
)
|
||||||
|
context = f" in the window '{window_title}'" if window_title else ""
|
||||||
|
result = client.generate(
|
||||||
|
prompt=(
|
||||||
|
f"This is a cropped UI element{context}. "
|
||||||
|
"What is it? Answer with a short label (2-5 words max). "
|
||||||
|
"Examples: 'search bar icon', 'Word application icon', 'close button', "
|
||||||
|
"'file menu', 'save button'.\n"
|
||||||
|
"Answer ONLY the label, nothing else."
|
||||||
|
),
|
||||||
|
image_path=tmp_path,
|
||||||
|
system_prompt="You identify UI elements. Answer with a short label only.",
|
||||||
|
temperature=0.1,
|
||||||
|
max_tokens=20,
|
||||||
|
)
|
||||||
|
|
||||||
|
import os
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
|
||||||
|
if result.get("success"):
|
||||||
|
raw = result.get("response", "").strip()
|
||||||
|
# Extraire un label court depuis la réponse (le VLM bavarde souvent)
|
||||||
|
# Retirer les préfixes courants
|
||||||
|
for prefix in (
|
||||||
|
"Based on the image, the UI element shown is a ",
|
||||||
|
"Based on the image, the UI element is a ",
|
||||||
|
"Based on the image, this is a ",
|
||||||
|
"Based on the image, it is a ",
|
||||||
|
"Based on the image, I can see ",
|
||||||
|
"Based on the image, ",
|
||||||
|
"The UI element shown is a ",
|
||||||
|
"The UI element is a ",
|
||||||
|
"The element is a ",
|
||||||
|
"This is a ", "It is a ", "It's a ", "I can see a ",
|
||||||
|
"I can see ", "A ",
|
||||||
|
):
|
||||||
|
if raw.lower().startswith(prefix.lower()):
|
||||||
|
raw = raw[len(prefix):]
|
||||||
|
break
|
||||||
|
# Prendre les 5 premiers mots utiles
|
||||||
|
words = raw.split()[:5]
|
||||||
|
label = " ".join(words).strip('",.\' ').rstrip(".")
|
||||||
|
if label and 2 <= len(label) <= 60:
|
||||||
|
logger.info("VLM identify element : '%s'", label)
|
||||||
|
return label
|
||||||
|
else:
|
||||||
|
logger.debug("VLM identify : label trop court ou vide après nettoyage (raw='%s')", raw[:80])
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("VLM identify element échoué : %s", e)
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# SomEngine — enrichissement Set-of-Mark des clics pendant le build_replay
|
# SomEngine — enrichissement Set-of-Mark des clics pendant le build_replay
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -1025,8 +1114,32 @@ def build_replay_from_raw_events(
|
|||||||
)
|
)
|
||||||
vlm_description = ", ".join(vlm_parts) if vlm_parts else ""
|
vlm_description = ", ".join(vlm_parts) if vlm_parts else ""
|
||||||
|
|
||||||
|
# ── SomEngine : identifier l'élément cliqué ──
|
||||||
|
som_elem = _som_identify_clicked_element(
|
||||||
|
evt, session_dir_path, screen_w, screen_h,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Déterminer le texte de l'élément cliqué (by_text)
|
||||||
|
# Priorité : vision_info.text > som_element.label > VLM identification
|
||||||
|
element_text = ""
|
||||||
|
element_type = ""
|
||||||
|
if isinstance(vision_info, dict):
|
||||||
|
element_text = vision_info.get("text", "")
|
||||||
|
element_type = vision_info.get("type", "")
|
||||||
|
if not element_text and som_elem and som_elem.get("label"):
|
||||||
|
element_text = som_elem["label"]
|
||||||
|
|
||||||
|
# Si pas de texte (icône sans label), demander au VLM
|
||||||
|
# d'identifier CE QUE c'est à partir du crop
|
||||||
|
if not element_text and anchor_b64:
|
||||||
|
element_text = _vlm_identify_element(
|
||||||
|
anchor_b64, window_title,
|
||||||
|
)
|
||||||
|
|
||||||
action["target_spec"] = {
|
action["target_spec"] = {
|
||||||
"anchor_image_base64": anchor_b64,
|
"anchor_image_base64": anchor_b64,
|
||||||
|
"by_text": element_text, # CE QUE l'élément EST
|
||||||
|
"by_role": element_type or (som_elem.get("source", "") if som_elem else ""),
|
||||||
"vlm_description": vlm_description,
|
"vlm_description": vlm_description,
|
||||||
"window_title": window_title,
|
"window_title": window_title,
|
||||||
"original_position": {
|
"original_position": {
|
||||||
@@ -1034,22 +1147,9 @@ def build_replay_from_raw_events(
|
|||||||
"y_relative": y_relative,
|
"y_relative": y_relative,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
# NE PAS mettre window_title comme by_text !
|
|
||||||
# by_text doit être le texte de l'ÉLÉMENT cliqué, pas le titre de la fenêtre.
|
|
||||||
# Sinon le template matching texte cherche "13071967.txt – Bloc-notes"
|
|
||||||
# sur l'écran et clique sur la barre de titre au lieu du bon élément.
|
|
||||||
|
|
||||||
# ── SomEngine : identifier l'élément cliqué ──
|
|
||||||
som_elem = _som_identify_clicked_element(
|
|
||||||
evt, session_dir_path, screen_w, screen_h,
|
|
||||||
)
|
|
||||||
if som_elem:
|
if som_elem:
|
||||||
action["target_spec"]["som_element"] = som_elem
|
action["target_spec"]["som_element"] = som_elem
|
||||||
# Enrichir la description VLM avec le label SoM
|
|
||||||
if som_elem.get("label") and not vision_info.get("text"):
|
|
||||||
action["target_spec"]["vlm_description"] += (
|
|
||||||
f", le texte de l'élément est '{som_elem['label']}'"
|
|
||||||
)
|
|
||||||
|
|
||||||
elif evt_type == "text_input":
|
elif evt_type == "text_input":
|
||||||
text = evt.get("text", "")
|
text = evt.get("text", "")
|
||||||
|
|||||||
Reference in New Issue
Block a user