feat: intégration SomEngine dans build_replay (Phase 1) et resolve_target (Phase 2)
Phase 1 : enrichit chaque clic avec som_element (id, label, bbox) via YOLO+docTR Phase 2 : nouvelle résolution SoM+VLM — SomEngine numérote, VLM identifie le mark 10 tests unitaires ajoutés, conftest unit/ pour le bon path agent_v0 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3322,6 +3322,197 @@ def _vlm_quick_find(
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Résolution Set-of-Mark : SomEngine (détection) + VLM (identification)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_som_engine_api = None # Singleton
|
||||
|
||||
|
||||
def _get_som_engine_api():
|
||||
"""Singleton SomEngine pour la résolution visuelle (lazy-loaded, GPU)."""
|
||||
global _som_engine_api
|
||||
if _som_engine_api is None:
|
||||
try:
|
||||
from core.detection.som_engine import SomEngine
|
||||
_som_engine_api = SomEngine(device="cuda")
|
||||
logger.info("SomEngine API initialisé (lazy singleton)")
|
||||
except Exception as e:
|
||||
logger.warning("SomEngine API non disponible : %s", e)
|
||||
_som_engine_api = False
|
||||
return _som_engine_api if _som_engine_api is not False else None
|
||||
|
||||
|
||||
def _resolve_by_som(
|
||||
screenshot_path: str,
|
||||
target_spec: Dict[str, Any],
|
||||
screen_width: int,
|
||||
screen_height: int,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Résoudre une cible UI via Set-of-Mark + VLM.
|
||||
|
||||
Pipeline :
|
||||
1. SomEngine détecte tous les éléments et les numérote sur le screenshot
|
||||
2. VLM reçoit l'image annotée + description de la cible
|
||||
3. VLM identifie le numéro du mark → coordonnées précises
|
||||
|
||||
Avantages vs VLM direct :
|
||||
- Le VLM n'a qu'à identifier (son point fort), pas localiser
|
||||
- Les coordonnées viennent de SomEngine (pixel-perfect)
|
||||
- Question simple "quel numéro ?" → réponse simple
|
||||
|
||||
Args:
|
||||
screenshot_path: Chemin du screenshot actuel
|
||||
target_spec: Spécification de la cible (vlm_description, som_element, etc.)
|
||||
screen_width: Largeur écran en pixels
|
||||
screen_height: Hauteur écran en pixels
|
||||
|
||||
Returns:
|
||||
Dict avec resolved=True et coordonnées, ou None si indisponible.
|
||||
"""
|
||||
engine = _get_som_engine_api()
|
||||
if engine is None:
|
||||
return None
|
||||
|
||||
client = _get_vlm_client()
|
||||
if client is None:
|
||||
return None
|
||||
|
||||
t0 = time.time()
|
||||
|
||||
# ── 1. Lancer SomEngine sur le screenshot actuel ──
|
||||
try:
|
||||
from PIL import Image as PILImage
|
||||
img = PILImage.open(screenshot_path).convert("RGB")
|
||||
som_result = engine.analyze(img)
|
||||
except Exception as e:
|
||||
logger.warning("SoM resolve : erreur analyse — %s", e)
|
||||
return None
|
||||
|
||||
if not som_result.elements:
|
||||
logger.info("SoM resolve : 0 éléments détectés")
|
||||
return None
|
||||
|
||||
# ── 2. Construire la description de la cible ──
|
||||
som_element = target_spec.get("som_element", {})
|
||||
vlm_description = target_spec.get("vlm_description", "")
|
||||
anchor_label = som_element.get("label", "")
|
||||
|
||||
# Construire un prompt riche
|
||||
target_parts = []
|
||||
if anchor_label:
|
||||
target_parts.append(f"texte '{anchor_label}'")
|
||||
if vlm_description:
|
||||
target_parts.append(vlm_description)
|
||||
if not target_parts:
|
||||
# Sans description, SoM resolve ne peut pas fonctionner
|
||||
logger.debug("SoM resolve : pas de description pour identifier l'élément")
|
||||
return None
|
||||
|
||||
target_desc = ", ".join(target_parts)
|
||||
|
||||
# ── 3. Sauvegarder l'image annotée SoM temporairement ──
|
||||
import tempfile
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
|
||||
som_result.som_image.save(tmp, format="JPEG", quality=85)
|
||||
som_img_path = tmp.name
|
||||
except Exception as e:
|
||||
logger.warning("SoM resolve : erreur sauvegarde image annotée — %s", e)
|
||||
return None
|
||||
|
||||
# ── 4. VLM : identifier le numéro du mark ──
|
||||
# Lister les éléments avec leur numéro pour aider le VLM
|
||||
elements_list = "\n".join(
|
||||
f" #{e.id}: '{e.label}' ({e.source})"
|
||||
for e in som_result.elements[:50] # Limiter à 50 éléments
|
||||
if e.label
|
||||
)
|
||||
|
||||
prompt = (
|
||||
"This screenshot has numbered marks (red badges) on each UI element.\n\n"
|
||||
f"I'm looking for this element: {target_desc}\n\n"
|
||||
)
|
||||
if elements_list:
|
||||
prompt += f"Detected elements:\n{elements_list}\n\n"
|
||||
prompt += (
|
||||
"Which mark number corresponds to this element?\n"
|
||||
'Return ONLY a JSON object: {"mark_id": N, "confidence": 0.XX}\n'
|
||||
"If not found, return: {\"mark_id\": null, \"confidence\": 0.0}"
|
||||
)
|
||||
|
||||
system_prompt = "You are a UI element identifier. Look at numbered marks on the screenshot. Output raw JSON only."
|
||||
|
||||
try:
|
||||
result = client.generate(
|
||||
prompt=prompt,
|
||||
image_path=som_img_path,
|
||||
system_prompt=system_prompt,
|
||||
temperature=0.1,
|
||||
max_tokens=100,
|
||||
force_json=False,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("SoM resolve : erreur VLM — %s", e)
|
||||
return None
|
||||
finally:
|
||||
import os
|
||||
try:
|
||||
os.unlink(som_img_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
elapsed = time.time() - t0
|
||||
|
||||
if not result.get("success"):
|
||||
logger.info("SoM resolve : VLM échoué (%.1fs)", elapsed)
|
||||
return None
|
||||
|
||||
# ── 5. Parser la réponse et retourner les coordonnées ──
|
||||
response_text = result.get("response", "").strip()
|
||||
parsed = client._extract_json_from_response(response_text)
|
||||
if parsed is None:
|
||||
logger.info("SoM resolve : réponse non-JSON (%.1fs) — %.80s", elapsed, response_text)
|
||||
return None
|
||||
|
||||
mark_id = parsed.get("mark_id")
|
||||
confidence = float(parsed.get("confidence", 0.0))
|
||||
|
||||
if mark_id is None or confidence < 0.3:
|
||||
logger.info(
|
||||
"SoM resolve : mark non trouvé ou confiance trop basse (mark=%s, conf=%.2f, %.1fs)",
|
||||
mark_id, confidence, elapsed,
|
||||
)
|
||||
return None
|
||||
|
||||
mark_id = int(mark_id)
|
||||
elem = som_result.get_element_by_id(mark_id)
|
||||
if elem is None:
|
||||
logger.warning("SoM resolve : mark #%d inexistant (%.1fs)", mark_id, elapsed)
|
||||
return None
|
||||
|
||||
cx_norm, cy_norm = elem.center_norm
|
||||
logger.info(
|
||||
"SoM resolve OK : mark #%d '%s' → (%.4f, %.4f) conf=%.2f en %.1fs (%d éléments)",
|
||||
mark_id, elem.label, cx_norm, cy_norm, confidence, elapsed, len(som_result.elements),
|
||||
)
|
||||
|
||||
return {
|
||||
"resolved": True,
|
||||
"method": "som_vlm",
|
||||
"x_pct": round(cx_norm, 6),
|
||||
"y_pct": round(cy_norm, 6),
|
||||
"matched_element": {
|
||||
"label": elem.label or f"mark #{mark_id}",
|
||||
"type": elem.source,
|
||||
"role": "som_identified",
|
||||
"confidence": confidence,
|
||||
"som_id": mark_id,
|
||||
},
|
||||
"score": confidence,
|
||||
}
|
||||
|
||||
|
||||
def _resolve_target_sync(
|
||||
screenshot_path: str,
|
||||
target_spec: Dict[str, Any],
|
||||
@@ -3336,6 +3527,7 @@ def _resolve_target_sync(
|
||||
Hiérarchie de résolution (strict_mode=True, replay sessions) — VLM-FIRST :
|
||||
1. VLM Quick Find (~3-8s) — compréhension sémantique de l'écran, multi-image
|
||||
(screenshot + crop de référence + description riche)
|
||||
1.5. SoM + VLM (~5-15s) — SomEngine numérote les éléments, VLM identifie le bon
|
||||
2. Template matching OpenCV (~100ms) — fallback pixel, seuil STRICT 0.90
|
||||
3. resolved=False → STOP le replay
|
||||
|
||||
@@ -3394,6 +3586,30 @@ def _resolve_target_sync(
|
||||
vlm_description[:60] if vlm_description else "(anchor)",
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Étape 1.5 : SoM + VLM (Set-of-Mark + identification)
|
||||
# SomEngine numérote les éléments, VLM identifie le bon numéro.
|
||||
# Plus fiable que le VLM direct car le VLM n'a qu'à identifier,
|
||||
# pas localiser — et les coordonnées sont pixel-perfect.
|
||||
# ---------------------------------------------------------------
|
||||
som_element = target_spec.get("som_element", {})
|
||||
if som_element or vlm_description:
|
||||
som_result = _resolve_by_som(
|
||||
screenshot_path=screenshot_path,
|
||||
target_spec=target_spec,
|
||||
screen_width=screen_width,
|
||||
screen_height=screen_height,
|
||||
)
|
||||
if som_result and som_result.get("resolved"):
|
||||
logger.info(
|
||||
"Strict resolve SoM+VLM : OK (score=%.2f, mark=#%s)",
|
||||
som_result.get("score", 0),
|
||||
som_result.get("matched_element", {}).get("som_id", "?"),
|
||||
)
|
||||
return som_result
|
||||
else:
|
||||
logger.info("Strict resolve SoM+VLM : échoué, passage template matching")
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Étape 2 : Template matching (fallback pixel) — seuil STRICT 0.90
|
||||
# ---------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user