feat: intégration SomEngine dans build_replay (Phase 1) et resolve_target (Phase 2)
Phase 1 : enrichit chaque clic avec som_element (id, label, bbox) via YOLO+docTR Phase 2 : nouvelle résolution SoM+VLM — SomEngine numérote, VLM identifie le mark 10 tests unitaires ajoutés, conftest unit/ pour le bon path agent_v0 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -427,6 +427,111 @@ def _needs_post_wait(action: dict) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SomEngine — enrichissement Set-of-Mark des clics pendant le build_replay
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_som_engine = None # Singleton, chargé à la demande
|
||||
|
||||
|
||||
def _get_som_engine():
|
||||
"""Singleton SomEngine (lazy-loaded, GPU)."""
|
||||
global _som_engine
|
||||
if _som_engine is None:
|
||||
try:
|
||||
from core.detection.som_engine import SomEngine
|
||||
_som_engine = SomEngine(device="cuda")
|
||||
logger.info("SomEngine initialisé (lazy singleton)")
|
||||
except Exception as e:
|
||||
logger.warning("SomEngine non disponible : %s", e)
|
||||
_som_engine = False # Marqueur "indisponible"
|
||||
return _som_engine if _som_engine is not False else None
|
||||
|
||||
|
||||
def _som_identify_clicked_element(
|
||||
event_data: dict,
|
||||
session_dir: Optional[Path],
|
||||
screen_w: int,
|
||||
screen_h: int,
|
||||
) -> Optional[dict]:
|
||||
"""Identifier l'élément UI cliqué via SomEngine (YOLO + docTR).
|
||||
|
||||
Charge le full screenshot de l'événement, lance SomEngine pour détecter
|
||||
tous les éléments, puis identifie celui qui se trouve sous le clic.
|
||||
|
||||
Returns:
|
||||
Dict avec id, label, source, bbox_norm, center_norm, confidence
|
||||
ou None si SomEngine indisponible ou élément non trouvé.
|
||||
"""
|
||||
engine = _get_som_engine()
|
||||
if engine is None:
|
||||
return None
|
||||
|
||||
if not session_dir:
|
||||
return None
|
||||
|
||||
shots_dir = session_dir / "shots"
|
||||
if not shots_dir.is_dir():
|
||||
return None
|
||||
|
||||
# Trouver le full screenshot
|
||||
screenshot_id = event_data.get("screenshot_id", "")
|
||||
if not screenshot_id:
|
||||
return None
|
||||
|
||||
full_path = shots_dir / f"{screenshot_id}_full.png"
|
||||
if not full_path.is_file():
|
||||
# Fallback : essayer sans le suffixe _full
|
||||
full_path = shots_dir / f"{screenshot_id}.png"
|
||||
if not full_path.is_file():
|
||||
return None
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
img = Image.open(full_path).convert("RGB")
|
||||
except Exception as e:
|
||||
logger.debug("SoM: impossible de charger %s : %s", full_path, e)
|
||||
return None
|
||||
|
||||
# Lancer SomEngine
|
||||
try:
|
||||
result = engine.analyze(img)
|
||||
except Exception as e:
|
||||
logger.warning("SoM: erreur d'analyse : %s", e)
|
||||
return None
|
||||
|
||||
if not result.elements:
|
||||
return None
|
||||
|
||||
# Trouver l'élément cliqué
|
||||
pos = event_data.get("pos", [])
|
||||
if not pos or len(pos) < 2:
|
||||
return None
|
||||
|
||||
click_x, click_y = int(pos[0]), int(pos[1])
|
||||
elem = result.find_element_at(click_x, click_y, margin=30)
|
||||
if elem is None:
|
||||
logger.debug(
|
||||
"SoM: aucun élément trouvé au clic (%d, %d) parmi %d éléments",
|
||||
click_x, click_y, len(result.elements),
|
||||
)
|
||||
return None
|
||||
|
||||
logger.info(
|
||||
"SoM: clic (%d,%d) → élément #%d '%s' (source=%s, conf=%.2f)",
|
||||
click_x, click_y, elem.id, elem.label, elem.source, elem.confidence,
|
||||
)
|
||||
return {
|
||||
"id": elem.id,
|
||||
"label": elem.label,
|
||||
"source": elem.source,
|
||||
"bbox_norm": list(elem.bbox_norm),
|
||||
"center_norm": list(elem.center_norm),
|
||||
"confidence": elem.confidence,
|
||||
"element_count": len(result.elements),
|
||||
}
|
||||
|
||||
|
||||
def _load_crop_for_event(
|
||||
event_data: dict,
|
||||
session_dir: Optional[Path],
|
||||
@@ -919,6 +1024,18 @@ def build_replay_from_raw_events(
|
||||
# Sinon le template matching texte cherche "13071967.txt – Bloc-notes"
|
||||
# sur l'écran et clique sur la barre de titre au lieu du bon élément.
|
||||
|
||||
# ── SomEngine : identifier l'élément cliqué ──
|
||||
som_elem = _som_identify_clicked_element(
|
||||
evt, session_dir_path, screen_w, screen_h,
|
||||
)
|
||||
if som_elem:
|
||||
action["target_spec"]["som_element"] = som_elem
|
||||
# Enrichir la description VLM avec le label SoM
|
||||
if som_elem.get("label") and not vision_info.get("text"):
|
||||
action["target_spec"]["vlm_description"] += (
|
||||
f", le texte de l'élément est '{som_elem['label']}'"
|
||||
)
|
||||
|
||||
elif evt_type == "text_input":
|
||||
text = evt.get("text", "")
|
||||
if not text:
|
||||
|
||||
Reference in New Issue
Block a user