feat: intégration SomEngine dans build_replay (Phase 1) et resolve_target (Phase 2)

Phase 1 : enrichit chaque clic avec som_element (id, label, bbox) via YOLO+docTR
Phase 2 : nouvelle résolution SoM+VLM — SomEngine numérote, VLM identifie le mark
10 tests unitaires ajoutés, conftest unit/ pour le bon path agent_v0

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-03-31 09:30:14 +02:00
parent 2ddccff108
commit 4c76dca992
4 changed files with 660 additions and 0 deletions

View File

@@ -427,6 +427,111 @@ def _needs_post_wait(action: dict) -> int:
return 0
# ---------------------------------------------------------------------------
# SomEngine — enrichissement Set-of-Mark des clics pendant le build_replay
# ---------------------------------------------------------------------------
_som_engine = None # Singleton, chargé à la demande
def _get_som_engine():
"""Singleton SomEngine (lazy-loaded, GPU)."""
global _som_engine
if _som_engine is None:
try:
from core.detection.som_engine import SomEngine
_som_engine = SomEngine(device="cuda")
logger.info("SomEngine initialisé (lazy singleton)")
except Exception as e:
logger.warning("SomEngine non disponible : %s", e)
_som_engine = False # Marqueur "indisponible"
return _som_engine if _som_engine is not False else None
def _som_identify_clicked_element(
event_data: dict,
session_dir: Optional[Path],
screen_w: int,
screen_h: int,
) -> Optional[dict]:
"""Identifier l'élément UI cliqué via SomEngine (YOLO + docTR).
Charge le full screenshot de l'événement, lance SomEngine pour détecter
tous les éléments, puis identifie celui qui se trouve sous le clic.
Returns:
Dict avec id, label, source, bbox_norm, center_norm, confidence
ou None si SomEngine indisponible ou élément non trouvé.
"""
engine = _get_som_engine()
if engine is None:
return None
if not session_dir:
return None
shots_dir = session_dir / "shots"
if not shots_dir.is_dir():
return None
# Trouver le full screenshot
screenshot_id = event_data.get("screenshot_id", "")
if not screenshot_id:
return None
full_path = shots_dir / f"{screenshot_id}_full.png"
if not full_path.is_file():
# Fallback : essayer sans le suffixe _full
full_path = shots_dir / f"{screenshot_id}.png"
if not full_path.is_file():
return None
try:
from PIL import Image
img = Image.open(full_path).convert("RGB")
except Exception as e:
logger.debug("SoM: impossible de charger %s : %s", full_path, e)
return None
# Lancer SomEngine
try:
result = engine.analyze(img)
except Exception as e:
logger.warning("SoM: erreur d'analyse : %s", e)
return None
if not result.elements:
return None
# Trouver l'élément cliqué
pos = event_data.get("pos", [])
if not pos or len(pos) < 2:
return None
click_x, click_y = int(pos[0]), int(pos[1])
elem = result.find_element_at(click_x, click_y, margin=30)
if elem is None:
logger.debug(
"SoM: aucun élément trouvé au clic (%d, %d) parmi %d éléments",
click_x, click_y, len(result.elements),
)
return None
logger.info(
"SoM: clic (%d,%d) → élément #%d '%s' (source=%s, conf=%.2f)",
click_x, click_y, elem.id, elem.label, elem.source, elem.confidence,
)
return {
"id": elem.id,
"label": elem.label,
"source": elem.source,
"bbox_norm": list(elem.bbox_norm),
"center_norm": list(elem.center_norm),
"confidence": elem.confidence,
"element_count": len(result.elements),
}
def _load_crop_for_event(
event_data: dict,
session_dir: Optional[Path],
@@ -919,6 +1024,18 @@ def build_replay_from_raw_events(
# Sinon le template matching texte cherche "13071967.txt Bloc-notes"
# sur l'écran et clique sur la barre de titre au lieu du bon élément.
# ── SomEngine : identifier l'élément cliqué ──
som_elem = _som_identify_clicked_element(
evt, session_dir_path, screen_w, screen_h,
)
if som_elem:
action["target_spec"]["som_element"] = som_elem
# Enrichir la description VLM avec le label SoM
if som_elem.get("label") and not vision_info.get("text"):
action["target_spec"]["vlm_description"] += (
f", le texte de l'élément est '{som_elem['label']}'"
)
elif evt_type == "text_input":
text = evt.get("text", "")
if not text: