feat: contrôle visuel post-action (template matching + VLM fallback)
- Screenshots de référence (res_shot_XXXX.png) attachés aux actions click/key_combo - _attach_expected_screenshots() charge les screenshots résultat de l'enregistrement - _verify_visual_state() dans executor : 2 étages de vérification - Étage 1 : template matching rapide (~100ms), score > 0.7 = OK, < 0.3 = FAIL - Étage 2 : VLM compare current vs expected (~4s), MATCH/MISMATCH - Résultat attaché à chaque action (visual_verification dans result) - Note : executor sur Windows (/tmp/executor_win.py) à synchroniser manuellement Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -554,6 +554,76 @@ def _load_crop_for_event(
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _attach_expected_screenshots(
|
||||||
|
actions: list, raw_events: list, session_dir: Path,
|
||||||
|
) -> None:
|
||||||
|
"""Attacher les screenshots de référence (résultat attendu) aux actions.
|
||||||
|
|
||||||
|
Pour chaque action de type click ou key_combo, cherche le screenshot
|
||||||
|
res_shot_XXXX.png (capturé 1s après l'action pendant l'enregistrement)
|
||||||
|
et l'attache comme expected_screenshot_b64.
|
||||||
|
|
||||||
|
Le screenshot est compressé en JPEG qualité 40 (~30-50 KB en b64)
|
||||||
|
pour limiter le poids de chaque action.
|
||||||
|
"""
|
||||||
|
import base64
|
||||||
|
from PIL import Image as _Image
|
||||||
|
|
||||||
|
shots_dir = session_dir / "shots"
|
||||||
|
if not shots_dir.is_dir():
|
||||||
|
return
|
||||||
|
|
||||||
|
# Mapper les screenshot_id des événements originaux aux actions
|
||||||
|
# Les événements click/key_combo ont un "screenshot_id" (ex: "shot_0003")
|
||||||
|
# Le screenshot résultat est "res_shot_0003.png"
|
||||||
|
action_idx = 0
|
||||||
|
for raw_evt in raw_events:
|
||||||
|
event_data = raw_evt.get("event", raw_evt)
|
||||||
|
screenshot_id = event_data.get("screenshot_id", "")
|
||||||
|
if not screenshot_id:
|
||||||
|
continue
|
||||||
|
|
||||||
|
evt_type = event_data.get("type", "")
|
||||||
|
if evt_type not in ("mouse_click", "key_combo", "key_press"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Trouver l'action correspondante (même type, index croissant)
|
||||||
|
while action_idx < len(actions):
|
||||||
|
a = actions[action_idx]
|
||||||
|
a_type = a.get("type", "")
|
||||||
|
if a_type in ("click", "key_combo"):
|
||||||
|
break
|
||||||
|
action_idx += 1
|
||||||
|
else:
|
||||||
|
break # Plus d'actions
|
||||||
|
|
||||||
|
# Charger le screenshot résultat
|
||||||
|
res_file = shots_dir / f"res_{screenshot_id}.png"
|
||||||
|
if not res_file.is_file():
|
||||||
|
action_idx += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
img = _Image.open(res_file)
|
||||||
|
# Redimensionner pour réduire le poids (800px de large)
|
||||||
|
if img.width > 800:
|
||||||
|
ratio = 800 / img.width
|
||||||
|
img = img.resize((800, int(img.height * ratio)), _Image.LANCZOS)
|
||||||
|
import io
|
||||||
|
buf = io.BytesIO()
|
||||||
|
img.save(buf, format="JPEG", quality=40)
|
||||||
|
b64 = base64.b64encode(buf.getvalue()).decode()
|
||||||
|
actions[action_idx]["expected_screenshot_b64"] = b64
|
||||||
|
logger.debug(
|
||||||
|
"Screenshot de référence attaché à action %d : %s (%d KB)",
|
||||||
|
action_idx, res_file.name, len(b64) // 1024,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Erreur chargement screenshot ref %s : %s", res_file, e)
|
||||||
|
|
||||||
|
action_idx += 1
|
||||||
|
|
||||||
|
|
||||||
def build_replay_from_raw_events(
|
def build_replay_from_raw_events(
|
||||||
events: list,
|
events: list,
|
||||||
session_id: str = "",
|
session_id: str = "",
|
||||||
@@ -917,16 +987,23 @@ def build_replay_from_raw_events(
|
|||||||
continue
|
continue
|
||||||
result.append(a)
|
result.append(a)
|
||||||
|
|
||||||
|
# ── 8. Attacher les screenshots de référence (état attendu après action) ──
|
||||||
|
# Les screenshots res_shot_XXXX.png capturés 1s après chaque action pendant
|
||||||
|
# l'enregistrement servent de référence pour le contrôle visuel.
|
||||||
|
if session_dir_path:
|
||||||
|
_attach_expected_screenshots(result, events, session_dir_path)
|
||||||
|
|
||||||
# Stats visual replay
|
# Stats visual replay
|
||||||
visual_clicks = sum(
|
visual_clicks = sum(
|
||||||
1 for a in result
|
1 for a in result
|
||||||
if a.get("type") == "click" and a.get("visual_mode")
|
if a.get("type") == "click" and a.get("visual_mode")
|
||||||
)
|
)
|
||||||
total_clicks = sum(1 for a in result if a.get("type") == "click")
|
total_clicks = sum(1 for a in result if a.get("type") == "click")
|
||||||
|
verified_count = sum(1 for a in result if a.get("expected_screenshot_b64"))
|
||||||
logger.info(
|
logger.info(
|
||||||
"build_replay_from_raw_events(%s) : %d actions propres produites "
|
"build_replay_from_raw_events(%s) : %d actions propres produites "
|
||||||
"(%d/%d clics avec visual_mode)",
|
"(%d/%d clics avec visual_mode, %d avec screenshot de référence)",
|
||||||
session_id, len(result), visual_clicks, total_clicks,
|
session_id, len(result), visual_clicks, total_clicks, verified_count,
|
||||||
)
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user