feat: premier replay E2E + mode apprentissage supervisé
Premier replay fonctionnel de bout en bout (Bloc-notes, Chrome). Corrections critiques : - Fix double-lancement agent (Lea.bat start /b + verrou PID) - Sérialisation replay (threading.Lock dans poll_and_execute) - Garde UIA bbox >50% écran (rejet conteneurs "Bureau") - Filtre fenêtres bruit système (systray overflow) - Auto-nettoyage replays bloqués (paused_need_help) Cascade visuelle complète dans session_cleaner : - UIA local (10ms) → template matching (100ms) → serveur docTR/VLM - Nettoyage bureau pré-replay (clic "Afficher le bureau") - Crops 80x80 + vlm_description pour chaque clic Grounding contraint à la fenêtre active : - Capture croppée à la fenêtre au lieu de l'écran entier - Conversion coordonnées fenêtre → écran - Élimine les faux positifs taskbar/systray Mode apprentissage supervisé (SUPERVISE → capture humaine) : - Léa passe en mode capture quand elle est perdue - Capture mini-workflow humain (clics + frappes + combos) - Fin par Ctrl+Shift+L ou timeout inactivité 10s - Correction stockée dans target_memory.db via serveur Deploy Windows complet (grounding.py, policy.py, uia_helper.py). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -875,17 +875,174 @@ def _find_session_dir(machine_id: str, session_id: str) -> Optional[Path]:
|
||||
return None
|
||||
|
||||
|
||||
def _load_crop_as_base64(session_dir: Path, screenshot_id: str) -> str:
|
||||
"""Charger un crop screenshot et le retourner en base64.
|
||||
|
||||
Le crop (80x80 autour du clic) sert d'ancre pour le template matching —
|
||||
le GroundingEngine compare cette vignette a l'ecran actuel via OpenCV.
|
||||
"""
|
||||
if not screenshot_id:
|
||||
return ""
|
||||
crop_path = session_dir / "shots" / f"{screenshot_id}_crop.png"
|
||||
if not crop_path.is_file():
|
||||
return ""
|
||||
try:
|
||||
import base64
|
||||
data = crop_path.read_bytes()
|
||||
return base64.b64encode(data).decode("ascii")
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def _build_vlm_description(
|
||||
uia_snapshot: Dict[str, Any], window_info: Dict[str, Any],
|
||||
) -> str:
|
||||
"""Construire une description naturelle pour le VLM.
|
||||
|
||||
Le VLM recoit cette phrase + le screenshot actuel et doit localiser
|
||||
l'element decrit. Plus la description est precise, meilleur le grounding.
|
||||
"""
|
||||
name = uia_snapshot.get("name", "")
|
||||
control_type = uia_snapshot.get("control_type", "")
|
||||
window_title = window_info.get("title", "") if window_info else ""
|
||||
|
||||
parts = []
|
||||
if control_type:
|
||||
parts.append(f"le {control_type}")
|
||||
if name:
|
||||
parts.append(f"'{name}'")
|
||||
if window_title and window_title != "unknown_window":
|
||||
parts.append(f"dans la fenetre '{window_title}'")
|
||||
|
||||
if parts:
|
||||
return " ".join(parts)
|
||||
return ""
|
||||
|
||||
|
||||
def _build_full_target_spec(
|
||||
event: Dict[str, Any], session_dir: Path,
|
||||
) -> Dict[str, Any]:
|
||||
"""Construire un target_spec complet pour la cascade de resolution visuelle.
|
||||
|
||||
Exploite TOUTES les donnees capturees pendant l'enregistrement :
|
||||
- uia_snapshot → resolution UIA locale (lea_uia.exe, 10-20ms)
|
||||
- crop screenshot → template matching OpenCV (~100ms)
|
||||
- nom UIA + window_title → OCR docTR + VLM grounding (1-5s)
|
||||
|
||||
La cascade : UIA → template → serveur (docTR+VLM) → VLM local.
|
||||
Si tout echoue → pause supervisee (pas de clic aveugle).
|
||||
"""
|
||||
uia_snapshot = event.get("uia_snapshot", {})
|
||||
window_info = event.get("window", {})
|
||||
vision_info = event.get("vision_info", {})
|
||||
screenshot_id = event.get("screenshot_id", "")
|
||||
|
||||
name = uia_snapshot.get("name", "") if uia_snapshot else ""
|
||||
control_type = uia_snapshot.get("control_type", "") if uia_snapshot else ""
|
||||
automation_id = uia_snapshot.get("automation_id", "") if uia_snapshot else ""
|
||||
parent_path = uia_snapshot.get("parent_path", []) if uia_snapshot else []
|
||||
window_title = window_info.get("title", "") if window_info else ""
|
||||
|
||||
# Cascade de resolution — UIA d'abord (rapide), puis vision
|
||||
resolve_order = []
|
||||
|
||||
# UIA : disponible si on a un nom ou automation_id
|
||||
has_uia = bool(name or automation_id)
|
||||
if has_uia:
|
||||
resolve_order.append("uia")
|
||||
|
||||
# Template matching : disponible si on a un crop
|
||||
anchor_b64 = _load_crop_as_base64(session_dir, screenshot_id)
|
||||
if anchor_b64:
|
||||
resolve_order.append("template")
|
||||
|
||||
# Serveur (docTR OCR + SomEngine + VLM) : toujours en fallback
|
||||
resolve_order.append("server")
|
||||
|
||||
# VLM local : dernier recours
|
||||
resolve_order.append("vlm_local")
|
||||
|
||||
if not resolve_order:
|
||||
return {}
|
||||
|
||||
target_spec: Dict[str, Any] = {
|
||||
"resolve_order": resolve_order,
|
||||
"window_title": window_title,
|
||||
}
|
||||
|
||||
# UIA target
|
||||
if has_uia:
|
||||
target_spec["uia_target"] = {
|
||||
"name": name,
|
||||
"control_type": control_type,
|
||||
"automation_id": automation_id,
|
||||
"parent_path": parent_path,
|
||||
}
|
||||
|
||||
# Anchor pour template matching
|
||||
if anchor_b64:
|
||||
target_spec["anchor_image_base64"] = anchor_b64
|
||||
|
||||
# Texte pour OCR (docTR sur le serveur)
|
||||
if name:
|
||||
target_spec["by_text"] = name
|
||||
|
||||
# Description VLM
|
||||
vlm_desc = _build_vlm_description(uia_snapshot or {}, window_info or {})
|
||||
if vlm_desc:
|
||||
target_spec["vlm_description"] = vlm_desc
|
||||
|
||||
return target_spec
|
||||
|
||||
|
||||
def _build_desktop_cleanup_actions(screen_w: int, screen_h: int) -> List[Dict[str, Any]]:
|
||||
"""Construire les actions de nettoyage bureau AVANT le replay.
|
||||
|
||||
Sur Windows 11, un clic sur l'extreme droite de la barre des taches
|
||||
(le pixel invisible 'Afficher le bureau') minimise toutes les fenetres.
|
||||
C'est exactement ce qu'un humain ferait avant de commencer un travail :
|
||||
repartir d'un bureau propre.
|
||||
|
||||
100% visuel — pas de raccourci clavier injecte (cf feedback_100pct_visual).
|
||||
"""
|
||||
# Le bouton 'Afficher le bureau' est au pixel tout en bas a droite
|
||||
# de la taskbar. Sur Win11, c'est une fine bande cliquable.
|
||||
x_pct = round((screen_w - 2) / screen_w, 6) # avant-dernier pixel
|
||||
y_pct = round((screen_h - 2) / screen_h, 6) # idem vertical
|
||||
|
||||
return [
|
||||
{
|
||||
"action_id": f"act_setup_desktop_{uuid.uuid4().hex[:6]}",
|
||||
"type": "click",
|
||||
"x_pct": x_pct,
|
||||
"y_pct": y_pct,
|
||||
"button": "left",
|
||||
"visual_mode": False, # position fixe, pas besoin de grounding
|
||||
"wait_before": 0.3,
|
||||
"_setup_action": True, # marqueur pour le distinguer des vrais clics
|
||||
},
|
||||
{
|
||||
"action_id": f"act_setup_wait_{uuid.uuid4().hex[:6]}",
|
||||
"type": "wait",
|
||||
"duration_ms": 1000,
|
||||
"wait_before": 0,
|
||||
"_setup_action": True,
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def _simple_build_replay(events: List[Dict[str, Any]], session_dir: Path) -> List[Dict[str, Any]]:
|
||||
"""Construire un replay simplifie sans dependre de stream_processor.
|
||||
"""Construire un replay visuel depuis les evenements bruts.
|
||||
|
||||
Convertit les evenements bruts en actions normalisees simples :
|
||||
- mouse_click -> action click (coordonnees en pixels)
|
||||
- text_input / type -> action type
|
||||
- key_combo / key_press -> action key_combo
|
||||
Chaque clic est enrichi avec un target_spec complet qui alimente
|
||||
la cascade de resolution du GroundingEngine :
|
||||
UIA local (10ms) → template matching (100ms) → serveur docTR/VLM (2-5s)
|
||||
|
||||
C'est un fallback pour quand build_replay_from_raw_events n'est pas disponible.
|
||||
Les coordonnees ne sont PAS converties en pourcentages (le serveur les accepte
|
||||
aussi en pixels).
|
||||
Les coordonnees x_pct/y_pct sont incluses comme hint de derniere chance.
|
||||
Lea ne clique pas en aveugle — elle VOIT l'ecran et CHERCHE l'element.
|
||||
|
||||
Le replay commence par un nettoyage du bureau (clic 'Afficher le bureau')
|
||||
pour partir d'un etat propre — exactement comme un humain.
|
||||
"""
|
||||
actions: List[Dict[str, Any]] = []
|
||||
click_count = 0
|
||||
@@ -900,6 +1057,9 @@ def _simple_build_replay(events: List[Dict[str, Any]], session_dir: Path) -> Lis
|
||||
screen_w, screen_h = int(res[0]), int(res[1])
|
||||
break
|
||||
|
||||
# ── Étape 0 : nettoyer le bureau ──
|
||||
actions.extend(_build_desktop_cleanup_actions(screen_w, screen_h))
|
||||
|
||||
for ev in events:
|
||||
inner = ev.get("event", {})
|
||||
etype = inner.get("type", "")
|
||||
@@ -913,15 +1073,35 @@ def _simple_build_replay(events: List[Dict[str, Any]], session_dir: Path) -> Lis
|
||||
pos = inner.get("pos", [0, 0])
|
||||
click_count += 1
|
||||
|
||||
x_pct = round(pos[0] / screen_w, 6) if screen_w else 0.0
|
||||
y_pct = round(pos[1] / screen_h, 6) if screen_h else 0.0
|
||||
|
||||
action = {
|
||||
"action_id": action_id,
|
||||
"type": "click",
|
||||
"x_pct": round(pos[0] / screen_w, 6) if screen_w else 0.0,
|
||||
"y_pct": round(pos[1] / screen_h, 6) if screen_h else 0.0,
|
||||
"x_pct": x_pct,
|
||||
"y_pct": y_pct,
|
||||
"button": inner.get("button", "left"),
|
||||
"visual_mode": False, # pas d'enrichissement → coords brutes
|
||||
"wait_before": 0.5,
|
||||
}
|
||||
|
||||
# Enrichir avec la cascade visuelle complete
|
||||
target_spec = _build_full_target_spec(inner, session_dir)
|
||||
if target_spec:
|
||||
action["visual_mode"] = True
|
||||
action["target_spec"] = target_spec
|
||||
uia_name = inner.get("uia_snapshot", {}).get("name", "?")
|
||||
methods = target_spec.get("resolve_order", [])
|
||||
logger.info(
|
||||
"Action %s enrichie [%s] : '%s' (%s)",
|
||||
action_id, "+".join(methods), uia_name,
|
||||
inner.get("uia_snapshot", {}).get("control_type", "?"),
|
||||
)
|
||||
else:
|
||||
# Pas de donnee visuelle du tout → coords brutes en dernier recours
|
||||
action["visual_mode"] = False
|
||||
logger.warning("Action %s : aucune donnee visuelle, coords brutes", action_id)
|
||||
|
||||
actions.append(action)
|
||||
|
||||
elif etype in ("text_input", "type"):
|
||||
|
||||
Reference in New Issue
Block a user