1. Le grounding se déclenche pour by_text_source="vlm" (pas juste "ocr") Les textes lus par gemma4 (onglets, labels) sont du texte visible, le grounding doit les chercher comme n'importe quel texte OCR. 2. gemma4 est automatiquement déchargé après le build_replay pour libérer la VRAM et permettre à qwen2.5vl de charger au replay. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
4778 lines
193 KiB
Python
4778 lines
193 KiB
Python
"""
|
||
StreamProcessor — Pont entre le streaming Agent V1 et le core pipeline RPA Vision V3.
|
||
|
||
Orchestre les composants core (ScreenAnalyzer, CLIP, FAISS, GraphBuilder)
|
||
pour traiter en temps réel les screenshots et événements reçus via fibre.
|
||
|
||
Tous les calculs GPU tournent ici (serveur RTX 5070).
|
||
"""
|
||
|
||
import base64
|
||
import hashlib
|
||
import logging
|
||
import os
|
||
import threading
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
from typing import Any, Dict, List, Optional
|
||
|
||
import numpy as np
|
||
|
||
from .live_session_manager import LiveSessionManager
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Filtrage des événements parasites (modificateurs seuls, text_input vide, etc.)
|
||
# Utilisé à 3 niveaux : réception (process_event), expansion (compound), et
|
||
# en amont dans GraphBuilder._find_transition_events / _build_compound_action.
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_MODIFIER_ONLY_KEYS = {
|
||
"ctrl", "ctrl_l", "ctrl_r", "control", "control_l", "control_r",
|
||
"alt", "alt_l", "alt_r", "alt_gr",
|
||
"shift", "shift_l", "shift_r",
|
||
"win", "win_l", "win_r", "cmd", "cmd_l", "cmd_r",
|
||
"meta", "meta_l", "meta_r", "super", "super_l", "super_r",
|
||
}
|
||
|
||
# Mapping numpad vk codes → caractères (layout-indépendant)
|
||
_NUMPAD_VK_MAP = {
|
||
96: '0', 97: '1', 98: '2', 99: '3', 100: '4',
|
||
101: '5', 102: '6', 103: '7', 104: '8', 105: '9',
|
||
106: '*', 107: '+', 109: '-', 110: '.', 111: '/',
|
||
}
|
||
|
||
# Table de conversion des caractères de contrôle vers les touches lisibles
|
||
# (produits par certains agents qui capturent les raw keycodes)
|
||
_CONTROL_CHAR_MAP = {
|
||
'\x01': 'a', '\x02': 'b', '\x03': 'c', '\x04': 'd', '\x05': 'e',
|
||
'\x06': 'f', '\x07': 'g', '\x08': 'h', '\x09': 'i', '\x0a': 'j',
|
||
'\x0b': 'k', '\x0c': 'l', '\x0d': 'm', '\x0e': 'n', '\x0f': 'o',
|
||
'\x10': 'p', '\x11': 'q', '\x12': 'r', '\x13': 's', '\x14': 't',
|
||
'\x15': 'u', '\x16': 'v', '\x17': 'w', '\x18': 'x', '\x19': 'y',
|
||
'\x1a': 'z',
|
||
}
|
||
|
||
# Types d'événements parasites à ignorer dans les actions enrichies
|
||
_PARASITIC_ACTION_TYPES = frozenset({
|
||
'heartbeat', 'focus_change', 'window_focus_change',
|
||
'screenshot', 'status', 'ping', 'pong',
|
||
})
|
||
|
||
|
||
def _is_modifier_only(keys: list) -> bool:
|
||
"""Retourne True si la liste de touches ne contient que des modificateurs."""
|
||
if not keys:
|
||
return True
|
||
return all(k.lower() in _MODIFIER_ONLY_KEYS for k in keys)
|
||
|
||
|
||
def _sanitize_keys(keys: list) -> list:
|
||
"""Nettoyer une liste de touches : convertir les caractères de contrôle."""
|
||
cleaned = []
|
||
for k in keys:
|
||
if not k:
|
||
continue
|
||
if k in _CONTROL_CHAR_MAP:
|
||
cleaned.append(_CONTROL_CHAR_MAP[k])
|
||
else:
|
||
cleaned.append(k)
|
||
return cleaned
|
||
|
||
|
||
def _is_parasitic_event(event_data: Dict[str, Any]) -> bool:
|
||
"""Retourne True si l'événement est parasite et doit être filtré.
|
||
|
||
Événements rejetés :
|
||
- key_press / key_combo avec uniquement des modificateurs seuls
|
||
- key_press / key_combo avec liste de touches vide
|
||
- text_input avec texte vide
|
||
"""
|
||
event_type = event_data.get("type", "")
|
||
|
||
if event_type in ("key_press", "key_combo"):
|
||
keys = event_data.get("keys", event_data.get("data", {}).get("keys", []))
|
||
if not keys or _is_modifier_only(keys):
|
||
return True
|
||
|
||
elif event_type == "text_input":
|
||
text = event_data.get("text", event_data.get("data", {}).get("text", ""))
|
||
if not text:
|
||
return True
|
||
|
||
return False
|
||
|
||
|
||
def _reconstruct_text_from_raw_keys(raw_keys: list) -> str:
|
||
"""Reconstruire le texte correct à partir des vk codes des raw_keys.
|
||
|
||
Corrige les problèmes de capture AZERTY, notamment :
|
||
- Numpad / (vk=111) capturé comme char='!' → corrigé en '/'
|
||
- Numpad 0-9 (vk=96-105) capturés comme char=None → corrigés en '0'-'9'
|
||
"""
|
||
text_parts = []
|
||
for event in raw_keys:
|
||
if event.get("action") != "press":
|
||
continue
|
||
vk = event.get("vk", 0)
|
||
char = event.get("char")
|
||
kind = event.get("kind", "")
|
||
name = event.get("name", "")
|
||
|
||
# Ignorer les modificateurs (releases qui traînent dans le buffer)
|
||
if kind == "key" and name in _MODIFIER_ONLY_KEYS:
|
||
continue
|
||
|
||
# Numpad : mapping fixe (layout-indépendant)
|
||
if vk in _NUMPAD_VK_MAP:
|
||
text_parts.append(_NUMPAD_VK_MAP[vk])
|
||
# Touche normale avec caractère valide
|
||
elif char and len(char) == 1 and char.isprintable():
|
||
text_parts.append(char)
|
||
return "".join(text_parts)
|
||
|
||
|
||
def _key_combo_printable_char(keys: list) -> Optional[str]:
|
||
"""Si le key_combo produit un seul caractère imprimable, le retourner.
|
||
|
||
Exemples :
|
||
- ['ctrl', '@'] → '@' (AltGr+0 sur AZERTY, capturé comme ctrl+@)
|
||
- ['shift', 'A'] → 'A'
|
||
- ['ctrl', 'c'] → None (c'est un raccourci, pas un caractère)
|
||
- ['enter'] → None (pas un caractère imprimable)
|
||
"""
|
||
if not keys:
|
||
return None
|
||
non_modifiers = [k for k in keys if k.lower() not in _MODIFIER_ONLY_KEYS]
|
||
if len(non_modifiers) != 1:
|
||
return None
|
||
char = non_modifiers[0]
|
||
# Un seul caractère imprimable (pas un nom de touche spéciale)
|
||
if len(char) == 1 and char.isprintable():
|
||
# Vérifier que c'est pas un raccourci courant (ctrl+c, ctrl+v, etc.)
|
||
modifiers = {k.lower() for k in keys if k.lower() in _MODIFIER_ONLY_KEYS}
|
||
if modifiers <= {"shift", "shift_l", "shift_r"}:
|
||
# Shift + char = caractère majuscule/spécial → OK
|
||
return char
|
||
if "alt_gr" in modifiers or (
|
||
"ctrl" in modifiers and ("alt" in modifiers or "alt_r" in modifiers)
|
||
):
|
||
# AltGr + char = caractère spécial (@ # € etc.) → OK
|
||
return char
|
||
# Ctrl + caractère NON-alphabétique = probablement AltGr résiduel
|
||
# Sur AZERTY, AltGr+0 produit @, capturé comme ['ctrl', 'alt_gr'] + ['ctrl', '@']
|
||
# Le premier combo est filtré (modifier-only), le second a juste 'ctrl' + '@'
|
||
if "ctrl" in modifiers and not char.isalpha():
|
||
return char
|
||
# ctrl + lettre seul = raccourci (Ctrl+S, Ctrl+C) → pas un caractère
|
||
return None
|
||
return None
|
||
|
||
|
||
def _merge_consecutive_text_inputs(steps: list) -> list:
|
||
"""Fusionne les text_input consécutifs en un seul."""
|
||
merged = []
|
||
for step in steps:
|
||
if (step.get("type") in ("text_input", "type")
|
||
and merged
|
||
and merged[-1].get("type") in ("text_input", "type")):
|
||
merged[-1]["text"] = merged[-1].get("text", "") + step.get("text", "")
|
||
else:
|
||
merged.append(dict(step)) # copie pour ne pas muter l'original
|
||
return merged
|
||
|
||
|
||
def _dedup_consecutive_combos(steps: list) -> list:
|
||
"""Supprime les key_combo dupliqués consécutifs."""
|
||
deduped = []
|
||
for step in steps:
|
||
if (step.get("type") in ("key_combo", "key_press")
|
||
and deduped
|
||
and deduped[-1].get("type") in ("key_combo", "key_press")
|
||
and deduped[-1].get("keys") == step.get("keys")):
|
||
continue # Doublon → skip
|
||
deduped.append(step)
|
||
return deduped
|
||
|
||
|
||
def _filter_parasitic_steps(steps: list) -> list:
|
||
"""Supprime les steps key_combo/key_press avec uniquement des modificateurs seuls."""
|
||
return [
|
||
s for s in steps
|
||
if not (
|
||
s.get("type") in ("key_combo", "key_press")
|
||
and _is_modifier_only(s.get("keys", []))
|
||
)
|
||
]
|
||
|
||
|
||
def _ensure_min_waits(steps: list, min_wait_ms: int = 300) -> list:
|
||
"""Ajoute un wait de min_wait_ms entre les steps si aucun wait n'existe."""
|
||
if not steps:
|
||
return steps
|
||
result = [steps[0]]
|
||
for step in steps[1:]:
|
||
if result[-1].get("type") != "wait" and step.get("type") != "wait":
|
||
result.append({"type": "wait", "duration_ms": min_wait_ms})
|
||
result.append(step)
|
||
return result
|
||
|
||
|
||
def clean_compound_steps(steps: list) -> list:
|
||
"""Pipeline complet de nettoyage des steps d'une compound action.
|
||
|
||
Applique dans l'ordre :
|
||
1. Suppression des steps modificateurs seuls
|
||
2. Fusion des text_input consécutifs
|
||
3. Déduplication des key_combo consécutifs identiques
|
||
4. Ajout de waits minimum entre steps si absents
|
||
"""
|
||
cleaned = _filter_parasitic_steps(steps)
|
||
cleaned = _merge_consecutive_text_inputs(cleaned)
|
||
cleaned = _dedup_consecutive_combos(cleaned)
|
||
cleaned = _ensure_min_waits(cleaned)
|
||
return cleaned
|
||
|
||
|
||
def clean_enriched_actions(actions: list) -> list:
|
||
"""Nettoyer une liste d'actions enrichies pour éliminer le bruit de replay.
|
||
|
||
Appliqué après construction de toutes les actions enrichies (post-BFS),
|
||
travaille sur les actions au format replay (type, keys, text, etc.).
|
||
|
||
Filtres appliqués dans l'ordre :
|
||
1. Supprimer les types parasites (heartbeat, focus_change, screenshot, etc.)
|
||
2. Sanitiser les touches (caractères de contrôle → lettres)
|
||
3. Supprimer les key_combo avec uniquement des modificateurs seuls
|
||
4. Supprimer les actions type/text_input avec texte vide
|
||
5. Dédupliquer les key_combo consécutifs identiques
|
||
6. Fusionner les text_input (type) consécutifs dans la même fenêtre
|
||
7. Supprimer les waits consécutifs (garder le plus long)
|
||
"""
|
||
if not actions:
|
||
return actions
|
||
|
||
# ── Étape 1-4 : filtrer les actions parasites ──
|
||
filtered = []
|
||
for a in actions:
|
||
atype = a.get('type', '')
|
||
|
||
# Types parasites issus du streaming brut
|
||
if atype in _PARASITIC_ACTION_TYPES:
|
||
continue
|
||
|
||
# key_combo : sanitiser les touches, puis filtrer les modificateurs seuls
|
||
if atype == 'key_combo':
|
||
keys = _sanitize_keys(a.get('keys', []))
|
||
if _is_modifier_only(keys):
|
||
continue
|
||
if not keys:
|
||
continue
|
||
a = dict(a, keys=keys)
|
||
|
||
# type/text_input : supprimer si texte vide
|
||
if atype == 'type' and not a.get('text', '').strip():
|
||
continue
|
||
|
||
filtered.append(a)
|
||
|
||
# ── Étape 5 : dédupliquer les key_combo consécutifs identiques ──
|
||
deduped = []
|
||
for a in filtered:
|
||
if (deduped
|
||
and a.get('type') == 'key_combo'
|
||
and deduped[-1].get('type') == 'key_combo'
|
||
and a.get('keys') == deduped[-1].get('keys')):
|
||
continue
|
||
deduped.append(a)
|
||
|
||
# ── Étape 6 : fusionner les text_input (type) consécutifs ──
|
||
merged = []
|
||
for a in deduped:
|
||
if (merged
|
||
and a.get('type') == 'type'
|
||
and merged[-1].get('type') == 'type'
|
||
# Même fenêtre cible (ou pas de fenêtre)
|
||
and a.get('window_title', '') == merged[-1].get('window_title', '')):
|
||
merged[-1] = dict(merged[-1], text=merged[-1].get('text', '') + a.get('text', ''))
|
||
continue
|
||
merged.append(a)
|
||
|
||
# ── Étape 7 : supprimer les waits consécutifs (garder le plus long) ──
|
||
cleaned = []
|
||
for a in merged:
|
||
if (cleaned
|
||
and a.get('type') == 'wait'
|
||
and cleaned[-1].get('type') == 'wait'):
|
||
if a.get('duration_ms', 0) > cleaned[-1].get('duration_ms', 0):
|
||
cleaned[-1] = a
|
||
continue
|
||
cleaned.append(a)
|
||
|
||
return cleaned
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Replay direct depuis événements bruts (sans VLM/GraphBuilder)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# Types d'événements ignorés lors de la construction du replay brut
|
||
_IGNORED_EVENT_TYPES = frozenset({
|
||
'heartbeat', 'focus_change', 'window_focus_change',
|
||
'screenshot', 'action_result', 'status', 'ping', 'pong',
|
||
})
|
||
|
||
# Combos de raccourcis spéciaux qui nécessitent un wait post-action
|
||
_POST_COMBO_WAITS = {
|
||
# (tuple de touches normalisées, triées en minuscule) -> wait_ms
|
||
# NB : les tuples sont sorted() alphabétiquement
|
||
('r', 'win'): 3000, # Win+R → Exécuter
|
||
('r', 'super'): 3000,
|
||
('meta', 'r'): 3000,
|
||
('enter',): 2000, # Enter (confirmation)
|
||
('return',): 2000,
|
||
('ctrl', 's'): 3000, # Ctrl+S
|
||
('ctrl', 's', 'shift'): 3000, # Ctrl+Shift+S
|
||
('alt', 'f4'): 2000, # Alt+F4
|
||
}
|
||
|
||
|
||
def _extract_screen_resolution(events: list) -> tuple:
|
||
"""Extraire la résolution d'écran depuis les métadonnées des événements.
|
||
|
||
Cherche d'abord le champ `screen_resolution` dans `screen_metadata`,
|
||
puis infère depuis les positions maximales des clics.
|
||
|
||
Returns:
|
||
Tuple (width, height).
|
||
"""
|
||
# Priorité 1 : screen_metadata.screen_resolution explicite
|
||
for evt in events:
|
||
event_data = evt.get("event", evt)
|
||
sm = event_data.get("screen_metadata", {})
|
||
sr = sm.get("screen_resolution")
|
||
if sr and isinstance(sr, (list, tuple)) and len(sr) == 2:
|
||
w, h = int(sr[0]), int(sr[1])
|
||
if w > 0 and h > 0:
|
||
return (w, h)
|
||
|
||
# Priorité 2 : inférer depuis les positions max des clics
|
||
return StreamProcessor._infer_screen_resolution(events)
|
||
|
||
|
||
def _should_cut_after_event(
|
||
event_data: dict,
|
||
saw_save_combo: bool = False,
|
||
actions_count: int = 0,
|
||
) -> bool:
|
||
"""Retourne True si on doit couper le replay après cet événement.
|
||
|
||
Coupe quand :
|
||
- Alt+F4 (fermeture par raccourci)
|
||
- Clic dans le systray APRÈS des actions significatives (pas au début)
|
||
- Après un Ctrl+S/Ctrl+Shift+S suivi d'un clic dans une fenêtre non-applicative
|
||
"""
|
||
# Ne pas couper au tout début (les premiers clics sont souvent
|
||
# sur la taskbar/recherche pour ouvrir une application)
|
||
if actions_count < 3:
|
||
return False
|
||
evt_type = event_data.get("type", "")
|
||
|
||
# Alt+F4
|
||
if evt_type in ("key_combo", "key_press"):
|
||
keys = event_data.get("keys", [])
|
||
keys_lower = {k.lower() for k in keys if k}
|
||
if "f4" in keys_lower and ("alt" in keys_lower or "alt_l" in keys_lower
|
||
or "alt_r" in keys_lower):
|
||
return True
|
||
|
||
# Clic dans le systray (fenêtres de fin de session)
|
||
if evt_type == "mouse_click":
|
||
window = event_data.get("window", {})
|
||
title = (window.get("title", "") if isinstance(window, dict) else "").lower()
|
||
if any(t in title for t in [
|
||
"unknown_window", "dépassement de capacité",
|
||
"fenêtre de dépassement", "overflow",
|
||
]):
|
||
return True
|
||
|
||
# Après un Ctrl+S/Ctrl+Shift+S : couper si clic dans une fenêtre
|
||
# non-applicative (terminal, agent, systray)
|
||
if saw_save_combo:
|
||
_CUT_WINDOW_PATTERNS = [
|
||
"cmd.exe", "system32", "dépassement", "unknown",
|
||
"powershell", "windowsterminal", "python.exe",
|
||
"terminal", "systray",
|
||
]
|
||
if any(t in title for t in _CUT_WINDOW_PATTERNS):
|
||
return True
|
||
# Couper aussi par app_name (plus robuste que le titre)
|
||
app_name = (window.get("app_name", "") if isinstance(window, dict) else "").lower()
|
||
if any(t in app_name for t in [
|
||
"windowsterminal", "cmd.exe", "powershell",
|
||
"python.exe", "explorer.exe",
|
||
]):
|
||
return True
|
||
|
||
return False
|
||
|
||
|
||
def _needs_post_wait(action: dict) -> int:
|
||
"""Retourne le wait en ms à insérer après cette action, ou 0."""
|
||
if action.get("type") == "key_combo":
|
||
keys = action.get("keys", [])
|
||
key_tuple = tuple(sorted(k.lower() for k in keys if k))
|
||
wait_ms = _POST_COMBO_WAITS.get(key_tuple, 0)
|
||
if wait_ms:
|
||
return wait_ms
|
||
return 0
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Gemma4 : lecture du texte visible sur les éléments sans OCR
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# Port du Docker Ollama 0.20 (gemma4)
|
||
_GEMMA4_PORT = os.environ.get("GEMMA4_PORT", "11435")
|
||
|
||
|
||
def _unload_gemma4():
|
||
"""Décharger gemma4 du GPU Docker pour libérer la VRAM pour qwen2.5vl."""
|
||
try:
|
||
import requests as _req
|
||
_req.post(
|
||
f"http://localhost:{_GEMMA4_PORT}/api/generate",
|
||
json={"model": "gemma4:e4b", "keep_alive": 0},
|
||
timeout=5,
|
||
)
|
||
logger.info("gemma4 déchargé du GPU (VRAM libérée)")
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
def _gemma4_read_element(
|
||
img_b64: str,
|
||
window_title: str = "",
|
||
click_pos: tuple = None,
|
||
) -> str:
|
||
"""Demander à gemma4 d'identifier l'élément cliqué.
|
||
|
||
Peut recevoir soit un crop (80x80) soit un screenshot fenêtre complet.
|
||
Si click_pos est fourni, c'est un screenshot fenêtre et gemma4 doit
|
||
identifier l'élément à cette position.
|
||
|
||
Returns:
|
||
Le texte lu (ex: "voiture electrique.txt") ou chaîne vide.
|
||
"""
|
||
import requests as _requests
|
||
|
||
context = f" in '{window_title}'" if window_title else ""
|
||
if click_pos:
|
||
prompt = (
|
||
f"This is a screenshot of a window{context}. "
|
||
f"The user clicked at position ({click_pos[0]}, {click_pos[1]}). "
|
||
"What is the exact text or label of the element that was clicked? "
|
||
"Answer ONLY the text, nothing else."
|
||
)
|
||
else:
|
||
prompt = (
|
||
f"This is a cropped UI element{context}. "
|
||
"Read the exact text on this element. "
|
||
"If it's an icon with no text, describe it in 2-3 words.\n"
|
||
"Answer ONLY the text or label, nothing else."
|
||
)
|
||
|
||
try:
|
||
resp = _requests.post(f"http://localhost:{_GEMMA4_PORT}/api/chat", json={
|
||
"model": "gemma4:e4b",
|
||
"messages": [{"role": "user", "content": prompt, "images": [img_b64]}],
|
||
"stream": False,
|
||
"think": False,
|
||
"options": {"temperature": 0.1, "num_predict": 30},
|
||
}, timeout=15)
|
||
if resp.ok:
|
||
content = resp.json().get("message", {}).get("content", "").strip()
|
||
# Nettoyer : retirer guillemets, points, préfixes
|
||
content = content.strip('"\'').rstrip(".").strip()
|
||
if content and 2 <= len(content) <= 60:
|
||
return content
|
||
except Exception as e:
|
||
logger.debug("gemma4 read element échoué : %s", e)
|
||
|
||
return ""
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# VLM identification d'éléments UI (pour les éléments sans texte OCR)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _vlm_identify_element(anchor_b64: str, window_title: str = "") -> str:
|
||
"""Demander au VLM de décrire un élément UI à partir de son crop.
|
||
|
||
Utilisé pendant le build_replay quand un élément cliqué n'a pas de
|
||
texte visible (icône YOLO sans label OCR). Le VLM décrit CE QUE c'est
|
||
(bouton, icône, menu) pour permettre la résolution sémantique au replay.
|
||
|
||
Returns:
|
||
Description courte de l'élément (ex: "search icon", "Word icon")
|
||
ou chaîne vide si le VLM n'est pas disponible.
|
||
"""
|
||
try:
|
||
import io
|
||
import tempfile
|
||
from PIL import Image
|
||
except ImportError:
|
||
return ""
|
||
|
||
try:
|
||
# Décoder le crop base64 → fichier temporaire pour le VLM
|
||
img_bytes = base64.b64decode(anchor_b64)
|
||
img = Image.open(io.BytesIO(img_bytes))
|
||
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
||
img.save(tmp, format="PNG")
|
||
tmp_path = tmp.name
|
||
|
||
import requests as _requests
|
||
from core.detection.vlm_config import get_vlm_model
|
||
_enrich_model = get_vlm_model()
|
||
context = f" from the window '{window_title}'" if window_title else ""
|
||
# Modèle VLM configurable (gemma4:e4b par défaut)
|
||
crop_b64 = base64.b64encode(open(tmp_path, "rb").read()).decode()
|
||
resp = _requests.post("http://localhost:11434/api/chat", json={
|
||
"model": _enrich_model,
|
||
"messages": [
|
||
{"role": "system", "content": "You name UI elements in 2-5 words. No explanation."},
|
||
{"role": "user", "content": (
|
||
f"This is a UI element{context}. "
|
||
"Name it in 2-5 words. Examples: 'save icon in title bar', "
|
||
"'Windows search icon', 'close button', 'file menu'."
|
||
), "images": [crop_b64]},
|
||
],
|
||
"stream": False,
|
||
"options": {"temperature": 0.1, "num_predict": 20},
|
||
}, timeout=30)
|
||
result = {"success": resp.ok, "response": resp.json().get("message", {}).get("content", "")}
|
||
|
||
import os
|
||
os.unlink(tmp_path)
|
||
|
||
if result.get("success"):
|
||
raw = result.get("response", "").strip()
|
||
# Extraire un label court depuis la réponse (le VLM bavarde souvent)
|
||
# Retirer les préfixes courants
|
||
for prefix in (
|
||
"Based on the image, the UI element shown is a ",
|
||
"Based on the image, the UI element is a ",
|
||
"Based on the image, this is a ",
|
||
"Based on the image, it is a ",
|
||
"Based on the image, I can see ",
|
||
"Based on the image, ",
|
||
"The UI element shown is a ",
|
||
"The UI element is a ",
|
||
"The element is a ",
|
||
"This is a ", "It is a ", "It's a ", "I can see a ",
|
||
"I can see ", "A ",
|
||
):
|
||
if raw.lower().startswith(prefix.lower()):
|
||
raw = raw[len(prefix):]
|
||
break
|
||
# Rejeter les réponses qui sont du bavardage, pas un label
|
||
reject_patterns = (
|
||
"several", "multiple", "various", "image",
|
||
"I can", "there are", "there is", "elements",
|
||
"the following", "here are",
|
||
)
|
||
if any(p in raw.lower()[:30] for p in reject_patterns):
|
||
logger.debug("VLM identify : réponse bavarde rejetée (raw='%s')", raw[:60])
|
||
return ""
|
||
|
||
# Prendre les 5 premiers mots utiles
|
||
words = raw.split()[:5]
|
||
label = " ".join(words).strip('",.\' ').rstrip(".")
|
||
if label and 2 <= len(label) <= 40:
|
||
logger.info("VLM identify element : '%s'", label)
|
||
return label
|
||
else:
|
||
logger.debug("VLM identify : label trop court/long après nettoyage (raw='%s')", raw[:80])
|
||
except Exception as e:
|
||
logger.debug("VLM identify element échoué : %s", e)
|
||
|
||
return ""
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# SomEngine — enrichissement Set-of-Mark des clics pendant le build_replay
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_som_cache: Dict[str, Any] = {} # screenshot_id -> SomResult (cache build_replay)
|
||
_SOM_CACHE_MAX = 50
|
||
|
||
|
||
def _get_som_engine():
|
||
"""Singleton SomEngine partagé."""
|
||
try:
|
||
from core.detection.som_engine import get_shared_engine
|
||
return get_shared_engine()
|
||
except ImportError:
|
||
return None
|
||
|
||
|
||
def _som_identify_clicked_element(
|
||
event_data: dict,
|
||
session_dir: Optional[Path],
|
||
screen_w: int,
|
||
screen_h: int,
|
||
) -> Optional[dict]:
|
||
"""Identifier l'élément UI cliqué via SomEngine (YOLO + docTR).
|
||
|
||
Charge le full screenshot de l'événement, lance SomEngine pour détecter
|
||
tous les éléments, puis identifie celui qui se trouve sous le clic.
|
||
|
||
Returns:
|
||
Dict avec id, label, source, bbox_norm, center_norm, confidence
|
||
ou None si SomEngine indisponible ou élément non trouvé.
|
||
"""
|
||
engine = _get_som_engine()
|
||
if engine is None:
|
||
return None
|
||
|
||
if not session_dir:
|
||
return None
|
||
|
||
shots_dir = session_dir / "shots"
|
||
if not shots_dir.is_dir():
|
||
return None
|
||
|
||
# Trouver le full screenshot
|
||
screenshot_id = event_data.get("screenshot_id", "")
|
||
if not screenshot_id:
|
||
return None
|
||
|
||
full_path = shots_dir / f"{screenshot_id}_full.png"
|
||
if not full_path.is_file():
|
||
# Fallback : essayer sans le suffixe _full
|
||
full_path = shots_dir / f"{screenshot_id}.png"
|
||
if not full_path.is_file():
|
||
return None
|
||
|
||
# Vérifier le cache SomResult par (session_dir, screenshot_id)
|
||
cache_key = f"{session_dir}:{screenshot_id}"
|
||
if cache_key in _som_cache:
|
||
result = _som_cache[cache_key]
|
||
else:
|
||
try:
|
||
from PIL import Image
|
||
img = Image.open(full_path).convert("RGB")
|
||
except Exception as e:
|
||
logger.debug("SoM: impossible de charger %s : %s", full_path, e)
|
||
return None
|
||
|
||
# Lancer SomEngine
|
||
try:
|
||
result = engine.analyze(img)
|
||
except Exception as e:
|
||
logger.warning("SoM: erreur d'analyse : %s", e)
|
||
return None
|
||
|
||
# Stocker dans le cache (éléments seulement, pas l'image annotée)
|
||
from core.detection.som_engine import SomResult
|
||
cached = SomResult(
|
||
elements=result.elements,
|
||
width=result.width,
|
||
height=result.height,
|
||
analysis_time_ms=result.analysis_time_ms,
|
||
)
|
||
if len(_som_cache) >= _SOM_CACHE_MAX:
|
||
# Supprimer la plus ancienne entrée (FIFO)
|
||
oldest_key = next(iter(_som_cache))
|
||
del _som_cache[oldest_key]
|
||
_som_cache[cache_key] = cached
|
||
|
||
if not result.elements:
|
||
return None
|
||
|
||
# Trouver l'élément cliqué
|
||
pos = event_data.get("pos", [])
|
||
if not pos or len(pos) < 2:
|
||
return None
|
||
|
||
click_x, click_y = int(pos[0]), int(pos[1])
|
||
elem = result.find_element_at(click_x, click_y, margin=30)
|
||
if elem is None:
|
||
logger.debug(
|
||
"SoM: aucun élément trouvé au clic (%d, %d) parmi %d éléments",
|
||
click_x, click_y, len(result.elements),
|
||
)
|
||
return None
|
||
|
||
logger.info(
|
||
"SoM: clic (%d,%d) → élément #%d '%s' (source=%s, conf=%.2f)",
|
||
click_x, click_y, elem.id, elem.label, elem.source, elem.confidence,
|
||
)
|
||
return {
|
||
"id": elem.id,
|
||
"label": elem.label,
|
||
"source": elem.source,
|
||
"bbox_norm": list(elem.bbox_norm),
|
||
"center_norm": list(elem.center_norm),
|
||
"confidence": elem.confidence,
|
||
"element_count": len(result.elements),
|
||
}
|
||
|
||
|
||
def _load_crop_for_event(
|
||
event_data: dict,
|
||
session_dir: Optional[Path],
|
||
screen_w: int = 0,
|
||
screen_h: int = 0,
|
||
) -> Optional[str]:
|
||
"""Charger le crop de référence (anchor) associé à un événement mouse_click.
|
||
|
||
Stratégie de recherche (par priorité) :
|
||
1. vision_info.crop → extraire le nom de fichier, chercher dans session_dir/shots/
|
||
2. screenshot_id → chercher {screenshot_id}_crop.png dans session_dir/shots/
|
||
3. Timestamp → chercher le focus_XXXX.png le plus proche dans session_dir/shots/
|
||
4. Fallback → cropper le full screenshot autour de la position du clic (400x400)
|
||
|
||
Args:
|
||
event_data: Événement brut (type mouse_click).
|
||
session_dir: Répertoire de la session (contient shots/).
|
||
screen_w: Largeur écran (pour le fallback crop).
|
||
screen_h: Hauteur écran (pour le fallback crop).
|
||
|
||
Returns:
|
||
Image crop encodée en base64, ou None si rien trouvé.
|
||
"""
|
||
if not session_dir:
|
||
return None
|
||
|
||
shots_dir = Path(session_dir) / "shots"
|
||
if not shots_dir.is_dir():
|
||
return None
|
||
|
||
def _read_png_b64(path: Path) -> Optional[str]:
|
||
"""Lire un fichier PNG et retourner son contenu en base64."""
|
||
try:
|
||
if path.exists() and path.stat().st_size > 0:
|
||
return base64.b64encode(path.read_bytes()).decode("utf-8")
|
||
except Exception as e:
|
||
logger.debug("Impossible de lire le crop %s : %s", path, e)
|
||
return None
|
||
|
||
# ── Stratégie 1 : vision_info.crop (nom de fichier Windows → chercher localement) ──
|
||
vision_info = event_data.get("vision_info", {})
|
||
if isinstance(vision_info, dict):
|
||
crop_path_str = vision_info.get("crop", "")
|
||
if crop_path_str:
|
||
# Extraire le nom de fichier depuis le chemin Windows
|
||
# Ex: "C:\\rpa_vision\\...\\shots\\shot_0002_crop.png" → "shot_0002_crop.png"
|
||
crop_filename = crop_path_str.replace("\\", "/").split("/")[-1]
|
||
result = _read_png_b64(shots_dir / crop_filename)
|
||
if result:
|
||
logger.debug("Crop trouvé via vision_info : %s", crop_filename)
|
||
return result
|
||
|
||
# ── Stratégie 2 : screenshot_id → {screenshot_id}_crop.png ──
|
||
screenshot_id = event_data.get("screenshot_id", "")
|
||
if screenshot_id:
|
||
crop_filename = f"{screenshot_id}_crop.png"
|
||
result = _read_png_b64(shots_dir / crop_filename)
|
||
if result:
|
||
logger.debug("Crop trouvé via screenshot_id : %s", crop_filename)
|
||
return result
|
||
|
||
# ── Stratégie 3 : Timestamp → focus_XXXX.png le plus proche ──
|
||
evt_ts = float(event_data.get("timestamp", 0))
|
||
if evt_ts > 0:
|
||
try:
|
||
focus_files = sorted(shots_dir.glob("focus_*.png"))
|
||
if focus_files:
|
||
best_file = None
|
||
best_delta = float("inf")
|
||
for f in focus_files:
|
||
# Extraire le timestamp du nom : focus_1774437474.png
|
||
try:
|
||
ts_str = f.stem.split("_", 1)[1]
|
||
file_ts = float(ts_str)
|
||
delta = abs(file_ts - evt_ts)
|
||
if delta < best_delta:
|
||
best_delta = delta
|
||
best_file = f
|
||
except (ValueError, IndexError):
|
||
continue
|
||
# Accepter si le focus est à moins de 5 secondes du clic
|
||
if best_file and best_delta < 5.0:
|
||
result = _read_png_b64(best_file)
|
||
if result:
|
||
logger.debug(
|
||
"Crop trouvé via timestamp (focus, delta=%.1fs) : %s",
|
||
best_delta, best_file.name,
|
||
)
|
||
return result
|
||
except Exception as e:
|
||
logger.debug("Erreur recherche focus par timestamp : %s", e)
|
||
|
||
# ── Stratégie 4 : Fallback → cropper le full screenshot autour du clic ──
|
||
if screenshot_id and screen_w > 0 and screen_h > 0:
|
||
full_path = shots_dir / f"{screenshot_id}_full.png"
|
||
if full_path.exists():
|
||
try:
|
||
from PIL import Image
|
||
import io
|
||
|
||
img = Image.open(full_path)
|
||
pos = event_data.get("pos", [])
|
||
if pos and len(pos) == 2:
|
||
cx, cy = int(pos[0]), int(pos[1])
|
||
# Crop 80x80 centré sur le clic (discriminant pour icônes)
|
||
crop_size = 40
|
||
x1 = max(0, cx - crop_size)
|
||
y1 = max(0, cy - crop_size)
|
||
x2 = min(img.width, cx + crop_size)
|
||
y2 = min(img.height, cy + crop_size)
|
||
cropped = img.crop((x1, y1, x2, y2))
|
||
buf = io.BytesIO()
|
||
cropped.save(buf, format="PNG")
|
||
result = base64.b64encode(buf.getvalue()).decode("utf-8")
|
||
logger.debug(
|
||
"Crop fallback (full screenshot cropped) : %s, zone=%dx%d",
|
||
full_path.name, x2 - x1, y2 - y1,
|
||
)
|
||
return result
|
||
except ImportError:
|
||
logger.debug("PIL non disponible pour le crop fallback")
|
||
except Exception as e:
|
||
logger.debug("Erreur crop fallback depuis full screenshot : %s", e)
|
||
|
||
return None
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Enrichissement VLM partagé — utilisé par build_replay ET reprocess_session
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def enrich_click_from_screenshot(
|
||
screenshot_path: Path,
|
||
click_x: int,
|
||
click_y: int,
|
||
screen_w: int,
|
||
screen_h: int,
|
||
window_title: str = "",
|
||
vision_info: Optional[dict] = None,
|
||
session_dir: Optional[Path] = None,
|
||
screenshot_id: str = "",
|
||
) -> Dict[str, Any]:
|
||
"""Enrichir un clic avec les informations visuelles extraites du screenshot.
|
||
|
||
Fonction partagée entre build_replay_from_raw_events() et
|
||
_enrich_workflow_targets() pour éviter la duplication de logique.
|
||
|
||
Étapes :
|
||
1. Crop 80x80 autour du clic → anchor_image_base64
|
||
2. SomEngine → détection de l'élément cliqué (label, type, bbox)
|
||
3. Description VLM positionnelle (haut/bas, gauche/droite)
|
||
4. Texte de l'élément (vision_info OCR > SomEngine label)
|
||
|
||
Args:
|
||
screenshot_path: Chemin vers le screenshot full (PNG).
|
||
click_x: Position X du clic en pixels.
|
||
click_y: Position Y du clic en pixels.
|
||
screen_w: Largeur de l'écran en pixels.
|
||
screen_h: Hauteur de l'écran en pixels.
|
||
window_title: Titre de la fenêtre active au moment du clic.
|
||
vision_info: Dict vision_info de l'événement original (optionnel).
|
||
session_dir: Répertoire de la session (pour le cache SomEngine).
|
||
screenshot_id: Identifiant du screenshot (pour le cache SomEngine).
|
||
|
||
Returns:
|
||
Dict avec les clés : anchor_image_base64, by_text, by_text_source,
|
||
by_role, vlm_description, window_title, original_position, by_position,
|
||
som_element (optionnel). Retourne un dict vide si le screenshot est
|
||
introuvable ou illisible.
|
||
"""
|
||
import io
|
||
|
||
if not screenshot_path or not Path(screenshot_path).is_file():
|
||
return {}
|
||
|
||
# ── 1. Crop 80x80 centré sur le clic (anchor_image_base64) ──
|
||
anchor_b64 = ""
|
||
try:
|
||
from PIL import Image
|
||
img = Image.open(screenshot_path)
|
||
crop_size = 40
|
||
x1 = max(0, click_x - crop_size)
|
||
y1 = max(0, click_y - crop_size)
|
||
x2 = min(img.width, click_x + crop_size)
|
||
y2 = min(img.height, click_y + crop_size)
|
||
cropped = img.crop((x1, y1, x2, y2))
|
||
buf = io.BytesIO()
|
||
cropped.save(buf, format="PNG")
|
||
anchor_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
|
||
except Exception as e:
|
||
logger.debug("enrich_click: crop échoué pour %s : %s", screenshot_path, e)
|
||
|
||
if not anchor_b64:
|
||
return {}
|
||
|
||
# ── 2. Position relative dans l'écran ──
|
||
y_relative = ""
|
||
x_relative = ""
|
||
if screen_h > 0:
|
||
y_relative = (
|
||
"en bas" if click_y / screen_h > 0.8
|
||
else "en haut" if click_y / screen_h < 0.2
|
||
else "au milieu"
|
||
)
|
||
if screen_w > 0:
|
||
x_relative = (
|
||
"à gauche" if click_x / screen_w < 0.3
|
||
else "à droite" if click_x / screen_w > 0.7
|
||
else "au centre"
|
||
)
|
||
|
||
# ── 3. Description VLM positionnelle ──
|
||
vlm_parts = []
|
||
if window_title:
|
||
vlm_parts.append(f"Dans la fenêtre '{window_title}'")
|
||
position_desc = " ".join(p for p in [y_relative, x_relative] if p)
|
||
if position_desc:
|
||
vlm_parts.append(f"l'élément cliqué se trouve {position_desc} de l'écran")
|
||
|
||
# Ajouter le texte visible (vision_info OCR)
|
||
if isinstance(vision_info, dict):
|
||
vis_text = vision_info.get("text", "")
|
||
vis_type = vision_info.get("type", "")
|
||
if vis_text:
|
||
vlm_parts.append(f"le texte visible est '{vis_text}'")
|
||
if vis_type:
|
||
vlm_parts.append(f"c'est un élément de type '{vis_type}'")
|
||
vlm_description = ", ".join(vlm_parts) if vlm_parts else ""
|
||
|
||
# ── 4. SomEngine : identifier l'élément cliqué ──
|
||
som_elem = None
|
||
if session_dir and screenshot_id:
|
||
# Appeler _som_identify_clicked_element via un event_data minimal
|
||
fake_event = {
|
||
"screenshot_id": screenshot_id,
|
||
"pos": [click_x, click_y],
|
||
}
|
||
som_elem = _som_identify_clicked_element(
|
||
fake_event, session_dir, screen_w, screen_h,
|
||
)
|
||
|
||
# ── 5. Déterminer le texte et le type de l'élément ──
|
||
element_text = ""
|
||
element_type = ""
|
||
text_source = ""
|
||
if isinstance(vision_info, dict):
|
||
element_text = vision_info.get("text", "")
|
||
element_type = vision_info.get("type", "")
|
||
if element_text:
|
||
text_source = "ocr"
|
||
if not element_text and som_elem and som_elem.get("label"):
|
||
element_text = som_elem["label"]
|
||
text_source = "ocr"
|
||
|
||
# ── 5b. Gemma4 : identifier l'élément cliqué via le screenshot fenêtre ──
|
||
# Quand l'OCR et SomEngine ne trouvent pas de texte, gemma4 (port 11435)
|
||
# reçoit le screenshot fenêtre + la position du clic et décrit l'élément.
|
||
# Un seul appel, une seule fois, pendant l'enregistrement.
|
||
if not element_text:
|
||
# Essayer avec le screenshot fenêtre (contexte complet)
|
||
win_screenshot = None
|
||
if session_dir and screenshot_id:
|
||
win_path = Path(session_dir) / "shots" / f"{screenshot_id}_window.png"
|
||
if win_path.is_file():
|
||
win_screenshot = base64.b64encode(win_path.read_bytes()).decode()
|
||
# Fallback sur le crop
|
||
img_b64 = win_screenshot or anchor_b64
|
||
element_text = _gemma4_read_element(
|
||
img_b64, window_title,
|
||
click_pos=(click_x, click_y) if win_screenshot else None,
|
||
)
|
||
if element_text:
|
||
text_source = "vlm"
|
||
logger.info("gemma4 a lu l'élément : '%s'", element_text)
|
||
|
||
# ── 6. Coordonnées normalisées ──
|
||
by_position = [
|
||
round(click_x / screen_w, 6) if screen_w > 0 else 0.0,
|
||
round(click_y / screen_h, 6) if screen_h > 0 else 0.0,
|
||
]
|
||
|
||
# ── Assembler le résultat ──
|
||
result = {
|
||
"anchor_image_base64": anchor_b64,
|
||
"by_text": element_text,
|
||
"by_text_source": text_source,
|
||
"by_role": element_type or (som_elem.get("source", "") if som_elem else ""),
|
||
"vlm_description": vlm_description,
|
||
"window_title": window_title,
|
||
"original_position": {
|
||
"x_relative": x_relative,
|
||
"y_relative": y_relative,
|
||
},
|
||
"by_position": by_position,
|
||
}
|
||
|
||
if som_elem:
|
||
result["som_element"] = som_elem
|
||
|
||
return result
|
||
|
||
|
||
def _attach_expected_screenshots(
|
||
actions: list, raw_events: list, session_dir: Path,
|
||
) -> None:
|
||
"""Attacher les screenshots de référence (résultat attendu) aux actions.
|
||
|
||
Pour chaque action de type click ou key_combo, cherche le screenshot
|
||
res_shot_XXXX.png (capturé 1s après l'action pendant l'enregistrement)
|
||
et l'attache comme expected_screenshot_b64.
|
||
|
||
Le screenshot est compressé en JPEG qualité 40 (~30-50 KB en b64)
|
||
pour limiter le poids de chaque action.
|
||
"""
|
||
import base64
|
||
from PIL import Image as _Image
|
||
|
||
shots_dir = session_dir / "shots"
|
||
if not shots_dir.is_dir():
|
||
return
|
||
|
||
# Mapper les screenshot_id des événements originaux aux actions
|
||
# Les événements click/key_combo ont un "screenshot_id" (ex: "shot_0003")
|
||
# Le screenshot résultat est "res_shot_0003.png"
|
||
action_idx = 0
|
||
for raw_evt in raw_events:
|
||
event_data = raw_evt.get("event", raw_evt)
|
||
screenshot_id = event_data.get("screenshot_id", "")
|
||
if not screenshot_id:
|
||
continue
|
||
|
||
evt_type = event_data.get("type", "")
|
||
if evt_type not in ("mouse_click", "key_combo", "key_press"):
|
||
continue
|
||
|
||
# Trouver l'action correspondante (même type, index croissant)
|
||
while action_idx < len(actions):
|
||
a = actions[action_idx]
|
||
a_type = a.get("type", "")
|
||
if a_type in ("click", "key_combo"):
|
||
break
|
||
action_idx += 1
|
||
else:
|
||
break # Plus d'actions
|
||
|
||
# Charger le screenshot résultat
|
||
res_file = shots_dir / f"res_{screenshot_id}.png"
|
||
if not res_file.is_file():
|
||
action_idx += 1
|
||
continue
|
||
|
||
try:
|
||
img = _Image.open(res_file)
|
||
# Redimensionner pour réduire le poids (800px de large)
|
||
if img.width > 800:
|
||
ratio = 800 / img.width
|
||
img = img.resize((800, int(img.height * ratio)), _Image.LANCZOS)
|
||
import io
|
||
buf = io.BytesIO()
|
||
img.save(buf, format="JPEG", quality=40)
|
||
b64 = base64.b64encode(buf.getvalue()).decode()
|
||
actions[action_idx]["expected_screenshot_b64"] = b64
|
||
logger.debug(
|
||
"Screenshot de référence attaché à action %d : %s (%d KB)",
|
||
action_idx, res_file.name, len(b64) // 1024,
|
||
)
|
||
except Exception as e:
|
||
logger.debug("Erreur chargement screenshot ref %s : %s", res_file, e)
|
||
|
||
action_idx += 1
|
||
|
||
|
||
def build_replay_from_raw_events(
|
||
events: list,
|
||
session_id: str = "",
|
||
session_dir: Optional[str] = None,
|
||
) -> list:
|
||
"""Construire un replay propre directement depuis les événements bruts d'une session.
|
||
|
||
Pas de dépendance au VLM, au GraphBuilder ou aux workflows.
|
||
Fonctionne immédiatement après la capture.
|
||
|
||
Pipeline de traitement :
|
||
1. Filtrer les événements parasites (heartbeat, focus_change, action_result)
|
||
2. Extraire la résolution d'écran depuis les métadonnées
|
||
3. Couper après Alt+F4 ou après des clics systray post-sauvegarde
|
||
4. Fusionner les text_input consécutifs (même séparés par <500ms)
|
||
5. Convertir en actions normalisées (coordonnées en %, waits adaptés)
|
||
6. Pour les clics : activer visual_mode et attacher le crop de référence (anchor)
|
||
7. Appliquer clean_enriched_actions() (dédup combos, sanitize, merge texte)
|
||
8. Insérer des waits contextuels après raccourcis critiques
|
||
|
||
Args:
|
||
events: Événements bruts chargés depuis live_events.jsonl.
|
||
Format : [{"session_id": ..., "event": {...}}, ...]
|
||
session_id: Identifiant de session (pour le logging).
|
||
session_dir: Répertoire de la session (contient shots/). Si fourni,
|
||
les crops de référence sont attachés aux clics pour le visual replay.
|
||
|
||
Returns:
|
||
Liste d'actions prêtes pour la queue de replay.
|
||
"""
|
||
import uuid
|
||
|
||
if not events:
|
||
return []
|
||
|
||
# Résoudre le répertoire de session pour les crops visuels
|
||
session_dir_path = Path(session_dir) if session_dir else None
|
||
if session_dir_path and not session_dir_path.is_dir():
|
||
logger.warning(
|
||
"session_dir '%s' n'existe pas — visual replay désactivé", session_dir,
|
||
)
|
||
session_dir_path = None
|
||
|
||
# ── 1. Extraire la résolution d'écran ──
|
||
screen_w, screen_h = _extract_screen_resolution(events)
|
||
logger.info(
|
||
"build_replay_from_raw_events(%s) : %d événements bruts, résolution=%dx%d, visual=%s",
|
||
session_id, len(events), screen_w, screen_h,
|
||
bool(session_dir_path),
|
||
)
|
||
|
||
# ── 2. Filtrer et normaliser les événements ──
|
||
actionable_events = []
|
||
saw_save_combo = False # Tracker Ctrl+S / Ctrl+Shift+S pour la coupure systray
|
||
for raw_evt in events:
|
||
event_data = raw_evt.get("event", raw_evt)
|
||
evt_type = event_data.get("type", "")
|
||
|
||
# Ignorer les types parasites
|
||
if evt_type in _IGNORED_EVENT_TYPES:
|
||
continue
|
||
|
||
# Tracker les raccourcis de sauvegarde (Ctrl+S, Ctrl+Shift+S)
|
||
if evt_type in ("key_combo", "key_press"):
|
||
keys = _sanitize_keys(event_data.get("keys", []))
|
||
keys_lower = {k.lower() for k in keys if k}
|
||
if "s" in keys_lower and ("ctrl" in keys_lower or "ctrl_l" in keys_lower
|
||
or "ctrl_r" in keys_lower):
|
||
saw_save_combo = True
|
||
|
||
# Vérifier la coupure AVANT d'ajouter l'événement.
|
||
# Pour les clics post-sauvegarde sur des fenêtres non-applicatives,
|
||
# on ne veut PAS inclure le clic qui déclenche la coupure.
|
||
if _should_cut_after_event(event_data, saw_save_combo=saw_save_combo, actions_count=len(actionable_events)):
|
||
# Alt+F4 est une action applicative → l'inclure
|
||
is_alt_f4 = False
|
||
if evt_type in ("key_combo", "key_press"):
|
||
_keys_lower = {k.lower() for k in event_data.get("keys", []) if k}
|
||
is_alt_f4 = "f4" in _keys_lower and (
|
||
"alt" in _keys_lower or "alt_l" in _keys_lower or "alt_r" in _keys_lower
|
||
)
|
||
if is_alt_f4:
|
||
actionable_events.append(event_data)
|
||
# Sinon c'est un clic parasite → ne PAS l'inclure
|
||
logger.debug(
|
||
"Coupure du replay (saw_save=%s, type=%s, included=%s)",
|
||
saw_save_combo, evt_type, is_alt_f4,
|
||
)
|
||
break
|
||
|
||
actionable_events.append(event_data)
|
||
|
||
# ── 3. Fusionner les text_input consécutifs ──
|
||
# Tous les text_input consécutifs sont fusionnés en un seul, indépendamment
|
||
# du gap temporel. L'utilisateur tape lettre par lettre mais on veut un
|
||
# seul "type" avec tout le texte dans le replay.
|
||
# Les key_combos qui produisent un caractère imprimable (ex: AltGr+0 → @)
|
||
# sont convertis en text_input pour être fusionnés avec le texte adjacent.
|
||
# Seul un changement de fenêtre (window_title différent) coupe la fusion.
|
||
merged_events = []
|
||
_altgr_seq_got_char = False # Vrai si on a déjà capturé le caractère d'une séquence AltGr
|
||
_in_altgr_seq = False # Vrai si on est dans une séquence AltGr (après un modifier-only ctrl+alt_gr)
|
||
for evt in actionable_events:
|
||
evt_type = evt.get("type", "")
|
||
evt_ts = float(evt.get("timestamp", 0))
|
||
|
||
# Convertir les key_combos qui produisent un caractère imprimable
|
||
# en text_input pour qu'ils soient fusionnés avec le texte adjacent.
|
||
# Ex: AltGr+0 capturé comme ['ctrl', '@'] → text_input '@'
|
||
if evt_type in ("key_combo", "key_press"):
|
||
keys = _sanitize_keys(evt.get("keys", []))
|
||
|
||
# Ignorer les key_combo modifier-only (ex: ['ctrl', 'alt_gr'] seul)
|
||
non_mods = [k for k in keys if k.lower() not in _MODIFIER_ONLY_KEYS]
|
||
if not non_mods:
|
||
_in_altgr_seq = True # Début de séquence AltGr
|
||
_altgr_seq_got_char = False
|
||
continue
|
||
|
||
printable = _key_combo_printable_char(keys)
|
||
|
||
# Filtrer les fantômes AltGr : sur AZERTY, AltGr+touche produit
|
||
# 3 events (ctrl+alt_gr, ctrl+@, ctrl+]). Le 3ème est un fantôme
|
||
# du release de la touche physique. On garde SEULEMENT le 1er
|
||
# caractère et on ignore les suivants de la même séquence.
|
||
if _in_altgr_seq and printable:
|
||
if not _altgr_seq_got_char:
|
||
# Premier caractère de la séquence AltGr → on le garde
|
||
_altgr_seq_got_char = True
|
||
evt = dict(evt, type="text_input", text=printable)
|
||
evt_type = "text_input"
|
||
else:
|
||
# Fantôme AltGr → ignorer complètement
|
||
continue
|
||
elif printable:
|
||
_in_altgr_seq = False
|
||
_altgr_seq_got_char = False
|
||
evt = dict(evt, type="text_input", text=printable)
|
||
evt_type = "text_input"
|
||
else:
|
||
_in_altgr_seq = False
|
||
_altgr_seq_got_char = False
|
||
|
||
if not printable and evt_type != "text_input":
|
||
# AltGr seul (AZERTY) : le caractère est dans les raw_keys
|
||
raw_keys = evt.get("raw_keys", [])
|
||
for rk in raw_keys:
|
||
ch = rk.get("char", "")
|
||
if ch and len(ch) == 1 and ch.isprintable() and rk.get("action") == "release":
|
||
printable = ch
|
||
break
|
||
if printable and evt_type != "text_input":
|
||
# Transformer en text_input pour fusion
|
||
evt = dict(evt, type="text_input", text=printable)
|
||
evt_type = "text_input"
|
||
# Pas de raw_keys pour ce caractère (sera collé via clipboard)
|
||
|
||
if evt_type == "text_input":
|
||
text = evt.get("text", "")
|
||
if not text:
|
||
continue
|
||
|
||
# Les \n et \t ne sont PAS du texte — ce sont des touches Enter/Tab
|
||
# qui doivent devenir des key_combo pour le replay
|
||
if text == "\n":
|
||
merged_events.append({
|
||
"type": "key_combo",
|
||
"keys": ["enter"],
|
||
"timestamp": evt_ts,
|
||
})
|
||
continue
|
||
if text == "\t":
|
||
merged_events.append({
|
||
"type": "key_combo",
|
||
"keys": ["tab"],
|
||
"timestamp": evt_ts,
|
||
})
|
||
continue
|
||
|
||
# Fusionner avec le précédent text_input si même application
|
||
# On compare par app_name (pas title, car le titre change pendant la frappe)
|
||
if merged_events and merged_events[-1].get("type") == "text_input":
|
||
prev_app = merged_events[-1].get("window", {}).get("app_name", "")
|
||
curr_app = evt.get("window", {}).get("app_name", "")
|
||
# Même application (ou application inconnue) → fusionner
|
||
if not prev_app or not curr_app or prev_app == curr_app:
|
||
merged_events[-1]["text"] = merged_events[-1].get("text", "") + text
|
||
merged_events[-1]["_end_ts"] = evt_ts
|
||
# Fusionner aussi les raw_keys (replay exact)
|
||
if evt.get("raw_keys"):
|
||
prev_raw = merged_events[-1].get("raw_keys", [])
|
||
merged_events[-1]["raw_keys"] = prev_raw + evt["raw_keys"]
|
||
continue
|
||
|
||
merged_events.append(dict(evt, _end_ts=evt_ts))
|
||
else:
|
||
merged_events.append(dict(evt))
|
||
|
||
# ── 3b. Reconstruire le texte correct depuis les raw_keys ──
|
||
# Les raw_keys contiennent les vk codes exacts (layout-indépendant)
|
||
# qui permettent de corriger les erreurs de capture AZERTY
|
||
# (ex: numpad / capturé comme '!' → corrigé en '/')
|
||
# ATTENTION : ne reconstruire QUE si le texte reconstruit a la même
|
||
# longueur que le texte original. Si des caractères viennent de
|
||
# key_combos convertis (ex: @ de AltGr), ils n'ont pas de raw_keys
|
||
# et la reconstruction les perdrait.
|
||
for evt in merged_events:
|
||
if evt.get("type") == "text_input" and evt.get("raw_keys"):
|
||
reconstructed = _reconstruct_text_from_raw_keys(evt["raw_keys"])
|
||
original = evt.get("text", "")
|
||
if reconstructed and len(reconstructed) == len(original):
|
||
# Même longueur → remplacement sûr (corrige les chars numpad)
|
||
evt["text"] = reconstructed
|
||
if reconstructed != original:
|
||
logger.debug(
|
||
"Texte reconstruit depuis raw_keys : '%s' → '%s'",
|
||
original[:50], reconstructed[:50],
|
||
)
|
||
elif reconstructed and len(reconstructed) < len(original):
|
||
# Longueur différente → des chars viennent de key_combos convertis
|
||
# (ex: @ de AltGr fusionné dans le texte mais absent des raw_keys)
|
||
# Garder le texte original ET supprimer raw_keys pour forcer le
|
||
# copier-coller au replay (les raw_keys sont incomplets)
|
||
del evt["raw_keys"]
|
||
logger.debug(
|
||
"Texte corrigé (key_combo fusionné) : '%s' → raw_keys supprimé, "
|
||
"replay par copier-coller",
|
||
original[:50],
|
||
)
|
||
|
||
# ── 4. Convertir en actions replay normalisées ──
|
||
actions = []
|
||
last_ts = 0.0
|
||
|
||
for evt in merged_events:
|
||
evt_type = evt.get("type", "")
|
||
evt_ts = float(evt.get("timestamp", 0))
|
||
|
||
# Insérer un wait si pause significative (> 2s, cappé à 5s)
|
||
if last_ts > 0 and evt_ts > last_ts:
|
||
delta_ms = int((evt_ts - last_ts) * 1000)
|
||
if delta_ms > 2000:
|
||
capped_ms = min(delta_ms, 5000)
|
||
actions.append({
|
||
"action_id": f"act_raw_{uuid.uuid4().hex[:8]}",
|
||
"type": "wait",
|
||
"duration_ms": capped_ms,
|
||
})
|
||
|
||
# Mettre à jour le timestamp
|
||
end_ts = float(evt.get("_end_ts", evt_ts))
|
||
last_ts = max(last_ts, end_ts if end_ts > 0 else evt_ts)
|
||
|
||
action = {"action_id": f"act_raw_{uuid.uuid4().hex[:8]}"}
|
||
|
||
if evt_type == "mouse_click":
|
||
pos = evt.get("pos", [])
|
||
if not pos or len(pos) != 2:
|
||
continue
|
||
action["type"] = "click"
|
||
action["x_pct"] = round(pos[0] / screen_w, 6)
|
||
action["y_pct"] = round(pos[1] / screen_h, 6)
|
||
action["button"] = evt.get("button", "left")
|
||
# Enrichir avec le titre de fenêtre si disponible
|
||
window = evt.get("window", {})
|
||
if window.get("title"):
|
||
action["window_title"] = window["title"]
|
||
|
||
# ── Visual replay : enrichissement VLM du clic ──
|
||
if session_dir_path:
|
||
# Stratégie de crop : d'abord chercher un crop pré-existant
|
||
# (vision_info.crop, screenshot_id_crop.png, focus_XXXX.png)
|
||
# puis fallback vers le crop 80x80 du full screenshot
|
||
anchor_b64_preexist = _load_crop_for_event(
|
||
evt, session_dir_path, screen_w, screen_h,
|
||
)
|
||
|
||
# Vérifier si un enrichissement temps réel existe déjà
|
||
# (calculé par SomEngine pendant l'enregistrement via api_stream)
|
||
enrichment = evt.get("enrichment")
|
||
if enrichment:
|
||
logger.debug(
|
||
"Enrichissement temps réel trouvé pour %s (by_text='%s')",
|
||
evt.get("screenshot_id", "?"),
|
||
enrichment.get("by_text", ""),
|
||
)
|
||
else:
|
||
# Pas d'enrichissement pré-calculé → appel SomEngine classique
|
||
screenshot_id = evt.get("screenshot_id", "")
|
||
full_path = session_dir_path / "shots" / f"{screenshot_id}_full.png" if screenshot_id else None
|
||
enrichment = enrich_click_from_screenshot(
|
||
screenshot_path=full_path,
|
||
click_x=int(pos[0]),
|
||
click_y=int(pos[1]),
|
||
screen_w=screen_w,
|
||
screen_h=screen_h,
|
||
window_title=window.get("title", ""),
|
||
vision_info=evt.get("vision_info"),
|
||
session_dir=session_dir_path,
|
||
screenshot_id=screenshot_id,
|
||
)
|
||
|
||
# Préférer le crop pré-existant (plus fiable : crop agent, focus)
|
||
if anchor_b64_preexist and enrichment:
|
||
enrichment["anchor_image_base64"] = anchor_b64_preexist
|
||
# Si enrich_click n'a pas pu cropper mais qu'on a un crop pré-existant
|
||
elif anchor_b64_preexist and not enrichment:
|
||
enrichment = {"anchor_image_base64": anchor_b64_preexist}
|
||
|
||
if enrichment and enrichment.get("anchor_image_base64"):
|
||
action["visual_mode"] = True
|
||
action["target_spec"] = {
|
||
k: v for k, v in enrichment.items()
|
||
if k != "by_position" # by_position est déjà dans x_pct/y_pct
|
||
}
|
||
# Ajouter les métadonnées fenêtre pour le grounding ciblé
|
||
wc = evt.get("window_capture", {})
|
||
if wc.get("rect"):
|
||
action["target_spec"]["window_capture"] = {
|
||
"rect": wc["rect"],
|
||
"window_size": wc.get("window_size"),
|
||
"click_relative": wc.get("click_relative"),
|
||
}
|
||
|
||
elif evt_type == "text_input":
|
||
text = evt.get("text", "")
|
||
if not text:
|
||
continue
|
||
action["type"] = "type"
|
||
action["text"] = text
|
||
# Propager les raw_keys pour le replay exact (solution AZERTY)
|
||
# SAUF si le texte contient des chars fusionnés depuis key_combos
|
||
# (ex: @ de AltGr) — dans ce cas les raw_keys sont incomplets
|
||
# et le replay doit utiliser le copier-coller
|
||
if evt.get("raw_keys"):
|
||
reconstructed = _reconstruct_text_from_raw_keys(evt["raw_keys"])
|
||
if len(reconstructed) >= len(text):
|
||
action["raw_keys"] = evt["raw_keys"]
|
||
else:
|
||
logger.debug(
|
||
"raw_keys incomplets pour '%s' (recon=%d < text=%d) → copier-coller",
|
||
text[:30], len(reconstructed), len(text),
|
||
)
|
||
|
||
elif evt_type in ("key_press", "key_combo"):
|
||
keys = evt.get("keys", [])
|
||
if not keys:
|
||
key = evt.get("key", "")
|
||
if key:
|
||
keys = [key]
|
||
if not keys:
|
||
continue
|
||
keys = _sanitize_keys(keys)
|
||
if _is_modifier_only(keys):
|
||
continue
|
||
action["type"] = "key_combo"
|
||
action["keys"] = keys
|
||
# Propager les raw_keys pour le replay exact (solution AZERTY)
|
||
if evt.get("raw_keys"):
|
||
action["raw_keys"] = evt["raw_keys"]
|
||
|
||
elif evt_type == "scroll":
|
||
pos = evt.get("pos", [])
|
||
action["type"] = "scroll"
|
||
if pos and len(pos) == 2:
|
||
action["x_pct"] = round(pos[0] / screen_w, 6)
|
||
action["y_pct"] = round(pos[1] / screen_h, 6)
|
||
action["delta"] = evt.get("delta", -3)
|
||
|
||
else:
|
||
continue
|
||
|
||
actions.append(action)
|
||
|
||
# ── 5. Nettoyage global (dédup combos, sanitize, merge texte, waits) ──
|
||
actions = clean_enriched_actions(actions)
|
||
|
||
# ── 6. Insérer des waits contextuels après raccourcis critiques ──
|
||
final_actions = []
|
||
for action in actions:
|
||
final_actions.append(action)
|
||
post_wait = _needs_post_wait(action)
|
||
if post_wait > 0:
|
||
# Vérifier si un wait existe déjà juste après (sera ajouté au prochain tour)
|
||
final_actions.append({
|
||
"action_id": f"act_raw_{uuid.uuid4().hex[:8]}",
|
||
"type": "wait",
|
||
"duration_ms": post_wait,
|
||
})
|
||
|
||
# ── 7. Dernier nettoyage des waits consécutifs ──
|
||
result = []
|
||
for a in final_actions:
|
||
if (result
|
||
and a.get("type") == "wait"
|
||
and result[-1].get("type") == "wait"):
|
||
# Garder le plus long
|
||
if a.get("duration_ms", 0) > result[-1].get("duration_ms", 0):
|
||
result[-1] = a
|
||
continue
|
||
result.append(a)
|
||
|
||
# ── 8. Attacher les screenshots de référence (état attendu après action) ──
|
||
# Les screenshots res_shot_XXXX.png capturés 1s après chaque action pendant
|
||
# l'enregistrement servent de référence pour le contrôle visuel.
|
||
if session_dir_path:
|
||
_attach_expected_screenshots(result, events, session_dir_path)
|
||
|
||
# ── 9. Enrichir avec expected_window_title (titre fenêtre attendu après le clic) ──
|
||
# Pour la vérification post-action : le titre de la fenêtre APRÈS le clic
|
||
# est le window_title du PROCHAIN clic dans la séquence.
|
||
click_indices = [i for i, a in enumerate(result) if a.get("type") == "click"]
|
||
for j, ci in enumerate(click_indices):
|
||
if j + 1 < len(click_indices):
|
||
next_ci = click_indices[j + 1]
|
||
next_title = result[next_ci].get("target_spec", {}).get("window_title", "")
|
||
if next_title:
|
||
result[ci]["expected_window_title"] = next_title
|
||
|
||
# Stats visual replay
|
||
visual_clicks = sum(
|
||
1 for a in result
|
||
if a.get("type") == "click" and a.get("visual_mode")
|
||
)
|
||
total_clicks = sum(1 for a in result if a.get("type") == "click")
|
||
verified_count = sum(1 for a in result if a.get("expected_screenshot_b64"))
|
||
logger.info(
|
||
"build_replay_from_raw_events(%s) : %d actions propres produites "
|
||
"(%d/%d clics avec visual_mode, %d avec screenshot de référence)",
|
||
session_id, len(result), visual_clicks, total_clicks, verified_count,
|
||
)
|
||
|
||
# Libérer gemma4 du GPU pour que qwen2.5vl puisse charger au replay
|
||
_unload_gemma4()
|
||
|
||
return result
|
||
|
||
|
||
class StreamProcessor:
|
||
"""
|
||
Processeur de streaming qui connecte les données Agent V1 au core pipeline.
|
||
|
||
Cycle de vie :
|
||
1. register_session() — crée l'état mémoire
|
||
2. process_event() — accumule événements, extrait contexte fenêtre
|
||
3. process_screenshot() — analyse via ScreenAnalyzer + CLIP embedding
|
||
4. finalize_session() — construit le Workflow via GraphBuilder (DBSCAN)
|
||
"""
|
||
|
||
def __init__(self, data_dir: str = "data/training"):
|
||
self.data_dir = Path(data_dir)
|
||
persist_dir = str(self.data_dir / "streaming_sessions")
|
||
live_sessions_dir = str(self.data_dir / "live_sessions")
|
||
self.session_manager = LiveSessionManager(
|
||
persist_dir=persist_dir,
|
||
live_sessions_dir=live_sessions_dir,
|
||
)
|
||
self._lock = threading.Lock()
|
||
|
||
# Core components (chargés paresseusement pour éviter les imports lourds au démarrage)
|
||
self._screen_analyzer = None
|
||
self._clip_embedder = None
|
||
self._state_embedding_builder = None # P0-3 : pipeline d'embedding unifié (fusion multi-modale)
|
||
self._faiss_manager = None
|
||
self._initialized = False
|
||
|
||
# Lock pour l'accès concurrent aux données de session (screen_states, embeddings, workflows)
|
||
self._data_lock = threading.Lock()
|
||
|
||
# Lock pour l'accès FAISS (IndexFlat.add() n'est pas thread-safe)
|
||
self._faiss_lock = threading.Lock()
|
||
|
||
# Flag de suspension : quand un replay est actif, le worker se suspend
|
||
# pour libérer le GPU au resolve_target VLM du replay.
|
||
# Settée depuis api_stream.py via set_replay_flag().
|
||
self._replay_active_flag: Optional[threading.Event] = None
|
||
|
||
# Résultats d'analyse par session
|
||
self._screen_states: Dict[str, list] = {} # session_id -> List[ScreenState]
|
||
self._embeddings: Dict[str, list] = {} # session_id -> List[np.ndarray]
|
||
|
||
# Workflows construits (pour le matching)
|
||
self._workflows: Dict[str, Any] = {}
|
||
|
||
# Charger les workflows existants depuis le disque
|
||
self._load_persisted_workflows()
|
||
|
||
def _load_persisted_workflows(self):
|
||
"""Charger les workflows sauvegardés depuis le disque au démarrage.
|
||
|
||
Scanne le dossier workflows/ principal et les sous-dossiers par machine
|
||
(workflows/{machine_id}/) pour la rétrocompatibilité.
|
||
"""
|
||
workflows_dir = self.data_dir / "workflows"
|
||
if not workflows_dir.exists():
|
||
return
|
||
|
||
try:
|
||
from core.models.workflow_graph import Workflow
|
||
|
||
count = 0
|
||
# Charger les workflows du dossier racine (rétrocompatibilité)
|
||
for wf_file in sorted(workflows_dir.glob("*.json")):
|
||
try:
|
||
wf = Workflow.load_from_file(wf_file)
|
||
self._workflows[wf.workflow_id] = wf
|
||
count += 1
|
||
except Exception as e:
|
||
logger.warning(f"Impossible de charger {wf_file.name}: {e}")
|
||
|
||
# Charger les workflows des sous-dossiers par machine
|
||
for machine_dir in sorted(workflows_dir.iterdir()):
|
||
if not machine_dir.is_dir():
|
||
continue
|
||
for wf_file in sorted(machine_dir.glob("*.json")):
|
||
try:
|
||
wf = Workflow.load_from_file(wf_file)
|
||
# Stocker le machine_id dans les métadonnées du workflow
|
||
if not hasattr(wf, '_machine_id'):
|
||
wf._machine_id = machine_dir.name
|
||
self._workflows[wf.workflow_id] = wf
|
||
count += 1
|
||
except Exception as e:
|
||
logger.warning(f"Impossible de charger {wf_file.name}: {e}")
|
||
|
||
if count:
|
||
logger.info(f"{count} workflow(s) chargé(s) depuis {workflows_dir}")
|
||
except ImportError:
|
||
logger.debug("core.models.workflow_graph non disponible, skip chargement")
|
||
|
||
def set_replay_flag(self, flag: threading.Event):
|
||
"""Associer le flag de replay actif (depuis api_stream.py).
|
||
|
||
Quand ce flag est set(), reprocess_session() se suspend entre chaque
|
||
screenshot pour libérer le GPU au replay (resolve_target VLM).
|
||
"""
|
||
self._replay_active_flag = flag
|
||
logger.info("Flag de suspension replay configuré sur le StreamProcessor")
|
||
|
||
def _wait_if_replay_active(self, context: str = "") -> bool:
|
||
"""Suspendre le traitement si un replay est en cours.
|
||
|
||
Vérifie le flag _replay_active_flag et attend qu'il se clear.
|
||
Timeout de sécurité : 60s max pour éviter un blocage si le replay
|
||
plante sans clear le flag.
|
||
|
||
Args:
|
||
context: Description pour les logs (ex: "screenshot 3/10").
|
||
|
||
Returns:
|
||
True si on a dû attendre (replay était actif), False sinon.
|
||
"""
|
||
if not self._replay_active_flag or not self._replay_active_flag.is_set():
|
||
return False
|
||
|
||
import time
|
||
suspend_start = time.time()
|
||
waited = False
|
||
while self._replay_active_flag.is_set():
|
||
elapsed = time.time() - suspend_start
|
||
if elapsed > 60:
|
||
logger.warning(
|
||
f"Worker : timeout suspension (60s), reprise forcée ({context})"
|
||
)
|
||
break
|
||
if not waited:
|
||
logger.info(f"Worker suspendu — replay en cours ({context})")
|
||
waited = True
|
||
time.sleep(2)
|
||
|
||
if waited:
|
||
total_wait = time.time() - suspend_start
|
||
logger.info(
|
||
f"Worker reprend après {total_wait:.1f}s de suspension ({context})"
|
||
)
|
||
return waited
|
||
|
||
def _ensure_initialized(self):
|
||
"""Charger les composants core GPU si pas encore fait.
|
||
|
||
DÉSACTIVÉ dans le serveur HTTP : les composants GPU (ScreenAnalyzer,
|
||
CLIP, FAISS) bloquent le GIL Python et rendent le serveur non-réactif.
|
||
Ces composants sont chargés uniquement par le worker séparé (run_worker.py).
|
||
Le serveur HTTP ne fait que stocker les screenshots et distribuer les replays.
|
||
"""
|
||
if self._initialized:
|
||
return
|
||
# Marquer comme initialisé SANS charger les composants GPU
|
||
self._initialized = True
|
||
logger.info("StreamProcessor initialisé en mode LÉGER (pas de GPU, pas de VLM)")
|
||
return
|
||
|
||
with self._lock:
|
||
if self._initialized:
|
||
return
|
||
|
||
logger.info("Initialisation des composants core (GPU)...")
|
||
|
||
try:
|
||
from core.pipeline.screen_analyzer import ScreenAnalyzer
|
||
self._screen_analyzer = ScreenAnalyzer(session_id="stream_server")
|
||
logger.info(" ScreenAnalyzer prêt")
|
||
except Exception as e:
|
||
logger.error(f" Erreur init ScreenAnalyzer: {e}")
|
||
self._screen_analyzer = None
|
||
|
||
try:
|
||
from core.embedding.clip_embedder import CLIPEmbedder
|
||
self._clip_embedder = CLIPEmbedder()
|
||
logger.info(" CLIPEmbedder prêt (singleton, ne sera plus rechargé)")
|
||
except Exception as e:
|
||
logger.error(f" Erreur init CLIPEmbedder: {e}")
|
||
self._clip_embedder = None
|
||
|
||
# P0-3 : Initialiser le StateEmbeddingBuilder pour unifier l'espace d'embedding
|
||
# Utilise le même CLIPEmbedder (pas de rechargement du modèle) + FusionEngine
|
||
# pour produire des vecteurs fusionnés (image+text+title+ui) identiques à GraphBuilder
|
||
try:
|
||
from core.embedding.state_embedding_builder import StateEmbeddingBuilder
|
||
if self._clip_embedder is not None:
|
||
# Injecter le CLIPEmbedder déjà chargé pour éviter un double chargement
|
||
self._state_embedding_builder = StateEmbeddingBuilder(
|
||
embedders={
|
||
"image": self._clip_embedder,
|
||
"text": self._clip_embedder,
|
||
"title": self._clip_embedder,
|
||
"ui": self._clip_embedder,
|
||
},
|
||
output_dir=self.data_dir / "embeddings",
|
||
use_clip=False, # Pas besoin, on fournit les embedders directement
|
||
)
|
||
else:
|
||
# Fallback : laisser le builder créer son propre CLIPEmbedder
|
||
self._state_embedding_builder = StateEmbeddingBuilder(
|
||
output_dir=self.data_dir / "embeddings",
|
||
use_clip=True,
|
||
)
|
||
logger.info(" StateEmbeddingBuilder prêt (fusion multi-modale unifiée)")
|
||
except Exception as e:
|
||
logger.warning(f" StateEmbeddingBuilder non disponible, fallback CLIP pur: {e}")
|
||
self._state_embedding_builder = None
|
||
|
||
try:
|
||
from core.embedding.faiss_manager import FAISSManager
|
||
self._faiss_manager = FAISSManager(
|
||
dimensions=512,
|
||
index_type="Flat",
|
||
metric="cosine",
|
||
)
|
||
logger.info(" FAISSManager prêt (512 dims, cosine)")
|
||
except Exception as e:
|
||
logger.error(f" Erreur init FAISSManager: {e}")
|
||
self._faiss_manager = None
|
||
|
||
self._initialized = True
|
||
logger.info("Composants core initialisés.")
|
||
|
||
# =========================================================================
|
||
# Événements
|
||
# =========================================================================
|
||
|
||
def process_event(self, session_id: str, event_data: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""Enregistrer un événement dans la session live.
|
||
|
||
Filtre les événements parasites à la réception :
|
||
- key_combo/key_press avec uniquement des modificateurs seuls (ctrl, alt, shift, etc.)
|
||
- key_combo/key_press avec liste de touches vide
|
||
- text_input avec texte vide
|
||
"""
|
||
if _is_parasitic_event(event_data):
|
||
logger.debug(
|
||
f"Événement parasite filtré (session {session_id}): "
|
||
f"type={event_data.get('type')}, data={event_data.get('keys', event_data.get('text', ''))}"
|
||
)
|
||
return {"status": "event_filtered", "session_id": session_id, "reason": "parasitic"}
|
||
self.session_manager.add_event(session_id, event_data)
|
||
return {"status": "event_recorded", "session_id": session_id}
|
||
|
||
# =========================================================================
|
||
# Screenshots
|
||
# =========================================================================
|
||
|
||
def process_screenshot(self, session_id: str, shot_id: str, file_path: str) -> Dict[str, Any]:
|
||
"""
|
||
Analyser un screenshot full via le core pipeline.
|
||
|
||
1. ScreenAnalyzer → ScreenState (OCR, UI detection)
|
||
2. StateEmbeddingBuilder → vecteur fusionné 512d (image+text+title+ui)
|
||
Même espace d'embedding que GraphBuilder (P0-3)
|
||
Fallback : CLIP embed_image() si StateEmbeddingBuilder échoue
|
||
3. FAISS indexation → matching temps réel
|
||
"""
|
||
self._ensure_initialized()
|
||
self.session_manager.add_screenshot(session_id, shot_id, file_path)
|
||
|
||
result = {
|
||
"shot_id": shot_id,
|
||
"session_id": session_id,
|
||
"state_id": None,
|
||
"ui_elements_count": 0,
|
||
"text_detected": 0,
|
||
"embedding_indexed": False,
|
||
"match": None,
|
||
}
|
||
|
||
# 1. Construire le ScreenState
|
||
if self._screen_analyzer is None:
|
||
logger.warning("ScreenAnalyzer non disponible, skip analyse")
|
||
return result
|
||
|
||
session = self.session_manager.get_session(session_id)
|
||
# Utiliser le mapping shot → window si disponible (reprocessing)
|
||
shot_map = getattr(session, '_shot_window_map', None) if session else None
|
||
if shot_map and shot_id in shot_map:
|
||
window_info = shot_map[shot_id]
|
||
else:
|
||
window_info = session.last_window_info if session else {}
|
||
|
||
try:
|
||
screen_state = self._screen_analyzer.analyze(
|
||
screenshot_path=file_path,
|
||
window_info=window_info,
|
||
)
|
||
result["state_id"] = screen_state.screen_state_id
|
||
result["ui_elements_count"] = len(screen_state.ui_elements)
|
||
result["text_detected"] = len(
|
||
getattr(screen_state.perception, "detected_text", [])
|
||
)
|
||
|
||
# Stocker le ScreenState pour le build final
|
||
with self._data_lock:
|
||
if session_id not in self._screen_states:
|
||
self._screen_states[session_id] = []
|
||
self._screen_states[session_id].append(screen_state)
|
||
|
||
logger.info(
|
||
f"Screenshot analysé: {shot_id} | "
|
||
f"{result['ui_elements_count']} UI elements, "
|
||
f"{result['text_detected']} textes"
|
||
)
|
||
except Exception as e:
|
||
logger.error(f"Erreur analyse screenshot {shot_id}: {e}")
|
||
return result
|
||
|
||
# 2. Construire l'embedding fusionné via StateEmbeddingBuilder (P0-3)
|
||
# Utilise le même pipeline que GraphBuilder : fusion image+text+title+ui
|
||
# pour garantir que les vecteurs FAISS sont dans le même espace d'embedding
|
||
embedding_vector = None
|
||
|
||
if self._state_embedding_builder is not None:
|
||
try:
|
||
state_embedding = self._state_embedding_builder.build(screen_state)
|
||
# Récupérer le vecteur fusionné depuis le StateEmbedding
|
||
fused_vec = state_embedding.get_vector()
|
||
if fused_vec is not None:
|
||
embedding_vector = fused_vec.astype(np.float32)
|
||
logger.debug(
|
||
f"Embedding fusionné multi-modal calculé pour {shot_id} "
|
||
f"(dim={embedding_vector.shape[0]})"
|
||
)
|
||
except Exception as e:
|
||
logger.warning(
|
||
f"StateEmbeddingBuilder échoué pour {shot_id}: {e}, "
|
||
f"fallback sur CLIP pur"
|
||
)
|
||
|
||
# Fallback : utiliser le CLIPEmbedder singleton (embedding image seul)
|
||
if embedding_vector is None and self._clip_embedder is not None:
|
||
try:
|
||
from PIL import Image
|
||
pil_image = Image.open(file_path)
|
||
embedding_vector = self._clip_embedder.embed_image(pil_image)
|
||
except Exception as e:
|
||
logger.debug(f"CLIP embedding échoué: {e}")
|
||
|
||
if embedding_vector is not None:
|
||
# Stocker pour le build final
|
||
with self._data_lock:
|
||
if session_id not in self._embeddings:
|
||
self._embeddings[session_id] = []
|
||
self._embeddings[session_id].append(embedding_vector)
|
||
|
||
# 3. Indexer dans FAISS (protégé par _faiss_lock car IndexFlat.add n'est pas thread-safe)
|
||
if self._faiss_manager is not None:
|
||
try:
|
||
with self._faiss_lock:
|
||
self._faiss_manager.add_embedding(
|
||
embedding_id=screen_state.screen_state_id,
|
||
vector=embedding_vector,
|
||
metadata={
|
||
"session_id": session_id,
|
||
"shot_id": shot_id,
|
||
"window_title": window_info.get("title", ""),
|
||
},
|
||
)
|
||
result["embedding_indexed"] = True
|
||
except Exception as e:
|
||
logger.error(f"Erreur FAISS indexation: {e}")
|
||
|
||
# 4. Matching temps réel contre les workflows connus
|
||
with self._data_lock:
|
||
has_workflows = bool(self._workflows)
|
||
if embedding_vector is not None and has_workflows:
|
||
result["match"] = self._try_match(embedding_vector)
|
||
|
||
return result
|
||
|
||
def process_crop(self, session_id: str, shot_id: str, file_path: str) -> Dict[str, Any]:
|
||
"""
|
||
Enregistrer un crop (400x400). Pas d'analyse ScreenAnalyzer
|
||
(un crop est un fragment, pas un écran complet).
|
||
"""
|
||
self.session_manager.add_screenshot(session_id, shot_id, file_path)
|
||
return {"status": "crop_stored", "shot_id": shot_id}
|
||
|
||
# =========================================================================
|
||
# Finalisation
|
||
# =========================================================================
|
||
|
||
def finalize_session(self, session_id: str) -> Dict[str, Any]:
|
||
"""
|
||
Construire un Workflow depuis les données accumulées.
|
||
|
||
Utilise le GraphBuilder du core avec les ScreenStates et embeddings
|
||
collectés pendant le streaming.
|
||
"""
|
||
self._ensure_initialized()
|
||
|
||
session = self.session_manager.finalize(session_id)
|
||
if not session:
|
||
return {"error": f"Session {session_id} non trouvée"}
|
||
|
||
with self._data_lock:
|
||
states = list(self._screen_states.get(session_id, []))
|
||
embeddings = list(self._embeddings.get(session_id, []))
|
||
|
||
if len(states) < 2:
|
||
logger.warning(
|
||
f"Session {session_id}: seulement {len(states)} states, "
|
||
f"pas assez pour construire un workflow"
|
||
)
|
||
return {
|
||
"session_id": session_id,
|
||
"status": "insufficient_data",
|
||
"states_count": len(states),
|
||
"min_required": 2,
|
||
}
|
||
|
||
# Convertir en RawSession pour le GraphBuilder
|
||
raw_dict = self.session_manager.to_raw_session(session_id)
|
||
if not raw_dict:
|
||
return {"error": "Conversion RawSession échouée"}
|
||
|
||
try:
|
||
from core.models.raw_session import RawSession
|
||
raw_session = RawSession.from_dict(raw_dict)
|
||
except Exception as e:
|
||
logger.error(f"Erreur construction RawSession: {e}")
|
||
# Fallback : construire manuellement
|
||
try:
|
||
raw_session = self._build_raw_session_fallback(session, raw_dict)
|
||
except Exception as e2:
|
||
return {"error": f"Erreur RawSession: {e2}"}
|
||
|
||
# Construire le workflow via GraphBuilder
|
||
try:
|
||
from core.graph.graph_builder import GraphBuilder
|
||
|
||
n = len(states)
|
||
min_reps = 1 if n < 6 else 2 if n <= 30 else min(3, n // 10)
|
||
|
||
builder = GraphBuilder(
|
||
embedding_builder=self._state_embedding_builder, # Réutiliser le même modèle CLIP
|
||
min_pattern_repetitions=min_reps,
|
||
clustering_eps=0.08,
|
||
clustering_min_samples=2,
|
||
)
|
||
|
||
# Nommer le workflow intelligemment à partir des titres de fenêtre
|
||
workflow_name = self._generate_workflow_name(session_id)
|
||
|
||
# Récupérer les embeddings pré-calculés pendant le streaming
|
||
with self._data_lock:
|
||
precomputed_embs = list(self._embeddings.get(session_id, []))
|
||
|
||
# Enrichir les ScreenStates avec les timestamps des événements.
|
||
# Nécessaire pour que _find_transition_events() puisse associer
|
||
# les actions utilisateur aux bonnes transitions.
|
||
session_state = self.session_manager.get_session(session_id)
|
||
shot_ts_map = getattr(session_state, '_shot_ts_map', {}) if session_state else {}
|
||
if shot_ts_map:
|
||
self._enrich_states_with_timestamps(states, shot_ts_map)
|
||
|
||
# Mode séquentiel : pour les enregistrements single-pass, chaque
|
||
# screenshot est une étape distincte du workflow.
|
||
# Le clustering DBSCAN fusionne les screenshots similaires
|
||
# (ex: plusieurs vues de Notepad) en un seul node → perte d'actions.
|
||
# Le mode séquentiel préserve toutes les étapes.
|
||
use_sequential = len(raw_session.events) > 0
|
||
|
||
# Injecter les ScreenStates et embeddings pré-calculés pour éviter
|
||
# de re-analyser et de recalculer les embeddings (triple calcul)
|
||
workflow = builder.build_from_session(
|
||
raw_session,
|
||
workflow_name=workflow_name,
|
||
precomputed_states=states,
|
||
precomputed_embeddings=precomputed_embs if len(precomputed_embs) == len(states) else None,
|
||
sequential=use_sequential,
|
||
)
|
||
|
||
with self._data_lock:
|
||
self._workflows[workflow.workflow_id] = workflow
|
||
|
||
# Persister sur disque (dans le dossier de la machine source)
|
||
machine_id = session.machine_id if hasattr(session, 'machine_id') else "default"
|
||
saved_path = self._persist_workflow(workflow, session_id, machine_id=machine_id)
|
||
# Stocker le machine_id dans le workflow pour le filtrage
|
||
workflow._machine_id = machine_id
|
||
|
||
# Récupérer les métadonnées applicatives de la session
|
||
session_state = self.session_manager.get_session(session_id)
|
||
app_context = {}
|
||
if session_state:
|
||
app_context = {
|
||
"window_titles": dict(session_state.window_titles_seen),
|
||
"app_names": dict(session_state.app_names_seen),
|
||
"primary_app": sorted(
|
||
session_state.app_names_seen.items(),
|
||
key=lambda x: -x[1]
|
||
)[0][0] if session_state.app_names_seen else None,
|
||
"multi_app": len(session_state.app_names_seen) >= 3,
|
||
}
|
||
|
||
result = {
|
||
"session_id": session_id,
|
||
"machine_id": machine_id,
|
||
"status": "workflow_built",
|
||
"workflow_id": workflow.workflow_id,
|
||
"workflow_name": workflow_name,
|
||
"nodes": len(workflow.nodes),
|
||
"edges": len(workflow.edges),
|
||
"states_analyzed": len(states),
|
||
"embeddings_indexed": len(embeddings),
|
||
"saved_path": str(saved_path) if saved_path else None,
|
||
"app_context": app_context,
|
||
}
|
||
|
||
logger.info(
|
||
f"Workflow construit: '{workflow_name}' ({workflow.workflow_id}) | "
|
||
f"{result['nodes']} nodes, {result['edges']} edges"
|
||
+ (f" | apps: {list(app_context.get('app_names', {}).keys())}" if app_context.get('app_names') else "")
|
||
)
|
||
|
||
# Libérer la mémoire des données de session (peuvent être lourdes)
|
||
self._cleanup_session_data(session_id)
|
||
|
||
return result
|
||
|
||
except Exception as e:
|
||
logger.error(f"Erreur construction workflow: {e}")
|
||
return {"error": f"GraphBuilder: {e}", "session_id": session_id}
|
||
|
||
# =========================================================================
|
||
# Matching
|
||
# =========================================================================
|
||
|
||
def _try_match(self, embedding_vector: np.ndarray) -> Optional[Dict[str, Any]]:
|
||
"""Matcher un embedding contre les workflows connus."""
|
||
if self._faiss_manager is None or self._faiss_manager.index.ntotal == 0:
|
||
return None
|
||
|
||
try:
|
||
results = self._faiss_manager.search_similar(
|
||
query_vector=embedding_vector,
|
||
k=1,
|
||
min_similarity=0.85,
|
||
)
|
||
if results:
|
||
best = results[0]
|
||
return {
|
||
"matched_id": best.embedding_id,
|
||
"similarity": round(best.similarity, 4),
|
||
"metadata": best.metadata,
|
||
}
|
||
except Exception as e:
|
||
logger.debug(f"Erreur matching: {e}")
|
||
|
||
return None
|
||
|
||
# =========================================================================
|
||
# Enrichissement VLM des workflows (target_spec sur chaque edge)
|
||
# =========================================================================
|
||
|
||
def _enrich_workflow_targets(
|
||
self,
|
||
workflow,
|
||
session_dir: Path,
|
||
) -> int:
|
||
"""Enrichir les target_spec des edges d'un workflow avec les données VLM.
|
||
|
||
Pour chaque edge dont l'action est un clic (mouse_click ou compound
|
||
avec un step mouse_click), cette méthode :
|
||
1. Trouve le screenshot correspondant (via le from_node → shot_XXXX_full.png)
|
||
2. Extrait la position du clic depuis action.parameters.position
|
||
3. Appelle enrich_click_from_screenshot() pour obtenir :
|
||
- anchor_image_base64 (crop 80x80)
|
||
- by_text (texte OCR de l'élément)
|
||
- by_role (type de l'élément)
|
||
- vlm_description (description positionnelle)
|
||
- som_element (détection SomEngine)
|
||
4. Met à jour le TargetSpec de l'edge et stocke les données
|
||
supplémentaires dans context_hints et edge.metadata
|
||
|
||
Args:
|
||
workflow: Objet Workflow avec ses edges à enrichir.
|
||
session_dir: Répertoire de la session (contient shots/).
|
||
|
||
Returns:
|
||
Nombre d'edges enrichis.
|
||
"""
|
||
shots_dir = session_dir / "shots"
|
||
if not shots_dir.is_dir():
|
||
logger.warning(
|
||
"enrich_workflow_targets: dossier shots/ introuvable dans %s",
|
||
session_dir,
|
||
)
|
||
return 0
|
||
|
||
# ── Résolution d'écran depuis les événements ou fallback ──
|
||
screen_w, screen_h = self._get_screen_resolution_from_session(session_dir)
|
||
|
||
# ── Mapping node_id → screenshot_id ──
|
||
# En mode séquentiel, node_000 → premier screenshot, node_001 → deuxième, etc.
|
||
# On reconstruit ce mapping depuis les fichiers shot_XXXX_full.png triés.
|
||
all_shots = sorted(shots_dir.glob("shot_*_full.png"))
|
||
node_to_shot: Dict[str, Path] = {}
|
||
for i, shot_path in enumerate(all_shots):
|
||
node_id = f"node_{i:03d}"
|
||
node_to_shot[node_id] = shot_path
|
||
|
||
# ── Enrichir chaque edge ──
|
||
enriched_count = 0
|
||
|
||
for edge in workflow.edges:
|
||
# Trouver le screenshot du node source (là où le clic a lieu)
|
||
shot_path = node_to_shot.get(edge.from_node)
|
||
if not shot_path or not shot_path.is_file():
|
||
logger.debug(
|
||
"enrich: pas de screenshot pour node %s (edge %s)",
|
||
edge.from_node, edge.edge_id,
|
||
)
|
||
continue
|
||
|
||
# Extraire le screenshot_id depuis le nom de fichier
|
||
# shot_0001_full.png → shot_0001
|
||
screenshot_id = shot_path.stem.replace("_full", "")
|
||
|
||
# ── Extraire la position du clic ──
|
||
click_positions = self._extract_click_positions(edge)
|
||
if not click_positions:
|
||
continue
|
||
|
||
# Enrichir le premier clic (action principale)
|
||
click_x, click_y = click_positions[0]
|
||
|
||
# Extraire le titre de fenêtre depuis le node source si disponible
|
||
window_title = ""
|
||
source_node = workflow.get_node(edge.from_node)
|
||
if source_node and hasattr(source_node, 'template'):
|
||
tpl = source_node.template
|
||
# WindowConstraint dans le template
|
||
if hasattr(tpl, 'window') and tpl.window:
|
||
if tpl.window.title_pattern:
|
||
window_title = tpl.window.title_pattern
|
||
elif tpl.window.title_contains:
|
||
window_title = tpl.window.title_contains
|
||
elif tpl.window.process_name:
|
||
window_title = tpl.window.process_name
|
||
# Fallback : contraintes de l'edge
|
||
if not window_title and edge.constraints:
|
||
window_title = (
|
||
edge.constraints.required_window_title
|
||
or edge.constraints.required_app_name
|
||
or ""
|
||
)
|
||
|
||
# ── Appel à la fonction partagée d'enrichissement ──
|
||
enrichment = enrich_click_from_screenshot(
|
||
screenshot_path=shot_path,
|
||
click_x=int(click_x),
|
||
click_y=int(click_y),
|
||
screen_w=screen_w,
|
||
screen_h=screen_h,
|
||
window_title=window_title,
|
||
session_dir=session_dir,
|
||
screenshot_id=screenshot_id,
|
||
)
|
||
|
||
if not enrichment:
|
||
continue
|
||
|
||
# ── Mettre à jour le TargetSpec de l'edge ──
|
||
target = edge.action.target
|
||
|
||
# Champs first-class du TargetSpec
|
||
if enrichment.get("by_text"):
|
||
target.by_text = enrichment["by_text"]
|
||
if enrichment.get("by_role"):
|
||
target.by_role = enrichment["by_role"]
|
||
if enrichment.get("by_position"):
|
||
target.by_position = tuple(enrichment["by_position"])
|
||
|
||
# Champs supplémentaires dans context_hints
|
||
if not target.context_hints:
|
||
target.context_hints = {}
|
||
if enrichment.get("vlm_description"):
|
||
target.context_hints["vlm_description"] = enrichment["vlm_description"]
|
||
if enrichment.get("window_title"):
|
||
target.context_hints["window_title"] = enrichment["window_title"]
|
||
if enrichment.get("original_position"):
|
||
target.context_hints["original_position"] = enrichment["original_position"]
|
||
if enrichment.get("anchor_image_base64"):
|
||
target.context_hints["anchor_image_base64"] = enrichment["anchor_image_base64"]
|
||
if enrichment.get("by_text_source"):
|
||
target.context_hints["by_text_source"] = enrichment["by_text_source"]
|
||
if enrichment.get("som_element"):
|
||
target.context_hints["som_element"] = enrichment["som_element"]
|
||
|
||
# Marquer l'edge comme enrichi dans les métadonnées
|
||
edge.metadata["vlm_enriched"] = True
|
||
edge.metadata["enrichment_source"] = "reprocess_session"
|
||
|
||
enriched_count += 1
|
||
logger.debug(
|
||
"Edge %s enrichi : by_text='%s', by_role='%s', anchor=%s",
|
||
edge.edge_id,
|
||
enrichment.get("by_text", ""),
|
||
enrichment.get("by_role", ""),
|
||
"oui" if enrichment.get("anchor_image_base64") else "non",
|
||
)
|
||
|
||
logger.info(
|
||
"Enrichissement VLM terminé : %d/%d edges enrichis pour workflow '%s'",
|
||
enriched_count, len(workflow.edges), workflow.name,
|
||
)
|
||
return enriched_count
|
||
|
||
def _extract_click_positions(self, edge) -> List[tuple]:
|
||
"""Extraire les positions de clic depuis un edge du workflow.
|
||
|
||
Supporte les actions simples (mouse_click) et compound (steps).
|
||
|
||
Returns:
|
||
Liste de tuples (x, y) en pixels. Peut être vide si pas de clic.
|
||
"""
|
||
action = edge.action
|
||
positions = []
|
||
|
||
if action.type == "mouse_click":
|
||
pos = action.parameters.get("position", [])
|
||
if pos and len(pos) == 2:
|
||
positions.append((pos[0], pos[1]))
|
||
|
||
elif action.type == "compound":
|
||
# Chercher les steps de type mouse_click
|
||
for step in action.parameters.get("steps", []):
|
||
if step.get("type") == "mouse_click":
|
||
pos = step.get("position", [])
|
||
if pos and len(pos) == 2:
|
||
positions.append((pos[0], pos[1]))
|
||
|
||
return positions
|
||
|
||
def _get_screen_resolution_from_session(
|
||
self, session_dir: Path,
|
||
) -> tuple:
|
||
"""Extraire la résolution d'écran depuis les événements d'une session.
|
||
|
||
Lit live_events.jsonl et cherche screen_metadata.screen_resolution
|
||
ou infère depuis les positions des clics.
|
||
|
||
Returns:
|
||
Tuple (width, height). Fallback: (1920, 1080).
|
||
"""
|
||
import json as _json
|
||
|
||
events_file = session_dir / "live_events.jsonl"
|
||
if not events_file.exists():
|
||
return (1920, 1080)
|
||
|
||
events = []
|
||
try:
|
||
for line in events_file.read_text().splitlines():
|
||
if not line.strip():
|
||
continue
|
||
try:
|
||
events.append(_json.loads(line))
|
||
except _json.JSONDecodeError:
|
||
continue
|
||
except Exception:
|
||
return (1920, 1080)
|
||
|
||
if events:
|
||
return _extract_screen_resolution(events)
|
||
return (1920, 1080)
|
||
|
||
# =========================================================================
|
||
# Retraitement (appelé par le SessionWorker)
|
||
# =========================================================================
|
||
|
||
def reprocess_session(
|
||
self,
|
||
session_id: str,
|
||
progress_callback=None,
|
||
) -> Dict[str, Any]:
|
||
"""Retraiter une session finalisée : analyser tous les screenshots puis construire le workflow.
|
||
|
||
Utilisé par le SessionWorker pour traiter les sessions en arrière-plan.
|
||
Cherche les fichiers shot_*_full.png sur disque, les analyse un par un
|
||
via process_screenshot(), puis appelle finalize_session() pour construire
|
||
le workflow.
|
||
|
||
Args:
|
||
session_id: Identifiant de la session à retraiter.
|
||
progress_callback: Callable(session_id, current, total, shot_id) pour la progression.
|
||
|
||
Returns:
|
||
Dict avec le résultat de finalize_session() ou un dict d'erreur.
|
||
"""
|
||
logger.info(f"Retraitement de la session {session_id}")
|
||
|
||
# Trouver le dossier de la session sur disque
|
||
# Les screenshots peuvent être dans :
|
||
# - data/training/live_sessions/{session_id}/shots/
|
||
# - data/training/live_sessions/{machine_id}/{session_id}/shots/
|
||
session_dir = self._find_session_dir(session_id)
|
||
if not session_dir:
|
||
return {"error": f"Dossier session {session_id} introuvable sur disque"}
|
||
|
||
shots_dir = session_dir / "shots"
|
||
if not shots_dir.exists():
|
||
return {"error": f"Dossier shots/ introuvable pour {session_id}"}
|
||
|
||
# Lister les screenshots full (shot_XXXX_full.png), triés par nom
|
||
all_shots = sorted(shots_dir.glob("shot_*_full.png"))
|
||
if not all_shots:
|
||
return {
|
||
"error": f"Aucun screenshot shot_*_full.png trouvé dans {shots_dir}",
|
||
"session_id": session_id,
|
||
}
|
||
|
||
# Sélection intelligente : ne garder que les screenshots significatifs
|
||
# pour éviter d'analyser des captures redondantes (~identiques)
|
||
key_shots = self._select_key_screenshots(session_id, all_shots)
|
||
total_all = len(all_shots)
|
||
total = len(key_shots)
|
||
logger.info(
|
||
f"Screenshots sélectionnés : {total}/{total_all} "
|
||
f"(déduplication perceptuelle) dans {shots_dir}"
|
||
)
|
||
|
||
# S'assurer que la session est enregistrée dans le session_manager
|
||
self.session_manager.get_or_create(session_id)
|
||
|
||
# Restaurer les window_info depuis live_events.jsonl
|
||
# pour que ScreenAnalyzer crée des ScreenStates avec les bons titres de fenêtre
|
||
self._restore_window_events(session_id, session_dir)
|
||
|
||
# Restaurer les événements utilisateur (mouse_click, text_input, key_press)
|
||
# depuis live_events.jsonl → session.events, pour que to_raw_session()
|
||
# puisse les passer au GraphBuilder (construction des edges/actions)
|
||
self._restore_user_events(session_id, session_dir)
|
||
|
||
# Nettoyer les données en mémoire (au cas où un traitement précédent a échoué)
|
||
with self._data_lock:
|
||
self._screen_states.pop(session_id, None)
|
||
self._embeddings.pop(session_id, None)
|
||
|
||
# Analyser les screenshots en parallèle (2 workers max pour la VRAM)
|
||
# Chaque process_screenshot() est indépendant : ScreenAnalyzer + CLIP + FAISS
|
||
# Les structures partagées (_screen_states, _embeddings) sont protégées par _data_lock
|
||
max_parallel = min(2, total)
|
||
errors = 0
|
||
processed_count = 0
|
||
|
||
def _analyze_one(shot_file, index=0):
|
||
"""Analyse un screenshot, retourne (shot_id, result_or_error).
|
||
|
||
Vérifie avant chaque analyse si un replay est en cours (flag GPU).
|
||
Si oui, attend que le replay termine avant de lancer le VLM.
|
||
"""
|
||
shot_id = shot_file.stem
|
||
# Suspendre si un replay est en cours (libérer le GPU)
|
||
self._wait_if_replay_active(
|
||
context=f"screenshot {index + 1}/{total} ({shot_id})"
|
||
)
|
||
try:
|
||
res = self.process_screenshot(session_id, shot_id, str(shot_file))
|
||
return (shot_id, res, None)
|
||
except Exception as e:
|
||
return (shot_id, None, str(e))
|
||
|
||
if total <= 1:
|
||
# Un seul screenshot — pas besoin de pool
|
||
for shot_file in key_shots:
|
||
shot_id, res, err = _analyze_one(shot_file, index=0)
|
||
processed_count += 1
|
||
if progress_callback:
|
||
try:
|
||
progress_callback(session_id, processed_count, total, shot_id)
|
||
except Exception:
|
||
pass
|
||
if err:
|
||
logger.error(f"Erreur analyse screenshot {shot_id}: {err}")
|
||
errors += 1
|
||
elif res and res.get("state_id") is None:
|
||
logger.warning(f"Screenshot {shot_id} : analyse échouée (pas de state_id)")
|
||
errors += 1
|
||
else:
|
||
# Traitement parallèle — 2 screenshots simultanés
|
||
# Note : _analyze_one vérifie le flag replay avant chaque VLM call,
|
||
# donc les workers se suspendent automatiquement si un replay est en cours.
|
||
logger.info(f"Traitement parallèle : {max_parallel} workers pour {total} screenshots")
|
||
with ThreadPoolExecutor(max_workers=max_parallel, thread_name_prefix="vlm") as pool:
|
||
futures = {pool.submit(_analyze_one, sf, i): sf for i, sf in enumerate(key_shots)}
|
||
for future in as_completed(futures):
|
||
shot_id, res, err = future.result()
|
||
processed_count += 1
|
||
if progress_callback:
|
||
try:
|
||
progress_callback(session_id, processed_count, total, shot_id)
|
||
except Exception:
|
||
pass
|
||
if err:
|
||
logger.error(f"Erreur analyse screenshot {shot_id}: {err}")
|
||
errors += 1
|
||
elif res and res.get("state_id") is None:
|
||
logger.warning(f"Screenshot {shot_id} : analyse échouée (pas de state_id)")
|
||
errors += 1
|
||
|
||
# Vérifier combien de states ont été produits
|
||
with self._data_lock:
|
||
states_count = len(self._screen_states.get(session_id, []))
|
||
|
||
logger.info(
|
||
f"Session {session_id} : {states_count}/{total} screenshots analysés "
|
||
f"({errors} erreurs, {total_all - total} skippés par dédup)"
|
||
)
|
||
|
||
# Construire le workflow via finalize_session()
|
||
# Note: finalize() du session_manager a déjà été appelé quand la session
|
||
# a été marquée comme finalisée. On n'a pas besoin de le refaire.
|
||
# finalize_session() utilise les screen_states accumulés.
|
||
result = self.finalize_session(session_id)
|
||
|
||
# ── Enrichissement VLM des target_spec du workflow ──
|
||
# Après la construction du workflow, enrichir chaque edge avec les
|
||
# informations visuelles (by_text, by_role, vlm_description, anchor_image)
|
||
# extraites des screenshots via SomEngine.
|
||
if result.get("status") == "workflow_built" and result.get("workflow_id"):
|
||
workflow_id = result["workflow_id"]
|
||
with self._data_lock:
|
||
workflow = self._workflows.get(workflow_id)
|
||
if workflow and session_dir:
|
||
try:
|
||
enriched_count = self._enrich_workflow_targets(workflow, session_dir)
|
||
result["enriched_edges"] = enriched_count
|
||
result["total_edges"] = len(workflow.edges)
|
||
|
||
# Re-sauvegarder le workflow enrichi sur disque
|
||
if enriched_count > 0 and result.get("saved_path"):
|
||
saved_path = Path(result["saved_path"])
|
||
workflow.save_to_file(saved_path)
|
||
logger.info(
|
||
"Workflow enrichi re-sauvegardé : %s "
|
||
"(%d/%d edges enrichis)",
|
||
saved_path, enriched_count, len(workflow.edges),
|
||
)
|
||
except Exception as e:
|
||
logger.error(
|
||
"Erreur enrichissement VLM du workflow %s : %s",
|
||
workflow_id, e,
|
||
)
|
||
result["enrichment_error"] = str(e)
|
||
|
||
return result
|
||
|
||
def _select_key_screenshots(self, session_id: str, shot_paths: List[Path]) -> List[Path]:
|
||
"""Sélectionner uniquement les screenshots significatifs pour éviter les analyses redondantes.
|
||
|
||
Critères :
|
||
1. Les screenshots d'action (shot_*_full) sont TOUJOURS conservés
|
||
car chacun correspond à une action utilisateur et est nécessaire
|
||
pour le mode séquentiel du GraphBuilder.
|
||
2. Pour les heartbeats ou autres, comparer au précédent via hash perceptuel.
|
||
3. Garder le premier et le dernier screenshot (toujours).
|
||
"""
|
||
if len(shot_paths) <= 2:
|
||
return list(shot_paths)
|
||
|
||
from PIL import Image
|
||
|
||
selected = []
|
||
last_hash = None
|
||
|
||
for path in shot_paths:
|
||
basename = os.path.basename(str(path))
|
||
|
||
# Les screenshots d'action (shot_*_full) sont systématiquement gardés.
|
||
# Chacun correspond à un clic ou une saisie clavier et constitue
|
||
# une étape distincte du workflow — ne pas les dédupliquer.
|
||
is_action = 'shot_' in basename and '_full' in basename
|
||
if is_action:
|
||
selected.append(path)
|
||
# Mettre à jour le hash pour la comparaison des heartbeats suivants
|
||
try:
|
||
img = Image.open(str(path)).resize((32, 32)).convert('L')
|
||
last_hash = hashlib.md5(img.tobytes()).hexdigest()
|
||
except Exception:
|
||
pass
|
||
continue
|
||
|
||
# Hash perceptuel pour les non-actions (heartbeats, etc.)
|
||
try:
|
||
img = Image.open(str(path)).resize((32, 32)).convert('L')
|
||
current_hash = hashlib.md5(img.tobytes()).hexdigest()
|
||
except Exception as e:
|
||
logger.debug(f"Impossible de hasher {basename}: {e}")
|
||
selected.append(path)
|
||
continue
|
||
|
||
if last_hash is None or current_hash != last_hash:
|
||
selected.append(path)
|
||
last_hash = current_hash
|
||
|
||
# Garantir que le premier et le dernier sont toujours inclus
|
||
if shot_paths[0] not in selected:
|
||
selected.insert(0, shot_paths[0])
|
||
if shot_paths[-1] not in selected:
|
||
selected.append(shot_paths[-1])
|
||
|
||
return selected
|
||
|
||
def _restore_window_events(self, session_id: str, session_dir: Path):
|
||
"""Restaurer les window_info depuis live_events.jsonl lors du retraitement.
|
||
|
||
Crée un mapping chronologique timestamp → window_info pour que chaque
|
||
screenshot soit associé au bon titre de fenêtre. Stocké dans
|
||
session._shot_window_map : {shot_number → window_info}.
|
||
|
||
Le mapping est utilisé par process_screenshot() via last_window_info
|
||
qui est mis à jour avant chaque shot.
|
||
"""
|
||
import json as _json
|
||
import re
|
||
|
||
events_file = session_dir / "live_events.jsonl"
|
||
if not events_file.exists():
|
||
logger.debug(f"Pas de live_events.jsonl pour {session_id}")
|
||
return
|
||
|
||
try:
|
||
# Collecter tous les window_focus_change avec leur timestamp
|
||
window_changes = [] # [(timestamp, {"title": ..., "app_name": ...})]
|
||
|
||
for line in events_file.read_text().splitlines():
|
||
if not line.strip():
|
||
continue
|
||
try:
|
||
evt = _json.loads(line)
|
||
except _json.JSONDecodeError:
|
||
continue
|
||
|
||
event_data = evt.get("event", evt)
|
||
evt_type = event_data.get("type", "")
|
||
ts = float(event_data.get("timestamp", evt.get("timestamp", 0)))
|
||
|
||
if evt_type == "window_focus_change":
|
||
to_info = event_data.get("to") or event_data.get("window") or {}
|
||
title = to_info.get("title", "")
|
||
if title:
|
||
window_changes.append((ts, {
|
||
"title": title,
|
||
"app_name": to_info.get("app_name", "unknown"),
|
||
}))
|
||
|
||
if not window_changes:
|
||
logger.debug(f"Pas de window_focus_change dans {session_id}")
|
||
return
|
||
|
||
window_changes.sort(key=lambda x: x[0])
|
||
|
||
# Mapper chaque shot_XXXX aux window_info par timestamp
|
||
shots_dir = session_dir / "shots"
|
||
shot_files = sorted(shots_dir.glob("shot_*_full.png"))
|
||
|
||
# Extraire le timestamp du nom de fichier (shot_XXXX_full.png → XXXX = index)
|
||
# ou du mtime du fichier
|
||
shot_window_map = {}
|
||
for shot_file in shot_files:
|
||
shot_ts = shot_file.stat().st_mtime
|
||
# Trouver le dernier window_change avant ce shot
|
||
best_window = window_changes[0][1] # Premier par défaut
|
||
for wc_ts, wc_info in window_changes:
|
||
if wc_ts <= shot_ts:
|
||
best_window = wc_info
|
||
else:
|
||
break
|
||
shot_window_map[shot_file.stem] = best_window
|
||
|
||
# Stocker dans la session pour que process_screenshot() puisse l'utiliser
|
||
session = self.session_manager.get_session(session_id)
|
||
if session:
|
||
session._shot_window_map = shot_window_map
|
||
# Mettre le dernier titre connu comme fallback
|
||
session.last_window_info = window_changes[-1][1]
|
||
|
||
titles_seen = set(w.get("title", "") for w in shot_window_map.values())
|
||
logger.info(
|
||
f"Window events restaurés pour {session_id}: "
|
||
f"{len(shot_window_map)} shots mappés, "
|
||
f"{len(titles_seen)} titres uniques: {titles_seen}"
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.warning(f"Erreur restauration window events pour {session_id}: {e}")
|
||
|
||
def _restore_user_events(self, session_id: str, session_dir: Path):
|
||
"""Restaurer les événements utilisateur depuis live_events.jsonl.
|
||
|
||
Charge les événements d'action (mouse_click, text_input, key_press)
|
||
dans session.events via session_manager.add_event().
|
||
Sans cela, to_raw_session() retourne une liste d'events vide,
|
||
et le GraphBuilder ne peut pas construire les actions des edges.
|
||
|
||
Construit aussi un mapping shot_id → timestamp pour enrichir les
|
||
ScreenStates avec le bon event_time (nécessaire pour
|
||
_find_transition_events).
|
||
|
||
Note : vide d'abord les events existants de la session pour
|
||
éviter les doublons (la session peut avoir été restaurée depuis
|
||
un fichier de persistance au démarrage).
|
||
"""
|
||
import json as _json
|
||
|
||
events_file = session_dir / "live_events.jsonl"
|
||
if not events_file.exists():
|
||
logger.debug(f"Pas de live_events.jsonl pour {session_id}")
|
||
return
|
||
|
||
try:
|
||
# Vider les events existants pour éviter les doublons
|
||
# (la session peut avoir été pré-chargée depuis la persistance)
|
||
session = self.session_manager.get_session(session_id)
|
||
if session and session.events:
|
||
logger.debug(
|
||
f"Nettoyage de {len(session.events)} events "
|
||
f"pré-existants pour {session_id}"
|
||
)
|
||
session.events.clear()
|
||
|
||
action_count = 0
|
||
shot_ts_map = {} # shot_id → timestamp de l'événement d'action
|
||
|
||
for line in events_file.read_text().splitlines():
|
||
if not line.strip():
|
||
continue
|
||
try:
|
||
raw = _json.loads(line)
|
||
except _json.JSONDecodeError:
|
||
continue
|
||
|
||
event_data = raw.get("event", raw)
|
||
evt_type = event_data.get("type", "")
|
||
ts = float(event_data.get("timestamp", raw.get("timestamp", 0)))
|
||
|
||
if evt_type not in ("mouse_click", "text_input", "key_press"):
|
||
continue
|
||
|
||
# Construire le dict d'événement pour add_event()
|
||
evt_dict = {
|
||
"type": evt_type,
|
||
"timestamp": ts,
|
||
}
|
||
|
||
# Copier les données spécifiques au type
|
||
if evt_type == "mouse_click":
|
||
evt_dict["pos"] = event_data.get("pos", [0, 0])
|
||
evt_dict["button"] = event_data.get("button", "left")
|
||
elif evt_type == "text_input":
|
||
evt_dict["text"] = event_data.get("text", "")
|
||
elif evt_type == "key_press":
|
||
evt_dict["keys"] = event_data.get("keys", [])
|
||
|
||
# Copier window info si disponible
|
||
window = event_data.get("window")
|
||
if window:
|
||
evt_dict["window"] = window
|
||
|
||
# Copier screenshot_id si disponible
|
||
shot_id = event_data.get("screenshot_id")
|
||
if shot_id:
|
||
evt_dict["screenshot_id"] = shot_id
|
||
# Mapper shot_id → timestamp pour enrichir les ScreenStates
|
||
shot_ts_map[shot_id] = ts
|
||
|
||
# Copier screen_metadata si disponible
|
||
screen_meta = event_data.get("screen_metadata")
|
||
if screen_meta:
|
||
# Propager la résolution pour to_raw_session()
|
||
if "screen_resolution" in screen_meta:
|
||
evt_dict["screen_resolution"] = screen_meta["screen_resolution"]
|
||
|
||
self.session_manager.add_event(session_id, evt_dict)
|
||
action_count += 1
|
||
|
||
# Stocker le mapping shot → timestamp dans la session
|
||
# pour enrichir les ScreenStates dans finalize_session()
|
||
session = self.session_manager.get_session(session_id)
|
||
if session:
|
||
session._shot_ts_map = shot_ts_map
|
||
|
||
logger.info(
|
||
f"User events restaurés pour {session_id}: "
|
||
f"{action_count} actions chargées, "
|
||
f"{len(shot_ts_map)} shots avec timestamp"
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.warning(f"Erreur restauration user events pour {session_id}: {e}")
|
||
|
||
def _find_session_dir(self, session_id: str) -> Optional[Path]:
|
||
"""Trouver le dossier d'une session sur disque.
|
||
|
||
Cherche dans (par ordre de priorité) :
|
||
1. data/training/{session_id}/
|
||
2. data/training/{subdir}/{session_id}/ (ex: live_sessions)
|
||
3. data/training/{subdir}/{machine_id}/{session_id}/ (multi-machine)
|
||
"""
|
||
# Chemin direct
|
||
direct = self.data_dir / session_id
|
||
if direct.is_dir() and (direct / "shots").exists():
|
||
return direct
|
||
|
||
# Chercher dans les sous-dossiers (1 niveau : live_sessions/{session_id})
|
||
parent = self.data_dir
|
||
if parent.exists():
|
||
for subdir in parent.iterdir():
|
||
if subdir.is_dir():
|
||
candidate = subdir / session_id
|
||
if candidate.is_dir() and (candidate / "shots").exists():
|
||
return candidate
|
||
# Chercher 1 niveau plus profond (live_sessions/{machine_id}/{session_id})
|
||
if subdir.is_dir():
|
||
for machine_dir in subdir.iterdir():
|
||
if machine_dir.is_dir():
|
||
candidate = machine_dir / session_id
|
||
if candidate.is_dir() and (candidate / "shots").exists():
|
||
return candidate
|
||
|
||
# Chercher aussi dans le parent du data_dir (cas où data_dir = streaming_sessions)
|
||
parent_parent = self.data_dir.parent
|
||
if parent_parent.exists() and parent_parent != self.data_dir:
|
||
direct2 = parent_parent / session_id
|
||
if direct2.is_dir() and (direct2 / "shots").exists():
|
||
return direct2
|
||
for subdir in parent_parent.iterdir():
|
||
if subdir.is_dir() and subdir.name != self.data_dir.name:
|
||
candidate = subdir / session_id
|
||
if candidate.is_dir() and (candidate / "shots").exists():
|
||
return candidate
|
||
|
||
return None
|
||
|
||
def find_pending_sessions(self) -> List[str]:
|
||
"""Trouver les sessions finalisées qui n'ont pas encore été traitées.
|
||
|
||
Une session est "pending" si :
|
||
- Elle est marquée comme finalisée dans le session_manager
|
||
- Elle a 0 ScreenStates en mémoire (jamais analysée ou analyse perdue)
|
||
- Elle a des screenshots full sur disque
|
||
|
||
Returns:
|
||
Liste de session_ids à traiter.
|
||
"""
|
||
pending = []
|
||
for sid in self.session_manager.session_ids:
|
||
session = self.session_manager.get_session(sid)
|
||
if session is None:
|
||
continue
|
||
if not session.finalized:
|
||
continue
|
||
|
||
# Vérifier si des states existent déjà
|
||
with self._data_lock:
|
||
states_count = len(self._screen_states.get(sid, []))
|
||
if states_count > 0:
|
||
continue
|
||
|
||
# Vérifier si un workflow existe déjà pour cette session
|
||
# 1. Check en mémoire (attribut _source_session)
|
||
with self._data_lock:
|
||
has_workflow = any(
|
||
getattr(wf, '_source_session', None) == sid
|
||
for wf in self._workflows.values()
|
||
)
|
||
if has_workflow:
|
||
continue
|
||
|
||
# 2. Check sur disque (metadata.source_session_id dans les fichiers JSON)
|
||
if self._workflow_exists_on_disk(sid):
|
||
continue
|
||
|
||
# Vérifier qu'il y a des screenshots full sur disque
|
||
session_dir = self._find_session_dir(sid)
|
||
if session_dir:
|
||
shots_dir = session_dir / "shots"
|
||
if shots_dir.exists():
|
||
full_shots = list(shots_dir.glob("shot_*_full.png"))
|
||
if full_shots:
|
||
logger.info(
|
||
f"Session pending trouvée : {sid} "
|
||
f"({len(full_shots)} screenshots full)"
|
||
)
|
||
pending.append(sid)
|
||
|
||
return pending
|
||
|
||
def _workflow_exists_on_disk(self, session_id: str) -> bool:
|
||
"""Vérifier si un workflow a déjà été produit pour cette session.
|
||
|
||
Parcourt les fichiers JSON dans data/training/workflows/ et cherche
|
||
source_session_id dans les métadonnées.
|
||
Utilise un cache pour éviter de re-lire les fichiers à chaque appel.
|
||
"""
|
||
import json as _json
|
||
|
||
if not hasattr(self, '_processed_sessions_cache'):
|
||
# Construire le cache au premier appel
|
||
self._processed_sessions_cache = set()
|
||
workflows_dir = self.data_dir / "workflows"
|
||
if workflows_dir.exists():
|
||
for wf_file in workflows_dir.rglob("*.json"):
|
||
try:
|
||
with open(wf_file, 'r') as f:
|
||
data = _json.load(f)
|
||
src = data.get('metadata', {}).get('source_session_id', '')
|
||
if src:
|
||
self._processed_sessions_cache.add(src)
|
||
except Exception:
|
||
continue
|
||
logger.info(
|
||
f"Cache sessions traitées : {len(self._processed_sessions_cache)} workflows existants"
|
||
)
|
||
|
||
return session_id in self._processed_sessions_cache
|
||
|
||
def _enrich_states_with_timestamps(
|
||
self,
|
||
states: List,
|
||
shot_ts_map: Dict[str, float],
|
||
):
|
||
"""Enrichir les ScreenStates avec les timestamps des événements d'action.
|
||
|
||
Le GraphBuilder utilise metadata['shot_timestamp'] (ou 'event_time')
|
||
pour associer les événements utilisateur aux transitions entre states.
|
||
Sans cette info, _find_transition_events() ne sait pas quels événements
|
||
appartiennent à quelle transition.
|
||
|
||
On fait un mapping par nom de shot :
|
||
- shot_XXXX → ScreenState dont le screen_state_id contient "XXXX"
|
||
ou par index séquentiel (shot_0001 → state[0], etc.)
|
||
|
||
Args:
|
||
states: Liste de ScreenStates à enrichir (modifiés in-place)
|
||
shot_ts_map: Mapping {shot_id: timestamp_epoch}
|
||
"""
|
||
if not shot_ts_map:
|
||
return
|
||
|
||
import re
|
||
|
||
# Trier les shot_ids par numéro pour correspondre à l'ordre des states
|
||
sorted_shots = sorted(
|
||
shot_ts_map.items(),
|
||
key=lambda x: x[0],
|
||
)
|
||
|
||
# Stratégie 1 : mapping par index séquentiel
|
||
# shot_0001 → state[0], shot_0002 → state[1], etc.
|
||
enriched = 0
|
||
for i, (shot_id, ts) in enumerate(sorted_shots):
|
||
if i < len(states):
|
||
state = states[i]
|
||
if state.metadata is None:
|
||
state.metadata = {}
|
||
state.metadata["shot_timestamp"] = ts
|
||
state.metadata["shot_id"] = shot_id
|
||
enriched += 1
|
||
|
||
# Pour les states restants (sans shot correspondant),
|
||
# interpoler entre les timestamps voisins
|
||
for i, state in enumerate(states):
|
||
if state.metadata and state.metadata.get("shot_timestamp"):
|
||
continue
|
||
if state.metadata is None:
|
||
state.metadata = {}
|
||
# Chercher les timestamps voisins
|
||
prev_ts = 0
|
||
next_ts = float('inf')
|
||
for j in range(i - 1, -1, -1):
|
||
t = states[j].metadata.get("shot_timestamp") if states[j].metadata else None
|
||
if t:
|
||
prev_ts = t
|
||
break
|
||
for j in range(i + 1, len(states)):
|
||
t = states[j].metadata.get("shot_timestamp") if states[j].metadata else None
|
||
if t:
|
||
next_ts = t
|
||
break
|
||
if prev_ts > 0 and next_ts < float('inf'):
|
||
state.metadata["shot_timestamp"] = (prev_ts + next_ts) / 2
|
||
elif prev_ts > 0:
|
||
state.metadata["shot_timestamp"] = prev_ts + 1.0
|
||
elif next_ts < float('inf'):
|
||
state.metadata["shot_timestamp"] = next_ts - 1.0
|
||
|
||
logger.debug(
|
||
f"Timestamps enrichis: {enriched}/{len(states)} states "
|
||
f"depuis {len(shot_ts_map)} shots"
|
||
)
|
||
|
||
def _cleanup_session_data(self, session_id: str):
|
||
"""Libérer la mémoire des ScreenStates et embeddings après finalization."""
|
||
with self._data_lock:
|
||
states = self._screen_states.pop(session_id, [])
|
||
embeddings = self._embeddings.pop(session_id, [])
|
||
logger.info(
|
||
f"Mémoire libérée pour {session_id}: "
|
||
f"{len(states)} states, {len(embeddings)} embeddings"
|
||
)
|
||
|
||
# =========================================================================
|
||
# Helpers
|
||
# =========================================================================
|
||
|
||
def _generate_workflow_name(self, session_id: str) -> str:
|
||
"""
|
||
Générer un nom de tâche lisible et humain à partir des titres de fenêtre.
|
||
|
||
Analyse les titres vus pendant la session pour extraire :
|
||
- L'application principale (la plus fréquente)
|
||
- Le contexte documentaire (après le tiret dans le titre)
|
||
- Une description d'action déduite du contexte
|
||
|
||
Exemples de résultats :
|
||
"Chrome - Facturation DPI" → "Chrome — Facturation DPI"
|
||
"Excel - Budget_2026.xlsx" → "Excel — Budget 2026"
|
||
3 apps → "Chrome, Excel et Word"
|
||
Aucun contexte → "Tâche du 17 mars à 14h"
|
||
"""
|
||
import re
|
||
|
||
session = self.session_manager.get_session(session_id)
|
||
if not session:
|
||
return self._fallback_task_name()
|
||
|
||
titles = session.window_titles_seen
|
||
apps = session.app_names_seen
|
||
|
||
if not titles and not apps:
|
||
return self._fallback_task_name()
|
||
|
||
# Trier par fréquence décroissante
|
||
sorted_titles = sorted(titles.items(), key=lambda x: -x[1])
|
||
sorted_apps = sorted(apps.items(), key=lambda x: -x[1])
|
||
|
||
# Extraire le nom d'app depuis le titre le plus fréquent
|
||
primary_title = sorted_titles[0][0] if sorted_titles else ""
|
||
primary_app = sorted_apps[0][0] if sorted_apps else ""
|
||
|
||
# Nettoyer le nom d'application pour l'affichage humain
|
||
app_display = self._humanize_app_name(primary_app) if primary_app else ""
|
||
|
||
# Extraire la partie contextuelle du titre (après/avant le séparateur)
|
||
context_part = ""
|
||
for sep in [" - ", " — ", " – ", " | ", ": "]:
|
||
if sep in primary_title:
|
||
parts = primary_title.split(sep)
|
||
if len(parts) >= 2:
|
||
candidates = [p.strip() for p in parts]
|
||
app_lower = primary_app.lower()
|
||
context_candidates = [
|
||
c for c in candidates
|
||
if app_lower not in c.lower()
|
||
and c.lower() not in app_lower
|
||
]
|
||
if context_candidates:
|
||
context_part = context_candidates[0]
|
||
else:
|
||
context_part = candidates[0]
|
||
break
|
||
|
||
# Construire le nom lisible
|
||
distinct_apps = [a for a, _ in sorted_apps if a.lower() not in ("unknown", "explorer")]
|
||
|
||
if len(distinct_apps) >= 3:
|
||
# Multi-app : "Chrome, Excel et Word"
|
||
app_names = [self._humanize_app_name(a) for a in distinct_apps[:3]]
|
||
if len(app_names) == 3:
|
||
name = f"{app_names[0]}, {app_names[1]} et {app_names[2]}"
|
||
else:
|
||
name = " et ".join(app_names)
|
||
elif context_part:
|
||
# Nettoyer le contexte pour le rendre lisible
|
||
clean_context = re.sub(r'[<>:"/\\|?*\[\]]', '', context_part)
|
||
# Retirer les extensions de fichier courantes
|
||
clean_context = re.sub(r'\.(xlsx?|csv|docx?|pdf|txt)$', '', clean_context, flags=re.IGNORECASE)
|
||
# Remplacer les underscores par des espaces
|
||
clean_context = clean_context.replace('_', ' ').strip()[:40]
|
||
if app_display:
|
||
name = f"{app_display} \u2014 {clean_context}"
|
||
else:
|
||
name = clean_context
|
||
elif app_display:
|
||
name = f"{app_display} \u2014 session"
|
||
else:
|
||
name = self._fallback_task_name()
|
||
|
||
# Dédoublonner si une tâche avec ce nom existe déjà
|
||
base_name = name
|
||
counter = 1
|
||
with self._data_lock:
|
||
existing_names = {
|
||
getattr(w, 'name', '') for w in self._workflows.values()
|
||
}
|
||
while name in existing_names:
|
||
counter += 1
|
||
name = f"{base_name} ({counter})"
|
||
|
||
return name
|
||
|
||
@staticmethod
|
||
def _fallback_task_name() -> str:
|
||
"""Générer un nom de tâche par défaut basé sur la date et l'heure."""
|
||
now = datetime.now()
|
||
# Noms de mois en français
|
||
mois = [
|
||
"", "janvier", "février", "mars", "avril", "mai", "juin",
|
||
"juillet", "août", "septembre", "octobre", "novembre", "décembre"
|
||
]
|
||
return f"Tâche du {now.day} {mois[now.month]} à {now.hour}h{now.minute:02d}"
|
||
|
||
@staticmethod
|
||
def _humanize_app_name(app_name: str) -> str:
|
||
"""Convertir un nom d'application technique en nom lisible.
|
||
|
||
Exemples :
|
||
"notepad.exe" → "Bloc-notes"
|
||
"chrome.exe" → "Chrome"
|
||
"WindowsTerminal" → "Terminal"
|
||
"""
|
||
import re
|
||
# Supprimer l'extension .exe et les chemins
|
||
name = app_name.split("\\")[-1].split("/")[-1]
|
||
name = re.sub(r'\.exe$', '', name, flags=re.IGNORECASE).strip()
|
||
|
||
# Dictionnaire de noms humains pour les applications courantes
|
||
app_human_names = {
|
||
"notepad": "Bloc-notes",
|
||
"notepad++": "Notepad++",
|
||
"chrome": "Chrome",
|
||
"msedge": "Edge",
|
||
"firefox": "Firefox",
|
||
"explorer": "Explorateur",
|
||
"windowsterminal": "Terminal",
|
||
"cmd": "Invite de commandes",
|
||
"powershell": "PowerShell",
|
||
"excel": "Excel",
|
||
"winword": "Word",
|
||
"powerpnt": "PowerPoint",
|
||
"outlook": "Outlook",
|
||
"teams": "Teams",
|
||
"code": "VS Code",
|
||
"searchhost": "Recherche",
|
||
"applicationframehost": "Application",
|
||
"calc": "Calculatrice",
|
||
"mspaint": "Paint",
|
||
"snippingtool": "Capture d'écran",
|
||
}
|
||
|
||
name_lower = name.lower()
|
||
if name_lower in app_human_names:
|
||
return app_human_names[name_lower]
|
||
|
||
# Capitaliser le nom si pas dans le dictionnaire
|
||
return name.capitalize() if name else "Application"
|
||
|
||
@staticmethod
|
||
def _clean_app_name(app_name: str) -> str:
|
||
"""Nettoyer un nom d'application pour l'utiliser dans un nom de workflow."""
|
||
import re
|
||
# Supprimer l'extension .exe et les chemins
|
||
name = app_name.split("\\")[-1].split("/")[-1]
|
||
name = re.sub(r'\.exe$', '', name, flags=re.IGNORECASE)
|
||
# Capitaliser
|
||
name = name.strip().capitalize()
|
||
# Supprimer les caractères spéciaux
|
||
name = re.sub(r'[^a-zA-Z0-9àâäéèêëïîôùûüÿçÀÂÄÉÈÊËÏÎÔÙÛÜŸÇ_]', '', name)
|
||
return name or "App"
|
||
|
||
def _persist_workflow(self, workflow, session_id: str, machine_id: str = "default") -> Optional[Path]:
|
||
"""Sauvegarder le workflow JSON sur disque.
|
||
|
||
Les workflows sont sauvegardés dans un sous-dossier par machine :
|
||
data/training/workflows/{machine_id}/wf_xxx.json
|
||
|
||
Cela permet de distinguer les workflows appris sur des machines différentes.
|
||
"""
|
||
try:
|
||
# Dossier par machine (ou racine pour "default")
|
||
if machine_id and machine_id != "default":
|
||
workflows_dir = self.data_dir / "workflows" / machine_id
|
||
else:
|
||
workflows_dir = self.data_dir / "workflows"
|
||
workflows_dir.mkdir(parents=True, exist_ok=True)
|
||
filepath = workflows_dir / f"{workflow.workflow_id}.json"
|
||
# Stocker le session_id source et machine_id dans les métadonnées
|
||
if hasattr(workflow, 'metadata') and isinstance(workflow.metadata, dict):
|
||
workflow.metadata['source_session_id'] = session_id
|
||
workflow.metadata['machine_id'] = machine_id
|
||
if not hasattr(workflow, '_machine_id'):
|
||
workflow._machine_id = machine_id
|
||
workflow._source_session = session_id
|
||
workflow.save_to_file(filepath)
|
||
logger.info(f"Workflow sauvegardé: {filepath} (session={session_id}, machine={machine_id})")
|
||
return filepath
|
||
except Exception as e:
|
||
logger.error(f"Erreur sauvegarde workflow {session_id}: {e}")
|
||
return None
|
||
|
||
def _build_raw_session_fallback(self, session, raw_dict):
|
||
"""Construire un RawSession manuellement si from_dict échoue."""
|
||
from core.models.raw_session import RawSession, Event, Screenshot, RawWindowContext
|
||
|
||
events = []
|
||
for evt_dict in raw_dict.get("events", []):
|
||
window_data = evt_dict.get("window", {"title": "", "app_name": "unknown"})
|
||
window = RawWindowContext(
|
||
title=window_data.get("title", ""),
|
||
app_name=window_data.get("app_name", "unknown"),
|
||
)
|
||
events.append(Event(
|
||
t=evt_dict.get("t", 0.0),
|
||
type=evt_dict.get("type", "unknown"),
|
||
window=window,
|
||
data={k: v for k, v in evt_dict.items()
|
||
if k not in ("t", "type", "window", "screenshot_id")},
|
||
screenshot_id=evt_dict.get("screenshot_id"),
|
||
))
|
||
|
||
screenshots = []
|
||
for ss_dict in raw_dict.get("screenshots", []):
|
||
screenshots.append(Screenshot(
|
||
screenshot_id=ss_dict["screenshot_id"],
|
||
relative_path=ss_dict.get("relative_path", ss_dict.get("path", "")),
|
||
captured_at=ss_dict.get("captured_at", datetime.now().isoformat()),
|
||
))
|
||
|
||
return RawSession(
|
||
session_id=session.session_id,
|
||
agent_version="agent_v1_stream",
|
||
environment=raw_dict.get("environment", {}),
|
||
user=raw_dict.get("user", {"id": "remote_agent"}),
|
||
context=raw_dict.get("context", {}),
|
||
started_at=session.created_at,
|
||
ended_at=datetime.now(),
|
||
events=events,
|
||
screenshots=screenshots,
|
||
)
|
||
|
||
def list_sessions(self, machine_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
||
"""Lister les sessions avec leur état.
|
||
|
||
Args:
|
||
machine_id: Si fourni, filtre par machine. Si None, retourne toutes les sessions.
|
||
"""
|
||
sessions = []
|
||
for sid in self.session_manager.session_ids:
|
||
session = self.session_manager.get_session(sid)
|
||
if session is None:
|
||
continue
|
||
# Filtre par machine si demandé
|
||
if machine_id and session.machine_id != machine_id:
|
||
continue
|
||
with self._data_lock:
|
||
states_count = len(self._screen_states.get(sid, []))
|
||
embeddings_count = len(self._embeddings.get(sid, []))
|
||
sessions.append({
|
||
"session_id": session.session_id,
|
||
"machine_id": session.machine_id,
|
||
"events_count": len(session.events),
|
||
"screenshots_count": len(session.shot_paths),
|
||
"states_count": states_count,
|
||
"embeddings_count": embeddings_count,
|
||
"last_window": session.last_window_info,
|
||
"created_at": session.created_at.isoformat(),
|
||
"last_activity": session.last_activity.isoformat(),
|
||
"finalized": session.finalized,
|
||
})
|
||
return sessions
|
||
|
||
def list_workflows(self, machine_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
||
"""Lister les workflows construits.
|
||
|
||
Args:
|
||
machine_id: Si fourni, filtre par machine. Si None, retourne tous les workflows.
|
||
"""
|
||
with self._data_lock:
|
||
workflows_snapshot = list(self._workflows.items())
|
||
result = []
|
||
for wf_id, wf in workflows_snapshot:
|
||
wf_machine = getattr(wf, '_machine_id', 'default')
|
||
# Filtre par machine si demandé
|
||
if machine_id and wf_machine != machine_id:
|
||
continue
|
||
result.append({
|
||
"workflow_id": wf_id,
|
||
"machine_id": wf_machine,
|
||
"nodes": len(wf.nodes) if hasattr(wf, "nodes") else 0,
|
||
"edges": len(wf.edges) if hasattr(wf, "edges") else 0,
|
||
"name": getattr(wf, "name", wf_id),
|
||
})
|
||
return result
|
||
|
||
def reload_workflows(self) -> int:
|
||
"""Recharger les workflows depuis le disque.
|
||
|
||
Utile après qu'un nouveau workflow a été exporté depuis le VWB
|
||
ou appris par le streaming. Retourne le nombre de workflows chargés.
|
||
"""
|
||
with self._data_lock:
|
||
self._workflows.clear()
|
||
self._load_persisted_workflows()
|
||
with self._data_lock:
|
||
count = len(self._workflows)
|
||
logger.info("Workflows rechargés depuis le disque : %d", count)
|
||
return count
|
||
|
||
# =========================================================================
|
||
# Extraction d'actions enrichies depuis un workflow appris
|
||
# =========================================================================
|
||
|
||
def extract_enriched_actions(
|
||
self,
|
||
workflow,
|
||
params: Optional[Dict[str, Any]] = None,
|
||
) -> List[Dict[str, Any]]:
|
||
"""Extraire les actions enrichies d'un workflow appris (nodes + edges + events).
|
||
|
||
Parcourt le graphe en BFS depuis les entry_nodes et construit pour chaque edge
|
||
une action enrichie contenant :
|
||
- Les coordonnées normalisées (x_pct, y_pct) depuis les events originaux
|
||
- Les infos de ciblage visuel (by_text, by_role, window_title)
|
||
- Les identifiants de nodes (from_node, to_node) pour le pre-check et post-check
|
||
- Le flag visual_mode=True pour activer la résolution visuelle côté agent
|
||
|
||
Args:
|
||
workflow: Objet Workflow ou dict brut avec nodes/edges
|
||
params: Paramètres de substitution (variables ${var})
|
||
|
||
Returns:
|
||
Liste d'actions enrichies prêtes pour la queue de replay, ou liste vide
|
||
si le workflow n'a pas d'edges exploitables.
|
||
"""
|
||
import uuid
|
||
from collections import defaultdict
|
||
|
||
params = params or {}
|
||
|
||
# Accéder aux données du workflow (objet ou dict)
|
||
if hasattr(workflow, 'edges'):
|
||
edges = workflow.edges
|
||
entry_nodes = workflow.entry_nodes if hasattr(workflow, 'entry_nodes') else []
|
||
nodes_list = workflow.nodes if hasattr(workflow, 'nodes') else []
|
||
elif isinstance(workflow, dict):
|
||
edges = workflow.get('edges', [])
|
||
entry_nodes = workflow.get('entry_nodes', [])
|
||
nodes_list = workflow.get('nodes', [])
|
||
else:
|
||
return []
|
||
|
||
if not edges:
|
||
return []
|
||
|
||
# Index des nodes par ID
|
||
node_index = {}
|
||
for n in nodes_list:
|
||
nid = n.node_id if hasattr(n, 'node_id') else n.get('node_id', '')
|
||
node_index[nid] = n
|
||
|
||
# Index des edges sortants par node
|
||
outgoing: Dict[str, list] = defaultdict(list)
|
||
for edge in edges:
|
||
fn = edge.from_node if hasattr(edge, 'from_node') else edge.get('from_node', '')
|
||
outgoing[fn].append(edge)
|
||
|
||
# Trouver les événements originaux de la session source
|
||
original_events = self._load_original_events_for_workflow(workflow)
|
||
|
||
# Trouver le dossier de la session source (pour les crops/anchors)
|
||
source_session_dir = self._find_session_dir_for_workflow(workflow)
|
||
|
||
# Inférer la résolution d'écran depuis les positions maximales des events
|
||
inferred_resolution = self._infer_screen_resolution(original_events)
|
||
|
||
# BFS depuis les entry_nodes
|
||
if not entry_nodes:
|
||
# Fallback : premier node des edges
|
||
first_edge = edges[0]
|
||
fn = first_edge.from_node if hasattr(first_edge, 'from_node') else first_edge.get('from_node', '')
|
||
entry_nodes = [fn]
|
||
|
||
visited = set()
|
||
queue = list(entry_nodes)
|
||
ordered_edges = []
|
||
|
||
while queue:
|
||
node_id = queue.pop(0)
|
||
if node_id in visited:
|
||
continue
|
||
visited.add(node_id)
|
||
|
||
for edge in outgoing.get(node_id, []):
|
||
ordered_edges.append(edge)
|
||
tn = edge.to_node if hasattr(edge, 'to_node') else edge.get('to_node', '')
|
||
if tn not in visited:
|
||
queue.append(tn)
|
||
|
||
# Construire les actions enrichies depuis les edges ordonnés
|
||
actions = []
|
||
for edge in ordered_edges:
|
||
enriched = self._edge_to_enriched_action(
|
||
edge, node_index, original_events, params, inferred_resolution,
|
||
source_session_dir,
|
||
)
|
||
if enriched:
|
||
actions.extend(enriched)
|
||
|
||
# Nettoyage global : éliminer les actions parasites, fusionner les
|
||
# text_input consécutifs, dédupliquer les key_combo, etc.
|
||
raw_count = len(actions)
|
||
actions = clean_enriched_actions(actions)
|
||
|
||
logger.info(
|
||
"Actions enrichies extraites : %d actions (nettoyées depuis %d brutes) "
|
||
"depuis %d edges (events originaux : %d)",
|
||
len(actions), raw_count, len(ordered_edges), len(original_events),
|
||
)
|
||
return actions
|
||
|
||
def _load_original_events_for_workflow(self, workflow) -> List[Dict[str, Any]]:
|
||
"""Charger les événements originaux (live_events.jsonl) liés à un workflow.
|
||
|
||
Stratégie de recherche :
|
||
1. metadata.source_session_id (si le workflow le stocke)
|
||
2. Parcourir les sessions existantes pour trouver un match temporel
|
||
3. Utiliser le workflow_id comme hint (parfois contient le session_id)
|
||
|
||
Returns:
|
||
Liste d'events bruts (dicts), ou liste vide si introuvable.
|
||
"""
|
||
import json
|
||
|
||
# Stratégie 1 : metadata.source_session_id
|
||
metadata = workflow.metadata if hasattr(workflow, 'metadata') else (
|
||
workflow.get('metadata', {}) if isinstance(workflow, dict) else {}
|
||
)
|
||
source_sid = metadata.get('source_session_id', '')
|
||
|
||
if source_sid:
|
||
events = self._load_events_from_session(source_sid)
|
||
if events:
|
||
return events
|
||
|
||
# Stratégie 2 : workflow_id peut contenir ou être un session_id
|
||
wf_id = workflow.workflow_id if hasattr(workflow, 'workflow_id') else (
|
||
workflow.get('workflow_id', '') if isinstance(workflow, dict) else ''
|
||
)
|
||
if wf_id.startswith('sess_'):
|
||
events = self._load_events_from_session(wf_id)
|
||
if events:
|
||
return events
|
||
|
||
# Stratégie 3 : chercher les sessions les plus proches temporellement
|
||
created_at = None
|
||
if hasattr(workflow, 'created_at'):
|
||
created_at = workflow.created_at
|
||
elif isinstance(workflow, dict) and 'created_at' in workflow:
|
||
try:
|
||
from datetime import datetime
|
||
created_at = datetime.fromisoformat(workflow['created_at'])
|
||
except (ValueError, TypeError):
|
||
pass
|
||
|
||
if created_at:
|
||
events = self._find_closest_session_events(created_at)
|
||
if events:
|
||
return events
|
||
|
||
return []
|
||
|
||
def _find_session_dir_for_workflow(self, workflow) -> Optional[Path]:
|
||
"""Trouver le dossier de la session source associée à un workflow.
|
||
|
||
Utilise la même logique de recherche que _load_original_events_for_workflow
|
||
mais retourne le chemin du dossier au lieu des événements. Nécessaire pour
|
||
accéder aux crops (anchor images) stockés dans {session_dir}/shots/.
|
||
|
||
Returns:
|
||
Path du dossier session, ou None si introuvable.
|
||
"""
|
||
# Stratégie 1 : metadata.source_session_id
|
||
metadata = workflow.metadata if hasattr(workflow, 'metadata') else (
|
||
workflow.get('metadata', {}) if isinstance(workflow, dict) else {}
|
||
)
|
||
source_sid = metadata.get('source_session_id', '')
|
||
|
||
if source_sid:
|
||
session_dir = self._find_session_dir(source_sid)
|
||
if session_dir:
|
||
return session_dir
|
||
|
||
# Stratégie 2 : workflow_id peut contenir ou être un session_id
|
||
wf_id = workflow.workflow_id if hasattr(workflow, 'workflow_id') else (
|
||
workflow.get('workflow_id', '') if isinstance(workflow, dict) else ''
|
||
)
|
||
if wf_id.startswith('sess_'):
|
||
session_dir = self._find_session_dir(wf_id)
|
||
if session_dir:
|
||
return session_dir
|
||
|
||
# Stratégie 3 : chercher la session la plus proche temporellement
|
||
created_at = None
|
||
if hasattr(workflow, 'created_at'):
|
||
created_at = workflow.created_at
|
||
elif isinstance(workflow, dict) and 'created_at' in workflow:
|
||
try:
|
||
from datetime import datetime as _dt
|
||
created_at = _dt.fromisoformat(workflow['created_at'])
|
||
except (ValueError, TypeError):
|
||
pass
|
||
|
||
if created_at:
|
||
session_dir = self._find_closest_session_dir(created_at)
|
||
if session_dir:
|
||
return session_dir
|
||
|
||
return None
|
||
|
||
def _find_closest_session_dir(self, workflow_created_at) -> Optional[Path]:
|
||
"""Trouver le dossier de la session la plus proche temporellement.
|
||
|
||
Même logique que _find_closest_session_events mais retourne le Path
|
||
du dossier au lieu de charger les événements.
|
||
|
||
Returns:
|
||
Path du dossier session, ou None si aucun match dans les 10 minutes.
|
||
"""
|
||
from datetime import datetime as _dt
|
||
|
||
best_dir = None
|
||
best_delta = float('inf')
|
||
|
||
search_dirs = [self.data_dir]
|
||
if self.data_dir.exists():
|
||
for subdir in self.data_dir.iterdir():
|
||
if subdir.is_dir() and not subdir.name.startswith('.'):
|
||
search_dirs.append(subdir)
|
||
|
||
for search_dir in search_dirs:
|
||
if not search_dir.exists():
|
||
continue
|
||
for session_dir in search_dir.iterdir():
|
||
if not session_dir.is_dir():
|
||
continue
|
||
name = session_dir.name
|
||
if not name.startswith('sess_'):
|
||
continue
|
||
try:
|
||
ts_part = name.split('_')[1]
|
||
session_dt = _dt.strptime(ts_part, '%Y%m%dT%H%M%S')
|
||
delta = abs((workflow_created_at - session_dt).total_seconds())
|
||
if delta < best_delta:
|
||
best_delta = delta
|
||
best_dir = session_dir
|
||
except (IndexError, ValueError):
|
||
continue
|
||
|
||
if best_dir and best_delta < 600:
|
||
return best_dir
|
||
|
||
return None
|
||
|
||
def _load_anchor_crop(
|
||
self,
|
||
matched_event: Dict[str, Any],
|
||
session_dir: Path,
|
||
) -> Optional[str]:
|
||
"""Charger le crop de référence (anchor image) pour un événement clic.
|
||
|
||
Cherche le crop dans le dossier shots/ de la session source, en utilisant
|
||
le screenshot_id de l'événement original. Si le crop n'existe pas, tente
|
||
de le recréer à partir du screenshot full en croppant autour de la position
|
||
du clic.
|
||
|
||
Args:
|
||
matched_event: Événement original (dict avec screenshot_id et pos)
|
||
session_dir: Dossier de la session source
|
||
|
||
Returns:
|
||
Image crop encodée en base64, ou None si introuvable.
|
||
"""
|
||
import base64
|
||
|
||
screenshot_id = matched_event.get('screenshot_id', '')
|
||
if not screenshot_id:
|
||
return None
|
||
|
||
shots_dir = session_dir / "shots"
|
||
if not shots_dir.exists():
|
||
return None
|
||
|
||
# Stratégie 1 : crop déjà capturé par l'agent (shot_XXXX_crop.png)
|
||
crop_path = shots_dir / f"{screenshot_id}_crop.png"
|
||
if crop_path.exists():
|
||
try:
|
||
crop_b64 = base64.b64encode(crop_path.read_bytes()).decode()
|
||
logger.debug("Anchor crop chargé : %s", crop_path.name)
|
||
return crop_b64
|
||
except Exception as e:
|
||
logger.warning("Erreur lecture crop %s : %s", crop_path, e)
|
||
|
||
# Stratégie 1b : crop en JPEG (compression possible côté agent)
|
||
crop_jpg = shots_dir / f"{screenshot_id}_crop.jpg"
|
||
if crop_jpg.exists():
|
||
try:
|
||
crop_b64 = base64.b64encode(crop_jpg.read_bytes()).decode()
|
||
logger.debug("Anchor crop JPEG chargé : %s", crop_jpg.name)
|
||
return crop_b64
|
||
except Exception as e:
|
||
logger.warning("Erreur lecture crop JPEG %s : %s", crop_jpg, e)
|
||
|
||
# Stratégie 2 : cropper le full screenshot autour de la position du clic
|
||
full_path = shots_dir / f"{screenshot_id}_full.png"
|
||
if not full_path.exists():
|
||
full_path = shots_dir / f"{screenshot_id}_full.jpg"
|
||
if not full_path.exists():
|
||
return None
|
||
|
||
pos = matched_event.get('pos', [])
|
||
if not pos or len(pos) < 2:
|
||
return None
|
||
|
||
try:
|
||
from PIL import Image
|
||
import io
|
||
|
||
img = Image.open(full_path)
|
||
x, y = int(pos[0]), int(pos[1])
|
||
|
||
# Crop 400x400 centré sur le clic (même taille que le captor)
|
||
crop_size = 200 # demi-côté
|
||
left = max(0, x - crop_size)
|
||
top = max(0, y - crop_size)
|
||
right = min(img.width, x + crop_size)
|
||
bottom = min(img.height, y + crop_size)
|
||
|
||
crop = img.crop((left, top, right, bottom))
|
||
|
||
buf = io.BytesIO()
|
||
crop.save(buf, format="PNG", optimize=True)
|
||
crop_b64 = base64.b64encode(buf.getvalue()).decode()
|
||
|
||
logger.debug(
|
||
"Anchor crop généré depuis %s (pos=%s, crop=%dx%d)",
|
||
full_path.name, pos, crop.width, crop.height,
|
||
)
|
||
return crop_b64
|
||
except Exception as e:
|
||
logger.warning("Erreur génération crop depuis %s : %s", full_path, e)
|
||
return None
|
||
|
||
def _load_events_from_session(self, session_id: str) -> List[Dict[str, Any]]:
|
||
"""Charger les événements depuis le live_events.jsonl d'une session."""
|
||
import json
|
||
|
||
session_dir = self._find_session_dir(session_id)
|
||
if not session_dir:
|
||
return []
|
||
|
||
events_file = session_dir / "live_events.jsonl"
|
||
if not events_file.exists():
|
||
return []
|
||
|
||
events = []
|
||
try:
|
||
with open(events_file, 'r', encoding='utf-8') as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
try:
|
||
data = json.loads(line)
|
||
event = data.get('event', data)
|
||
events.append(event)
|
||
except json.JSONDecodeError:
|
||
continue
|
||
except Exception as e:
|
||
logger.warning("Erreur lecture events %s : %s", events_file, e)
|
||
|
||
return events
|
||
|
||
def _find_closest_session_events(self, workflow_created_at) -> List[Dict[str, Any]]:
|
||
"""Trouver la session la plus proche temporellement du workflow.
|
||
|
||
Parcourt les dossiers de sessions sur disque et compare les dates
|
||
de création (encodées dans le nom du dossier sess_YYYYMMDDTHHMMSS_xxx).
|
||
"""
|
||
import json
|
||
from datetime import datetime
|
||
|
||
best_dir = None
|
||
best_delta = float('inf')
|
||
|
||
# Chercher dans data_dir et ses sous-dossiers (machine_id)
|
||
search_dirs = [self.data_dir]
|
||
if self.data_dir.exists():
|
||
for subdir in self.data_dir.iterdir():
|
||
if subdir.is_dir() and not subdir.name.startswith('.'):
|
||
search_dirs.append(subdir)
|
||
|
||
for search_dir in search_dirs:
|
||
for session_dir in search_dir.iterdir():
|
||
if not session_dir.is_dir():
|
||
continue
|
||
name = session_dir.name
|
||
if not name.startswith('sess_'):
|
||
continue
|
||
# Extraire le timestamp du nom : sess_YYYYMMDDTHHMMSS_xxx
|
||
try:
|
||
ts_part = name.split('_')[1] # YYYYMMDDTHHMMSS
|
||
session_dt = datetime.strptime(ts_part, '%Y%m%dT%H%M%S')
|
||
delta = abs((workflow_created_at - session_dt).total_seconds())
|
||
if delta < best_delta:
|
||
best_delta = delta
|
||
best_dir = session_dir
|
||
except (IndexError, ValueError):
|
||
continue
|
||
|
||
if best_dir and best_delta < 600: # Max 10 minutes d'écart
|
||
events_file = best_dir / "live_events.jsonl"
|
||
if events_file.exists():
|
||
events = []
|
||
try:
|
||
with open(events_file, 'r', encoding='utf-8') as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
try:
|
||
data = json.loads(line)
|
||
event = data.get('event', data)
|
||
events.append(event)
|
||
except json.JSONDecodeError:
|
||
continue
|
||
if events:
|
||
logger.info(
|
||
"Events originaux trouvés dans %s (delta=%ds, %d events)",
|
||
best_dir.name, int(best_delta), len(events),
|
||
)
|
||
return events
|
||
except Exception as e:
|
||
logger.warning("Erreur lecture events %s : %s", events_file, e)
|
||
|
||
return []
|
||
|
||
def _find_session_wide_search(
|
||
self,
|
||
workflow,
|
||
return_dir: bool = False,
|
||
):
|
||
"""Recherche élargie de la session source d'un workflow.
|
||
|
||
Utilisé quand ``_load_original_events_for_workflow`` échoue (pas de
|
||
``source_session_id`` et fenêtre temporelle de 10 min trop serrée).
|
||
|
||
Élargit la fenêtre à 3 heures et utilise le ``_machine_id`` du workflow
|
||
pour filtrer les candidats.
|
||
|
||
Args:
|
||
workflow: Objet Workflow ou dict.
|
||
return_dir: Si True, retourne aussi le Path du dossier session.
|
||
|
||
Returns:
|
||
Si ``return_dir`` est False : (events_list, session_dir_path)
|
||
Si ``return_dir`` est True : (events_list, session_dir_path)
|
||
"""
|
||
import json as _json
|
||
from datetime import datetime as _dt
|
||
|
||
created_at = None
|
||
if hasattr(workflow, 'created_at'):
|
||
created_at = workflow.created_at
|
||
elif isinstance(workflow, dict) and 'created_at' in workflow:
|
||
try:
|
||
created_at = _dt.fromisoformat(workflow['created_at'])
|
||
except (ValueError, TypeError):
|
||
pass
|
||
|
||
if not created_at:
|
||
return ([], None)
|
||
|
||
# Machine_id du workflow (peut être dans l'attribut privé ou dans les métadonnées)
|
||
machine_id = getattr(workflow, '_machine_id', None)
|
||
if not machine_id:
|
||
metadata = workflow.metadata if hasattr(workflow, 'metadata') else (
|
||
workflow.get('metadata', {}) if isinstance(workflow, dict) else {}
|
||
)
|
||
machine_id = metadata.get('machine_id', '')
|
||
|
||
best_dir = None
|
||
best_delta = float('inf')
|
||
max_delta = 10800 # 3 heures
|
||
|
||
search_dirs = [self.data_dir]
|
||
if self.data_dir.exists():
|
||
for subdir in self.data_dir.iterdir():
|
||
if subdir.is_dir() and not subdir.name.startswith('.'):
|
||
search_dirs.append(subdir)
|
||
|
||
for search_dir in search_dirs:
|
||
if not search_dir.exists():
|
||
continue
|
||
for session_dir in search_dir.iterdir():
|
||
if not session_dir.is_dir():
|
||
continue
|
||
name = session_dir.name
|
||
if not name.startswith('sess_'):
|
||
continue
|
||
|
||
# Filtrer par machine_id si connu : le session_dir parent doit contenir
|
||
# le machine_id, ou être directement dans data_dir
|
||
if machine_id and machine_id != "default":
|
||
parent_name = search_dir.name
|
||
# Accepter si le parent est le machine_id ou si c'est le data_dir racine
|
||
if parent_name != machine_id and search_dir != self.data_dir:
|
||
continue
|
||
|
||
try:
|
||
ts_part = name.split('_')[1]
|
||
session_dt = _dt.strptime(ts_part, '%Y%m%dT%H%M%S')
|
||
delta = abs((created_at - session_dt).total_seconds())
|
||
if delta < best_delta:
|
||
best_delta = delta
|
||
best_dir = session_dir
|
||
except (IndexError, ValueError):
|
||
continue
|
||
|
||
if best_dir and best_delta < max_delta:
|
||
events_file = best_dir / "live_events.jsonl"
|
||
if events_file.exists():
|
||
events = []
|
||
try:
|
||
with open(events_file, 'r', encoding='utf-8') as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
try:
|
||
data = _json.loads(line)
|
||
event = data.get('event', data)
|
||
events.append(event)
|
||
except _json.JSONDecodeError:
|
||
continue
|
||
if events:
|
||
logger.info(
|
||
"Recherche élargie : session trouvée dans %s "
|
||
"(delta=%ds, %d events, machine=%s)",
|
||
best_dir.name, int(best_delta), len(events),
|
||
machine_id or "?",
|
||
)
|
||
return (events, best_dir)
|
||
except Exception as e:
|
||
logger.warning("Erreur lecture events %s : %s", events_file, e)
|
||
|
||
return ([], None)
|
||
|
||
@staticmethod
|
||
def _infer_screen_resolution(events: List[Dict[str, Any]]) -> tuple:
|
||
"""Inférer la résolution d'écran depuis les positions maximales des events.
|
||
|
||
Analyse les coordonnées de tous les clics pour estimer la résolution
|
||
de l'écran source. Si un clic a x=1800 ou y=1500, la résolution
|
||
est au moins 1800+marge x 1500+marge.
|
||
|
||
Utilise une heuristique : arrondir vers la résolution standard la plus
|
||
proche parmi les plus courantes (1920x1080, 2560x1440, 2560x1600,
|
||
3840x2160, 1366x768, 1280x720).
|
||
|
||
Returns:
|
||
Tuple (width, height) de la résolution inférée, ou (1920, 1080) par défaut.
|
||
"""
|
||
# Résolutions standard connues
|
||
STANDARD_RESOLUTIONS = [
|
||
(1280, 720), (1366, 768), (1440, 900), (1600, 900),
|
||
(1920, 1080), (1920, 1200), (2560, 1440), (2560, 1600),
|
||
(3440, 1440), (3840, 2160),
|
||
]
|
||
|
||
max_x = 0
|
||
max_y = 0
|
||
for evt in events:
|
||
pos = evt.get('pos', [])
|
||
if pos and len(pos) == 2:
|
||
max_x = max(max_x, pos[0])
|
||
max_y = max(max_y, pos[1])
|
||
|
||
if max_x == 0 and max_y == 0:
|
||
return (1920, 1080)
|
||
|
||
# Trouver la résolution standard minimale qui contient tous les clics
|
||
for w, h in STANDARD_RESOLUTIONS:
|
||
if w >= max_x and h >= max_y:
|
||
return (w, h)
|
||
|
||
# Si aucune résolution standard ne convient, arrondir vers le haut
|
||
# par paliers de 100 pixels
|
||
inferred_w = ((max_x // 100) + 1) * 100
|
||
inferred_h = ((max_y // 100) + 1) * 100
|
||
return (inferred_w, inferred_h)
|
||
|
||
def _edge_to_enriched_action(
|
||
self,
|
||
edge,
|
||
node_index: Dict[str, Any],
|
||
original_events: List[Dict[str, Any]],
|
||
params: Dict[str, Any],
|
||
inferred_resolution: tuple = (1920, 1080),
|
||
source_session_dir: Optional[Path] = None,
|
||
) -> List[Dict[str, Any]]:
|
||
"""Convertir un edge de workflow en action(s) enrichie(s).
|
||
|
||
Enrichit chaque action avec :
|
||
- Coordonnées normalisées depuis les événements originaux
|
||
- Infos de ciblage visuel (by_text, by_role, window_title)
|
||
- Anchor image (crop) pour le template matching visuel
|
||
- Flag visual_mode pour la résolution visuelle côté agent
|
||
- Identifiants from_node/to_node pour pre-check et post-conditions
|
||
|
||
Args:
|
||
edge: WorkflowEdge (objet ou dict)
|
||
node_index: Index des nodes par ID
|
||
original_events: Événements originaux de la session
|
||
params: Variables de substitution
|
||
inferred_resolution: Résolution écran inférée
|
||
source_session_dir: Dossier de la session source (pour les crops/anchors)
|
||
|
||
Returns:
|
||
Liste d'actions enrichies (1 pour un edge simple, N pour un compound)
|
||
"""
|
||
import uuid
|
||
|
||
# Extraire les données de l'edge (objet ou dict)
|
||
if hasattr(edge, 'edge_id'):
|
||
edge_id = edge.edge_id
|
||
from_node = edge.from_node
|
||
to_node = edge.to_node
|
||
action_obj = edge.action
|
||
edge_metadata = edge.metadata if hasattr(edge, 'metadata') else {}
|
||
else:
|
||
edge_id = edge.get('edge_id', '')
|
||
from_node = edge.get('from_node', '')
|
||
to_node = edge.get('to_node', '')
|
||
action_obj = edge.get('action', {})
|
||
edge_metadata = edge.get('metadata', {})
|
||
|
||
# Extraire les données de l'action
|
||
if hasattr(action_obj, 'type'):
|
||
action_type = action_obj.type
|
||
target = action_obj.target
|
||
action_params = action_obj.parameters or {}
|
||
elif isinstance(action_obj, dict):
|
||
action_type = action_obj.get('type', 'unknown')
|
||
target = action_obj.get('target', {})
|
||
action_params = action_obj.get('parameters', {})
|
||
else:
|
||
return []
|
||
|
||
# Extraire les infos du target
|
||
if hasattr(target, 'by_role'):
|
||
by_role = target.by_role or ''
|
||
by_text = target.by_text or ''
|
||
by_position = target.by_position
|
||
elif isinstance(target, dict):
|
||
by_role = target.get('by_role', '') or ''
|
||
by_text = target.get('by_text', '') or ''
|
||
by_position = target.get('by_position')
|
||
else:
|
||
by_role = ''
|
||
by_text = ''
|
||
by_position = None
|
||
|
||
# Données du node source (pour pre-check et window_title)
|
||
source_node = node_index.get(from_node)
|
||
target_node = node_index.get(to_node)
|
||
|
||
window_title = self._extract_window_title(source_node)
|
||
target_window_title = self._extract_window_title(target_node)
|
||
|
||
# Chercher l'événement original correspondant à cet edge
|
||
matched_event = self._match_edge_to_event(
|
||
edge_metadata, action_type, action_params, original_events
|
||
)
|
||
|
||
# Construire les coordonnées par ordre de priorité :
|
||
# 1. by_position du target (explicite, fiable)
|
||
# 2. position dans action_params (set par GraphBuilder depuis l'event original)
|
||
# 3. matched_event (recherche dans les events de la session - moins fiable)
|
||
# 4. (0, 0) → sera résolu visuellement par l'agent
|
||
x_pct = 0.0
|
||
y_pct = 0.0
|
||
text = ''
|
||
keys = []
|
||
button = action_params.get('button', 'left') if isinstance(action_params, dict) else 'left'
|
||
|
||
# Priorité 1 : by_position explicite du target
|
||
if by_position and isinstance(by_position, (list, tuple)) and len(by_position) == 2:
|
||
px, py = by_position
|
||
if px <= 1.0 and py <= 1.0:
|
||
x_pct = px
|
||
y_pct = py
|
||
elif px > 0 or py > 0:
|
||
rw = (action_params.get('ref_width', 1920) or 1920) if isinstance(action_params, dict) else 1920
|
||
rh = (action_params.get('ref_height', 1080) or 1080) if isinstance(action_params, dict) else 1080
|
||
x_pct = round(px / rw, 6)
|
||
y_pct = round(py / rh, 6)
|
||
|
||
# Priorité 2 : position dans action_params (de GraphBuilder)
|
||
if x_pct == 0.0 and y_pct == 0.0 and isinstance(action_params, dict):
|
||
pos = action_params.get('position', [])
|
||
if pos and len(pos) == 2 and (pos[0] > 0 or pos[1] > 0):
|
||
rw = (action_params.get('ref_width') or inferred_resolution[0]) if isinstance(action_params, dict) else inferred_resolution[0]
|
||
rh = (action_params.get('ref_height') or inferred_resolution[1]) if isinstance(action_params, dict) else inferred_resolution[1]
|
||
x_pct = round(pos[0] / rw, 6)
|
||
y_pct = round(pos[1] / rh, 6)
|
||
|
||
# Priorité 3 : matched_event (session la plus proche)
|
||
if x_pct == 0.0 and y_pct == 0.0 and matched_event:
|
||
pos = matched_event.get('pos', [])
|
||
if pos and len(pos) == 2:
|
||
ref_width = matched_event.get('screen_width') or inferred_resolution[0]
|
||
ref_height = matched_event.get('screen_height') or inferred_resolution[1]
|
||
x_pct = round(pos[0] / ref_width, 6)
|
||
y_pct = round(pos[1] / ref_height, 6)
|
||
|
||
# Sécurité : clamper à [0, 1]
|
||
x_pct = max(0.0, min(1.0, x_pct))
|
||
y_pct = max(0.0, min(1.0, y_pct))
|
||
|
||
# Texte et touches : action_params d'abord, matched_event en complément
|
||
if isinstance(action_params, dict):
|
||
text = action_params.get('text', '')
|
||
keys = action_params.get('keys', [])
|
||
if not text and matched_event:
|
||
text = matched_event.get('text', '')
|
||
if not keys and matched_event:
|
||
keys = matched_event.get('keys', [])
|
||
if matched_event:
|
||
button = matched_event.get('button', button)
|
||
# Enrichir le window_title si absent
|
||
event_window = matched_event.get('window', {})
|
||
if not window_title and event_window:
|
||
window_title = event_window.get('title', '')
|
||
|
||
# Sanitiser les touches : convertir les caractères de contrôle
|
||
if keys:
|
||
keys = _sanitize_keys(keys)
|
||
# Si ne reste que des modificateurs seuls → action parasite, skip
|
||
if _is_modifier_only(keys):
|
||
return []
|
||
|
||
# Substitution de variables dans le texte
|
||
if text and params:
|
||
text = self._substitute_vars(text, params, action_params)
|
||
|
||
# Déterminer le type d'action normalisé
|
||
if action_type == 'mouse_click':
|
||
norm_type = 'click'
|
||
elif action_type == 'text_input':
|
||
norm_type = 'type'
|
||
elif action_type == 'key_press':
|
||
norm_type = 'key_combo'
|
||
elif action_type == 'compound':
|
||
# Décomposer les compound en sous-actions
|
||
steps = action_params.get('steps', []) if isinstance(action_params, dict) else []
|
||
return self._expand_compound_enriched(
|
||
steps, edge_id, from_node, to_node, window_title, params
|
||
)
|
||
elif action_type in ('unknown', 'unknown_element'):
|
||
# Actions "unknown" : essayer de deviner depuis l'événement original
|
||
if matched_event:
|
||
evt_type = matched_event.get('type', '')
|
||
if evt_type == 'mouse_click':
|
||
norm_type = 'click'
|
||
elif evt_type == 'text_input':
|
||
norm_type = 'type'
|
||
elif evt_type == 'key_press':
|
||
norm_type = 'key_combo'
|
||
else:
|
||
# Event trouvé mais type non reconnu : défaut click
|
||
norm_type = 'click'
|
||
logger.debug(
|
||
"Edge %s : action unknown, event type=%s -> défaut click",
|
||
edge_id, evt_type,
|
||
)
|
||
else:
|
||
# Pas d'événement original : défaut click (la transition entre
|
||
# deux états est presque toujours causée par un clic)
|
||
norm_type = 'click'
|
||
logger.debug(
|
||
"Edge %s : action unknown, pas d'event original -> défaut click",
|
||
edge_id,
|
||
)
|
||
else:
|
||
norm_type = action_type
|
||
|
||
# Construire le target_spec pour la résolution visuelle
|
||
target_spec = {}
|
||
if by_text and by_text not in ('', 'null', 'None'):
|
||
target_spec['by_text'] = by_text
|
||
if by_role and by_role not in ('', 'unknown', 'unknown_element', 'null'):
|
||
target_spec['by_role'] = by_role
|
||
if window_title:
|
||
target_spec['window_title'] = window_title
|
||
|
||
# Enrichir le target_spec avec les textes du node source (OCR)
|
||
source_texts = self._extract_required_texts(source_node)
|
||
if source_texts:
|
||
target_spec['context_hints'] = {'screen_texts': source_texts[:3]}
|
||
|
||
# Enrichir avec l'anchor image (crop de référence) pour les clics
|
||
if norm_type == 'click' and matched_event and source_session_dir:
|
||
anchor_b64 = self._load_anchor_crop(matched_event, source_session_dir)
|
||
if anchor_b64:
|
||
target_spec['anchor_image_base64'] = anchor_b64
|
||
logger.debug(
|
||
"Anchor image chargée pour edge %s (screenshot_id=%s)",
|
||
edge_id, matched_event.get('screenshot_id', '?'),
|
||
)
|
||
|
||
# Construire l'action enrichie
|
||
action = {
|
||
'action_id': f'act_{uuid.uuid4().hex[:8]}',
|
||
'type': norm_type,
|
||
'edge_id': edge_id,
|
||
'from_node': from_node,
|
||
'to_node': to_node,
|
||
'x_pct': x_pct,
|
||
'y_pct': y_pct,
|
||
'window_title': window_title,
|
||
}
|
||
|
||
# Ajouter les champs spécifiques au type d'action
|
||
if norm_type == 'click':
|
||
action['button'] = button
|
||
elif norm_type == 'type':
|
||
action['text'] = text
|
||
elif norm_type == 'key_combo':
|
||
action['keys'] = keys
|
||
|
||
# Activer la résolution visuelle si on a des critères sémantiques
|
||
# OU si les coordonnées sont nulles (nécessite une résolution)
|
||
if target_spec or (x_pct == 0.0 and y_pct == 0.0 and norm_type == 'click'):
|
||
action['visual_mode'] = True
|
||
action['target_spec'] = target_spec
|
||
|
||
return [action]
|
||
|
||
def _match_edge_to_event(
|
||
self,
|
||
edge_metadata: Dict[str, Any],
|
||
action_type: str,
|
||
action_params: Dict[str, Any],
|
||
original_events: List[Dict[str, Any]],
|
||
) -> Optional[Dict[str, Any]]:
|
||
"""Trouver l'événement original correspondant à un edge.
|
||
|
||
Stratégie de matching :
|
||
1. Par type d'action (mouse_click, text_input, key_press)
|
||
2. Par position approximative (si position dans action_params)
|
||
3. Par ordre chronologique (premier événement non-matché du bon type)
|
||
|
||
Returns:
|
||
L'événement matché ou None.
|
||
"""
|
||
if not original_events:
|
||
return None
|
||
|
||
# Type d'événement attendu
|
||
expected_types = set()
|
||
if action_type in ('mouse_click', 'click', 'unknown'):
|
||
expected_types.add('mouse_click')
|
||
if action_type in ('text_input', 'type', 'unknown'):
|
||
expected_types.add('text_input')
|
||
if action_type in ('key_press', 'key_combo', 'unknown'):
|
||
expected_types.add('key_press')
|
||
|
||
if not expected_types:
|
||
expected_types = {'mouse_click', 'text_input', 'key_press'}
|
||
|
||
# Filtrer les événements du bon type
|
||
candidates = [
|
||
e for e in original_events
|
||
if e.get('type', '') in expected_types
|
||
]
|
||
|
||
if not candidates:
|
||
return None
|
||
|
||
# Si on a une position dans action_params, chercher l'événement le plus proche
|
||
ref_pos = (action_params.get('position', []) if isinstance(action_params, dict) else [])
|
||
if ref_pos and len(ref_pos) == 2 and ref_pos[0] > 0:
|
||
best_event = None
|
||
best_dist = float('inf')
|
||
for evt in candidates:
|
||
evt_pos = evt.get('pos', [])
|
||
if evt_pos and len(evt_pos) == 2:
|
||
dx = ref_pos[0] - evt_pos[0]
|
||
dy = ref_pos[1] - evt_pos[1]
|
||
dist = (dx * dx + dy * dy) ** 0.5
|
||
if dist < best_dist:
|
||
best_dist = dist
|
||
best_event = evt
|
||
if best_event and best_dist < 200: # Max 200px d'écart
|
||
return best_event
|
||
|
||
# Fallback : le premier événement du bon type
|
||
# On utilise created_from_event du edge metadata comme hint
|
||
created_from = edge_metadata.get('created_from_event', '')
|
||
if created_from:
|
||
for evt in candidates:
|
||
if evt.get('type') == created_from:
|
||
return evt
|
||
|
||
return candidates[0] if candidates else None
|
||
|
||
def _extract_window_title(self, node) -> str:
|
||
"""Extraire le titre de fenêtre depuis un node (objet ou dict)."""
|
||
if node is None:
|
||
return ''
|
||
if hasattr(node, 'template'):
|
||
tpl = node.template
|
||
if tpl and hasattr(tpl, 'window') and tpl.window:
|
||
return tpl.window.title_contains or tpl.window.title_pattern or ''
|
||
elif isinstance(node, dict):
|
||
template = node.get('template', {})
|
||
if isinstance(template, dict):
|
||
window = template.get('window', {})
|
||
if isinstance(window, dict):
|
||
return window.get('title_contains', '') or window.get('title_pattern', '') or ''
|
||
return ''
|
||
|
||
def _extract_required_texts(self, node) -> List[str]:
|
||
"""Extraire les textes requis depuis le template d'un node."""
|
||
if node is None:
|
||
return []
|
||
if hasattr(node, 'template'):
|
||
tpl = node.template
|
||
if tpl and hasattr(tpl, 'text') and tpl.text:
|
||
texts = tpl.text.required_texts or []
|
||
# Filtrer les textes trop courts ou trop longs
|
||
return [t for t in texts if 3 <= len(t) <= 80]
|
||
elif isinstance(node, dict):
|
||
template = node.get('template', {})
|
||
if isinstance(template, dict):
|
||
text_spec = template.get('text', {})
|
||
if isinstance(text_spec, dict):
|
||
texts = text_spec.get('required_texts', [])
|
||
return [t for t in texts if isinstance(t, str) and 3 <= len(t) <= 80]
|
||
return []
|
||
|
||
@staticmethod
|
||
def _substitute_vars(text: str, params: Dict[str, Any], action_params: Dict[str, Any]) -> str:
|
||
"""Substituer les variables ${var} dans un texte."""
|
||
import re
|
||
defaults = action_params.get('defaults', {}) if isinstance(action_params, dict) else {}
|
||
|
||
def replacer(match):
|
||
var_name = match.group(1)
|
||
return str(params.get(var_name, defaults.get(var_name, match.group(0))))
|
||
|
||
return re.sub(r'\$\{(\w+)\}', replacer, text)
|
||
|
||
def _expand_compound_enriched(
|
||
self,
|
||
steps: List[Dict[str, Any]],
|
||
edge_id: str,
|
||
from_node: str,
|
||
to_node: str,
|
||
window_title: str,
|
||
params: Dict[str, Any],
|
||
) -> List[Dict[str, Any]]:
|
||
"""Décomposer un compound en actions enrichies individuelles.
|
||
|
||
Applique le nettoyage des steps parasites avant expansion :
|
||
- Suppression des modificateurs seuls (ctrl, alt, shift, etc.)
|
||
- Fusion des text_input consécutifs
|
||
- Déduplication des key_combo consécutifs identiques
|
||
|
||
Supporte les types de steps produits par GraphBuilder._build_compound_action() :
|
||
- mouse_click / click : clic souris avec x_pct/y_pct ou position/ref_*
|
||
- text_input / type : saisie de texte
|
||
- key_press / key_combo : combinaison de touches
|
||
- wait : pause entre actions
|
||
"""
|
||
import uuid
|
||
|
||
# Nettoyage des steps parasites avant expansion
|
||
steps = clean_compound_steps(steps)
|
||
|
||
actions = []
|
||
for step in steps:
|
||
step_type = step.get('type', 'unknown')
|
||
action = {
|
||
'action_id': f'act_{uuid.uuid4().hex[:8]}',
|
||
'edge_id': edge_id,
|
||
'from_node': from_node,
|
||
'to_node': to_node,
|
||
'window_title': window_title,
|
||
}
|
||
if step_type in ('key_press', 'key_combo'):
|
||
action['type'] = 'key_combo'
|
||
keys = step.get('keys', [])
|
||
if not keys and step.get('key'):
|
||
keys = [step['key']]
|
||
action['keys'] = keys
|
||
elif step_type in ('text_input', 'type'):
|
||
action['type'] = 'type'
|
||
text = step.get('text', '')
|
||
text = self._substitute_vars(text, params, step)
|
||
action['text'] = text
|
||
elif step_type == 'wait':
|
||
action['type'] = 'wait'
|
||
action['duration_ms'] = step.get('duration_ms', 500)
|
||
elif step_type in ('mouse_click', 'click'):
|
||
action['type'] = 'click'
|
||
# Coordonnées normalisées directes (x_pct/y_pct)
|
||
x_pct = step.get('x_pct', 0.0)
|
||
y_pct = step.get('y_pct', 0.0)
|
||
# Fallback : calculer depuis position absolue + résolution de référence
|
||
if x_pct == 0.0 and y_pct == 0.0:
|
||
pos = step.get('position', [])
|
||
if pos and len(pos) == 2 and (pos[0] > 0 or pos[1] > 0):
|
||
rw = step.get('ref_width', 1920) or 1920
|
||
rh = step.get('ref_height', 1080) or 1080
|
||
x_pct = round(pos[0] / rw, 6)
|
||
y_pct = round(pos[1] / rh, 6)
|
||
action['x_pct'] = x_pct
|
||
action['y_pct'] = y_pct
|
||
action['button'] = step.get('button', 'left')
|
||
else:
|
||
continue
|
||
actions.append(action)
|
||
|
||
# Nettoyage post-expansion des actions enrichies
|
||
return clean_enriched_actions(actions)
|
||
|
||
# =========================================================================
|
||
# Replay hybride : événements bruts + structure workflow
|
||
# =========================================================================
|
||
|
||
def build_hybrid_replay(
|
||
self,
|
||
workflow,
|
||
session_id: Optional[str] = None,
|
||
) -> List[Dict[str, Any]]:
|
||
"""Construire un replay hybride combinant événements bruts et structure workflow.
|
||
|
||
Le replay hybride utilise :
|
||
- Les événements bruts (live_events.jsonl) comme SOURCE D'ACTIONS
|
||
- La structure du workflow (nodes) comme STRUCTURE DE VÉRIFICATION
|
||
|
||
Les événements sont groupés par transition de node. Entre chaque groupe,
|
||
une action ``verify_screen`` est insérée pour vérifier que l'écran
|
||
correspond bien au node attendu avant de continuer.
|
||
|
||
Args:
|
||
workflow: Objet Workflow ou dict brut avec nodes/edges.
|
||
session_id: Identifiant de session explicite (optionnel, sinon
|
||
déduit depuis les métadonnées du workflow).
|
||
|
||
Returns:
|
||
Liste d'actions propres prêtes pour la queue de replay, avec des
|
||
``verify_screen`` intercalés entre les groupes de transition.
|
||
"""
|
||
import uuid
|
||
|
||
# 1. Charger les événements bruts
|
||
original_events = self._load_original_events_for_workflow(workflow)
|
||
|
||
# Si la recherche standard échoue, élargir la fenêtre temporelle.
|
||
# Le SessionWorker peut construire le workflow 1-2h après la capture
|
||
# (analyse VLM longue), donc la fenêtre de 10 min est trop serrée.
|
||
if not original_events:
|
||
original_events, session_dir_hint = self._find_session_wide_search(workflow)
|
||
else:
|
||
session_dir_hint = None
|
||
|
||
if not original_events:
|
||
logger.warning("build_hybrid_replay : aucun événement brut trouvé")
|
||
return []
|
||
|
||
# 2. Trouver le dossier de la session source
|
||
session_dir = session_dir_hint or self._find_session_dir_for_workflow(workflow)
|
||
if not session_dir:
|
||
# Fallback : chercher avec une fenêtre large
|
||
if not session_dir_hint:
|
||
_, session_dir = self._find_session_wide_search(workflow, return_dir=True)
|
||
if not session_dir:
|
||
logger.warning("build_hybrid_replay : dossier session introuvable")
|
||
return []
|
||
|
||
# 3. Mapper les screenshots aux nodes du workflow
|
||
node_timeline = self._map_screenshots_to_nodes(session_dir, workflow)
|
||
if not node_timeline:
|
||
logger.warning(
|
||
"build_hybrid_replay : impossible de mapper les screenshots aux nodes, "
|
||
"fallback sur extract_enriched_actions"
|
||
)
|
||
return []
|
||
|
||
# 4. Grouper les événements par transition de node
|
||
groups = self._group_events_by_transition(original_events, node_timeline)
|
||
if not groups:
|
||
logger.warning("build_hybrid_replay : aucun groupe de transition construit")
|
||
return []
|
||
|
||
# 5. Inférer la résolution d'écran
|
||
screen_w, screen_h = self._infer_screen_resolution(original_events)
|
||
|
||
# 6. Construire la liste d'actions
|
||
actions = []
|
||
for group_idx, group in enumerate(groups):
|
||
group_events = group["events"]
|
||
to_node = group["to_node"]
|
||
|
||
# Convertir les événements bruts en actions de replay
|
||
group_actions = self._events_to_replay_actions(
|
||
group_events, screen_w, screen_h, group_idx,
|
||
)
|
||
|
||
# Nettoyer le groupe (filtrer parasites, fusionner texte, dédupliquer combos, waits)
|
||
group_actions = clean_enriched_actions(group_actions)
|
||
|
||
actions.extend(group_actions)
|
||
|
||
# Insérer une vérification visuelle après chaque groupe
|
||
if to_node:
|
||
actions.append({
|
||
"action_id": f"act_verify_{uuid.uuid4().hex[:8]}",
|
||
"type": "verify_screen",
|
||
"expected_node": to_node,
|
||
"timeout_ms": 5000,
|
||
"group": group_idx,
|
||
})
|
||
|
||
logger.info(
|
||
"Replay hybride construit : %d actions (%d groupes, %d events bruts, "
|
||
"résolution=%dx%d)",
|
||
len(actions), len(groups), len(original_events), screen_w, screen_h,
|
||
)
|
||
return actions
|
||
|
||
def _map_screenshots_to_nodes(
|
||
self,
|
||
session_dir: Path,
|
||
workflow,
|
||
) -> List[tuple]:
|
||
"""Mapper chaque screenshot de la session au node du workflow correspondant.
|
||
|
||
Utilise les window_focus_change events pour associer un timestamp à un
|
||
window_title, puis le window_title au node (via node.template.window.title_contains).
|
||
|
||
Returns:
|
||
Liste de (timestamp, node_id) triée chronologiquement, ou liste vide
|
||
si le mapping est impossible.
|
||
"""
|
||
import json as _json
|
||
|
||
# Extraire les titres de fenêtre depuis les nodes du workflow
|
||
if hasattr(workflow, 'nodes'):
|
||
nodes = workflow.nodes
|
||
elif isinstance(workflow, dict):
|
||
nodes = workflow.get('nodes', [])
|
||
else:
|
||
return []
|
||
|
||
# Construire un index title_fragment → node_id
|
||
# Un node peut avoir un title_contains comme "Bloc-notes" ou "Sans titre"
|
||
title_to_node = {}
|
||
node_order = [] # Ordre topologique (par index)
|
||
for node in nodes:
|
||
nid = node.node_id if hasattr(node, 'node_id') else node.get('node_id', '')
|
||
node_order.append(nid)
|
||
win_title = self._extract_window_title(node)
|
||
if win_title:
|
||
title_to_node[win_title.lower()] = nid
|
||
|
||
if not title_to_node:
|
||
logger.debug("Aucun titre de fenêtre dans les nodes du workflow")
|
||
return []
|
||
|
||
# Charger les changements de fenêtre depuis live_events.jsonl
|
||
events_file = session_dir / "live_events.jsonl"
|
||
if not events_file.exists():
|
||
return []
|
||
|
||
window_changes = [] # [(timestamp, window_title)]
|
||
try:
|
||
for line in events_file.read_text(encoding="utf-8").splitlines():
|
||
if not line.strip():
|
||
continue
|
||
try:
|
||
evt = _json.loads(line)
|
||
except _json.JSONDecodeError:
|
||
continue
|
||
|
||
event_data = evt.get("event", evt)
|
||
evt_type = event_data.get("type", "")
|
||
ts = float(event_data.get("timestamp", evt.get("timestamp", 0)))
|
||
|
||
if evt_type == "window_focus_change":
|
||
to_info = event_data.get("to") or event_data.get("window") or {}
|
||
title = to_info.get("title", "")
|
||
if title:
|
||
window_changes.append((ts, title))
|
||
except Exception as e:
|
||
logger.warning("Erreur lecture events pour mapping screenshots : %s", e)
|
||
return []
|
||
|
||
if not window_changes:
|
||
# Fallback : utiliser les timestamps des screenshots (mtime) et les
|
||
# mapper aux nodes dans l'ordre du workflow
|
||
return self._map_screenshots_by_order(session_dir, node_order)
|
||
|
||
# Associer chaque changement de fenêtre au node correspondant
|
||
timeline = []
|
||
last_node_id = None
|
||
for ts, title in sorted(window_changes, key=lambda x: x[0]):
|
||
matched_node = self._match_title_to_node(title, title_to_node)
|
||
if matched_node and matched_node != last_node_id:
|
||
timeline.append((ts, matched_node))
|
||
last_node_id = matched_node
|
||
|
||
# Si timeline est vide, essayer un mapping plus souple avec les screenshots
|
||
if not timeline:
|
||
return self._map_screenshots_by_order(session_dir, node_order)
|
||
|
||
logger.info(
|
||
"Timeline node mappée : %d transitions (%s)",
|
||
len(timeline),
|
||
" → ".join(nid for _, nid in timeline),
|
||
)
|
||
return timeline
|
||
|
||
def _match_title_to_node(self, window_title: str, title_to_node: dict) -> Optional[str]:
|
||
"""Matcher un titre de fenêtre à un node via les fragments de titre.
|
||
|
||
Le matching est insensible à la casse et cherche si le fragment du node
|
||
est contenu dans le titre de la fenêtre.
|
||
|
||
Returns:
|
||
Le node_id correspondant, ou None.
|
||
"""
|
||
title_lower = window_title.lower()
|
||
best_match = None
|
||
best_len = 0
|
||
for fragment, node_id in title_to_node.items():
|
||
if fragment in title_lower and len(fragment) > best_len:
|
||
best_match = node_id
|
||
best_len = len(fragment)
|
||
return best_match
|
||
|
||
def _map_screenshots_by_order(
|
||
self,
|
||
session_dir: Path,
|
||
node_order: List[str],
|
||
) -> List[tuple]:
|
||
"""Fallback : mapper les screenshots aux nodes dans l'ordre topologique.
|
||
|
||
Utilisé quand les window_focus_change ne sont pas disponibles. Distribue
|
||
les screenshots uniformément entre les nodes.
|
||
|
||
Returns:
|
||
Liste de (timestamp, node_id).
|
||
"""
|
||
shots_dir = session_dir / "shots"
|
||
if not shots_dir.exists() or not node_order:
|
||
return []
|
||
|
||
shot_files = sorted(shots_dir.glob("shot_*_full.png"))
|
||
if not shot_files:
|
||
return []
|
||
|
||
timeline = []
|
||
shots_per_node = max(1, len(shot_files) // max(1, len(node_order)))
|
||
|
||
for i, node_id in enumerate(node_order):
|
||
shot_idx = min(i * shots_per_node, len(shot_files) - 1)
|
||
ts = shot_files[shot_idx].stat().st_mtime
|
||
timeline.append((ts, node_id))
|
||
|
||
return timeline
|
||
|
||
def _group_events_by_transition(
|
||
self,
|
||
events: List[Dict[str, Any]],
|
||
node_timeline: List[tuple],
|
||
) -> List[Dict[str, Any]]:
|
||
"""Grouper les événements bruts par transition de node.
|
||
|
||
Pour chaque événement, détermine dans quel intervalle de transition il
|
||
se situe (entre quel changement de node et le suivant).
|
||
|
||
Args:
|
||
events: Liste d'événements bruts (dicts avec ``timestamp`` et ``type``).
|
||
node_timeline: Liste de (timestamp, node_id) triée chronologiquement.
|
||
|
||
Returns:
|
||
Liste de groupes :
|
||
``[{"from_node": "node_000", "to_node": "node_001", "events": [...]}, ...]``
|
||
"""
|
||
if not node_timeline or not events:
|
||
return []
|
||
|
||
# Construire les intervalles de transition
|
||
# Chaque transition va du node_timeline[i] au node_timeline[i+1]
|
||
groups = []
|
||
for i in range(len(node_timeline)):
|
||
from_node = node_timeline[i][1]
|
||
to_node = node_timeline[i + 1][1] if i + 1 < len(node_timeline) else ""
|
||
start_ts = node_timeline[i][0]
|
||
end_ts = node_timeline[i + 1][0] if i + 1 < len(node_timeline) else float("inf")
|
||
|
||
# Collecter les événements dans cet intervalle
|
||
group_events = []
|
||
for evt in events:
|
||
evt_ts = float(evt.get("timestamp", 0))
|
||
evt_type = evt.get("type", "")
|
||
|
||
# Ignorer les événements non-actionnables
|
||
if evt_type in _PARASITIC_ACTION_TYPES:
|
||
continue
|
||
if evt_type in ("window_focus_change", "screenshot", "heartbeat"):
|
||
continue
|
||
|
||
if start_ts <= evt_ts < end_ts:
|
||
group_events.append(evt)
|
||
|
||
if group_events:
|
||
groups.append({
|
||
"from_node": from_node,
|
||
"to_node": to_node,
|
||
"events": group_events,
|
||
})
|
||
|
||
# Si aucun événement n'a de timestamp ou tout est tombé dans les parasites,
|
||
# essayer un groupement séquentiel sans timestamp
|
||
if not groups:
|
||
groups = self._group_events_sequential(events, node_timeline)
|
||
|
||
return groups
|
||
|
||
def _group_events_sequential(
|
||
self,
|
||
events: List[Dict[str, Any]],
|
||
node_timeline: List[tuple],
|
||
) -> List[Dict[str, Any]]:
|
||
"""Groupement séquentiel des événements quand les timestamps ne matchent pas.
|
||
|
||
Distribue les événements actionnables entre les transitions du workflow
|
||
de manière proportionnelle.
|
||
|
||
Returns:
|
||
Liste de groupes comme ``_group_events_by_transition``.
|
||
"""
|
||
# Filtrer les événements actionnables
|
||
actionable_types = {
|
||
"mouse_click", "text_input", "key_press", "key_combo", "scroll",
|
||
}
|
||
actionable = [
|
||
e for e in events
|
||
if e.get("type", "") in actionable_types
|
||
]
|
||
|
||
if not actionable or len(node_timeline) < 2:
|
||
# Un seul node → tout dans un seul groupe
|
||
if actionable and node_timeline:
|
||
return [{
|
||
"from_node": node_timeline[0][1],
|
||
"to_node": node_timeline[-1][1] if len(node_timeline) > 1 else "",
|
||
"events": actionable,
|
||
}]
|
||
return []
|
||
|
||
# Distribuer proportionnellement
|
||
n_transitions = len(node_timeline) - 1
|
||
events_per_group = max(1, len(actionable) // n_transitions)
|
||
groups = []
|
||
|
||
for i in range(n_transitions):
|
||
start_idx = i * events_per_group
|
||
end_idx = (i + 1) * events_per_group if i < n_transitions - 1 else len(actionable)
|
||
group_events = actionable[start_idx:end_idx]
|
||
if group_events:
|
||
groups.append({
|
||
"from_node": node_timeline[i][1],
|
||
"to_node": node_timeline[i + 1][1],
|
||
"events": group_events,
|
||
})
|
||
|
||
return groups
|
||
|
||
def _events_to_replay_actions(
|
||
self,
|
||
events: List[Dict[str, Any]],
|
||
screen_w: int,
|
||
screen_h: int,
|
||
group_idx: int,
|
||
) -> List[Dict[str, Any]]:
|
||
"""Convertir une liste d'événements bruts en actions de replay normalisées.
|
||
|
||
Pré-fusionne les text_input consécutifs (frappes individuelles) en un seul
|
||
bloc de texte avant de les convertir en actions avec timing. Cela évite
|
||
d'avoir des dizaines d'actions ``type`` d'un seul caractère.
|
||
|
||
Ajoute un ``wait`` entre les actions de types différents quand le délai
|
||
naturel est significatif (> 2s = pause de réflexion de l'utilisateur).
|
||
|
||
Args:
|
||
events: Événements bruts d'un groupe.
|
||
screen_w: Largeur de l'écran source (pixels).
|
||
screen_h: Hauteur de l'écran source (pixels).
|
||
group_idx: Index du groupe de transition.
|
||
|
||
Returns:
|
||
Liste d'actions normalisées pour le replay.
|
||
"""
|
||
import uuid
|
||
|
||
# ── Phase 1 : pré-fusionner les événements text_input consécutifs ──
|
||
# Les frappes clavier produisent un text_input par caractère. On les
|
||
# fusionne en un seul événement avec le texte complet et le timestamp
|
||
# du premier caractère.
|
||
merged_events = []
|
||
for evt in events:
|
||
evt_type = evt.get("type", "")
|
||
|
||
# Filtrer les événements non-actionnables
|
||
if evt_type in _PARASITIC_ACTION_TYPES:
|
||
continue
|
||
if evt_type in ("window_focus_change", "screenshot", "heartbeat"):
|
||
continue
|
||
|
||
if evt_type == "text_input":
|
||
text = evt.get("text", "")
|
||
if not text:
|
||
continue
|
||
# Fusionner avec le précédent si c'est aussi un text_input
|
||
if merged_events and merged_events[-1].get("type") == "text_input":
|
||
merged_events[-1]["text"] = merged_events[-1].get("text", "") + text
|
||
# Garder le timestamp le plus récent pour le calcul du delta suivant
|
||
merged_events[-1]["_end_ts"] = float(evt.get("timestamp", 0))
|
||
continue
|
||
|
||
merged_events.append(dict(evt))
|
||
|
||
# ── Phase 2 : convertir les événements fusionnés en actions replay ──
|
||
actions = []
|
||
last_action_end_ts = 0.0
|
||
|
||
for evt in merged_events:
|
||
evt_type = evt.get("type", "")
|
||
evt_ts = float(evt.get("timestamp", 0))
|
||
|
||
# Calculer le délai entre la fin de la dernière action et le début
|
||
# de celle-ci. Les waits ne sont insérés que pour les pauses
|
||
# significatives (> 2s), pas entre chaque frappe.
|
||
if last_action_end_ts > 0 and evt_ts > last_action_end_ts:
|
||
delta_ms = int((evt_ts - last_action_end_ts) * 1000)
|
||
if delta_ms > 2000:
|
||
capped_ms = min(delta_ms, 5000)
|
||
actions.append({
|
||
"action_id": f"act_wait_{uuid.uuid4().hex[:8]}",
|
||
"type": "wait",
|
||
"duration_ms": capped_ms,
|
||
"group": group_idx,
|
||
})
|
||
|
||
# Mettre à jour le timestamp de fin
|
||
end_ts = float(evt.get("_end_ts", evt_ts))
|
||
if end_ts > 0:
|
||
last_action_end_ts = end_ts
|
||
elif evt_ts > 0:
|
||
last_action_end_ts = evt_ts
|
||
|
||
action = {
|
||
"action_id": f"act_hyb_{uuid.uuid4().hex[:8]}",
|
||
"group": group_idx,
|
||
}
|
||
|
||
if evt_type == "mouse_click":
|
||
pos = evt.get("pos", [])
|
||
if pos and len(pos) == 2:
|
||
action["type"] = "click"
|
||
action["x_pct"] = round(pos[0] / screen_w, 6)
|
||
action["y_pct"] = round(pos[1] / screen_h, 6)
|
||
action["button"] = evt.get("button", "left")
|
||
else:
|
||
continue
|
||
|
||
elif evt_type == "text_input":
|
||
text = evt.get("text", "")
|
||
if not text:
|
||
continue
|
||
action["type"] = "type"
|
||
action["text"] = text
|
||
|
||
elif evt_type in ("key_press", "key_combo"):
|
||
keys = evt.get("keys", [])
|
||
if not keys:
|
||
key = evt.get("key", "")
|
||
if key:
|
||
keys = [key]
|
||
if not keys:
|
||
continue
|
||
keys = _sanitize_keys(keys)
|
||
if _is_modifier_only(keys):
|
||
continue
|
||
action["type"] = "key_combo"
|
||
action["keys"] = keys
|
||
|
||
elif evt_type == "scroll":
|
||
pos = evt.get("pos", [])
|
||
action["type"] = "scroll"
|
||
if pos and len(pos) == 2:
|
||
action["x_pct"] = round(pos[0] / screen_w, 6)
|
||
action["y_pct"] = round(pos[1] / screen_h, 6)
|
||
action["delta"] = evt.get("delta", -3)
|
||
|
||
else:
|
||
continue
|
||
|
||
actions.append(action)
|
||
|
||
return actions
|
||
|
||
@property
|
||
def stats(self) -> Dict[str, Any]:
|
||
"""Statistiques du processeur."""
|
||
with self._data_lock:
|
||
total_workflows = len(self._workflows)
|
||
return {
|
||
"active_sessions": self.session_manager.active_session_count,
|
||
"total_sessions": len(self.session_manager.session_ids),
|
||
"total_workflows": total_workflows,
|
||
"faiss_vectors": self._faiss_manager.index.ntotal if self._faiss_manager else 0,
|
||
"initialized": self._initialized,
|
||
}
|