Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 12s
security-audit / pip-audit (CVE dépendances) (push) Successful in 12s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 15s
tests / Tests unitaires (sans GPU) (push) Failing after 14s
tests / Tests sécurité (critique) (push) Has been skipped
VWB Executor : - _check_screen_for_patterns() : capture écran + OCR + pattern matching - _handle_detected_pattern() : clic automatique sur dialogues connus - Vérifie entre chaque étape en mode intelligent/debug - Si un dialogue bloque (OK, Save, Cancel), Léa le gère seule Stream Processor : - Enrichit les ScreenState avec ui_pattern/ui_pattern_action/ui_pattern_target - Les patterns détectés sont loggés et stockés dans les résultats - Permet au GraphBuilder de savoir quels écrans sont des dialogues Phase 2 du plan "connaissance native de l'environnement". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
5008 lines
203 KiB
Python
5008 lines
203 KiB
Python
"""
|
||
StreamProcessor — Pont entre le streaming Agent V1 et le core pipeline RPA Vision V3.
|
||
|
||
Orchestre les composants core (ScreenAnalyzer, CLIP, FAISS, GraphBuilder)
|
||
pour traiter en temps réel les screenshots et événements reçus via fibre.
|
||
|
||
Tous les calculs GPU tournent ici (serveur RTX 5070).
|
||
"""
|
||
|
||
import base64
|
||
import hashlib
|
||
import logging
|
||
import os
|
||
import threading
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
from typing import Any, Dict, List, Optional
|
||
|
||
import numpy as np
|
||
|
||
from .live_session_manager import LiveSessionManager
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Filtrage des événements parasites (modificateurs seuls, text_input vide, etc.)
|
||
# Utilisé à 3 niveaux : réception (process_event), expansion (compound), et
|
||
# en amont dans GraphBuilder._find_transition_events / _build_compound_action.
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_MODIFIER_ONLY_KEYS = {
|
||
"ctrl", "ctrl_l", "ctrl_r", "control", "control_l", "control_r",
|
||
"alt", "alt_l", "alt_r", "alt_gr",
|
||
"shift", "shift_l", "shift_r",
|
||
"win", "win_l", "win_r", "cmd", "cmd_l", "cmd_r",
|
||
"meta", "meta_l", "meta_r", "super", "super_l", "super_r",
|
||
}
|
||
|
||
# Mapping numpad vk codes → caractères (layout-indépendant)
|
||
_NUMPAD_VK_MAP = {
|
||
96: '0', 97: '1', 98: '2', 99: '3', 100: '4',
|
||
101: '5', 102: '6', 103: '7', 104: '8', 105: '9',
|
||
106: '*', 107: '+', 109: '-', 110: '.', 111: '/',
|
||
}
|
||
|
||
# Table de conversion des caractères de contrôle vers les touches lisibles
|
||
# (produits par certains agents qui capturent les raw keycodes)
|
||
_CONTROL_CHAR_MAP = {
|
||
'\x01': 'a', '\x02': 'b', '\x03': 'c', '\x04': 'd', '\x05': 'e',
|
||
'\x06': 'f', '\x07': 'g', '\x08': 'h', '\x09': 'i', '\x0a': 'j',
|
||
'\x0b': 'k', '\x0c': 'l', '\x0d': 'm', '\x0e': 'n', '\x0f': 'o',
|
||
'\x10': 'p', '\x11': 'q', '\x12': 'r', '\x13': 's', '\x14': 't',
|
||
'\x15': 'u', '\x16': 'v', '\x17': 'w', '\x18': 'x', '\x19': 'y',
|
||
'\x1a': 'z',
|
||
}
|
||
|
||
# Types d'événements parasites à ignorer dans les actions enrichies
|
||
_PARASITIC_ACTION_TYPES = frozenset({
|
||
'heartbeat', 'focus_change', 'window_focus_change',
|
||
'screenshot', 'status', 'ping', 'pong',
|
||
})
|
||
|
||
|
||
def _is_modifier_only(keys: list) -> bool:
|
||
"""Retourne True si la liste de touches ne contient que des modificateurs."""
|
||
if not keys:
|
||
return True
|
||
return all(k.lower() in _MODIFIER_ONLY_KEYS for k in keys)
|
||
|
||
|
||
def _sanitize_keys(keys: list) -> list:
|
||
"""Nettoyer une liste de touches : convertir les caractères de contrôle."""
|
||
cleaned = []
|
||
for k in keys:
|
||
if not k:
|
||
continue
|
||
if k in _CONTROL_CHAR_MAP:
|
||
cleaned.append(_CONTROL_CHAR_MAP[k])
|
||
else:
|
||
cleaned.append(k)
|
||
return cleaned
|
||
|
||
|
||
def _is_parasitic_event(event_data: Dict[str, Any]) -> bool:
|
||
"""Retourne True si l'événement est parasite et doit être filtré.
|
||
|
||
Événements rejetés :
|
||
- key_press / key_combo avec uniquement des modificateurs seuls
|
||
- key_press / key_combo avec liste de touches vide
|
||
- text_input avec texte vide
|
||
"""
|
||
event_type = event_data.get("type", "")
|
||
|
||
if event_type in ("key_press", "key_combo"):
|
||
keys = event_data.get("keys", event_data.get("data", {}).get("keys", []))
|
||
if not keys or _is_modifier_only(keys):
|
||
return True
|
||
|
||
elif event_type == "text_input":
|
||
text = event_data.get("text", event_data.get("data", {}).get("text", ""))
|
||
if not text:
|
||
return True
|
||
|
||
return False
|
||
|
||
|
||
def _reconstruct_text_from_raw_keys(raw_keys: list) -> str:
|
||
"""Reconstruire le texte correct à partir des vk codes des raw_keys.
|
||
|
||
Corrige les problèmes de capture AZERTY, notamment :
|
||
- Numpad / (vk=111) capturé comme char='!' → corrigé en '/'
|
||
- Numpad 0-9 (vk=96-105) capturés comme char=None → corrigés en '0'-'9'
|
||
"""
|
||
text_parts = []
|
||
for event in raw_keys:
|
||
if event.get("action") != "press":
|
||
continue
|
||
vk = event.get("vk", 0)
|
||
char = event.get("char")
|
||
kind = event.get("kind", "")
|
||
name = event.get("name", "")
|
||
|
||
# Ignorer les modificateurs (releases qui traînent dans le buffer)
|
||
if kind == "key" and name in _MODIFIER_ONLY_KEYS:
|
||
continue
|
||
|
||
# Numpad : mapping fixe (layout-indépendant)
|
||
if vk in _NUMPAD_VK_MAP:
|
||
text_parts.append(_NUMPAD_VK_MAP[vk])
|
||
# Touche normale avec caractère valide
|
||
elif char and len(char) == 1 and char.isprintable():
|
||
text_parts.append(char)
|
||
return "".join(text_parts)
|
||
|
||
|
||
def _key_combo_printable_char(keys: list) -> Optional[str]:
|
||
"""Si le key_combo produit un seul caractère imprimable, le retourner.
|
||
|
||
Exemples :
|
||
- ['ctrl', '@'] → '@' (AltGr+0 sur AZERTY, capturé comme ctrl+@)
|
||
- ['shift', 'A'] → 'A'
|
||
- ['ctrl', 'c'] → None (c'est un raccourci, pas un caractère)
|
||
- ['enter'] → None (pas un caractère imprimable)
|
||
"""
|
||
if not keys:
|
||
return None
|
||
non_modifiers = [k for k in keys if k.lower() not in _MODIFIER_ONLY_KEYS]
|
||
if len(non_modifiers) != 1:
|
||
return None
|
||
char = non_modifiers[0]
|
||
# Un seul caractère imprimable (pas un nom de touche spéciale)
|
||
if len(char) == 1 and char.isprintable():
|
||
# Vérifier que c'est pas un raccourci courant (ctrl+c, ctrl+v, etc.)
|
||
modifiers = {k.lower() for k in keys if k.lower() in _MODIFIER_ONLY_KEYS}
|
||
if modifiers <= {"shift", "shift_l", "shift_r"}:
|
||
# Shift + char = caractère majuscule/spécial → OK
|
||
return char
|
||
if "alt_gr" in modifiers or (
|
||
"ctrl" in modifiers and ("alt" in modifiers or "alt_r" in modifiers)
|
||
):
|
||
# AltGr + char = caractère spécial (@ # € etc.) → OK
|
||
return char
|
||
# Ctrl + caractère NON-alphabétique = probablement AltGr résiduel
|
||
# Sur AZERTY, AltGr+0 produit @, capturé comme ['ctrl', 'alt_gr'] + ['ctrl', '@']
|
||
# Le premier combo est filtré (modifier-only), le second a juste 'ctrl' + '@'
|
||
if "ctrl" in modifiers and not char.isalpha():
|
||
return char
|
||
# ctrl + lettre seul = raccourci (Ctrl+S, Ctrl+C) → pas un caractère
|
||
return None
|
||
return None
|
||
|
||
|
||
def _merge_consecutive_text_inputs(steps: list) -> list:
|
||
"""Fusionne les text_input consécutifs en un seul."""
|
||
merged = []
|
||
for step in steps:
|
||
if (step.get("type") in ("text_input", "type")
|
||
and merged
|
||
and merged[-1].get("type") in ("text_input", "type")):
|
||
merged[-1]["text"] = merged[-1].get("text", "") + step.get("text", "")
|
||
else:
|
||
merged.append(dict(step)) # copie pour ne pas muter l'original
|
||
return merged
|
||
|
||
|
||
def _dedup_consecutive_combos(steps: list) -> list:
|
||
"""Supprime les key_combo dupliqués consécutifs."""
|
||
deduped = []
|
||
for step in steps:
|
||
if (step.get("type") in ("key_combo", "key_press")
|
||
and deduped
|
||
and deduped[-1].get("type") in ("key_combo", "key_press")
|
||
and deduped[-1].get("keys") == step.get("keys")):
|
||
continue # Doublon → skip
|
||
deduped.append(step)
|
||
return deduped
|
||
|
||
|
||
def _filter_parasitic_steps(steps: list) -> list:
|
||
"""Supprime les steps key_combo/key_press avec uniquement des modificateurs seuls."""
|
||
return [
|
||
s for s in steps
|
||
if not (
|
||
s.get("type") in ("key_combo", "key_press")
|
||
and _is_modifier_only(s.get("keys", []))
|
||
)
|
||
]
|
||
|
||
|
||
def _ensure_min_waits(steps: list, min_wait_ms: int = 300) -> list:
|
||
"""Ajoute un wait de min_wait_ms entre les steps si aucun wait n'existe."""
|
||
if not steps:
|
||
return steps
|
||
result = [steps[0]]
|
||
for step in steps[1:]:
|
||
if result[-1].get("type") != "wait" and step.get("type") != "wait":
|
||
result.append({"type": "wait", "duration_ms": min_wait_ms})
|
||
result.append(step)
|
||
return result
|
||
|
||
|
||
def clean_compound_steps(steps: list) -> list:
|
||
"""Pipeline complet de nettoyage des steps d'une compound action.
|
||
|
||
Applique dans l'ordre :
|
||
1. Suppression des steps modificateurs seuls
|
||
2. Fusion des text_input consécutifs
|
||
3. Déduplication des key_combo consécutifs identiques
|
||
4. Ajout de waits minimum entre steps si absents
|
||
"""
|
||
cleaned = _filter_parasitic_steps(steps)
|
||
cleaned = _merge_consecutive_text_inputs(cleaned)
|
||
cleaned = _dedup_consecutive_combos(cleaned)
|
||
cleaned = _ensure_min_waits(cleaned)
|
||
return cleaned
|
||
|
||
|
||
def clean_enriched_actions(actions: list) -> list:
|
||
"""Nettoyer une liste d'actions enrichies pour éliminer le bruit de replay.
|
||
|
||
Appliqué après construction de toutes les actions enrichies (post-BFS),
|
||
travaille sur les actions au format replay (type, keys, text, etc.).
|
||
|
||
Filtres appliqués dans l'ordre :
|
||
1. Supprimer les types parasites (heartbeat, focus_change, screenshot, etc.)
|
||
2. Sanitiser les touches (caractères de contrôle → lettres)
|
||
3. Supprimer les key_combo avec uniquement des modificateurs seuls
|
||
4. Supprimer les actions type/text_input avec texte vide
|
||
5. Dédupliquer les key_combo consécutifs identiques
|
||
6. Fusionner les text_input (type) consécutifs dans la même fenêtre
|
||
7. Supprimer les waits consécutifs (garder le plus long)
|
||
"""
|
||
if not actions:
|
||
return actions
|
||
|
||
# ── Étape 1-4 : filtrer les actions parasites ──
|
||
filtered = []
|
||
for a in actions:
|
||
atype = a.get('type', '')
|
||
|
||
# Types parasites issus du streaming brut
|
||
if atype in _PARASITIC_ACTION_TYPES:
|
||
continue
|
||
|
||
# key_combo : sanitiser les touches, puis filtrer les modificateurs seuls
|
||
if atype == 'key_combo':
|
||
keys = _sanitize_keys(a.get('keys', []))
|
||
if _is_modifier_only(keys):
|
||
continue
|
||
if not keys:
|
||
continue
|
||
a = dict(a, keys=keys)
|
||
|
||
# type/text_input : supprimer si texte vide
|
||
if atype == 'type' and not a.get('text', '').strip():
|
||
continue
|
||
|
||
filtered.append(a)
|
||
|
||
# ── Étape 5 : dédupliquer les key_combo consécutifs identiques ──
|
||
deduped = []
|
||
for a in filtered:
|
||
if (deduped
|
||
and a.get('type') == 'key_combo'
|
||
and deduped[-1].get('type') == 'key_combo'
|
||
and a.get('keys') == deduped[-1].get('keys')):
|
||
continue
|
||
deduped.append(a)
|
||
|
||
# ── Étape 6 : fusionner les text_input (type) consécutifs ──
|
||
merged = []
|
||
for a in deduped:
|
||
if (merged
|
||
and a.get('type') == 'type'
|
||
and merged[-1].get('type') == 'type'
|
||
# Même fenêtre cible (ou pas de fenêtre)
|
||
and a.get('window_title', '') == merged[-1].get('window_title', '')):
|
||
merged[-1] = dict(merged[-1], text=merged[-1].get('text', '') + a.get('text', ''))
|
||
continue
|
||
merged.append(a)
|
||
|
||
# ── Étape 7 : supprimer les waits consécutifs (garder le plus long) ──
|
||
cleaned = []
|
||
for a in merged:
|
||
if (cleaned
|
||
and a.get('type') == 'wait'
|
||
and cleaned[-1].get('type') == 'wait'):
|
||
if a.get('duration_ms', 0) > cleaned[-1].get('duration_ms', 0):
|
||
cleaned[-1] = a
|
||
continue
|
||
cleaned.append(a)
|
||
|
||
return cleaned
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Replay direct depuis événements bruts (sans VLM/GraphBuilder)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# Types d'événements ignorés lors de la construction du replay brut
|
||
_IGNORED_EVENT_TYPES = frozenset({
|
||
'heartbeat', 'focus_change', 'window_focus_change',
|
||
'screenshot', 'action_result', 'status', 'ping', 'pong',
|
||
})
|
||
|
||
# Combos de raccourcis spéciaux qui nécessitent un wait post-action
|
||
_POST_COMBO_WAITS = {
|
||
# (tuple de touches normalisées, triées en minuscule) -> wait_ms
|
||
# NB : les tuples sont sorted() alphabétiquement
|
||
('r', 'win'): 3000, # Win+R → Exécuter
|
||
('r', 'super'): 3000,
|
||
('meta', 'r'): 3000,
|
||
('enter',): 2000, # Enter (confirmation)
|
||
('return',): 2000,
|
||
('ctrl', 's'): 3000, # Ctrl+S
|
||
('ctrl', 's', 'shift'): 3000, # Ctrl+Shift+S
|
||
('alt', 'f4'): 2000, # Alt+F4
|
||
}
|
||
|
||
|
||
def _extract_screen_resolution(events: list) -> tuple:
|
||
"""Extraire la résolution d'écran depuis les métadonnées des événements.
|
||
|
||
Cherche d'abord le champ `screen_resolution` dans `screen_metadata`,
|
||
puis infère depuis les positions maximales des clics.
|
||
|
||
Returns:
|
||
Tuple (width, height).
|
||
"""
|
||
# Priorité 1 : screen_metadata.screen_resolution explicite
|
||
for evt in events:
|
||
event_data = evt.get("event", evt)
|
||
sm = event_data.get("screen_metadata", {})
|
||
sr = sm.get("screen_resolution")
|
||
if sr and isinstance(sr, (list, tuple)) and len(sr) == 2:
|
||
w, h = int(sr[0]), int(sr[1])
|
||
if w > 0 and h > 0:
|
||
return (w, h)
|
||
|
||
# Priorité 2 : inférer depuis les positions max des clics
|
||
return StreamProcessor._infer_screen_resolution(events)
|
||
|
||
|
||
def _should_cut_after_event(
|
||
event_data: dict,
|
||
saw_save_combo: bool = False,
|
||
actions_count: int = 0,
|
||
) -> bool:
|
||
"""Retourne True si on doit couper le replay après cet événement.
|
||
|
||
Coupe quand :
|
||
- Alt+F4 (fermeture par raccourci)
|
||
- Clic dans le systray APRÈS des actions significatives (pas au début)
|
||
- Après un Ctrl+S/Ctrl+Shift+S suivi d'un clic dans une fenêtre non-applicative
|
||
"""
|
||
# Ne pas couper au tout début (les premiers clics sont souvent
|
||
# sur la taskbar/recherche pour ouvrir une application)
|
||
if actions_count < 3:
|
||
return False
|
||
evt_type = event_data.get("type", "")
|
||
|
||
# Alt+F4
|
||
if evt_type in ("key_combo", "key_press"):
|
||
keys = event_data.get("keys", [])
|
||
keys_lower = {k.lower() for k in keys if k}
|
||
if "f4" in keys_lower and ("alt" in keys_lower or "alt_l" in keys_lower
|
||
or "alt_r" in keys_lower):
|
||
return True
|
||
|
||
# Clic dans le systray (fenêtres de fin de session)
|
||
if evt_type == "mouse_click":
|
||
window = event_data.get("window", {})
|
||
title = (window.get("title", "") if isinstance(window, dict) else "").lower()
|
||
if any(t in title for t in [
|
||
"unknown_window", "dépassement de capacité",
|
||
"fenêtre de dépassement", "overflow",
|
||
]):
|
||
return True
|
||
|
||
# Après un Ctrl+S/Ctrl+Shift+S : couper si clic dans une fenêtre
|
||
# non-applicative (terminal, agent, systray)
|
||
if saw_save_combo:
|
||
_CUT_WINDOW_PATTERNS = [
|
||
"cmd.exe", "system32", "dépassement", "unknown",
|
||
"powershell", "windowsterminal", "python.exe",
|
||
"terminal", "systray",
|
||
]
|
||
if any(t in title for t in _CUT_WINDOW_PATTERNS):
|
||
return True
|
||
# Couper aussi par app_name (plus robuste que le titre)
|
||
app_name = (window.get("app_name", "") if isinstance(window, dict) else "").lower()
|
||
if any(t in app_name for t in [
|
||
"windowsterminal", "cmd.exe", "powershell",
|
||
"python.exe", "explorer.exe",
|
||
]):
|
||
return True
|
||
|
||
return False
|
||
|
||
|
||
def _needs_post_wait(action: dict) -> int:
|
||
"""Retourne le wait en ms à insérer après cette action, ou 0."""
|
||
if action.get("type") == "key_combo":
|
||
keys = action.get("keys", [])
|
||
key_tuple = tuple(sorted(k.lower() for k in keys if k))
|
||
wait_ms = _POST_COMBO_WAITS.get(key_tuple, 0)
|
||
if wait_ms:
|
||
return wait_ms
|
||
return 0
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Gemma4 : lecture du texte visible sur les éléments sans OCR
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# Port du Docker Ollama 0.20 (gemma4)
|
||
_GEMMA4_PORT = os.environ.get("GEMMA4_PORT", "11435")
|
||
|
||
|
||
def _unload_gemma4():
|
||
"""Décharger gemma4 du GPU Docker pour libérer la VRAM pour qwen2.5vl."""
|
||
try:
|
||
import requests as _req
|
||
_req.post(
|
||
f"http://localhost:{_GEMMA4_PORT}/api/generate",
|
||
json={"model": "gemma4:e4b", "keep_alive": 0},
|
||
timeout=5,
|
||
)
|
||
logger.info("gemma4 déchargé du GPU (VRAM libérée)")
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
def _gemma4_read_element(
|
||
img_b64: str,
|
||
window_title: str = "",
|
||
click_pos: tuple = None,
|
||
) -> str:
|
||
"""Demander à gemma4 d'identifier l'élément cliqué.
|
||
|
||
Peut recevoir soit un crop (80x80) soit un screenshot fenêtre complet.
|
||
Si click_pos est fourni, c'est un screenshot fenêtre et gemma4 doit
|
||
identifier l'élément à cette position.
|
||
|
||
Returns:
|
||
Le texte lu (ex: "voiture electrique.txt") ou chaîne vide.
|
||
"""
|
||
import requests as _requests
|
||
|
||
context = f" in '{window_title}'" if window_title else ""
|
||
if click_pos:
|
||
prompt = (
|
||
f"This is a screenshot of a window{context}. "
|
||
f"The user clicked at position ({click_pos[0]}, {click_pos[1]}). "
|
||
"What is the exact text or label of the element that was clicked? "
|
||
"Answer ONLY the text, nothing else."
|
||
)
|
||
else:
|
||
prompt = (
|
||
f"This is a cropped UI element{context}. "
|
||
"Read the exact text on this element. "
|
||
"If it's an icon with no text, describe it in 2-3 words.\n"
|
||
"Answer ONLY the text or label, nothing else."
|
||
)
|
||
|
||
try:
|
||
resp = _requests.post(f"http://localhost:{_GEMMA4_PORT}/api/chat", json={
|
||
"model": "gemma4:e4b",
|
||
"messages": [{"role": "user", "content": prompt, "images": [img_b64]}],
|
||
"stream": False,
|
||
"think": False,
|
||
"options": {"temperature": 0.1, "num_predict": 30},
|
||
}, timeout=15)
|
||
if resp.ok:
|
||
content = resp.json().get("message", {}).get("content", "").strip()
|
||
# Nettoyer : retirer guillemets, points, préfixes
|
||
content = content.strip('"\'').rstrip(".").strip()
|
||
if content and 2 <= len(content) <= 60:
|
||
return content
|
||
except Exception as e:
|
||
logger.debug("gemma4 read element échoué : %s", e)
|
||
|
||
return ""
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# VLM identification d'éléments UI (pour les éléments sans texte OCR)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _vlm_identify_element(anchor_b64: str, window_title: str = "") -> str:
|
||
"""Demander au VLM de décrire un élément UI à partir de son crop.
|
||
|
||
Utilisé pendant le build_replay quand un élément cliqué n'a pas de
|
||
texte visible (icône YOLO sans label OCR). Le VLM décrit CE QUE c'est
|
||
(bouton, icône, menu) pour permettre la résolution sémantique au replay.
|
||
|
||
Returns:
|
||
Description courte de l'élément (ex: "search icon", "Word icon")
|
||
ou chaîne vide si le VLM n'est pas disponible.
|
||
"""
|
||
try:
|
||
import io
|
||
import tempfile
|
||
from PIL import Image
|
||
except ImportError:
|
||
return ""
|
||
|
||
try:
|
||
# Décoder le crop base64 → fichier temporaire pour le VLM
|
||
img_bytes = base64.b64decode(anchor_b64)
|
||
img = Image.open(io.BytesIO(img_bytes))
|
||
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
||
img.save(tmp, format="PNG")
|
||
tmp_path = tmp.name
|
||
|
||
import requests as _requests
|
||
from core.detection.vlm_config import get_vlm_model
|
||
_enrich_model = get_vlm_model()
|
||
context = f" from the window '{window_title}'" if window_title else ""
|
||
# Modèle VLM configurable (gemma4:e4b par défaut)
|
||
crop_b64 = base64.b64encode(open(tmp_path, "rb").read()).decode()
|
||
resp = _requests.post("http://localhost:11434/api/chat", json={
|
||
"model": _enrich_model,
|
||
"messages": [
|
||
{"role": "system", "content": "You name UI elements in 2-5 words. No explanation."},
|
||
{"role": "user", "content": (
|
||
f"This is a UI element{context}. "
|
||
"Name it in 2-5 words. Examples: 'save icon in title bar', "
|
||
"'Windows search icon', 'close button', 'file menu'."
|
||
), "images": [crop_b64]},
|
||
],
|
||
"stream": False,
|
||
"options": {"temperature": 0.1, "num_predict": 20},
|
||
}, timeout=30)
|
||
result = {"success": resp.ok, "response": resp.json().get("message", {}).get("content", "")}
|
||
|
||
import os
|
||
os.unlink(tmp_path)
|
||
|
||
if result.get("success"):
|
||
raw = result.get("response", "").strip()
|
||
# Extraire un label court depuis la réponse (le VLM bavarde souvent)
|
||
# Retirer les préfixes courants
|
||
for prefix in (
|
||
"Based on the image, the UI element shown is a ",
|
||
"Based on the image, the UI element is a ",
|
||
"Based on the image, this is a ",
|
||
"Based on the image, it is a ",
|
||
"Based on the image, I can see ",
|
||
"Based on the image, ",
|
||
"The UI element shown is a ",
|
||
"The UI element is a ",
|
||
"The element is a ",
|
||
"This is a ", "It is a ", "It's a ", "I can see a ",
|
||
"I can see ", "A ",
|
||
):
|
||
if raw.lower().startswith(prefix.lower()):
|
||
raw = raw[len(prefix):]
|
||
break
|
||
# Rejeter les réponses qui sont du bavardage, pas un label
|
||
reject_patterns = (
|
||
"several", "multiple", "various", "image",
|
||
"I can", "there are", "there is", "elements",
|
||
"the following", "here are",
|
||
)
|
||
if any(p in raw.lower()[:30] for p in reject_patterns):
|
||
logger.debug("VLM identify : réponse bavarde rejetée (raw='%s')", raw[:60])
|
||
return ""
|
||
|
||
# Prendre les 5 premiers mots utiles
|
||
words = raw.split()[:5]
|
||
label = " ".join(words).strip('",.\' ').rstrip(".")
|
||
if label and 2 <= len(label) <= 40:
|
||
logger.info("VLM identify element : '%s'", label)
|
||
return label
|
||
else:
|
||
logger.debug("VLM identify : label trop court/long après nettoyage (raw='%s')", raw[:80])
|
||
except Exception as e:
|
||
logger.debug("VLM identify element échoué : %s", e)
|
||
|
||
return ""
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# SomEngine — enrichissement Set-of-Mark des clics pendant le build_replay
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_som_cache: Dict[str, Any] = {} # screenshot_id -> SomResult (cache build_replay)
|
||
_SOM_CACHE_MAX = 50
|
||
|
||
|
||
def _get_som_engine():
|
||
"""Singleton SomEngine partagé."""
|
||
try:
|
||
from core.detection.som_engine import get_shared_engine
|
||
return get_shared_engine()
|
||
except ImportError:
|
||
return None
|
||
|
||
|
||
def _som_identify_clicked_element(
|
||
event_data: dict,
|
||
session_dir: Optional[Path],
|
||
screen_w: int,
|
||
screen_h: int,
|
||
) -> Optional[dict]:
|
||
"""Identifier l'élément UI cliqué via SomEngine (YOLO + docTR).
|
||
|
||
Charge le full screenshot de l'événement, lance SomEngine pour détecter
|
||
tous les éléments, puis identifie celui qui se trouve sous le clic.
|
||
|
||
Returns:
|
||
Dict avec id, label, source, bbox_norm, center_norm, confidence
|
||
ou None si SomEngine indisponible ou élément non trouvé.
|
||
"""
|
||
engine = _get_som_engine()
|
||
if engine is None:
|
||
return None
|
||
|
||
if not session_dir:
|
||
return None
|
||
|
||
shots_dir = session_dir / "shots"
|
||
if not shots_dir.is_dir():
|
||
return None
|
||
|
||
# Trouver le full screenshot
|
||
screenshot_id = event_data.get("screenshot_id", "")
|
||
if not screenshot_id:
|
||
return None
|
||
|
||
full_path = shots_dir / f"{screenshot_id}_full.png"
|
||
if not full_path.is_file():
|
||
# Fallback : essayer sans le suffixe _full
|
||
full_path = shots_dir / f"{screenshot_id}.png"
|
||
if not full_path.is_file():
|
||
return None
|
||
|
||
# Vérifier le cache SomResult par (session_dir, screenshot_id)
|
||
cache_key = f"{session_dir}:{screenshot_id}"
|
||
if cache_key in _som_cache:
|
||
result = _som_cache[cache_key]
|
||
else:
|
||
try:
|
||
from PIL import Image
|
||
img = Image.open(full_path).convert("RGB")
|
||
except Exception as e:
|
||
logger.debug("SoM: impossible de charger %s : %s", full_path, e)
|
||
return None
|
||
|
||
# Lancer SomEngine
|
||
try:
|
||
result = engine.analyze(img)
|
||
except Exception as e:
|
||
logger.warning("SoM: erreur d'analyse : %s", e)
|
||
return None
|
||
|
||
# Stocker dans le cache (éléments seulement, pas l'image annotée)
|
||
from core.detection.som_engine import SomResult
|
||
cached = SomResult(
|
||
elements=result.elements,
|
||
width=result.width,
|
||
height=result.height,
|
||
analysis_time_ms=result.analysis_time_ms,
|
||
)
|
||
if len(_som_cache) >= _SOM_CACHE_MAX:
|
||
# Supprimer la plus ancienne entrée (FIFO)
|
||
oldest_key = next(iter(_som_cache))
|
||
del _som_cache[oldest_key]
|
||
_som_cache[cache_key] = cached
|
||
|
||
if not result.elements:
|
||
return None
|
||
|
||
# Trouver l'élément cliqué
|
||
pos = event_data.get("pos", [])
|
||
if not pos or len(pos) < 2:
|
||
return None
|
||
|
||
click_x, click_y = int(pos[0]), int(pos[1])
|
||
elem = result.find_element_at(click_x, click_y, margin=30)
|
||
if elem is None:
|
||
logger.debug(
|
||
"SoM: aucun élément trouvé au clic (%d, %d) parmi %d éléments",
|
||
click_x, click_y, len(result.elements),
|
||
)
|
||
return None
|
||
|
||
logger.info(
|
||
"SoM: clic (%d,%d) → élément #%d '%s' (source=%s, conf=%.2f)",
|
||
click_x, click_y, elem.id, elem.label, elem.source, elem.confidence,
|
||
)
|
||
return {
|
||
"id": elem.id,
|
||
"label": elem.label,
|
||
"source": elem.source,
|
||
"bbox_norm": list(elem.bbox_norm),
|
||
"center_norm": list(elem.center_norm),
|
||
"confidence": elem.confidence,
|
||
"element_count": len(result.elements),
|
||
}
|
||
|
||
|
||
def _load_crop_for_event(
|
||
event_data: dict,
|
||
session_dir: Optional[Path],
|
||
screen_w: int = 0,
|
||
screen_h: int = 0,
|
||
) -> Optional[str]:
|
||
"""Charger le crop de référence (anchor) associé à un événement mouse_click.
|
||
|
||
Stratégie de recherche (par priorité) :
|
||
1. vision_info.crop → extraire le nom de fichier, chercher dans session_dir/shots/
|
||
2. screenshot_id → chercher {screenshot_id}_crop.png dans session_dir/shots/
|
||
3. Timestamp → chercher le focus_XXXX.png le plus proche dans session_dir/shots/
|
||
4. Fallback → cropper le full screenshot autour de la position du clic (400x400)
|
||
|
||
Args:
|
||
event_data: Événement brut (type mouse_click).
|
||
session_dir: Répertoire de la session (contient shots/).
|
||
screen_w: Largeur écran (pour le fallback crop).
|
||
screen_h: Hauteur écran (pour le fallback crop).
|
||
|
||
Returns:
|
||
Image crop encodée en base64, ou None si rien trouvé.
|
||
"""
|
||
if not session_dir:
|
||
return None
|
||
|
||
shots_dir = Path(session_dir) / "shots"
|
||
if not shots_dir.is_dir():
|
||
return None
|
||
|
||
def _read_png_b64(path: Path) -> Optional[str]:
|
||
"""Lire un fichier PNG et retourner son contenu en base64."""
|
||
try:
|
||
if path.exists() and path.stat().st_size > 0:
|
||
return base64.b64encode(path.read_bytes()).decode("utf-8")
|
||
except Exception as e:
|
||
logger.debug("Impossible de lire le crop %s : %s", path, e)
|
||
return None
|
||
|
||
# ── Stratégie 1 : vision_info.crop (nom de fichier Windows → chercher localement) ──
|
||
vision_info = event_data.get("vision_info", {})
|
||
if isinstance(vision_info, dict):
|
||
crop_path_str = vision_info.get("crop", "")
|
||
if crop_path_str:
|
||
# Extraire le nom de fichier depuis le chemin Windows
|
||
# Ex: "C:\\rpa_vision\\...\\shots\\shot_0002_crop.png" → "shot_0002_crop.png"
|
||
crop_filename = crop_path_str.replace("\\", "/").split("/")[-1]
|
||
result = _read_png_b64(shots_dir / crop_filename)
|
||
if result:
|
||
logger.debug("Crop trouvé via vision_info : %s", crop_filename)
|
||
return result
|
||
|
||
# ── Stratégie 2 : screenshot_id → {screenshot_id}_crop.png ──
|
||
screenshot_id = event_data.get("screenshot_id", "")
|
||
if screenshot_id:
|
||
crop_filename = f"{screenshot_id}_crop.png"
|
||
result = _read_png_b64(shots_dir / crop_filename)
|
||
if result:
|
||
logger.debug("Crop trouvé via screenshot_id : %s", crop_filename)
|
||
return result
|
||
|
||
# ── Stratégie 3 : Timestamp → focus_XXXX.png le plus proche ──
|
||
evt_ts = float(event_data.get("timestamp", 0))
|
||
if evt_ts > 0:
|
||
try:
|
||
focus_files = sorted(shots_dir.glob("focus_*.png"))
|
||
if focus_files:
|
||
best_file = None
|
||
best_delta = float("inf")
|
||
for f in focus_files:
|
||
# Extraire le timestamp du nom : focus_1774437474.png
|
||
try:
|
||
ts_str = f.stem.split("_", 1)[1]
|
||
file_ts = float(ts_str)
|
||
delta = abs(file_ts - evt_ts)
|
||
if delta < best_delta:
|
||
best_delta = delta
|
||
best_file = f
|
||
except (ValueError, IndexError):
|
||
continue
|
||
# Accepter si le focus est à moins de 5 secondes du clic
|
||
if best_file and best_delta < 5.0:
|
||
result = _read_png_b64(best_file)
|
||
if result:
|
||
logger.debug(
|
||
"Crop trouvé via timestamp (focus, delta=%.1fs) : %s",
|
||
best_delta, best_file.name,
|
||
)
|
||
return result
|
||
except Exception as e:
|
||
logger.debug("Erreur recherche focus par timestamp : %s", e)
|
||
|
||
# ── Stratégie 4 : Fallback → cropper le full screenshot autour du clic ──
|
||
if screenshot_id and screen_w > 0 and screen_h > 0:
|
||
full_path = shots_dir / f"{screenshot_id}_full.png"
|
||
if full_path.exists():
|
||
try:
|
||
from PIL import Image
|
||
import io
|
||
|
||
img = Image.open(full_path)
|
||
pos = event_data.get("pos", [])
|
||
if pos and len(pos) == 2:
|
||
cx, cy = int(pos[0]), int(pos[1])
|
||
# Crop 80x80 centré sur le clic (discriminant pour icônes)
|
||
crop_size = 40
|
||
x1 = max(0, cx - crop_size)
|
||
y1 = max(0, cy - crop_size)
|
||
x2 = min(img.width, cx + crop_size)
|
||
y2 = min(img.height, cy + crop_size)
|
||
cropped = img.crop((x1, y1, x2, y2))
|
||
buf = io.BytesIO()
|
||
cropped.save(buf, format="PNG")
|
||
result = base64.b64encode(buf.getvalue()).decode("utf-8")
|
||
logger.debug(
|
||
"Crop fallback (full screenshot cropped) : %s, zone=%dx%d",
|
||
full_path.name, x2 - x1, y2 - y1,
|
||
)
|
||
return result
|
||
except ImportError:
|
||
logger.debug("PIL non disponible pour le crop fallback")
|
||
except Exception as e:
|
||
logger.debug("Erreur crop fallback depuis full screenshot : %s", e)
|
||
|
||
return None
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Enrichissement VLM partagé — utilisé par build_replay ET reprocess_session
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def enrich_click_from_screenshot(
|
||
screenshot_path: Path,
|
||
click_x: int,
|
||
click_y: int,
|
||
screen_w: int,
|
||
screen_h: int,
|
||
window_title: str = "",
|
||
vision_info: Optional[dict] = None,
|
||
session_dir: Optional[Path] = None,
|
||
screenshot_id: str = "",
|
||
) -> Dict[str, Any]:
|
||
"""Enrichir un clic avec les informations visuelles extraites du screenshot.
|
||
|
||
Fonction partagée entre build_replay_from_raw_events() et
|
||
_enrich_workflow_targets() pour éviter la duplication de logique.
|
||
|
||
Étapes :
|
||
1. Crop 80x80 autour du clic → anchor_image_base64
|
||
2. SomEngine → détection de l'élément cliqué (label, type, bbox)
|
||
3. Description VLM positionnelle (haut/bas, gauche/droite)
|
||
4. Texte de l'élément (vision_info OCR > SomEngine label)
|
||
|
||
Args:
|
||
screenshot_path: Chemin vers le screenshot full (PNG).
|
||
click_x: Position X du clic en pixels.
|
||
click_y: Position Y du clic en pixels.
|
||
screen_w: Largeur de l'écran en pixels.
|
||
screen_h: Hauteur de l'écran en pixels.
|
||
window_title: Titre de la fenêtre active au moment du clic.
|
||
vision_info: Dict vision_info de l'événement original (optionnel).
|
||
session_dir: Répertoire de la session (pour le cache SomEngine).
|
||
screenshot_id: Identifiant du screenshot (pour le cache SomEngine).
|
||
|
||
Returns:
|
||
Dict avec les clés : anchor_image_base64, by_text, by_text_source,
|
||
by_role, vlm_description, window_title, original_position, by_position,
|
||
som_element (optionnel). Retourne un dict vide si le screenshot est
|
||
introuvable ou illisible.
|
||
"""
|
||
import io
|
||
|
||
if not screenshot_path or not Path(screenshot_path).is_file():
|
||
return {}
|
||
|
||
# ── 1. Crop 80x80 centré sur le clic (anchor_image_base64) ──
|
||
anchor_b64 = ""
|
||
try:
|
||
from PIL import Image
|
||
img = Image.open(screenshot_path)
|
||
crop_size = 40
|
||
x1 = max(0, click_x - crop_size)
|
||
y1 = max(0, click_y - crop_size)
|
||
x2 = min(img.width, click_x + crop_size)
|
||
y2 = min(img.height, click_y + crop_size)
|
||
cropped = img.crop((x1, y1, x2, y2))
|
||
buf = io.BytesIO()
|
||
cropped.save(buf, format="PNG")
|
||
anchor_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
|
||
except Exception as e:
|
||
logger.debug("enrich_click: crop échoué pour %s : %s", screenshot_path, e)
|
||
|
||
if not anchor_b64:
|
||
return {}
|
||
|
||
# ── 2. Position relative dans l'écran ──
|
||
y_relative = ""
|
||
x_relative = ""
|
||
if screen_h > 0:
|
||
y_relative = (
|
||
"en bas" if click_y / screen_h > 0.8
|
||
else "en haut" if click_y / screen_h < 0.2
|
||
else "au milieu"
|
||
)
|
||
if screen_w > 0:
|
||
x_relative = (
|
||
"à gauche" if click_x / screen_w < 0.3
|
||
else "à droite" if click_x / screen_w > 0.7
|
||
else "au centre"
|
||
)
|
||
|
||
# ── 3. Description VLM positionnelle ──
|
||
vlm_parts = []
|
||
if window_title:
|
||
vlm_parts.append(f"Dans la fenêtre '{window_title}'")
|
||
position_desc = " ".join(p for p in [y_relative, x_relative] if p)
|
||
if position_desc:
|
||
vlm_parts.append(f"l'élément cliqué se trouve {position_desc} de l'écran")
|
||
|
||
# Ajouter le texte visible (vision_info OCR)
|
||
if isinstance(vision_info, dict):
|
||
vis_text = vision_info.get("text", "")
|
||
vis_type = vision_info.get("type", "")
|
||
if vis_text:
|
||
vlm_parts.append(f"le texte visible est '{vis_text}'")
|
||
if vis_type:
|
||
vlm_parts.append(f"c'est un élément de type '{vis_type}'")
|
||
vlm_description = ", ".join(vlm_parts) if vlm_parts else ""
|
||
|
||
# ── 4. SomEngine : identifier l'élément cliqué ──
|
||
som_elem = None
|
||
if session_dir and screenshot_id:
|
||
# Appeler _som_identify_clicked_element via un event_data minimal
|
||
fake_event = {
|
||
"screenshot_id": screenshot_id,
|
||
"pos": [click_x, click_y],
|
||
}
|
||
som_elem = _som_identify_clicked_element(
|
||
fake_event, session_dir, screen_w, screen_h,
|
||
)
|
||
|
||
# ── 5. Déterminer le texte et le type de l'élément ──
|
||
element_text = ""
|
||
element_type = ""
|
||
text_source = ""
|
||
if isinstance(vision_info, dict):
|
||
element_text = vision_info.get("text", "")
|
||
element_type = vision_info.get("type", "")
|
||
if element_text:
|
||
text_source = "ocr"
|
||
if not element_text and som_elem and som_elem.get("label"):
|
||
element_text = som_elem["label"]
|
||
text_source = "ocr"
|
||
|
||
# ── 5b. Gemma4 : identifier l'élément cliqué via le screenshot fenêtre ──
|
||
# Quand l'OCR et SomEngine ne trouvent pas de texte, gemma4 (port 11435)
|
||
# reçoit le screenshot fenêtre + la position du clic et décrit l'élément.
|
||
# Un seul appel, une seule fois, pendant l'enregistrement.
|
||
if not element_text:
|
||
# Essayer avec le screenshot fenêtre (contexte complet)
|
||
win_screenshot = None
|
||
if session_dir and screenshot_id:
|
||
win_path = Path(session_dir) / "shots" / f"{screenshot_id}_window.png"
|
||
if win_path.is_file():
|
||
win_screenshot = base64.b64encode(win_path.read_bytes()).decode()
|
||
# Fallback sur le crop
|
||
img_b64 = win_screenshot or anchor_b64
|
||
element_text = _gemma4_read_element(
|
||
img_b64, window_title,
|
||
click_pos=(click_x, click_y) if win_screenshot else None,
|
||
)
|
||
if element_text:
|
||
text_source = "vlm"
|
||
logger.info("gemma4 a lu l'élément : '%s'", element_text)
|
||
|
||
# ── 6. Coordonnées normalisées ──
|
||
by_position = [
|
||
round(click_x / screen_w, 6) if screen_w > 0 else 0.0,
|
||
round(click_y / screen_h, 6) if screen_h > 0 else 0.0,
|
||
]
|
||
|
||
# ── Assembler le résultat ──
|
||
result = {
|
||
"anchor_image_base64": anchor_b64,
|
||
"by_text": element_text,
|
||
"by_text_source": text_source,
|
||
"by_role": element_type or (som_elem.get("source", "") if som_elem else ""),
|
||
"vlm_description": vlm_description,
|
||
"window_title": window_title,
|
||
"original_position": {
|
||
"x_relative": x_relative,
|
||
"y_relative": y_relative,
|
||
},
|
||
"by_position": by_position,
|
||
}
|
||
|
||
if som_elem:
|
||
result["som_element"] = som_elem
|
||
|
||
return result
|
||
|
||
|
||
def _attach_expected_screenshots(
|
||
actions: list, raw_events: list, session_dir: Path,
|
||
) -> None:
|
||
"""Attacher les screenshots de référence (résultat attendu) aux actions.
|
||
|
||
Pour chaque action de type click ou key_combo, cherche le screenshot
|
||
res_shot_XXXX.png (capturé 1s après l'action pendant l'enregistrement)
|
||
et l'attache comme expected_screenshot_b64.
|
||
|
||
Le screenshot est compressé en JPEG qualité 40 (~30-50 KB en b64)
|
||
pour limiter le poids de chaque action.
|
||
"""
|
||
import base64
|
||
from PIL import Image as _Image
|
||
|
||
shots_dir = session_dir / "shots"
|
||
if not shots_dir.is_dir():
|
||
return
|
||
|
||
# Mapper les screenshot_id des événements originaux aux actions
|
||
# Les événements click/key_combo ont un "screenshot_id" (ex: "shot_0003")
|
||
# Le screenshot résultat est "res_shot_0003.png"
|
||
action_idx = 0
|
||
for raw_evt in raw_events:
|
||
event_data = raw_evt.get("event", raw_evt)
|
||
screenshot_id = event_data.get("screenshot_id", "")
|
||
if not screenshot_id:
|
||
continue
|
||
|
||
evt_type = event_data.get("type", "")
|
||
if evt_type not in ("mouse_click", "key_combo", "key_press"):
|
||
continue
|
||
|
||
# Trouver l'action correspondante (même type, index croissant)
|
||
while action_idx < len(actions):
|
||
a = actions[action_idx]
|
||
a_type = a.get("type", "")
|
||
if a_type in ("click", "key_combo"):
|
||
break
|
||
action_idx += 1
|
||
else:
|
||
break # Plus d'actions
|
||
|
||
# Charger le screenshot résultat
|
||
res_file = shots_dir / f"res_{screenshot_id}.png"
|
||
if not res_file.is_file():
|
||
action_idx += 1
|
||
continue
|
||
|
||
try:
|
||
img = _Image.open(res_file)
|
||
# Redimensionner pour réduire le poids (800px de large)
|
||
if img.width > 800:
|
||
ratio = 800 / img.width
|
||
img = img.resize((800, int(img.height * ratio)), _Image.LANCZOS)
|
||
import io
|
||
buf = io.BytesIO()
|
||
img.save(buf, format="JPEG", quality=40)
|
||
b64 = base64.b64encode(buf.getvalue()).decode()
|
||
actions[action_idx]["expected_screenshot_b64"] = b64
|
||
logger.debug(
|
||
"Screenshot de référence attaché à action %d : %s (%d KB)",
|
||
action_idx, res_file.name, len(b64) // 1024,
|
||
)
|
||
except Exception as e:
|
||
logger.debug("Erreur chargement screenshot ref %s : %s", res_file, e)
|
||
|
||
action_idx += 1
|
||
|
||
|
||
def _enrich_actions_with_intentions(
|
||
actions: list,
|
||
session_dir: Path,
|
||
domain_id: str = "",
|
||
) -> None:
|
||
"""Enrichir les actions avec intention + expected_result via gemma4.
|
||
|
||
Pour chaque action, gemma4 reçoit :
|
||
- Le contexte métier (TIM codage CIM-10, bureautique, etc.)
|
||
- Le screenshot AVANT l'action (contexte visuel)
|
||
- La description de l'action (clic sur X, frappe Y)
|
||
- La position dans le workflow (action N/total)
|
||
|
||
Et produit :
|
||
- intention : ce que l'utilisateur veut accomplir (en termes métier)
|
||
- expected_result : ce qui devrait changer à l'écran après l'action
|
||
- expected_state : description de l'état attendu AVANT l'action
|
||
|
||
Ces champs alimentent le Critic (vérification sémantique) et
|
||
l'Observer (pré-analyse écran). C'est la Phase 1 du plan acteur.
|
||
|
||
Un seul appel gemma4 par action — fait pendant le build, pas au replay.
|
||
Modifie les actions in-place.
|
||
"""
|
||
import requests as _requests
|
||
|
||
gemma4_port = os.environ.get("GEMMA4_PORT", _GEMMA4_PORT)
|
||
gemma4_url = f"http://localhost:{gemma4_port}/api/chat"
|
||
|
||
# Charger le contexte métier
|
||
from .domain_context import get_domain_context
|
||
domain = get_domain_context(domain_id or os.environ.get("RPA_DOMAIN", "generic"))
|
||
domain_prompt = domain.system_prompt
|
||
|
||
# Vérifier que gemma4 est disponible
|
||
try:
|
||
_requests.get(f"http://localhost:{gemma4_port}/api/tags", timeout=3)
|
||
except Exception:
|
||
logger.info("gemma4 non disponible — enrichissement intentions désactivé")
|
||
return
|
||
|
||
logger.info(f"Enrichissement intentions avec contexte métier : {domain.name}")
|
||
shots_dir = session_dir / "shots"
|
||
total = len(actions)
|
||
|
||
# Construire un résumé du workflow pour le contexte
|
||
action_summaries = []
|
||
for i, a in enumerate(actions):
|
||
a_type = a.get("type", "?")
|
||
if a_type == "click":
|
||
by_text = a.get("target_spec", {}).get("by_text", "")
|
||
window = a.get("target_spec", {}).get("window_title", "")
|
||
desc = f"{i+1}. Clic sur '{by_text or 'élément'}' dans '{window or '?'}'"
|
||
elif a_type == "type":
|
||
text = a.get("text", "")
|
||
desc = f"{i+1}. Saisie de texte : '{text[:30]}'"
|
||
elif a_type == "key_combo":
|
||
keys = a.get("keys", [])
|
||
desc = f"{i+1}. Raccourci clavier : {'+'.join(keys)}"
|
||
elif a_type == "wait":
|
||
desc = f"{i+1}. Attente {a.get('duration_ms', 0)}ms"
|
||
else:
|
||
desc = f"{i+1}. {a_type}"
|
||
action_summaries.append(desc)
|
||
|
||
workflow_summary = "\n".join(action_summaries)
|
||
|
||
enriched_count = 0
|
||
for i, action in enumerate(actions):
|
||
a_type = action.get("type", "")
|
||
|
||
# N'enrichir que les actions significatives (click, type, key_combo)
|
||
if a_type not in ("click", "type", "key_combo"):
|
||
continue
|
||
|
||
# Construire la description de l'action courante
|
||
if a_type == "click":
|
||
by_text = action.get("target_spec", {}).get("by_text", "")
|
||
window = action.get("target_spec", {}).get("window_title", "")
|
||
action_desc = f"Cliquer sur '{by_text or 'un élément'}' dans la fenêtre '{window or 'inconnue'}'"
|
||
elif a_type == "type":
|
||
text = action.get("text", "")
|
||
action_desc = f"Saisir le texte '{text[:50]}'"
|
||
elif a_type == "key_combo":
|
||
keys = action.get("keys", [])
|
||
action_desc = f"Appuyer sur {'+'.join(keys)}"
|
||
else:
|
||
action_desc = a_type
|
||
|
||
# Charger le screenshot associé (si disponible)
|
||
screenshot_b64 = ""
|
||
# Chercher le screenshot le plus proche dans le target_spec ou les expected
|
||
if action.get("target_spec", {}).get("anchor_image_base64"):
|
||
# On a le crop — pas suffisant pour le contexte, chercher le full
|
||
pass
|
||
|
||
# Chercher dans les screenshots de la session
|
||
# Les actions sont ordonnées, et les screenshots aussi
|
||
# On utilise l'expected_screenshot de l'action PRÉCÉDENTE comme "avant"
|
||
if i > 0 and actions[i-1].get("expected_screenshot_b64"):
|
||
screenshot_b64 = actions[i-1]["expected_screenshot_b64"]
|
||
|
||
# Prompt enrichi avec le contexte métier
|
||
prompt = (
|
||
f"Tu analyses un workflow enregistré ({total} actions).\n\n"
|
||
f"Workflow complet :\n{workflow_summary}\n\n"
|
||
f"Action actuelle ({i+1}/{total}) : {action_desc}\n\n"
|
||
f"Réponds EXACTEMENT dans ce format (3 lignes) :\n"
|
||
f"INTENTION: ce que l'utilisateur veut accomplir avec cette action (1 phrase)\n"
|
||
f"AVANT: description de l'état attendu de l'écran AVANT cette action (1 phrase)\n"
|
||
f"APRÈS: description de l'état attendu de l'écran APRÈS cette action (1 phrase)"
|
||
)
|
||
|
||
# Injecter le contexte métier (TIM, comptabilité, etc.)
|
||
messages = []
|
||
if domain_prompt:
|
||
messages.append({"role": "system", "content": domain_prompt})
|
||
messages.append({"role": "user", "content": prompt})
|
||
if screenshot_b64:
|
||
messages[0]["images"] = [screenshot_b64]
|
||
|
||
try:
|
||
resp = _requests.post(
|
||
gemma4_url,
|
||
json={
|
||
"model": "gemma4:e4b",
|
||
"messages": messages,
|
||
"stream": False,
|
||
"think": True,
|
||
"options": {"temperature": 0.1, "num_predict": 800},
|
||
},
|
||
timeout=20,
|
||
)
|
||
if not resp.ok:
|
||
continue
|
||
|
||
content = resp.json().get("message", {}).get("content", "").strip()
|
||
|
||
# Parser la réponse
|
||
intention = ""
|
||
expected_state = ""
|
||
expected_result = ""
|
||
|
||
for line in content.split("\n"):
|
||
line_clean = line.strip()
|
||
upper = line_clean.upper()
|
||
if upper.startswith("INTENTION:"):
|
||
intention = line_clean.split(":", 1)[1].strip()
|
||
elif upper.startswith("AVANT:"):
|
||
expected_state = line_clean.split(":", 1)[1].strip()
|
||
elif upper.startswith(("APRÈS:", "APRES:")):
|
||
expected_result = line_clean.split(":", 1)[1].strip()
|
||
|
||
# Stocker dans l'action (modifie in-place)
|
||
if intention:
|
||
action["intention"] = intention
|
||
if expected_state:
|
||
action["expected_state"] = expected_state
|
||
# Propager dans target_spec pour l'Observer
|
||
if "target_spec" in action:
|
||
action["target_spec"]["expected_state"] = expected_state
|
||
if expected_result:
|
||
action["expected_result"] = expected_result
|
||
|
||
if intention or expected_result:
|
||
enriched_count += 1
|
||
logger.debug(
|
||
"Action %d/%d enrichie : intention='%s', expected='%s'",
|
||
i+1, total, intention[:50], expected_result[:50],
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.debug("Enrichissement action %d échoué : %s", i+1, e)
|
||
continue
|
||
|
||
logger.info(
|
||
"Enrichissement intentions : %d/%d actions enrichies par gemma4",
|
||
enriched_count, total,
|
||
)
|
||
|
||
|
||
def build_replay_from_raw_events(
|
||
events: list,
|
||
session_id: str = "",
|
||
session_dir: Optional[str] = None,
|
||
) -> list:
|
||
"""Construire un replay propre directement depuis les événements bruts d'une session.
|
||
|
||
Pas de dépendance au VLM, au GraphBuilder ou aux workflows.
|
||
Fonctionne immédiatement après la capture.
|
||
|
||
Pipeline de traitement :
|
||
1. Filtrer les événements parasites (heartbeat, focus_change, action_result)
|
||
2. Extraire la résolution d'écran depuis les métadonnées
|
||
3. Couper après Alt+F4 ou après des clics systray post-sauvegarde
|
||
4. Fusionner les text_input consécutifs (même séparés par <500ms)
|
||
5. Convertir en actions normalisées (coordonnées en %, waits adaptés)
|
||
6. Pour les clics : activer visual_mode et attacher le crop de référence (anchor)
|
||
7. Appliquer clean_enriched_actions() (dédup combos, sanitize, merge texte)
|
||
8. Insérer des waits contextuels après raccourcis critiques
|
||
|
||
Args:
|
||
events: Événements bruts chargés depuis live_events.jsonl.
|
||
Format : [{"session_id": ..., "event": {...}}, ...]
|
||
session_id: Identifiant de session (pour le logging).
|
||
session_dir: Répertoire de la session (contient shots/). Si fourni,
|
||
les crops de référence sont attachés aux clics pour le visual replay.
|
||
|
||
Returns:
|
||
Liste d'actions prêtes pour la queue de replay.
|
||
"""
|
||
import uuid
|
||
|
||
if not events:
|
||
return []
|
||
|
||
# Résoudre le répertoire de session pour les crops visuels
|
||
session_dir_path = Path(session_dir) if session_dir else None
|
||
if session_dir_path and not session_dir_path.is_dir():
|
||
logger.warning(
|
||
"session_dir '%s' n'existe pas — visual replay désactivé", session_dir,
|
||
)
|
||
session_dir_path = None
|
||
|
||
# ── 1. Extraire la résolution d'écran ──
|
||
screen_w, screen_h = _extract_screen_resolution(events)
|
||
logger.info(
|
||
"build_replay_from_raw_events(%s) : %d événements bruts, résolution=%dx%d, visual=%s",
|
||
session_id, len(events), screen_w, screen_h,
|
||
bool(session_dir_path),
|
||
)
|
||
|
||
# ── 2. Filtrer et normaliser les événements ──
|
||
actionable_events = []
|
||
saw_save_combo = False # Tracker Ctrl+S / Ctrl+Shift+S pour la coupure systray
|
||
for raw_evt in events:
|
||
event_data = raw_evt.get("event", raw_evt)
|
||
evt_type = event_data.get("type", "")
|
||
|
||
# Ignorer les types parasites
|
||
if evt_type in _IGNORED_EVENT_TYPES:
|
||
continue
|
||
|
||
# Tracker les raccourcis de sauvegarde (Ctrl+S, Ctrl+Shift+S)
|
||
if evt_type in ("key_combo", "key_press"):
|
||
keys = _sanitize_keys(event_data.get("keys", []))
|
||
keys_lower = {k.lower() for k in keys if k}
|
||
if "s" in keys_lower and ("ctrl" in keys_lower or "ctrl_l" in keys_lower
|
||
or "ctrl_r" in keys_lower):
|
||
saw_save_combo = True
|
||
|
||
# Vérifier la coupure AVANT d'ajouter l'événement.
|
||
# Pour les clics post-sauvegarde sur des fenêtres non-applicatives,
|
||
# on ne veut PAS inclure le clic qui déclenche la coupure.
|
||
if _should_cut_after_event(event_data, saw_save_combo=saw_save_combo, actions_count=len(actionable_events)):
|
||
# Alt+F4 est une action applicative → l'inclure
|
||
is_alt_f4 = False
|
||
if evt_type in ("key_combo", "key_press"):
|
||
_keys_lower = {k.lower() for k in event_data.get("keys", []) if k}
|
||
is_alt_f4 = "f4" in _keys_lower and (
|
||
"alt" in _keys_lower or "alt_l" in _keys_lower or "alt_r" in _keys_lower
|
||
)
|
||
if is_alt_f4:
|
||
actionable_events.append(event_data)
|
||
# Sinon c'est un clic parasite → ne PAS l'inclure
|
||
logger.debug(
|
||
"Coupure du replay (saw_save=%s, type=%s, included=%s)",
|
||
saw_save_combo, evt_type, is_alt_f4,
|
||
)
|
||
break
|
||
|
||
actionable_events.append(event_data)
|
||
|
||
# ── 3. Fusionner les text_input consécutifs ──
|
||
# Tous les text_input consécutifs sont fusionnés en un seul, indépendamment
|
||
# du gap temporel. L'utilisateur tape lettre par lettre mais on veut un
|
||
# seul "type" avec tout le texte dans le replay.
|
||
# Les key_combos qui produisent un caractère imprimable (ex: AltGr+0 → @)
|
||
# sont convertis en text_input pour être fusionnés avec le texte adjacent.
|
||
# Seul un changement de fenêtre (window_title différent) coupe la fusion.
|
||
merged_events = []
|
||
_altgr_seq_got_char = False # Vrai si on a déjà capturé le caractère d'une séquence AltGr
|
||
_in_altgr_seq = False # Vrai si on est dans une séquence AltGr (après un modifier-only ctrl+alt_gr)
|
||
for evt in actionable_events:
|
||
evt_type = evt.get("type", "")
|
||
evt_ts = float(evt.get("timestamp", 0))
|
||
|
||
# Convertir les key_combos qui produisent un caractère imprimable
|
||
# en text_input pour qu'ils soient fusionnés avec le texte adjacent.
|
||
# Ex: AltGr+0 capturé comme ['ctrl', '@'] → text_input '@'
|
||
if evt_type in ("key_combo", "key_press"):
|
||
keys = _sanitize_keys(evt.get("keys", []))
|
||
|
||
# Ignorer les key_combo modifier-only (ex: ['ctrl', 'alt_gr'] seul)
|
||
non_mods = [k for k in keys if k.lower() not in _MODIFIER_ONLY_KEYS]
|
||
if not non_mods:
|
||
_in_altgr_seq = True # Début de séquence AltGr
|
||
_altgr_seq_got_char = False
|
||
continue
|
||
|
||
printable = _key_combo_printable_char(keys)
|
||
|
||
# Filtrer les fantômes AltGr : sur AZERTY, AltGr+touche produit
|
||
# 3 events (ctrl+alt_gr, ctrl+@, ctrl+]). Le 3ème est un fantôme
|
||
# du release de la touche physique. On garde SEULEMENT le 1er
|
||
# caractère et on ignore les suivants de la même séquence.
|
||
if _in_altgr_seq and printable:
|
||
if not _altgr_seq_got_char:
|
||
# Premier caractère de la séquence AltGr → on le garde
|
||
_altgr_seq_got_char = True
|
||
evt = dict(evt, type="text_input", text=printable)
|
||
evt_type = "text_input"
|
||
else:
|
||
# Fantôme AltGr → ignorer complètement
|
||
continue
|
||
elif printable:
|
||
_in_altgr_seq = False
|
||
_altgr_seq_got_char = False
|
||
evt = dict(evt, type="text_input", text=printable)
|
||
evt_type = "text_input"
|
||
else:
|
||
_in_altgr_seq = False
|
||
_altgr_seq_got_char = False
|
||
|
||
if not printable and evt_type != "text_input":
|
||
# AltGr seul (AZERTY) : le caractère est dans les raw_keys
|
||
raw_keys = evt.get("raw_keys", [])
|
||
for rk in raw_keys:
|
||
ch = rk.get("char", "")
|
||
if ch and len(ch) == 1 and ch.isprintable() and rk.get("action") == "release":
|
||
printable = ch
|
||
break
|
||
if printable and evt_type != "text_input":
|
||
# Transformer en text_input pour fusion
|
||
evt = dict(evt, type="text_input", text=printable)
|
||
evt_type = "text_input"
|
||
# Pas de raw_keys pour ce caractère (sera collé via clipboard)
|
||
|
||
if evt_type == "text_input":
|
||
text = evt.get("text", "")
|
||
if not text:
|
||
continue
|
||
|
||
# Les \n et \t ne sont PAS du texte — ce sont des touches Enter/Tab
|
||
# qui doivent devenir des key_combo pour le replay
|
||
if text == "\n":
|
||
merged_events.append({
|
||
"type": "key_combo",
|
||
"keys": ["enter"],
|
||
"timestamp": evt_ts,
|
||
})
|
||
continue
|
||
if text == "\t":
|
||
merged_events.append({
|
||
"type": "key_combo",
|
||
"keys": ["tab"],
|
||
"timestamp": evt_ts,
|
||
})
|
||
continue
|
||
|
||
# Fusionner avec le précédent text_input si même application
|
||
# On compare par app_name (pas title, car le titre change pendant la frappe)
|
||
if merged_events and merged_events[-1].get("type") == "text_input":
|
||
prev_app = merged_events[-1].get("window", {}).get("app_name", "")
|
||
curr_app = evt.get("window", {}).get("app_name", "")
|
||
# Même application (ou application inconnue) → fusionner
|
||
if not prev_app or not curr_app or prev_app == curr_app:
|
||
merged_events[-1]["text"] = merged_events[-1].get("text", "") + text
|
||
merged_events[-1]["_end_ts"] = evt_ts
|
||
# Fusionner aussi les raw_keys (replay exact)
|
||
if evt.get("raw_keys"):
|
||
prev_raw = merged_events[-1].get("raw_keys", [])
|
||
merged_events[-1]["raw_keys"] = prev_raw + evt["raw_keys"]
|
||
continue
|
||
|
||
merged_events.append(dict(evt, _end_ts=evt_ts))
|
||
else:
|
||
merged_events.append(dict(evt))
|
||
|
||
# ── 3b. Reconstruire le texte correct depuis les raw_keys ──
|
||
# Les raw_keys contiennent les vk codes exacts (layout-indépendant)
|
||
# qui permettent de corriger les erreurs de capture AZERTY
|
||
# (ex: numpad / capturé comme '!' → corrigé en '/')
|
||
# ATTENTION : ne reconstruire QUE si le texte reconstruit a la même
|
||
# longueur que le texte original. Si des caractères viennent de
|
||
# key_combos convertis (ex: @ de AltGr), ils n'ont pas de raw_keys
|
||
# et la reconstruction les perdrait.
|
||
for evt in merged_events:
|
||
if evt.get("type") == "text_input" and evt.get("raw_keys"):
|
||
reconstructed = _reconstruct_text_from_raw_keys(evt["raw_keys"])
|
||
original = evt.get("text", "")
|
||
if reconstructed and len(reconstructed) == len(original):
|
||
# Même longueur → remplacement sûr (corrige les chars numpad)
|
||
evt["text"] = reconstructed
|
||
if reconstructed != original:
|
||
logger.debug(
|
||
"Texte reconstruit depuis raw_keys : '%s' → '%s'",
|
||
original[:50], reconstructed[:50],
|
||
)
|
||
elif reconstructed and len(reconstructed) < len(original):
|
||
# Longueur différente → des chars viennent de key_combos convertis
|
||
# (ex: @ de AltGr fusionné dans le texte mais absent des raw_keys)
|
||
# Garder le texte original ET supprimer raw_keys pour forcer le
|
||
# copier-coller au replay (les raw_keys sont incomplets)
|
||
del evt["raw_keys"]
|
||
logger.debug(
|
||
"Texte corrigé (key_combo fusionné) : '%s' → raw_keys supprimé, "
|
||
"replay par copier-coller",
|
||
original[:50],
|
||
)
|
||
|
||
# ── 4. Convertir en actions replay normalisées ──
|
||
actions = []
|
||
last_ts = 0.0
|
||
|
||
for evt in merged_events:
|
||
evt_type = evt.get("type", "")
|
||
evt_ts = float(evt.get("timestamp", 0))
|
||
|
||
# Insérer un wait si pause significative (> 2s, cappé à 5s)
|
||
if last_ts > 0 and evt_ts > last_ts:
|
||
delta_ms = int((evt_ts - last_ts) * 1000)
|
||
if delta_ms > 2000:
|
||
capped_ms = min(delta_ms, 5000)
|
||
actions.append({
|
||
"action_id": f"act_raw_{uuid.uuid4().hex[:8]}",
|
||
"type": "wait",
|
||
"duration_ms": capped_ms,
|
||
})
|
||
|
||
# Mettre à jour le timestamp
|
||
end_ts = float(evt.get("_end_ts", evt_ts))
|
||
last_ts = max(last_ts, end_ts if end_ts > 0 else evt_ts)
|
||
|
||
action = {"action_id": f"act_raw_{uuid.uuid4().hex[:8]}"}
|
||
|
||
if evt_type == "mouse_click":
|
||
pos = evt.get("pos", [])
|
||
if not pos or len(pos) != 2:
|
||
continue
|
||
action["type"] = "click"
|
||
action["x_pct"] = round(pos[0] / screen_w, 6)
|
||
action["y_pct"] = round(pos[1] / screen_h, 6)
|
||
action["button"] = evt.get("button", "left")
|
||
# Enrichir avec le titre de fenêtre si disponible
|
||
window = evt.get("window", {})
|
||
if window.get("title"):
|
||
action["window_title"] = window["title"]
|
||
|
||
# ── Visual replay : enrichissement VLM du clic ──
|
||
if session_dir_path:
|
||
# Stratégie de crop : d'abord chercher un crop pré-existant
|
||
# (vision_info.crop, screenshot_id_crop.png, focus_XXXX.png)
|
||
# puis fallback vers le crop 80x80 du full screenshot
|
||
anchor_b64_preexist = _load_crop_for_event(
|
||
evt, session_dir_path, screen_w, screen_h,
|
||
)
|
||
|
||
# Vérifier si un enrichissement temps réel existe déjà
|
||
# (calculé par SomEngine pendant l'enregistrement via api_stream)
|
||
enrichment = evt.get("enrichment")
|
||
if enrichment:
|
||
logger.debug(
|
||
"Enrichissement temps réel trouvé pour %s (by_text='%s')",
|
||
evt.get("screenshot_id", "?"),
|
||
enrichment.get("by_text", ""),
|
||
)
|
||
else:
|
||
# Pas d'enrichissement pré-calculé → appel SomEngine classique
|
||
screenshot_id = evt.get("screenshot_id", "")
|
||
full_path = session_dir_path / "shots" / f"{screenshot_id}_full.png" if screenshot_id else None
|
||
enrichment = enrich_click_from_screenshot(
|
||
screenshot_path=full_path,
|
||
click_x=int(pos[0]),
|
||
click_y=int(pos[1]),
|
||
screen_w=screen_w,
|
||
screen_h=screen_h,
|
||
window_title=window.get("title", ""),
|
||
vision_info=evt.get("vision_info"),
|
||
session_dir=session_dir_path,
|
||
screenshot_id=screenshot_id,
|
||
)
|
||
|
||
# Préférer le crop pré-existant (plus fiable : crop agent, focus)
|
||
if anchor_b64_preexist and enrichment:
|
||
enrichment["anchor_image_base64"] = anchor_b64_preexist
|
||
# Si enrich_click n'a pas pu cropper mais qu'on a un crop pré-existant
|
||
elif anchor_b64_preexist and not enrichment:
|
||
enrichment = {"anchor_image_base64": anchor_b64_preexist}
|
||
|
||
if enrichment and enrichment.get("anchor_image_base64"):
|
||
action["visual_mode"] = True
|
||
action["target_spec"] = {
|
||
k: v for k, v in enrichment.items()
|
||
if k != "by_position" # by_position est déjà dans x_pct/y_pct
|
||
}
|
||
# Ajouter les métadonnées fenêtre pour le grounding ciblé
|
||
wc = evt.get("window_capture", {})
|
||
if wc.get("rect"):
|
||
action["target_spec"]["window_capture"] = {
|
||
"rect": wc["rect"],
|
||
"window_size": wc.get("window_size"),
|
||
"click_relative": wc.get("click_relative"),
|
||
}
|
||
|
||
elif evt_type == "text_input":
|
||
text = evt.get("text", "")
|
||
if not text:
|
||
continue
|
||
action["type"] = "type"
|
||
action["text"] = text
|
||
# Propager les raw_keys pour le replay exact (solution AZERTY)
|
||
# SAUF si le texte contient des chars fusionnés depuis key_combos
|
||
# (ex: @ de AltGr) — dans ce cas les raw_keys sont incomplets
|
||
# et le replay doit utiliser le copier-coller
|
||
if evt.get("raw_keys"):
|
||
reconstructed = _reconstruct_text_from_raw_keys(evt["raw_keys"])
|
||
if len(reconstructed) >= len(text):
|
||
action["raw_keys"] = evt["raw_keys"]
|
||
else:
|
||
logger.debug(
|
||
"raw_keys incomplets pour '%s' (recon=%d < text=%d) → copier-coller",
|
||
text[:30], len(reconstructed), len(text),
|
||
)
|
||
|
||
elif evt_type in ("key_press", "key_combo"):
|
||
keys = evt.get("keys", [])
|
||
if not keys:
|
||
key = evt.get("key", "")
|
||
if key:
|
||
keys = [key]
|
||
if not keys:
|
||
continue
|
||
keys = _sanitize_keys(keys)
|
||
if _is_modifier_only(keys):
|
||
continue
|
||
action["type"] = "key_combo"
|
||
action["keys"] = keys
|
||
# Propager les raw_keys pour le replay exact (solution AZERTY)
|
||
if evt.get("raw_keys"):
|
||
action["raw_keys"] = evt["raw_keys"]
|
||
|
||
elif evt_type == "scroll":
|
||
pos = evt.get("pos", [])
|
||
action["type"] = "scroll"
|
||
if pos and len(pos) == 2:
|
||
action["x_pct"] = round(pos[0] / screen_w, 6)
|
||
action["y_pct"] = round(pos[1] / screen_h, 6)
|
||
action["delta"] = evt.get("delta", -3)
|
||
|
||
else:
|
||
continue
|
||
|
||
actions.append(action)
|
||
|
||
# ── 5. Nettoyage global (dédup combos, sanitize, merge texte, waits) ──
|
||
actions = clean_enriched_actions(actions)
|
||
|
||
# ── 6. Insérer des waits contextuels après raccourcis critiques ──
|
||
final_actions = []
|
||
for action in actions:
|
||
final_actions.append(action)
|
||
post_wait = _needs_post_wait(action)
|
||
if post_wait > 0:
|
||
# Vérifier si un wait existe déjà juste après (sera ajouté au prochain tour)
|
||
final_actions.append({
|
||
"action_id": f"act_raw_{uuid.uuid4().hex[:8]}",
|
||
"type": "wait",
|
||
"duration_ms": post_wait,
|
||
})
|
||
|
||
# ── 7. Dernier nettoyage des waits consécutifs ──
|
||
result = []
|
||
for a in final_actions:
|
||
if (result
|
||
and a.get("type") == "wait"
|
||
and result[-1].get("type") == "wait"):
|
||
# Garder le plus long
|
||
if a.get("duration_ms", 0) > result[-1].get("duration_ms", 0):
|
||
result[-1] = a
|
||
continue
|
||
result.append(a)
|
||
|
||
# ── 8. Attacher les screenshots de référence (état attendu après action) ──
|
||
# Les screenshots res_shot_XXXX.png capturés 1s après chaque action pendant
|
||
# l'enregistrement servent de référence pour le contrôle visuel.
|
||
if session_dir_path:
|
||
_attach_expected_screenshots(result, events, session_dir_path)
|
||
|
||
# ── 9. Enrichir avec expected_window_title (titre fenêtre attendu après le clic) ──
|
||
# Pour la vérification post-action : le titre de la fenêtre APRÈS le clic
|
||
# est le window_title du PROCHAIN clic dans la séquence.
|
||
click_indices = [i for i, a in enumerate(result) if a.get("type") == "click"]
|
||
for j, ci in enumerate(click_indices):
|
||
if j + 1 < len(click_indices):
|
||
next_ci = click_indices[j + 1]
|
||
next_title = result[next_ci].get("target_spec", {}).get("window_title", "")
|
||
if next_title:
|
||
result[ci]["expected_window_title"] = next_title
|
||
|
||
# ── 10. Enrichir avec intention + expected_result via gemma4 (Critic) ──
|
||
# gemma4 analyse chaque action dans son contexte pour produire :
|
||
# - intention : ce que l'utilisateur veut accomplir
|
||
# - expected_result : description de l'état écran attendu après l'action
|
||
# - expected_state : description de l'état écran attendu AVANT l'action
|
||
# Ces champs alimentent le Critic (vérification sémantique post-action)
|
||
# et l'Observer (pré-analyse écran).
|
||
# Ref: docs/VISION_RPA_INTELLIGENT.md — étape VERIFY du pipeline
|
||
# Ref: docs/PLAN_ACTEUR_V1.md — Phase 1 : Workflow comme template
|
||
if session_dir_path:
|
||
_enrich_actions_with_intentions(result, session_dir_path)
|
||
|
||
# ── 11. Consolider avec les apprentissages passés ──
|
||
# Les replays précédents ont enregistré quelles méthodes marchent
|
||
# pour quels éléments. On réinjecte ces connaissances dans le workflow.
|
||
# C'est la boucle d'apprentissage : chaque replay améliore les suivants.
|
||
try:
|
||
from .replay_learner import ReplayLearner
|
||
_learner = ReplayLearner()
|
||
consolidated = _learner.consolidate_workflow(result, session_id)
|
||
if consolidated:
|
||
logger.info(
|
||
"Consolidation apprentissage : %d actions enrichies par l'historique",
|
||
consolidated,
|
||
)
|
||
except Exception as e:
|
||
logger.debug("Consolidation apprentissage échouée : %s", e)
|
||
|
||
# Stats visual replay
|
||
visual_clicks = sum(
|
||
1 for a in result
|
||
if a.get("type") == "click" and a.get("visual_mode")
|
||
)
|
||
total_clicks = sum(1 for a in result if a.get("type") == "click")
|
||
verified_count = sum(1 for a in result if a.get("expected_screenshot_b64"))
|
||
intention_count = sum(1 for a in result if a.get("intention"))
|
||
logger.info(
|
||
"build_replay_from_raw_events(%s) : %d actions propres produites "
|
||
"(%d/%d clics avec visual_mode, %d avec screenshot de référence, "
|
||
"%d avec intentions)",
|
||
session_id, len(result), visual_clicks, total_clicks,
|
||
verified_count, intention_count,
|
||
)
|
||
|
||
# Libérer gemma4 du GPU pour que qwen2.5vl puisse charger au replay
|
||
_unload_gemma4()
|
||
|
||
return result
|
||
|
||
|
||
class StreamProcessor:
|
||
"""
|
||
Processeur de streaming qui connecte les données Agent V1 au core pipeline.
|
||
|
||
Cycle de vie :
|
||
1. register_session() — crée l'état mémoire
|
||
2. process_event() — accumule événements, extrait contexte fenêtre
|
||
3. process_screenshot() — analyse via ScreenAnalyzer + CLIP embedding
|
||
4. finalize_session() — construit le Workflow via GraphBuilder (DBSCAN)
|
||
"""
|
||
|
||
def __init__(self, data_dir: str = "data/training"):
|
||
self.data_dir = Path(data_dir)
|
||
persist_dir = str(self.data_dir / "streaming_sessions")
|
||
live_sessions_dir = str(self.data_dir / "live_sessions")
|
||
self.session_manager = LiveSessionManager(
|
||
persist_dir=persist_dir,
|
||
live_sessions_dir=live_sessions_dir,
|
||
)
|
||
self._lock = threading.Lock()
|
||
|
||
# Core components (chargés paresseusement pour éviter les imports lourds au démarrage)
|
||
self._screen_analyzer = None
|
||
self._clip_embedder = None
|
||
self._state_embedding_builder = None # P0-3 : pipeline d'embedding unifié (fusion multi-modale)
|
||
self._faiss_manager = None
|
||
self._initialized = False
|
||
|
||
# Lock pour l'accès concurrent aux données de session (screen_states, embeddings, workflows)
|
||
self._data_lock = threading.Lock()
|
||
|
||
# Lock pour l'accès FAISS (IndexFlat.add() n'est pas thread-safe)
|
||
self._faiss_lock = threading.Lock()
|
||
|
||
# Flag de suspension : quand un replay est actif, le worker se suspend
|
||
# pour libérer le GPU au resolve_target VLM du replay.
|
||
# Settée depuis api_stream.py via set_replay_flag().
|
||
self._replay_active_flag: Optional[threading.Event] = None
|
||
|
||
# Résultats d'analyse par session
|
||
self._screen_states: Dict[str, list] = {} # session_id -> List[ScreenState]
|
||
self._embeddings: Dict[str, list] = {} # session_id -> List[np.ndarray]
|
||
|
||
# Workflows construits (pour le matching)
|
||
self._workflows: Dict[str, Any] = {}
|
||
|
||
# Charger les workflows existants depuis le disque
|
||
self._load_persisted_workflows()
|
||
|
||
def _load_persisted_workflows(self):
|
||
"""Charger les workflows sauvegardés depuis le disque au démarrage.
|
||
|
||
Scanne le dossier workflows/ principal et les sous-dossiers par machine
|
||
(workflows/{machine_id}/) pour la rétrocompatibilité.
|
||
"""
|
||
workflows_dir = self.data_dir / "workflows"
|
||
if not workflows_dir.exists():
|
||
return
|
||
|
||
try:
|
||
from core.models.workflow_graph import Workflow
|
||
|
||
count = 0
|
||
# Charger les workflows du dossier racine (rétrocompatibilité)
|
||
for wf_file in sorted(workflows_dir.glob("*.json")):
|
||
try:
|
||
wf = Workflow.load_from_file(wf_file)
|
||
self._workflows[wf.workflow_id] = wf
|
||
count += 1
|
||
except Exception as e:
|
||
logger.warning(f"Impossible de charger {wf_file.name}: {e}")
|
||
|
||
# Charger les workflows des sous-dossiers par machine
|
||
for machine_dir in sorted(workflows_dir.iterdir()):
|
||
if not machine_dir.is_dir():
|
||
continue
|
||
for wf_file in sorted(machine_dir.glob("*.json")):
|
||
try:
|
||
wf = Workflow.load_from_file(wf_file)
|
||
# Stocker le machine_id dans les métadonnées du workflow
|
||
if not hasattr(wf, '_machine_id'):
|
||
wf._machine_id = machine_dir.name
|
||
self._workflows[wf.workflow_id] = wf
|
||
count += 1
|
||
except Exception as e:
|
||
logger.warning(f"Impossible de charger {wf_file.name}: {e}")
|
||
|
||
if count:
|
||
logger.info(f"{count} workflow(s) chargé(s) depuis {workflows_dir}")
|
||
except ImportError:
|
||
logger.debug("core.models.workflow_graph non disponible, skip chargement")
|
||
|
||
def set_replay_flag(self, flag: threading.Event):
|
||
"""Associer le flag de replay actif (depuis api_stream.py).
|
||
|
||
Quand ce flag est set(), reprocess_session() se suspend entre chaque
|
||
screenshot pour libérer le GPU au replay (resolve_target VLM).
|
||
"""
|
||
self._replay_active_flag = flag
|
||
logger.info("Flag de suspension replay configuré sur le StreamProcessor")
|
||
|
||
def _wait_if_replay_active(self, context: str = "") -> bool:
|
||
"""Suspendre le traitement si un replay est en cours.
|
||
|
||
Vérifie le flag _replay_active_flag et attend qu'il se clear.
|
||
Timeout de sécurité : 60s max pour éviter un blocage si le replay
|
||
plante sans clear le flag.
|
||
|
||
Args:
|
||
context: Description pour les logs (ex: "screenshot 3/10").
|
||
|
||
Returns:
|
||
True si on a dû attendre (replay était actif), False sinon.
|
||
"""
|
||
if not self._replay_active_flag or not self._replay_active_flag.is_set():
|
||
return False
|
||
|
||
import time
|
||
suspend_start = time.time()
|
||
waited = False
|
||
while self._replay_active_flag.is_set():
|
||
elapsed = time.time() - suspend_start
|
||
if elapsed > 60:
|
||
logger.warning(
|
||
f"Worker : timeout suspension (60s), reprise forcée ({context})"
|
||
)
|
||
break
|
||
if not waited:
|
||
logger.info(f"Worker suspendu — replay en cours ({context})")
|
||
waited = True
|
||
time.sleep(2)
|
||
|
||
if waited:
|
||
total_wait = time.time() - suspend_start
|
||
logger.info(
|
||
f"Worker reprend après {total_wait:.1f}s de suspension ({context})"
|
||
)
|
||
return waited
|
||
|
||
def _ensure_initialized(self):
|
||
"""Charger les composants core GPU si pas encore fait.
|
||
|
||
DÉSACTIVÉ dans le serveur HTTP : les composants GPU (ScreenAnalyzer,
|
||
CLIP, FAISS) bloquent le GIL Python et rendent le serveur non-réactif.
|
||
Ces composants sont chargés uniquement par le worker séparé (run_worker.py).
|
||
Le serveur HTTP ne fait que stocker les screenshots et distribuer les replays.
|
||
"""
|
||
if self._initialized:
|
||
return
|
||
# Marquer comme initialisé SANS charger les composants GPU
|
||
self._initialized = True
|
||
logger.info("StreamProcessor initialisé en mode LÉGER (pas de GPU, pas de VLM)")
|
||
return
|
||
|
||
with self._lock:
|
||
if self._initialized:
|
||
return
|
||
|
||
logger.info("Initialisation des composants core (GPU)...")
|
||
|
||
try:
|
||
from core.pipeline.screen_analyzer import ScreenAnalyzer
|
||
self._screen_analyzer = ScreenAnalyzer(session_id="stream_server")
|
||
logger.info(" ScreenAnalyzer prêt")
|
||
except Exception as e:
|
||
logger.error(f" Erreur init ScreenAnalyzer: {e}")
|
||
self._screen_analyzer = None
|
||
|
||
try:
|
||
from core.embedding.clip_embedder import CLIPEmbedder
|
||
self._clip_embedder = CLIPEmbedder()
|
||
logger.info(" CLIPEmbedder prêt (singleton, ne sera plus rechargé)")
|
||
except Exception as e:
|
||
logger.error(f" Erreur init CLIPEmbedder: {e}")
|
||
self._clip_embedder = None
|
||
|
||
# P0-3 : Initialiser le StateEmbeddingBuilder pour unifier l'espace d'embedding
|
||
# Utilise le même CLIPEmbedder (pas de rechargement du modèle) + FusionEngine
|
||
# pour produire des vecteurs fusionnés (image+text+title+ui) identiques à GraphBuilder
|
||
try:
|
||
from core.embedding.state_embedding_builder import StateEmbeddingBuilder
|
||
if self._clip_embedder is not None:
|
||
# Injecter le CLIPEmbedder déjà chargé pour éviter un double chargement
|
||
self._state_embedding_builder = StateEmbeddingBuilder(
|
||
embedders={
|
||
"image": self._clip_embedder,
|
||
"text": self._clip_embedder,
|
||
"title": self._clip_embedder,
|
||
"ui": self._clip_embedder,
|
||
},
|
||
output_dir=self.data_dir / "embeddings",
|
||
use_clip=False, # Pas besoin, on fournit les embedders directement
|
||
)
|
||
else:
|
||
# Fallback : laisser le builder créer son propre CLIPEmbedder
|
||
self._state_embedding_builder = StateEmbeddingBuilder(
|
||
output_dir=self.data_dir / "embeddings",
|
||
use_clip=True,
|
||
)
|
||
logger.info(" StateEmbeddingBuilder prêt (fusion multi-modale unifiée)")
|
||
except Exception as e:
|
||
logger.warning(f" StateEmbeddingBuilder non disponible, fallback CLIP pur: {e}")
|
||
self._state_embedding_builder = None
|
||
|
||
try:
|
||
from core.embedding.faiss_manager import FAISSManager
|
||
self._faiss_manager = FAISSManager(
|
||
dimensions=512,
|
||
index_type="Flat",
|
||
metric="cosine",
|
||
)
|
||
logger.info(" FAISSManager prêt (512 dims, cosine)")
|
||
except Exception as e:
|
||
logger.error(f" Erreur init FAISSManager: {e}")
|
||
self._faiss_manager = None
|
||
|
||
self._initialized = True
|
||
logger.info("Composants core initialisés.")
|
||
|
||
# =========================================================================
|
||
# Événements
|
||
# =========================================================================
|
||
|
||
def process_event(self, session_id: str, event_data: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""Enregistrer un événement dans la session live.
|
||
|
||
Filtre les événements parasites à la réception :
|
||
- key_combo/key_press avec uniquement des modificateurs seuls (ctrl, alt, shift, etc.)
|
||
- key_combo/key_press avec liste de touches vide
|
||
- text_input avec texte vide
|
||
"""
|
||
if _is_parasitic_event(event_data):
|
||
logger.debug(
|
||
f"Événement parasite filtré (session {session_id}): "
|
||
f"type={event_data.get('type')}, data={event_data.get('keys', event_data.get('text', ''))}"
|
||
)
|
||
return {"status": "event_filtered", "session_id": session_id, "reason": "parasitic"}
|
||
self.session_manager.add_event(session_id, event_data)
|
||
return {"status": "event_recorded", "session_id": session_id}
|
||
|
||
# =========================================================================
|
||
# Screenshots
|
||
# =========================================================================
|
||
|
||
def process_screenshot(self, session_id: str, shot_id: str, file_path: str) -> Dict[str, Any]:
|
||
"""
|
||
Analyser un screenshot full via le core pipeline.
|
||
|
||
1. ScreenAnalyzer → ScreenState (OCR, UI detection)
|
||
2. StateEmbeddingBuilder → vecteur fusionné 512d (image+text+title+ui)
|
||
Même espace d'embedding que GraphBuilder (P0-3)
|
||
Fallback : CLIP embed_image() si StateEmbeddingBuilder échoue
|
||
3. FAISS indexation → matching temps réel
|
||
"""
|
||
self._ensure_initialized()
|
||
self.session_manager.add_screenshot(session_id, shot_id, file_path)
|
||
|
||
result = {
|
||
"shot_id": shot_id,
|
||
"session_id": session_id,
|
||
"state_id": None,
|
||
"ui_elements_count": 0,
|
||
"text_detected": 0,
|
||
"embedding_indexed": False,
|
||
"match": None,
|
||
}
|
||
|
||
# 1. Construire le ScreenState
|
||
if self._screen_analyzer is None:
|
||
logger.warning("ScreenAnalyzer non disponible, skip analyse")
|
||
return result
|
||
|
||
session = self.session_manager.get_session(session_id)
|
||
# Utiliser le mapping shot → window si disponible (reprocessing)
|
||
shot_map = getattr(session, '_shot_window_map', None) if session else None
|
||
if shot_map and shot_id in shot_map:
|
||
window_info = shot_map[shot_id]
|
||
else:
|
||
window_info = session.last_window_info if session else {}
|
||
|
||
try:
|
||
screen_state = self._screen_analyzer.analyze(
|
||
screenshot_path=file_path,
|
||
window_info=window_info,
|
||
)
|
||
result["state_id"] = screen_state.screen_state_id
|
||
result["ui_elements_count"] = len(screen_state.ui_elements)
|
||
result["text_detected"] = len(
|
||
getattr(screen_state.perception, "detected_text", [])
|
||
)
|
||
|
||
# Stocker le ScreenState pour le build final
|
||
with self._data_lock:
|
||
if session_id not in self._screen_states:
|
||
self._screen_states[session_id] = []
|
||
self._screen_states[session_id].append(screen_state)
|
||
|
||
# Enrichir avec les patterns UI connus
|
||
try:
|
||
from core.knowledge.ui_patterns import UIPatternLibrary
|
||
detected_text = getattr(screen_state.perception, "detected_text", [])
|
||
if detected_text:
|
||
ocr_text = " ".join(str(t) for t in detected_text) if isinstance(detected_text, list) else str(detected_text)
|
||
lib = UIPatternLibrary()
|
||
pattern = lib.find_pattern(ocr_text)
|
||
if pattern:
|
||
result["ui_pattern"] = pattern["pattern"]
|
||
result["ui_pattern_action"] = pattern["action"]
|
||
result["ui_pattern_target"] = pattern["target"]
|
||
logger.info(f"Pattern UI détecté: {pattern['pattern']} → {pattern['target']}")
|
||
except ImportError:
|
||
pass
|
||
except Exception as e:
|
||
logger.debug(f"Pattern check: {e}")
|
||
|
||
logger.info(
|
||
f"Screenshot analysé: {shot_id} | "
|
||
f"{result['ui_elements_count']} UI elements, "
|
||
f"{result['text_detected']} textes"
|
||
)
|
||
except Exception as e:
|
||
logger.error(f"Erreur analyse screenshot {shot_id}: {e}")
|
||
return result
|
||
|
||
# 2. Construire l'embedding fusionné via StateEmbeddingBuilder (P0-3)
|
||
# Utilise le même pipeline que GraphBuilder : fusion image+text+title+ui
|
||
# pour garantir que les vecteurs FAISS sont dans le même espace d'embedding
|
||
embedding_vector = None
|
||
|
||
if self._state_embedding_builder is not None:
|
||
try:
|
||
state_embedding = self._state_embedding_builder.build(screen_state)
|
||
# Récupérer le vecteur fusionné depuis le StateEmbedding
|
||
fused_vec = state_embedding.get_vector()
|
||
if fused_vec is not None:
|
||
embedding_vector = fused_vec.astype(np.float32)
|
||
logger.debug(
|
||
f"Embedding fusionné multi-modal calculé pour {shot_id} "
|
||
f"(dim={embedding_vector.shape[0]})"
|
||
)
|
||
except Exception as e:
|
||
logger.warning(
|
||
f"StateEmbeddingBuilder échoué pour {shot_id}: {e}, "
|
||
f"fallback sur CLIP pur"
|
||
)
|
||
|
||
# Fallback : utiliser le CLIPEmbedder singleton (embedding image seul)
|
||
if embedding_vector is None and self._clip_embedder is not None:
|
||
try:
|
||
from PIL import Image
|
||
pil_image = Image.open(file_path)
|
||
embedding_vector = self._clip_embedder.embed_image(pil_image)
|
||
except Exception as e:
|
||
logger.debug(f"CLIP embedding échoué: {e}")
|
||
|
||
if embedding_vector is not None:
|
||
# Stocker pour le build final
|
||
with self._data_lock:
|
||
if session_id not in self._embeddings:
|
||
self._embeddings[session_id] = []
|
||
self._embeddings[session_id].append(embedding_vector)
|
||
|
||
# 3. Indexer dans FAISS (protégé par _faiss_lock car IndexFlat.add n'est pas thread-safe)
|
||
if self._faiss_manager is not None:
|
||
try:
|
||
with self._faiss_lock:
|
||
self._faiss_manager.add_embedding(
|
||
embedding_id=screen_state.screen_state_id,
|
||
vector=embedding_vector,
|
||
metadata={
|
||
"session_id": session_id,
|
||
"shot_id": shot_id,
|
||
"window_title": window_info.get("title", ""),
|
||
},
|
||
)
|
||
result["embedding_indexed"] = True
|
||
except Exception as e:
|
||
logger.error(f"Erreur FAISS indexation: {e}")
|
||
|
||
# 4. Matching temps réel contre les workflows connus
|
||
with self._data_lock:
|
||
has_workflows = bool(self._workflows)
|
||
if embedding_vector is not None and has_workflows:
|
||
result["match"] = self._try_match(embedding_vector)
|
||
|
||
return result
|
||
|
||
def process_crop(self, session_id: str, shot_id: str, file_path: str) -> Dict[str, Any]:
|
||
"""
|
||
Enregistrer un crop (400x400). Pas d'analyse ScreenAnalyzer
|
||
(un crop est un fragment, pas un écran complet).
|
||
"""
|
||
self.session_manager.add_screenshot(session_id, shot_id, file_path)
|
||
return {"status": "crop_stored", "shot_id": shot_id}
|
||
|
||
# =========================================================================
|
||
# Finalisation
|
||
# =========================================================================
|
||
|
||
def finalize_session(self, session_id: str) -> Dict[str, Any]:
|
||
"""
|
||
Construire un Workflow depuis les données accumulées.
|
||
|
||
Utilise le GraphBuilder du core avec les ScreenStates et embeddings
|
||
collectés pendant le streaming.
|
||
"""
|
||
self._ensure_initialized()
|
||
|
||
session = self.session_manager.finalize(session_id)
|
||
if not session:
|
||
return {"error": f"Session {session_id} non trouvée"}
|
||
|
||
with self._data_lock:
|
||
states = list(self._screen_states.get(session_id, []))
|
||
embeddings = list(self._embeddings.get(session_id, []))
|
||
|
||
if len(states) < 2:
|
||
logger.warning(
|
||
f"Session {session_id}: seulement {len(states)} states, "
|
||
f"pas assez pour construire un workflow"
|
||
)
|
||
return {
|
||
"session_id": session_id,
|
||
"status": "insufficient_data",
|
||
"states_count": len(states),
|
||
"min_required": 2,
|
||
}
|
||
|
||
# Convertir en RawSession pour le GraphBuilder
|
||
raw_dict = self.session_manager.to_raw_session(session_id)
|
||
if not raw_dict:
|
||
return {"error": "Conversion RawSession échouée"}
|
||
|
||
try:
|
||
from core.models.raw_session import RawSession
|
||
raw_session = RawSession.from_dict(raw_dict)
|
||
except Exception as e:
|
||
logger.error(f"Erreur construction RawSession: {e}")
|
||
# Fallback : construire manuellement
|
||
try:
|
||
raw_session = self._build_raw_session_fallback(session, raw_dict)
|
||
except Exception as e2:
|
||
return {"error": f"Erreur RawSession: {e2}"}
|
||
|
||
# Construire le workflow via GraphBuilder
|
||
try:
|
||
from core.graph.graph_builder import GraphBuilder
|
||
|
||
n = len(states)
|
||
min_reps = 1 if n < 6 else 2 if n <= 30 else min(3, n // 10)
|
||
|
||
builder = GraphBuilder(
|
||
embedding_builder=self._state_embedding_builder, # Réutiliser le même modèle CLIP
|
||
min_pattern_repetitions=min_reps,
|
||
clustering_eps=0.08,
|
||
clustering_min_samples=2,
|
||
)
|
||
|
||
# Nommer le workflow intelligemment à partir des titres de fenêtre
|
||
workflow_name = self._generate_workflow_name(session_id)
|
||
|
||
# Récupérer les embeddings pré-calculés pendant le streaming
|
||
with self._data_lock:
|
||
precomputed_embs = list(self._embeddings.get(session_id, []))
|
||
|
||
# Enrichir les ScreenStates avec les timestamps des événements.
|
||
# Nécessaire pour que _find_transition_events() puisse associer
|
||
# les actions utilisateur aux bonnes transitions.
|
||
session_state = self.session_manager.get_session(session_id)
|
||
shot_ts_map = getattr(session_state, '_shot_ts_map', {}) if session_state else {}
|
||
if shot_ts_map:
|
||
self._enrich_states_with_timestamps(states, shot_ts_map)
|
||
|
||
# Mode séquentiel : pour les enregistrements single-pass, chaque
|
||
# screenshot est une étape distincte du workflow.
|
||
# Le clustering DBSCAN fusionne les screenshots similaires
|
||
# (ex: plusieurs vues de Notepad) en un seul node → perte d'actions.
|
||
# Le mode séquentiel préserve toutes les étapes.
|
||
use_sequential = len(raw_session.events) > 0
|
||
|
||
# Injecter les ScreenStates et embeddings pré-calculés pour éviter
|
||
# de re-analyser et de recalculer les embeddings (triple calcul)
|
||
workflow = builder.build_from_session(
|
||
raw_session,
|
||
workflow_name=workflow_name,
|
||
precomputed_states=states,
|
||
precomputed_embeddings=precomputed_embs if len(precomputed_embs) == len(states) else None,
|
||
sequential=use_sequential,
|
||
)
|
||
|
||
with self._data_lock:
|
||
self._workflows[workflow.workflow_id] = workflow
|
||
|
||
# Persister sur disque (dans le dossier de la machine source)
|
||
machine_id = session.machine_id if hasattr(session, 'machine_id') else "default"
|
||
saved_path = self._persist_workflow(workflow, session_id, machine_id=machine_id)
|
||
# Stocker le machine_id dans le workflow pour le filtrage
|
||
workflow._machine_id = machine_id
|
||
|
||
# Récupérer les métadonnées applicatives de la session
|
||
session_state = self.session_manager.get_session(session_id)
|
||
app_context = {}
|
||
if session_state:
|
||
app_context = {
|
||
"window_titles": dict(session_state.window_titles_seen),
|
||
"app_names": dict(session_state.app_names_seen),
|
||
"primary_app": sorted(
|
||
session_state.app_names_seen.items(),
|
||
key=lambda x: -x[1]
|
||
)[0][0] if session_state.app_names_seen else None,
|
||
"multi_app": len(session_state.app_names_seen) >= 3,
|
||
}
|
||
|
||
result = {
|
||
"session_id": session_id,
|
||
"machine_id": machine_id,
|
||
"status": "workflow_built",
|
||
"workflow_id": workflow.workflow_id,
|
||
"workflow_name": workflow_name,
|
||
"nodes": len(workflow.nodes),
|
||
"edges": len(workflow.edges),
|
||
"states_analyzed": len(states),
|
||
"embeddings_indexed": len(embeddings),
|
||
"saved_path": str(saved_path) if saved_path else None,
|
||
"app_context": app_context,
|
||
}
|
||
|
||
logger.info(
|
||
f"Workflow construit: '{workflow_name}' ({workflow.workflow_id}) | "
|
||
f"{result['nodes']} nodes, {result['edges']} edges"
|
||
+ (f" | apps: {list(app_context.get('app_names', {}).keys())}" if app_context.get('app_names') else "")
|
||
)
|
||
|
||
# Libérer la mémoire des données de session (peuvent être lourdes)
|
||
self._cleanup_session_data(session_id)
|
||
|
||
return result
|
||
|
||
except Exception as e:
|
||
logger.error(f"Erreur construction workflow: {e}")
|
||
return {"error": f"GraphBuilder: {e}", "session_id": session_id}
|
||
|
||
# =========================================================================
|
||
# Matching
|
||
# =========================================================================
|
||
|
||
def _try_match(self, embedding_vector: np.ndarray) -> Optional[Dict[str, Any]]:
|
||
"""Matcher un embedding contre les workflows connus."""
|
||
if self._faiss_manager is None or self._faiss_manager.index.ntotal == 0:
|
||
return None
|
||
|
||
try:
|
||
results = self._faiss_manager.search_similar(
|
||
query_vector=embedding_vector,
|
||
k=1,
|
||
min_similarity=0.85,
|
||
)
|
||
if results:
|
||
best = results[0]
|
||
return {
|
||
"matched_id": best.embedding_id,
|
||
"similarity": round(best.similarity, 4),
|
||
"metadata": best.metadata,
|
||
}
|
||
except Exception as e:
|
||
logger.debug(f"Erreur matching: {e}")
|
||
|
||
return None
|
||
|
||
# =========================================================================
|
||
# Enrichissement VLM des workflows (target_spec sur chaque edge)
|
||
# =========================================================================
|
||
|
||
def _enrich_workflow_targets(
|
||
self,
|
||
workflow,
|
||
session_dir: Path,
|
||
) -> int:
|
||
"""Enrichir les target_spec des edges d'un workflow avec les données VLM.
|
||
|
||
Pour chaque edge dont l'action est un clic (mouse_click ou compound
|
||
avec un step mouse_click), cette méthode :
|
||
1. Trouve le screenshot correspondant (via le from_node → shot_XXXX_full.png)
|
||
2. Extrait la position du clic depuis action.parameters.position
|
||
3. Appelle enrich_click_from_screenshot() pour obtenir :
|
||
- anchor_image_base64 (crop 80x80)
|
||
- by_text (texte OCR de l'élément)
|
||
- by_role (type de l'élément)
|
||
- vlm_description (description positionnelle)
|
||
- som_element (détection SomEngine)
|
||
4. Met à jour le TargetSpec de l'edge et stocke les données
|
||
supplémentaires dans context_hints et edge.metadata
|
||
|
||
Args:
|
||
workflow: Objet Workflow avec ses edges à enrichir.
|
||
session_dir: Répertoire de la session (contient shots/).
|
||
|
||
Returns:
|
||
Nombre d'edges enrichis.
|
||
"""
|
||
shots_dir = session_dir / "shots"
|
||
if not shots_dir.is_dir():
|
||
logger.warning(
|
||
"enrich_workflow_targets: dossier shots/ introuvable dans %s",
|
||
session_dir,
|
||
)
|
||
return 0
|
||
|
||
# ── Résolution d'écran depuis les événements ou fallback ──
|
||
screen_w, screen_h = self._get_screen_resolution_from_session(session_dir)
|
||
|
||
# ── Mapping node_id → screenshot_id ──
|
||
# En mode séquentiel, node_000 → premier screenshot, node_001 → deuxième, etc.
|
||
# On reconstruit ce mapping depuis les fichiers shot_XXXX_full.png triés.
|
||
all_shots = sorted(shots_dir.glob("shot_*_full.png"))
|
||
node_to_shot: Dict[str, Path] = {}
|
||
for i, shot_path in enumerate(all_shots):
|
||
node_id = f"node_{i:03d}"
|
||
node_to_shot[node_id] = shot_path
|
||
|
||
# ── Enrichir chaque edge ──
|
||
enriched_count = 0
|
||
|
||
for edge in workflow.edges:
|
||
# Trouver le screenshot du node source (là où le clic a lieu)
|
||
shot_path = node_to_shot.get(edge.from_node)
|
||
if not shot_path or not shot_path.is_file():
|
||
logger.debug(
|
||
"enrich: pas de screenshot pour node %s (edge %s)",
|
||
edge.from_node, edge.edge_id,
|
||
)
|
||
continue
|
||
|
||
# Extraire le screenshot_id depuis le nom de fichier
|
||
# shot_0001_full.png → shot_0001
|
||
screenshot_id = shot_path.stem.replace("_full", "")
|
||
|
||
# ── Extraire la position du clic ──
|
||
click_positions = self._extract_click_positions(edge)
|
||
if not click_positions:
|
||
continue
|
||
|
||
# Enrichir le premier clic (action principale)
|
||
click_x, click_y = click_positions[0]
|
||
|
||
# Extraire le titre de fenêtre depuis le node source si disponible
|
||
window_title = ""
|
||
source_node = workflow.get_node(edge.from_node)
|
||
if source_node and hasattr(source_node, 'template'):
|
||
tpl = source_node.template
|
||
# WindowConstraint dans le template
|
||
if hasattr(tpl, 'window') and tpl.window:
|
||
if tpl.window.title_pattern:
|
||
window_title = tpl.window.title_pattern
|
||
elif tpl.window.title_contains:
|
||
window_title = tpl.window.title_contains
|
||
elif tpl.window.process_name:
|
||
window_title = tpl.window.process_name
|
||
# Fallback : contraintes de l'edge
|
||
if not window_title and edge.constraints:
|
||
window_title = (
|
||
edge.constraints.required_window_title
|
||
or edge.constraints.required_app_name
|
||
or ""
|
||
)
|
||
|
||
# ── Appel à la fonction partagée d'enrichissement ──
|
||
enrichment = enrich_click_from_screenshot(
|
||
screenshot_path=shot_path,
|
||
click_x=int(click_x),
|
||
click_y=int(click_y),
|
||
screen_w=screen_w,
|
||
screen_h=screen_h,
|
||
window_title=window_title,
|
||
session_dir=session_dir,
|
||
screenshot_id=screenshot_id,
|
||
)
|
||
|
||
if not enrichment:
|
||
continue
|
||
|
||
# ── Mettre à jour le TargetSpec de l'edge ──
|
||
target = edge.action.target
|
||
|
||
# Champs first-class du TargetSpec
|
||
if enrichment.get("by_text"):
|
||
target.by_text = enrichment["by_text"]
|
||
if enrichment.get("by_role"):
|
||
target.by_role = enrichment["by_role"]
|
||
if enrichment.get("by_position"):
|
||
target.by_position = tuple(enrichment["by_position"])
|
||
|
||
# Champs supplémentaires dans context_hints
|
||
if not target.context_hints:
|
||
target.context_hints = {}
|
||
if enrichment.get("vlm_description"):
|
||
target.context_hints["vlm_description"] = enrichment["vlm_description"]
|
||
if enrichment.get("window_title"):
|
||
target.context_hints["window_title"] = enrichment["window_title"]
|
||
if enrichment.get("original_position"):
|
||
target.context_hints["original_position"] = enrichment["original_position"]
|
||
if enrichment.get("anchor_image_base64"):
|
||
target.context_hints["anchor_image_base64"] = enrichment["anchor_image_base64"]
|
||
if enrichment.get("by_text_source"):
|
||
target.context_hints["by_text_source"] = enrichment["by_text_source"]
|
||
if enrichment.get("som_element"):
|
||
target.context_hints["som_element"] = enrichment["som_element"]
|
||
|
||
# Marquer l'edge comme enrichi dans les métadonnées
|
||
edge.metadata["vlm_enriched"] = True
|
||
edge.metadata["enrichment_source"] = "reprocess_session"
|
||
|
||
enriched_count += 1
|
||
logger.debug(
|
||
"Edge %s enrichi : by_text='%s', by_role='%s', anchor=%s",
|
||
edge.edge_id,
|
||
enrichment.get("by_text", ""),
|
||
enrichment.get("by_role", ""),
|
||
"oui" if enrichment.get("anchor_image_base64") else "non",
|
||
)
|
||
|
||
logger.info(
|
||
"Enrichissement VLM terminé : %d/%d edges enrichis pour workflow '%s'",
|
||
enriched_count, len(workflow.edges), workflow.name,
|
||
)
|
||
return enriched_count
|
||
|
||
def _extract_click_positions(self, edge) -> List[tuple]:
|
||
"""Extraire les positions de clic depuis un edge du workflow.
|
||
|
||
Supporte les actions simples (mouse_click) et compound (steps).
|
||
|
||
Returns:
|
||
Liste de tuples (x, y) en pixels. Peut être vide si pas de clic.
|
||
"""
|
||
action = edge.action
|
||
positions = []
|
||
|
||
if action.type == "mouse_click":
|
||
pos = action.parameters.get("position", [])
|
||
if pos and len(pos) == 2:
|
||
positions.append((pos[0], pos[1]))
|
||
|
||
elif action.type == "compound":
|
||
# Chercher les steps de type mouse_click
|
||
for step in action.parameters.get("steps", []):
|
||
if step.get("type") == "mouse_click":
|
||
pos = step.get("position", [])
|
||
if pos and len(pos) == 2:
|
||
positions.append((pos[0], pos[1]))
|
||
|
||
return positions
|
||
|
||
def _get_screen_resolution_from_session(
|
||
self, session_dir: Path,
|
||
) -> tuple:
|
||
"""Extraire la résolution d'écran depuis les événements d'une session.
|
||
|
||
Lit live_events.jsonl et cherche screen_metadata.screen_resolution
|
||
ou infère depuis les positions des clics.
|
||
|
||
Returns:
|
||
Tuple (width, height). Fallback: (1920, 1080).
|
||
"""
|
||
import json as _json
|
||
|
||
events_file = session_dir / "live_events.jsonl"
|
||
if not events_file.exists():
|
||
return (1920, 1080)
|
||
|
||
events = []
|
||
try:
|
||
for line in events_file.read_text().splitlines():
|
||
if not line.strip():
|
||
continue
|
||
try:
|
||
events.append(_json.loads(line))
|
||
except _json.JSONDecodeError:
|
||
continue
|
||
except Exception:
|
||
return (1920, 1080)
|
||
|
||
if events:
|
||
return _extract_screen_resolution(events)
|
||
return (1920, 1080)
|
||
|
||
# =========================================================================
|
||
# Retraitement (appelé par le SessionWorker)
|
||
# =========================================================================
|
||
|
||
def reprocess_session(
|
||
self,
|
||
session_id: str,
|
||
progress_callback=None,
|
||
) -> Dict[str, Any]:
|
||
"""Retraiter une session finalisée : analyser tous les screenshots puis construire le workflow.
|
||
|
||
Utilisé par le SessionWorker pour traiter les sessions en arrière-plan.
|
||
Cherche les fichiers shot_*_full.png sur disque, les analyse un par un
|
||
via process_screenshot(), puis appelle finalize_session() pour construire
|
||
le workflow.
|
||
|
||
Args:
|
||
session_id: Identifiant de la session à retraiter.
|
||
progress_callback: Callable(session_id, current, total, shot_id) pour la progression.
|
||
|
||
Returns:
|
||
Dict avec le résultat de finalize_session() ou un dict d'erreur.
|
||
"""
|
||
logger.info(f"Retraitement de la session {session_id}")
|
||
|
||
# Trouver le dossier de la session sur disque
|
||
# Les screenshots peuvent être dans :
|
||
# - data/training/live_sessions/{session_id}/shots/
|
||
# - data/training/live_sessions/{machine_id}/{session_id}/shots/
|
||
session_dir = self._find_session_dir(session_id)
|
||
if not session_dir:
|
||
return {"error": f"Dossier session {session_id} introuvable sur disque"}
|
||
|
||
shots_dir = session_dir / "shots"
|
||
if not shots_dir.exists():
|
||
return {"error": f"Dossier shots/ introuvable pour {session_id}"}
|
||
|
||
# Lister les screenshots full (shot_XXXX_full.png), triés par nom
|
||
all_shots = sorted(shots_dir.glob("shot_*_full.png"))
|
||
if not all_shots:
|
||
return {
|
||
"error": f"Aucun screenshot shot_*_full.png trouvé dans {shots_dir}",
|
||
"session_id": session_id,
|
||
}
|
||
|
||
# Sélection intelligente : ne garder que les screenshots significatifs
|
||
# pour éviter d'analyser des captures redondantes (~identiques)
|
||
key_shots = self._select_key_screenshots(session_id, all_shots)
|
||
total_all = len(all_shots)
|
||
total = len(key_shots)
|
||
logger.info(
|
||
f"Screenshots sélectionnés : {total}/{total_all} "
|
||
f"(déduplication perceptuelle) dans {shots_dir}"
|
||
)
|
||
|
||
# S'assurer que la session est enregistrée dans le session_manager
|
||
self.session_manager.get_or_create(session_id)
|
||
|
||
# Restaurer les window_info depuis live_events.jsonl
|
||
# pour que ScreenAnalyzer crée des ScreenStates avec les bons titres de fenêtre
|
||
self._restore_window_events(session_id, session_dir)
|
||
|
||
# Restaurer les événements utilisateur (mouse_click, text_input, key_press)
|
||
# depuis live_events.jsonl → session.events, pour que to_raw_session()
|
||
# puisse les passer au GraphBuilder (construction des edges/actions)
|
||
self._restore_user_events(session_id, session_dir)
|
||
|
||
# Nettoyer les données en mémoire (au cas où un traitement précédent a échoué)
|
||
with self._data_lock:
|
||
self._screen_states.pop(session_id, None)
|
||
self._embeddings.pop(session_id, None)
|
||
|
||
# Analyser les screenshots en parallèle (2 workers max pour la VRAM)
|
||
# Chaque process_screenshot() est indépendant : ScreenAnalyzer + CLIP + FAISS
|
||
# Les structures partagées (_screen_states, _embeddings) sont protégées par _data_lock
|
||
max_parallel = min(2, total)
|
||
errors = 0
|
||
processed_count = 0
|
||
|
||
def _analyze_one(shot_file, index=0):
|
||
"""Analyse un screenshot, retourne (shot_id, result_or_error).
|
||
|
||
Vérifie avant chaque analyse si un replay est en cours (flag GPU).
|
||
Si oui, attend que le replay termine avant de lancer le VLM.
|
||
"""
|
||
shot_id = shot_file.stem
|
||
# Suspendre si un replay est en cours (libérer le GPU)
|
||
self._wait_if_replay_active(
|
||
context=f"screenshot {index + 1}/{total} ({shot_id})"
|
||
)
|
||
try:
|
||
res = self.process_screenshot(session_id, shot_id, str(shot_file))
|
||
return (shot_id, res, None)
|
||
except Exception as e:
|
||
return (shot_id, None, str(e))
|
||
|
||
if total <= 1:
|
||
# Un seul screenshot — pas besoin de pool
|
||
for shot_file in key_shots:
|
||
shot_id, res, err = _analyze_one(shot_file, index=0)
|
||
processed_count += 1
|
||
if progress_callback:
|
||
try:
|
||
progress_callback(session_id, processed_count, total, shot_id)
|
||
except Exception:
|
||
pass
|
||
if err:
|
||
logger.error(f"Erreur analyse screenshot {shot_id}: {err}")
|
||
errors += 1
|
||
elif res and res.get("state_id") is None:
|
||
logger.warning(f"Screenshot {shot_id} : analyse échouée (pas de state_id)")
|
||
errors += 1
|
||
else:
|
||
# Traitement parallèle — 2 screenshots simultanés
|
||
# Note : _analyze_one vérifie le flag replay avant chaque VLM call,
|
||
# donc les workers se suspendent automatiquement si un replay est en cours.
|
||
logger.info(f"Traitement parallèle : {max_parallel} workers pour {total} screenshots")
|
||
with ThreadPoolExecutor(max_workers=max_parallel, thread_name_prefix="vlm") as pool:
|
||
futures = {pool.submit(_analyze_one, sf, i): sf for i, sf in enumerate(key_shots)}
|
||
for future in as_completed(futures):
|
||
shot_id, res, err = future.result()
|
||
processed_count += 1
|
||
if progress_callback:
|
||
try:
|
||
progress_callback(session_id, processed_count, total, shot_id)
|
||
except Exception:
|
||
pass
|
||
if err:
|
||
logger.error(f"Erreur analyse screenshot {shot_id}: {err}")
|
||
errors += 1
|
||
elif res and res.get("state_id") is None:
|
||
logger.warning(f"Screenshot {shot_id} : analyse échouée (pas de state_id)")
|
||
errors += 1
|
||
|
||
# Vérifier combien de states ont été produits
|
||
with self._data_lock:
|
||
states_count = len(self._screen_states.get(session_id, []))
|
||
|
||
logger.info(
|
||
f"Session {session_id} : {states_count}/{total} screenshots analysés "
|
||
f"({errors} erreurs, {total_all - total} skippés par dédup)"
|
||
)
|
||
|
||
# Construire le workflow via finalize_session()
|
||
# Note: finalize() du session_manager a déjà été appelé quand la session
|
||
# a été marquée comme finalisée. On n'a pas besoin de le refaire.
|
||
# finalize_session() utilise les screen_states accumulés.
|
||
result = self.finalize_session(session_id)
|
||
|
||
# ── Enrichissement VLM des target_spec du workflow ──
|
||
# Après la construction du workflow, enrichir chaque edge avec les
|
||
# informations visuelles (by_text, by_role, vlm_description, anchor_image)
|
||
# extraites des screenshots via SomEngine.
|
||
if result.get("status") == "workflow_built" and result.get("workflow_id"):
|
||
workflow_id = result["workflow_id"]
|
||
with self._data_lock:
|
||
workflow = self._workflows.get(workflow_id)
|
||
if workflow and session_dir:
|
||
try:
|
||
enriched_count = self._enrich_workflow_targets(workflow, session_dir)
|
||
result["enriched_edges"] = enriched_count
|
||
result["total_edges"] = len(workflow.edges)
|
||
|
||
# Re-sauvegarder le workflow enrichi sur disque
|
||
if enriched_count > 0 and result.get("saved_path"):
|
||
saved_path = Path(result["saved_path"])
|
||
workflow.save_to_file(saved_path)
|
||
logger.info(
|
||
"Workflow enrichi re-sauvegardé : %s "
|
||
"(%d/%d edges enrichis)",
|
||
saved_path, enriched_count, len(workflow.edges),
|
||
)
|
||
except Exception as e:
|
||
logger.error(
|
||
"Erreur enrichissement VLM du workflow %s : %s",
|
||
workflow_id, e,
|
||
)
|
||
result["enrichment_error"] = str(e)
|
||
|
||
return result
|
||
|
||
def _select_key_screenshots(self, session_id: str, shot_paths: List[Path]) -> List[Path]:
|
||
"""Sélectionner uniquement les screenshots significatifs pour éviter les analyses redondantes.
|
||
|
||
Critères :
|
||
1. Les screenshots d'action (shot_*_full) sont TOUJOURS conservés
|
||
car chacun correspond à une action utilisateur et est nécessaire
|
||
pour le mode séquentiel du GraphBuilder.
|
||
2. Pour les heartbeats ou autres, comparer au précédent via hash perceptuel.
|
||
3. Garder le premier et le dernier screenshot (toujours).
|
||
"""
|
||
if len(shot_paths) <= 2:
|
||
return list(shot_paths)
|
||
|
||
from PIL import Image
|
||
|
||
selected = []
|
||
last_hash = None
|
||
|
||
for path in shot_paths:
|
||
basename = os.path.basename(str(path))
|
||
|
||
# Les screenshots d'action (shot_*_full) sont systématiquement gardés.
|
||
# Chacun correspond à un clic ou une saisie clavier et constitue
|
||
# une étape distincte du workflow — ne pas les dédupliquer.
|
||
is_action = 'shot_' in basename and '_full' in basename
|
||
if is_action:
|
||
selected.append(path)
|
||
# Mettre à jour le hash pour la comparaison des heartbeats suivants
|
||
try:
|
||
img = Image.open(str(path)).resize((32, 32)).convert('L')
|
||
last_hash = hashlib.md5(img.tobytes()).hexdigest()
|
||
except Exception:
|
||
pass
|
||
continue
|
||
|
||
# Hash perceptuel pour les non-actions (heartbeats, etc.)
|
||
try:
|
||
img = Image.open(str(path)).resize((32, 32)).convert('L')
|
||
current_hash = hashlib.md5(img.tobytes()).hexdigest()
|
||
except Exception as e:
|
||
logger.debug(f"Impossible de hasher {basename}: {e}")
|
||
selected.append(path)
|
||
continue
|
||
|
||
if last_hash is None or current_hash != last_hash:
|
||
selected.append(path)
|
||
last_hash = current_hash
|
||
|
||
# Garantir que le premier et le dernier sont toujours inclus
|
||
if shot_paths[0] not in selected:
|
||
selected.insert(0, shot_paths[0])
|
||
if shot_paths[-1] not in selected:
|
||
selected.append(shot_paths[-1])
|
||
|
||
return selected
|
||
|
||
def _restore_window_events(self, session_id: str, session_dir: Path):
|
||
"""Restaurer les window_info depuis live_events.jsonl lors du retraitement.
|
||
|
||
Crée un mapping chronologique timestamp → window_info pour que chaque
|
||
screenshot soit associé au bon titre de fenêtre. Stocké dans
|
||
session._shot_window_map : {shot_number → window_info}.
|
||
|
||
Le mapping est utilisé par process_screenshot() via last_window_info
|
||
qui est mis à jour avant chaque shot.
|
||
"""
|
||
import json as _json
|
||
import re
|
||
|
||
events_file = session_dir / "live_events.jsonl"
|
||
if not events_file.exists():
|
||
logger.debug(f"Pas de live_events.jsonl pour {session_id}")
|
||
return
|
||
|
||
try:
|
||
# Collecter tous les window_focus_change avec leur timestamp
|
||
window_changes = [] # [(timestamp, {"title": ..., "app_name": ...})]
|
||
|
||
for line in events_file.read_text().splitlines():
|
||
if not line.strip():
|
||
continue
|
||
try:
|
||
evt = _json.loads(line)
|
||
except _json.JSONDecodeError:
|
||
continue
|
||
|
||
event_data = evt.get("event", evt)
|
||
evt_type = event_data.get("type", "")
|
||
ts = float(event_data.get("timestamp", evt.get("timestamp", 0)))
|
||
|
||
if evt_type == "window_focus_change":
|
||
to_info = event_data.get("to") or event_data.get("window") or {}
|
||
title = to_info.get("title", "")
|
||
if title:
|
||
window_changes.append((ts, {
|
||
"title": title,
|
||
"app_name": to_info.get("app_name", "unknown"),
|
||
}))
|
||
|
||
if not window_changes:
|
||
logger.debug(f"Pas de window_focus_change dans {session_id}")
|
||
return
|
||
|
||
window_changes.sort(key=lambda x: x[0])
|
||
|
||
# Mapper chaque shot_XXXX aux window_info par timestamp
|
||
shots_dir = session_dir / "shots"
|
||
shot_files = sorted(shots_dir.glob("shot_*_full.png"))
|
||
|
||
# Extraire le timestamp du nom de fichier (shot_XXXX_full.png → XXXX = index)
|
||
# ou du mtime du fichier
|
||
shot_window_map = {}
|
||
for shot_file in shot_files:
|
||
shot_ts = shot_file.stat().st_mtime
|
||
# Trouver le dernier window_change avant ce shot
|
||
best_window = window_changes[0][1] # Premier par défaut
|
||
for wc_ts, wc_info in window_changes:
|
||
if wc_ts <= shot_ts:
|
||
best_window = wc_info
|
||
else:
|
||
break
|
||
shot_window_map[shot_file.stem] = best_window
|
||
|
||
# Stocker dans la session pour que process_screenshot() puisse l'utiliser
|
||
session = self.session_manager.get_session(session_id)
|
||
if session:
|
||
session._shot_window_map = shot_window_map
|
||
# Mettre le dernier titre connu comme fallback
|
||
session.last_window_info = window_changes[-1][1]
|
||
|
||
titles_seen = set(w.get("title", "") for w in shot_window_map.values())
|
||
logger.info(
|
||
f"Window events restaurés pour {session_id}: "
|
||
f"{len(shot_window_map)} shots mappés, "
|
||
f"{len(titles_seen)} titres uniques: {titles_seen}"
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.warning(f"Erreur restauration window events pour {session_id}: {e}")
|
||
|
||
def _restore_user_events(self, session_id: str, session_dir: Path):
|
||
"""Restaurer les événements utilisateur depuis live_events.jsonl.
|
||
|
||
Charge les événements d'action (mouse_click, text_input, key_press)
|
||
dans session.events via session_manager.add_event().
|
||
Sans cela, to_raw_session() retourne une liste d'events vide,
|
||
et le GraphBuilder ne peut pas construire les actions des edges.
|
||
|
||
Construit aussi un mapping shot_id → timestamp pour enrichir les
|
||
ScreenStates avec le bon event_time (nécessaire pour
|
||
_find_transition_events).
|
||
|
||
Note : vide d'abord les events existants de la session pour
|
||
éviter les doublons (la session peut avoir été restaurée depuis
|
||
un fichier de persistance au démarrage).
|
||
"""
|
||
import json as _json
|
||
|
||
events_file = session_dir / "live_events.jsonl"
|
||
if not events_file.exists():
|
||
logger.debug(f"Pas de live_events.jsonl pour {session_id}")
|
||
return
|
||
|
||
try:
|
||
# Vider les events existants pour éviter les doublons
|
||
# (la session peut avoir été pré-chargée depuis la persistance)
|
||
session = self.session_manager.get_session(session_id)
|
||
if session and session.events:
|
||
logger.debug(
|
||
f"Nettoyage de {len(session.events)} events "
|
||
f"pré-existants pour {session_id}"
|
||
)
|
||
session.events.clear()
|
||
|
||
action_count = 0
|
||
shot_ts_map = {} # shot_id → timestamp de l'événement d'action
|
||
|
||
for line in events_file.read_text().splitlines():
|
||
if not line.strip():
|
||
continue
|
||
try:
|
||
raw = _json.loads(line)
|
||
except _json.JSONDecodeError:
|
||
continue
|
||
|
||
event_data = raw.get("event", raw)
|
||
evt_type = event_data.get("type", "")
|
||
ts = float(event_data.get("timestamp", raw.get("timestamp", 0)))
|
||
|
||
if evt_type not in ("mouse_click", "text_input", "key_press"):
|
||
continue
|
||
|
||
# Construire le dict d'événement pour add_event()
|
||
evt_dict = {
|
||
"type": evt_type,
|
||
"timestamp": ts,
|
||
}
|
||
|
||
# Copier les données spécifiques au type
|
||
if evt_type == "mouse_click":
|
||
evt_dict["pos"] = event_data.get("pos", [0, 0])
|
||
evt_dict["button"] = event_data.get("button", "left")
|
||
elif evt_type == "text_input":
|
||
evt_dict["text"] = event_data.get("text", "")
|
||
elif evt_type == "key_press":
|
||
evt_dict["keys"] = event_data.get("keys", [])
|
||
|
||
# Copier window info si disponible
|
||
window = event_data.get("window")
|
||
if window:
|
||
evt_dict["window"] = window
|
||
|
||
# Copier screenshot_id si disponible
|
||
shot_id = event_data.get("screenshot_id")
|
||
if shot_id:
|
||
evt_dict["screenshot_id"] = shot_id
|
||
# Mapper shot_id → timestamp pour enrichir les ScreenStates
|
||
shot_ts_map[shot_id] = ts
|
||
|
||
# Copier screen_metadata si disponible
|
||
screen_meta = event_data.get("screen_metadata")
|
||
if screen_meta:
|
||
# Propager la résolution pour to_raw_session()
|
||
if "screen_resolution" in screen_meta:
|
||
evt_dict["screen_resolution"] = screen_meta["screen_resolution"]
|
||
|
||
self.session_manager.add_event(session_id, evt_dict)
|
||
action_count += 1
|
||
|
||
# Stocker le mapping shot → timestamp dans la session
|
||
# pour enrichir les ScreenStates dans finalize_session()
|
||
session = self.session_manager.get_session(session_id)
|
||
if session:
|
||
session._shot_ts_map = shot_ts_map
|
||
|
||
logger.info(
|
||
f"User events restaurés pour {session_id}: "
|
||
f"{action_count} actions chargées, "
|
||
f"{len(shot_ts_map)} shots avec timestamp"
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.warning(f"Erreur restauration user events pour {session_id}: {e}")
|
||
|
||
def _find_session_dir(self, session_id: str) -> Optional[Path]:
|
||
"""Trouver le dossier d'une session sur disque.
|
||
|
||
Cherche dans (par ordre de priorité) :
|
||
1. data/training/{session_id}/
|
||
2. data/training/{subdir}/{session_id}/ (ex: live_sessions)
|
||
3. data/training/{subdir}/{machine_id}/{session_id}/ (multi-machine)
|
||
"""
|
||
# Chemin direct
|
||
direct = self.data_dir / session_id
|
||
if direct.is_dir() and (direct / "shots").exists():
|
||
return direct
|
||
|
||
# Chercher dans les sous-dossiers (1 niveau : live_sessions/{session_id})
|
||
parent = self.data_dir
|
||
if parent.exists():
|
||
for subdir in parent.iterdir():
|
||
if subdir.is_dir():
|
||
candidate = subdir / session_id
|
||
if candidate.is_dir() and (candidate / "shots").exists():
|
||
return candidate
|
||
# Chercher 1 niveau plus profond (live_sessions/{machine_id}/{session_id})
|
||
if subdir.is_dir():
|
||
for machine_dir in subdir.iterdir():
|
||
if machine_dir.is_dir():
|
||
candidate = machine_dir / session_id
|
||
if candidate.is_dir() and (candidate / "shots").exists():
|
||
return candidate
|
||
|
||
# Chercher aussi dans le parent du data_dir (cas où data_dir = streaming_sessions)
|
||
parent_parent = self.data_dir.parent
|
||
if parent_parent.exists() and parent_parent != self.data_dir:
|
||
direct2 = parent_parent / session_id
|
||
if direct2.is_dir() and (direct2 / "shots").exists():
|
||
return direct2
|
||
for subdir in parent_parent.iterdir():
|
||
if subdir.is_dir() and subdir.name != self.data_dir.name:
|
||
candidate = subdir / session_id
|
||
if candidate.is_dir() and (candidate / "shots").exists():
|
||
return candidate
|
||
|
||
return None
|
||
|
||
def find_pending_sessions(self) -> List[str]:
|
||
"""Trouver les sessions finalisées qui n'ont pas encore été traitées.
|
||
|
||
Une session est "pending" si :
|
||
- Elle est marquée comme finalisée dans le session_manager
|
||
- Elle a 0 ScreenStates en mémoire (jamais analysée ou analyse perdue)
|
||
- Elle a des screenshots full sur disque
|
||
|
||
Returns:
|
||
Liste de session_ids à traiter.
|
||
"""
|
||
pending = []
|
||
for sid in self.session_manager.session_ids:
|
||
session = self.session_manager.get_session(sid)
|
||
if session is None:
|
||
continue
|
||
if not session.finalized:
|
||
continue
|
||
|
||
# Vérifier si des states existent déjà
|
||
with self._data_lock:
|
||
states_count = len(self._screen_states.get(sid, []))
|
||
if states_count > 0:
|
||
continue
|
||
|
||
# Vérifier si un workflow existe déjà pour cette session
|
||
# 1. Check en mémoire (attribut _source_session)
|
||
with self._data_lock:
|
||
has_workflow = any(
|
||
getattr(wf, '_source_session', None) == sid
|
||
for wf in self._workflows.values()
|
||
)
|
||
if has_workflow:
|
||
continue
|
||
|
||
# 2. Check sur disque (metadata.source_session_id dans les fichiers JSON)
|
||
if self._workflow_exists_on_disk(sid):
|
||
continue
|
||
|
||
# Vérifier qu'il y a des screenshots full sur disque
|
||
session_dir = self._find_session_dir(sid)
|
||
if session_dir:
|
||
shots_dir = session_dir / "shots"
|
||
if shots_dir.exists():
|
||
full_shots = list(shots_dir.glob("shot_*_full.png"))
|
||
if full_shots:
|
||
logger.info(
|
||
f"Session pending trouvée : {sid} "
|
||
f"({len(full_shots)} screenshots full)"
|
||
)
|
||
pending.append(sid)
|
||
|
||
return pending
|
||
|
||
def _workflow_exists_on_disk(self, session_id: str) -> bool:
|
||
"""Vérifier si un workflow a déjà été produit pour cette session.
|
||
|
||
Parcourt les fichiers JSON dans data/training/workflows/ et cherche
|
||
source_session_id dans les métadonnées.
|
||
Utilise un cache pour éviter de re-lire les fichiers à chaque appel.
|
||
"""
|
||
import json as _json
|
||
|
||
if not hasattr(self, '_processed_sessions_cache'):
|
||
# Construire le cache au premier appel
|
||
self._processed_sessions_cache = set()
|
||
workflows_dir = self.data_dir / "workflows"
|
||
if workflows_dir.exists():
|
||
for wf_file in workflows_dir.rglob("*.json"):
|
||
try:
|
||
with open(wf_file, 'r') as f:
|
||
data = _json.load(f)
|
||
src = data.get('metadata', {}).get('source_session_id', '')
|
||
if src:
|
||
self._processed_sessions_cache.add(src)
|
||
except Exception:
|
||
continue
|
||
logger.info(
|
||
f"Cache sessions traitées : {len(self._processed_sessions_cache)} workflows existants"
|
||
)
|
||
|
||
return session_id in self._processed_sessions_cache
|
||
|
||
def _enrich_states_with_timestamps(
|
||
self,
|
||
states: List,
|
||
shot_ts_map: Dict[str, float],
|
||
):
|
||
"""Enrichir les ScreenStates avec les timestamps des événements d'action.
|
||
|
||
Le GraphBuilder utilise metadata['shot_timestamp'] (ou 'event_time')
|
||
pour associer les événements utilisateur aux transitions entre states.
|
||
Sans cette info, _find_transition_events() ne sait pas quels événements
|
||
appartiennent à quelle transition.
|
||
|
||
On fait un mapping par nom de shot :
|
||
- shot_XXXX → ScreenState dont le screen_state_id contient "XXXX"
|
||
ou par index séquentiel (shot_0001 → state[0], etc.)
|
||
|
||
Args:
|
||
states: Liste de ScreenStates à enrichir (modifiés in-place)
|
||
shot_ts_map: Mapping {shot_id: timestamp_epoch}
|
||
"""
|
||
if not shot_ts_map:
|
||
return
|
||
|
||
import re
|
||
|
||
# Trier les shot_ids par numéro pour correspondre à l'ordre des states
|
||
sorted_shots = sorted(
|
||
shot_ts_map.items(),
|
||
key=lambda x: x[0],
|
||
)
|
||
|
||
# Stratégie 1 : mapping par index séquentiel
|
||
# shot_0001 → state[0], shot_0002 → state[1], etc.
|
||
enriched = 0
|
||
for i, (shot_id, ts) in enumerate(sorted_shots):
|
||
if i < len(states):
|
||
state = states[i]
|
||
if state.metadata is None:
|
||
state.metadata = {}
|
||
state.metadata["shot_timestamp"] = ts
|
||
state.metadata["shot_id"] = shot_id
|
||
enriched += 1
|
||
|
||
# Pour les states restants (sans shot correspondant),
|
||
# interpoler entre les timestamps voisins
|
||
for i, state in enumerate(states):
|
||
if state.metadata and state.metadata.get("shot_timestamp"):
|
||
continue
|
||
if state.metadata is None:
|
||
state.metadata = {}
|
||
# Chercher les timestamps voisins
|
||
prev_ts = 0
|
||
next_ts = float('inf')
|
||
for j in range(i - 1, -1, -1):
|
||
t = states[j].metadata.get("shot_timestamp") if states[j].metadata else None
|
||
if t:
|
||
prev_ts = t
|
||
break
|
||
for j in range(i + 1, len(states)):
|
||
t = states[j].metadata.get("shot_timestamp") if states[j].metadata else None
|
||
if t:
|
||
next_ts = t
|
||
break
|
||
if prev_ts > 0 and next_ts < float('inf'):
|
||
state.metadata["shot_timestamp"] = (prev_ts + next_ts) / 2
|
||
elif prev_ts > 0:
|
||
state.metadata["shot_timestamp"] = prev_ts + 1.0
|
||
elif next_ts < float('inf'):
|
||
state.metadata["shot_timestamp"] = next_ts - 1.0
|
||
|
||
logger.debug(
|
||
f"Timestamps enrichis: {enriched}/{len(states)} states "
|
||
f"depuis {len(shot_ts_map)} shots"
|
||
)
|
||
|
||
def _cleanup_session_data(self, session_id: str):
|
||
"""Libérer la mémoire des ScreenStates et embeddings après finalization."""
|
||
with self._data_lock:
|
||
states = self._screen_states.pop(session_id, [])
|
||
embeddings = self._embeddings.pop(session_id, [])
|
||
logger.info(
|
||
f"Mémoire libérée pour {session_id}: "
|
||
f"{len(states)} states, {len(embeddings)} embeddings"
|
||
)
|
||
|
||
# =========================================================================
|
||
# Helpers
|
||
# =========================================================================
|
||
|
||
def _generate_workflow_name(self, session_id: str) -> str:
|
||
"""
|
||
Générer un nom de tâche lisible et humain à partir des titres de fenêtre.
|
||
|
||
Analyse les titres vus pendant la session pour extraire :
|
||
- L'application principale (la plus fréquente)
|
||
- Le contexte documentaire (après le tiret dans le titre)
|
||
- Une description d'action déduite du contexte
|
||
|
||
Exemples de résultats :
|
||
"Chrome - Facturation DPI" → "Chrome — Facturation DPI"
|
||
"Excel - Budget_2026.xlsx" → "Excel — Budget 2026"
|
||
3 apps → "Chrome, Excel et Word"
|
||
Aucun contexte → "Tâche du 17 mars à 14h"
|
||
"""
|
||
import re
|
||
|
||
session = self.session_manager.get_session(session_id)
|
||
if not session:
|
||
return self._fallback_task_name()
|
||
|
||
titles = session.window_titles_seen
|
||
apps = session.app_names_seen
|
||
|
||
if not titles and not apps:
|
||
return self._fallback_task_name()
|
||
|
||
# Trier par fréquence décroissante
|
||
sorted_titles = sorted(titles.items(), key=lambda x: -x[1])
|
||
sorted_apps = sorted(apps.items(), key=lambda x: -x[1])
|
||
|
||
# Extraire le nom d'app depuis le titre le plus fréquent
|
||
primary_title = sorted_titles[0][0] if sorted_titles else ""
|
||
primary_app = sorted_apps[0][0] if sorted_apps else ""
|
||
|
||
# Nettoyer le nom d'application pour l'affichage humain
|
||
app_display = self._humanize_app_name(primary_app) if primary_app else ""
|
||
|
||
# Extraire la partie contextuelle du titre (après/avant le séparateur)
|
||
context_part = ""
|
||
for sep in [" - ", " — ", " – ", " | ", ": "]:
|
||
if sep in primary_title:
|
||
parts = primary_title.split(sep)
|
||
if len(parts) >= 2:
|
||
candidates = [p.strip() for p in parts]
|
||
app_lower = primary_app.lower()
|
||
context_candidates = [
|
||
c for c in candidates
|
||
if app_lower not in c.lower()
|
||
and c.lower() not in app_lower
|
||
]
|
||
if context_candidates:
|
||
context_part = context_candidates[0]
|
||
else:
|
||
context_part = candidates[0]
|
||
break
|
||
|
||
# Construire le nom lisible
|
||
distinct_apps = [a for a, _ in sorted_apps if a.lower() not in ("unknown", "explorer")]
|
||
|
||
if len(distinct_apps) >= 3:
|
||
# Multi-app : "Chrome, Excel et Word"
|
||
app_names = [self._humanize_app_name(a) for a in distinct_apps[:3]]
|
||
if len(app_names) == 3:
|
||
name = f"{app_names[0]}, {app_names[1]} et {app_names[2]}"
|
||
else:
|
||
name = " et ".join(app_names)
|
||
elif context_part:
|
||
# Nettoyer le contexte pour le rendre lisible
|
||
clean_context = re.sub(r'[<>:"/\\|?*\[\]]', '', context_part)
|
||
# Retirer les extensions de fichier courantes
|
||
clean_context = re.sub(r'\.(xlsx?|csv|docx?|pdf|txt)$', '', clean_context, flags=re.IGNORECASE)
|
||
# Remplacer les underscores par des espaces
|
||
clean_context = clean_context.replace('_', ' ').strip()[:40]
|
||
if app_display:
|
||
name = f"{app_display} \u2014 {clean_context}"
|
||
else:
|
||
name = clean_context
|
||
elif app_display:
|
||
name = f"{app_display} \u2014 session"
|
||
else:
|
||
name = self._fallback_task_name()
|
||
|
||
# Dédoublonner si une tâche avec ce nom existe déjà
|
||
base_name = name
|
||
counter = 1
|
||
with self._data_lock:
|
||
existing_names = {
|
||
getattr(w, 'name', '') for w in self._workflows.values()
|
||
}
|
||
while name in existing_names:
|
||
counter += 1
|
||
name = f"{base_name} ({counter})"
|
||
|
||
return name
|
||
|
||
@staticmethod
|
||
def _fallback_task_name() -> str:
|
||
"""Générer un nom de tâche par défaut basé sur la date et l'heure."""
|
||
now = datetime.now()
|
||
# Noms de mois en français
|
||
mois = [
|
||
"", "janvier", "février", "mars", "avril", "mai", "juin",
|
||
"juillet", "août", "septembre", "octobre", "novembre", "décembre"
|
||
]
|
||
return f"Tâche du {now.day} {mois[now.month]} à {now.hour}h{now.minute:02d}"
|
||
|
||
@staticmethod
|
||
def _humanize_app_name(app_name: str) -> str:
|
||
"""Convertir un nom d'application technique en nom lisible.
|
||
|
||
Exemples :
|
||
"notepad.exe" → "Bloc-notes"
|
||
"chrome.exe" → "Chrome"
|
||
"WindowsTerminal" → "Terminal"
|
||
"""
|
||
import re
|
||
# Supprimer l'extension .exe et les chemins
|
||
name = app_name.split("\\")[-1].split("/")[-1]
|
||
name = re.sub(r'\.exe$', '', name, flags=re.IGNORECASE).strip()
|
||
|
||
# Dictionnaire de noms humains pour les applications courantes
|
||
app_human_names = {
|
||
"notepad": "Bloc-notes",
|
||
"notepad++": "Notepad++",
|
||
"chrome": "Chrome",
|
||
"msedge": "Edge",
|
||
"firefox": "Firefox",
|
||
"explorer": "Explorateur",
|
||
"windowsterminal": "Terminal",
|
||
"cmd": "Invite de commandes",
|
||
"powershell": "PowerShell",
|
||
"excel": "Excel",
|
||
"winword": "Word",
|
||
"powerpnt": "PowerPoint",
|
||
"outlook": "Outlook",
|
||
"teams": "Teams",
|
||
"code": "VS Code",
|
||
"searchhost": "Recherche",
|
||
"applicationframehost": "Application",
|
||
"calc": "Calculatrice",
|
||
"mspaint": "Paint",
|
||
"snippingtool": "Capture d'écran",
|
||
}
|
||
|
||
name_lower = name.lower()
|
||
if name_lower in app_human_names:
|
||
return app_human_names[name_lower]
|
||
|
||
# Capitaliser le nom si pas dans le dictionnaire
|
||
return name.capitalize() if name else "Application"
|
||
|
||
@staticmethod
|
||
def _clean_app_name(app_name: str) -> str:
|
||
"""Nettoyer un nom d'application pour l'utiliser dans un nom de workflow."""
|
||
import re
|
||
# Supprimer l'extension .exe et les chemins
|
||
name = app_name.split("\\")[-1].split("/")[-1]
|
||
name = re.sub(r'\.exe$', '', name, flags=re.IGNORECASE)
|
||
# Capitaliser
|
||
name = name.strip().capitalize()
|
||
# Supprimer les caractères spéciaux
|
||
name = re.sub(r'[^a-zA-Z0-9àâäéèêëïîôùûüÿçÀÂÄÉÈÊËÏÎÔÙÛÜŸÇ_]', '', name)
|
||
return name or "App"
|
||
|
||
def _persist_workflow(self, workflow, session_id: str, machine_id: str = "default") -> Optional[Path]:
|
||
"""Sauvegarder le workflow JSON sur disque.
|
||
|
||
Les workflows sont sauvegardés dans un sous-dossier par machine :
|
||
data/training/workflows/{machine_id}/wf_xxx.json
|
||
|
||
Cela permet de distinguer les workflows appris sur des machines différentes.
|
||
"""
|
||
try:
|
||
# Dossier par machine (ou racine pour "default")
|
||
if machine_id and machine_id != "default":
|
||
workflows_dir = self.data_dir / "workflows" / machine_id
|
||
else:
|
||
workflows_dir = self.data_dir / "workflows"
|
||
workflows_dir.mkdir(parents=True, exist_ok=True)
|
||
filepath = workflows_dir / f"{workflow.workflow_id}.json"
|
||
# Stocker le session_id source et machine_id dans les métadonnées
|
||
if hasattr(workflow, 'metadata') and isinstance(workflow.metadata, dict):
|
||
workflow.metadata['source_session_id'] = session_id
|
||
workflow.metadata['machine_id'] = machine_id
|
||
if not hasattr(workflow, '_machine_id'):
|
||
workflow._machine_id = machine_id
|
||
workflow._source_session = session_id
|
||
workflow.save_to_file(filepath)
|
||
logger.info(f"Workflow sauvegardé: {filepath} (session={session_id}, machine={machine_id})")
|
||
return filepath
|
||
except Exception as e:
|
||
logger.error(f"Erreur sauvegarde workflow {session_id}: {e}")
|
||
return None
|
||
|
||
def _build_raw_session_fallback(self, session, raw_dict):
|
||
"""Construire un RawSession manuellement si from_dict échoue."""
|
||
from core.models.raw_session import RawSession, Event, Screenshot, RawWindowContext
|
||
|
||
events = []
|
||
for evt_dict in raw_dict.get("events", []):
|
||
window_data = evt_dict.get("window", {"title": "", "app_name": "unknown"})
|
||
window = RawWindowContext(
|
||
title=window_data.get("title", ""),
|
||
app_name=window_data.get("app_name", "unknown"),
|
||
)
|
||
events.append(Event(
|
||
t=evt_dict.get("t", 0.0),
|
||
type=evt_dict.get("type", "unknown"),
|
||
window=window,
|
||
data={k: v for k, v in evt_dict.items()
|
||
if k not in ("t", "type", "window", "screenshot_id")},
|
||
screenshot_id=evt_dict.get("screenshot_id"),
|
||
))
|
||
|
||
screenshots = []
|
||
for ss_dict in raw_dict.get("screenshots", []):
|
||
screenshots.append(Screenshot(
|
||
screenshot_id=ss_dict["screenshot_id"],
|
||
relative_path=ss_dict.get("relative_path", ss_dict.get("path", "")),
|
||
captured_at=ss_dict.get("captured_at", datetime.now().isoformat()),
|
||
))
|
||
|
||
return RawSession(
|
||
session_id=session.session_id,
|
||
agent_version="agent_v1_stream",
|
||
environment=raw_dict.get("environment", {}),
|
||
user=raw_dict.get("user", {"id": "remote_agent"}),
|
||
context=raw_dict.get("context", {}),
|
||
started_at=session.created_at,
|
||
ended_at=datetime.now(),
|
||
events=events,
|
||
screenshots=screenshots,
|
||
)
|
||
|
||
def list_sessions(self, machine_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
||
"""Lister les sessions avec leur état.
|
||
|
||
Args:
|
||
machine_id: Si fourni, filtre par machine. Si None, retourne toutes les sessions.
|
||
"""
|
||
sessions = []
|
||
for sid in self.session_manager.session_ids:
|
||
session = self.session_manager.get_session(sid)
|
||
if session is None:
|
||
continue
|
||
# Filtre par machine si demandé
|
||
if machine_id and session.machine_id != machine_id:
|
||
continue
|
||
with self._data_lock:
|
||
states_count = len(self._screen_states.get(sid, []))
|
||
embeddings_count = len(self._embeddings.get(sid, []))
|
||
sessions.append({
|
||
"session_id": session.session_id,
|
||
"machine_id": session.machine_id,
|
||
"events_count": len(session.events),
|
||
"screenshots_count": len(session.shot_paths),
|
||
"states_count": states_count,
|
||
"embeddings_count": embeddings_count,
|
||
"last_window": session.last_window_info,
|
||
"created_at": session.created_at.isoformat(),
|
||
"last_activity": session.last_activity.isoformat(),
|
||
"finalized": session.finalized,
|
||
})
|
||
return sessions
|
||
|
||
def list_workflows(self, machine_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
||
"""Lister les workflows construits.
|
||
|
||
Args:
|
||
machine_id: Si fourni, filtre par machine. Si None, retourne tous les workflows.
|
||
"""
|
||
with self._data_lock:
|
||
workflows_snapshot = list(self._workflows.items())
|
||
result = []
|
||
for wf_id, wf in workflows_snapshot:
|
||
wf_machine = getattr(wf, '_machine_id', 'default')
|
||
# Filtre par machine si demandé
|
||
if machine_id and wf_machine != machine_id:
|
||
continue
|
||
result.append({
|
||
"workflow_id": wf_id,
|
||
"machine_id": wf_machine,
|
||
"nodes": len(wf.nodes) if hasattr(wf, "nodes") else 0,
|
||
"edges": len(wf.edges) if hasattr(wf, "edges") else 0,
|
||
"name": getattr(wf, "name", wf_id),
|
||
})
|
||
return result
|
||
|
||
def reload_workflows(self) -> int:
|
||
"""Recharger les workflows depuis le disque.
|
||
|
||
Utile après qu'un nouveau workflow a été exporté depuis le VWB
|
||
ou appris par le streaming. Retourne le nombre de workflows chargés.
|
||
"""
|
||
with self._data_lock:
|
||
self._workflows.clear()
|
||
self._load_persisted_workflows()
|
||
with self._data_lock:
|
||
count = len(self._workflows)
|
||
logger.info("Workflows rechargés depuis le disque : %d", count)
|
||
return count
|
||
|
||
# =========================================================================
|
||
# Extraction d'actions enrichies depuis un workflow appris
|
||
# =========================================================================
|
||
|
||
def extract_enriched_actions(
|
||
self,
|
||
workflow,
|
||
params: Optional[Dict[str, Any]] = None,
|
||
) -> List[Dict[str, Any]]:
|
||
"""Extraire les actions enrichies d'un workflow appris (nodes + edges + events).
|
||
|
||
Parcourt le graphe en BFS depuis les entry_nodes et construit pour chaque edge
|
||
une action enrichie contenant :
|
||
- Les coordonnées normalisées (x_pct, y_pct) depuis les events originaux
|
||
- Les infos de ciblage visuel (by_text, by_role, window_title)
|
||
- Les identifiants de nodes (from_node, to_node) pour le pre-check et post-check
|
||
- Le flag visual_mode=True pour activer la résolution visuelle côté agent
|
||
|
||
Args:
|
||
workflow: Objet Workflow ou dict brut avec nodes/edges
|
||
params: Paramètres de substitution (variables ${var})
|
||
|
||
Returns:
|
||
Liste d'actions enrichies prêtes pour la queue de replay, ou liste vide
|
||
si le workflow n'a pas d'edges exploitables.
|
||
"""
|
||
import uuid
|
||
from collections import defaultdict
|
||
|
||
params = params or {}
|
||
|
||
# Accéder aux données du workflow (objet ou dict)
|
||
if hasattr(workflow, 'edges'):
|
||
edges = workflow.edges
|
||
entry_nodes = workflow.entry_nodes if hasattr(workflow, 'entry_nodes') else []
|
||
nodes_list = workflow.nodes if hasattr(workflow, 'nodes') else []
|
||
elif isinstance(workflow, dict):
|
||
edges = workflow.get('edges', [])
|
||
entry_nodes = workflow.get('entry_nodes', [])
|
||
nodes_list = workflow.get('nodes', [])
|
||
else:
|
||
return []
|
||
|
||
if not edges:
|
||
return []
|
||
|
||
# Index des nodes par ID
|
||
node_index = {}
|
||
for n in nodes_list:
|
||
nid = n.node_id if hasattr(n, 'node_id') else n.get('node_id', '')
|
||
node_index[nid] = n
|
||
|
||
# Index des edges sortants par node
|
||
outgoing: Dict[str, list] = defaultdict(list)
|
||
for edge in edges:
|
||
fn = edge.from_node if hasattr(edge, 'from_node') else edge.get('from_node', '')
|
||
outgoing[fn].append(edge)
|
||
|
||
# Trouver les événements originaux de la session source
|
||
original_events = self._load_original_events_for_workflow(workflow)
|
||
|
||
# Trouver le dossier de la session source (pour les crops/anchors)
|
||
source_session_dir = self._find_session_dir_for_workflow(workflow)
|
||
|
||
# Inférer la résolution d'écran depuis les positions maximales des events
|
||
inferred_resolution = self._infer_screen_resolution(original_events)
|
||
|
||
# BFS depuis les entry_nodes
|
||
if not entry_nodes:
|
||
# Fallback : premier node des edges
|
||
first_edge = edges[0]
|
||
fn = first_edge.from_node if hasattr(first_edge, 'from_node') else first_edge.get('from_node', '')
|
||
entry_nodes = [fn]
|
||
|
||
visited = set()
|
||
queue = list(entry_nodes)
|
||
ordered_edges = []
|
||
|
||
while queue:
|
||
node_id = queue.pop(0)
|
||
if node_id in visited:
|
||
continue
|
||
visited.add(node_id)
|
||
|
||
for edge in outgoing.get(node_id, []):
|
||
ordered_edges.append(edge)
|
||
tn = edge.to_node if hasattr(edge, 'to_node') else edge.get('to_node', '')
|
||
if tn not in visited:
|
||
queue.append(tn)
|
||
|
||
# Construire les actions enrichies depuis les edges ordonnés
|
||
actions = []
|
||
for edge in ordered_edges:
|
||
enriched = self._edge_to_enriched_action(
|
||
edge, node_index, original_events, params, inferred_resolution,
|
||
source_session_dir,
|
||
)
|
||
if enriched:
|
||
actions.extend(enriched)
|
||
|
||
# Nettoyage global : éliminer les actions parasites, fusionner les
|
||
# text_input consécutifs, dédupliquer les key_combo, etc.
|
||
raw_count = len(actions)
|
||
actions = clean_enriched_actions(actions)
|
||
|
||
logger.info(
|
||
"Actions enrichies extraites : %d actions (nettoyées depuis %d brutes) "
|
||
"depuis %d edges (events originaux : %d)",
|
||
len(actions), raw_count, len(ordered_edges), len(original_events),
|
||
)
|
||
return actions
|
||
|
||
def _load_original_events_for_workflow(self, workflow) -> List[Dict[str, Any]]:
|
||
"""Charger les événements originaux (live_events.jsonl) liés à un workflow.
|
||
|
||
Stratégie de recherche :
|
||
1. metadata.source_session_id (si le workflow le stocke)
|
||
2. Parcourir les sessions existantes pour trouver un match temporel
|
||
3. Utiliser le workflow_id comme hint (parfois contient le session_id)
|
||
|
||
Returns:
|
||
Liste d'events bruts (dicts), ou liste vide si introuvable.
|
||
"""
|
||
import json
|
||
|
||
# Stratégie 1 : metadata.source_session_id
|
||
metadata = workflow.metadata if hasattr(workflow, 'metadata') else (
|
||
workflow.get('metadata', {}) if isinstance(workflow, dict) else {}
|
||
)
|
||
source_sid = metadata.get('source_session_id', '')
|
||
|
||
if source_sid:
|
||
events = self._load_events_from_session(source_sid)
|
||
if events:
|
||
return events
|
||
|
||
# Stratégie 2 : workflow_id peut contenir ou être un session_id
|
||
wf_id = workflow.workflow_id if hasattr(workflow, 'workflow_id') else (
|
||
workflow.get('workflow_id', '') if isinstance(workflow, dict) else ''
|
||
)
|
||
if wf_id.startswith('sess_'):
|
||
events = self._load_events_from_session(wf_id)
|
||
if events:
|
||
return events
|
||
|
||
# Stratégie 3 : chercher les sessions les plus proches temporellement
|
||
created_at = None
|
||
if hasattr(workflow, 'created_at'):
|
||
created_at = workflow.created_at
|
||
elif isinstance(workflow, dict) and 'created_at' in workflow:
|
||
try:
|
||
from datetime import datetime
|
||
created_at = datetime.fromisoformat(workflow['created_at'])
|
||
except (ValueError, TypeError):
|
||
pass
|
||
|
||
if created_at:
|
||
events = self._find_closest_session_events(created_at)
|
||
if events:
|
||
return events
|
||
|
||
return []
|
||
|
||
def _find_session_dir_for_workflow(self, workflow) -> Optional[Path]:
|
||
"""Trouver le dossier de la session source associée à un workflow.
|
||
|
||
Utilise la même logique de recherche que _load_original_events_for_workflow
|
||
mais retourne le chemin du dossier au lieu des événements. Nécessaire pour
|
||
accéder aux crops (anchor images) stockés dans {session_dir}/shots/.
|
||
|
||
Returns:
|
||
Path du dossier session, ou None si introuvable.
|
||
"""
|
||
# Stratégie 1 : metadata.source_session_id
|
||
metadata = workflow.metadata if hasattr(workflow, 'metadata') else (
|
||
workflow.get('metadata', {}) if isinstance(workflow, dict) else {}
|
||
)
|
||
source_sid = metadata.get('source_session_id', '')
|
||
|
||
if source_sid:
|
||
session_dir = self._find_session_dir(source_sid)
|
||
if session_dir:
|
||
return session_dir
|
||
|
||
# Stratégie 2 : workflow_id peut contenir ou être un session_id
|
||
wf_id = workflow.workflow_id if hasattr(workflow, 'workflow_id') else (
|
||
workflow.get('workflow_id', '') if isinstance(workflow, dict) else ''
|
||
)
|
||
if wf_id.startswith('sess_'):
|
||
session_dir = self._find_session_dir(wf_id)
|
||
if session_dir:
|
||
return session_dir
|
||
|
||
# Stratégie 3 : chercher la session la plus proche temporellement
|
||
created_at = None
|
||
if hasattr(workflow, 'created_at'):
|
||
created_at = workflow.created_at
|
||
elif isinstance(workflow, dict) and 'created_at' in workflow:
|
||
try:
|
||
from datetime import datetime as _dt
|
||
created_at = _dt.fromisoformat(workflow['created_at'])
|
||
except (ValueError, TypeError):
|
||
pass
|
||
|
||
if created_at:
|
||
session_dir = self._find_closest_session_dir(created_at)
|
||
if session_dir:
|
||
return session_dir
|
||
|
||
return None
|
||
|
||
def _find_closest_session_dir(self, workflow_created_at) -> Optional[Path]:
|
||
"""Trouver le dossier de la session la plus proche temporellement.
|
||
|
||
Même logique que _find_closest_session_events mais retourne le Path
|
||
du dossier au lieu de charger les événements.
|
||
|
||
Returns:
|
||
Path du dossier session, ou None si aucun match dans les 10 minutes.
|
||
"""
|
||
from datetime import datetime as _dt
|
||
|
||
best_dir = None
|
||
best_delta = float('inf')
|
||
|
||
search_dirs = [self.data_dir]
|
||
if self.data_dir.exists():
|
||
for subdir in self.data_dir.iterdir():
|
||
if subdir.is_dir() and not subdir.name.startswith('.'):
|
||
search_dirs.append(subdir)
|
||
|
||
for search_dir in search_dirs:
|
||
if not search_dir.exists():
|
||
continue
|
||
for session_dir in search_dir.iterdir():
|
||
if not session_dir.is_dir():
|
||
continue
|
||
name = session_dir.name
|
||
if not name.startswith('sess_'):
|
||
continue
|
||
try:
|
||
ts_part = name.split('_')[1]
|
||
session_dt = _dt.strptime(ts_part, '%Y%m%dT%H%M%S')
|
||
delta = abs((workflow_created_at - session_dt).total_seconds())
|
||
if delta < best_delta:
|
||
best_delta = delta
|
||
best_dir = session_dir
|
||
except (IndexError, ValueError):
|
||
continue
|
||
|
||
if best_dir and best_delta < 600:
|
||
return best_dir
|
||
|
||
return None
|
||
|
||
def _load_anchor_crop(
|
||
self,
|
||
matched_event: Dict[str, Any],
|
||
session_dir: Path,
|
||
) -> Optional[str]:
|
||
"""Charger le crop de référence (anchor image) pour un événement clic.
|
||
|
||
Cherche le crop dans le dossier shots/ de la session source, en utilisant
|
||
le screenshot_id de l'événement original. Si le crop n'existe pas, tente
|
||
de le recréer à partir du screenshot full en croppant autour de la position
|
||
du clic.
|
||
|
||
Args:
|
||
matched_event: Événement original (dict avec screenshot_id et pos)
|
||
session_dir: Dossier de la session source
|
||
|
||
Returns:
|
||
Image crop encodée en base64, ou None si introuvable.
|
||
"""
|
||
import base64
|
||
|
||
screenshot_id = matched_event.get('screenshot_id', '')
|
||
if not screenshot_id:
|
||
return None
|
||
|
||
shots_dir = session_dir / "shots"
|
||
if not shots_dir.exists():
|
||
return None
|
||
|
||
# Stratégie 1 : crop déjà capturé par l'agent (shot_XXXX_crop.png)
|
||
crop_path = shots_dir / f"{screenshot_id}_crop.png"
|
||
if crop_path.exists():
|
||
try:
|
||
crop_b64 = base64.b64encode(crop_path.read_bytes()).decode()
|
||
logger.debug("Anchor crop chargé : %s", crop_path.name)
|
||
return crop_b64
|
||
except Exception as e:
|
||
logger.warning("Erreur lecture crop %s : %s", crop_path, e)
|
||
|
||
# Stratégie 1b : crop en JPEG (compression possible côté agent)
|
||
crop_jpg = shots_dir / f"{screenshot_id}_crop.jpg"
|
||
if crop_jpg.exists():
|
||
try:
|
||
crop_b64 = base64.b64encode(crop_jpg.read_bytes()).decode()
|
||
logger.debug("Anchor crop JPEG chargé : %s", crop_jpg.name)
|
||
return crop_b64
|
||
except Exception as e:
|
||
logger.warning("Erreur lecture crop JPEG %s : %s", crop_jpg, e)
|
||
|
||
# Stratégie 2 : cropper le full screenshot autour de la position du clic
|
||
full_path = shots_dir / f"{screenshot_id}_full.png"
|
||
if not full_path.exists():
|
||
full_path = shots_dir / f"{screenshot_id}_full.jpg"
|
||
if not full_path.exists():
|
||
return None
|
||
|
||
pos = matched_event.get('pos', [])
|
||
if not pos or len(pos) < 2:
|
||
return None
|
||
|
||
try:
|
||
from PIL import Image
|
||
import io
|
||
|
||
img = Image.open(full_path)
|
||
x, y = int(pos[0]), int(pos[1])
|
||
|
||
# Crop 400x400 centré sur le clic (même taille que le captor)
|
||
crop_size = 200 # demi-côté
|
||
left = max(0, x - crop_size)
|
||
top = max(0, y - crop_size)
|
||
right = min(img.width, x + crop_size)
|
||
bottom = min(img.height, y + crop_size)
|
||
|
||
crop = img.crop((left, top, right, bottom))
|
||
|
||
buf = io.BytesIO()
|
||
crop.save(buf, format="PNG", optimize=True)
|
||
crop_b64 = base64.b64encode(buf.getvalue()).decode()
|
||
|
||
logger.debug(
|
||
"Anchor crop généré depuis %s (pos=%s, crop=%dx%d)",
|
||
full_path.name, pos, crop.width, crop.height,
|
||
)
|
||
return crop_b64
|
||
except Exception as e:
|
||
logger.warning("Erreur génération crop depuis %s : %s", full_path, e)
|
||
return None
|
||
|
||
def _load_events_from_session(self, session_id: str) -> List[Dict[str, Any]]:
|
||
"""Charger les événements depuis le live_events.jsonl d'une session."""
|
||
import json
|
||
|
||
session_dir = self._find_session_dir(session_id)
|
||
if not session_dir:
|
||
return []
|
||
|
||
events_file = session_dir / "live_events.jsonl"
|
||
if not events_file.exists():
|
||
return []
|
||
|
||
events = []
|
||
try:
|
||
with open(events_file, 'r', encoding='utf-8') as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
try:
|
||
data = json.loads(line)
|
||
event = data.get('event', data)
|
||
events.append(event)
|
||
except json.JSONDecodeError:
|
||
continue
|
||
except Exception as e:
|
||
logger.warning("Erreur lecture events %s : %s", events_file, e)
|
||
|
||
return events
|
||
|
||
def _find_closest_session_events(self, workflow_created_at) -> List[Dict[str, Any]]:
|
||
"""Trouver la session la plus proche temporellement du workflow.
|
||
|
||
Parcourt les dossiers de sessions sur disque et compare les dates
|
||
de création (encodées dans le nom du dossier sess_YYYYMMDDTHHMMSS_xxx).
|
||
"""
|
||
import json
|
||
from datetime import datetime
|
||
|
||
best_dir = None
|
||
best_delta = float('inf')
|
||
|
||
# Chercher dans data_dir et ses sous-dossiers (machine_id)
|
||
search_dirs = [self.data_dir]
|
||
if self.data_dir.exists():
|
||
for subdir in self.data_dir.iterdir():
|
||
if subdir.is_dir() and not subdir.name.startswith('.'):
|
||
search_dirs.append(subdir)
|
||
|
||
for search_dir in search_dirs:
|
||
for session_dir in search_dir.iterdir():
|
||
if not session_dir.is_dir():
|
||
continue
|
||
name = session_dir.name
|
||
if not name.startswith('sess_'):
|
||
continue
|
||
# Extraire le timestamp du nom : sess_YYYYMMDDTHHMMSS_xxx
|
||
try:
|
||
ts_part = name.split('_')[1] # YYYYMMDDTHHMMSS
|
||
session_dt = datetime.strptime(ts_part, '%Y%m%dT%H%M%S')
|
||
delta = abs((workflow_created_at - session_dt).total_seconds())
|
||
if delta < best_delta:
|
||
best_delta = delta
|
||
best_dir = session_dir
|
||
except (IndexError, ValueError):
|
||
continue
|
||
|
||
if best_dir and best_delta < 600: # Max 10 minutes d'écart
|
||
events_file = best_dir / "live_events.jsonl"
|
||
if events_file.exists():
|
||
events = []
|
||
try:
|
||
with open(events_file, 'r', encoding='utf-8') as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
try:
|
||
data = json.loads(line)
|
||
event = data.get('event', data)
|
||
events.append(event)
|
||
except json.JSONDecodeError:
|
||
continue
|
||
if events:
|
||
logger.info(
|
||
"Events originaux trouvés dans %s (delta=%ds, %d events)",
|
||
best_dir.name, int(best_delta), len(events),
|
||
)
|
||
return events
|
||
except Exception as e:
|
||
logger.warning("Erreur lecture events %s : %s", events_file, e)
|
||
|
||
return []
|
||
|
||
def _find_session_wide_search(
|
||
self,
|
||
workflow,
|
||
return_dir: bool = False,
|
||
):
|
||
"""Recherche élargie de la session source d'un workflow.
|
||
|
||
Utilisé quand ``_load_original_events_for_workflow`` échoue (pas de
|
||
``source_session_id`` et fenêtre temporelle de 10 min trop serrée).
|
||
|
||
Élargit la fenêtre à 3 heures et utilise le ``_machine_id`` du workflow
|
||
pour filtrer les candidats.
|
||
|
||
Args:
|
||
workflow: Objet Workflow ou dict.
|
||
return_dir: Si True, retourne aussi le Path du dossier session.
|
||
|
||
Returns:
|
||
Si ``return_dir`` est False : (events_list, session_dir_path)
|
||
Si ``return_dir`` est True : (events_list, session_dir_path)
|
||
"""
|
||
import json as _json
|
||
from datetime import datetime as _dt
|
||
|
||
created_at = None
|
||
if hasattr(workflow, 'created_at'):
|
||
created_at = workflow.created_at
|
||
elif isinstance(workflow, dict) and 'created_at' in workflow:
|
||
try:
|
||
created_at = _dt.fromisoformat(workflow['created_at'])
|
||
except (ValueError, TypeError):
|
||
pass
|
||
|
||
if not created_at:
|
||
return ([], None)
|
||
|
||
# Machine_id du workflow (peut être dans l'attribut privé ou dans les métadonnées)
|
||
machine_id = getattr(workflow, '_machine_id', None)
|
||
if not machine_id:
|
||
metadata = workflow.metadata if hasattr(workflow, 'metadata') else (
|
||
workflow.get('metadata', {}) if isinstance(workflow, dict) else {}
|
||
)
|
||
machine_id = metadata.get('machine_id', '')
|
||
|
||
best_dir = None
|
||
best_delta = float('inf')
|
||
max_delta = 10800 # 3 heures
|
||
|
||
search_dirs = [self.data_dir]
|
||
if self.data_dir.exists():
|
||
for subdir in self.data_dir.iterdir():
|
||
if subdir.is_dir() and not subdir.name.startswith('.'):
|
||
search_dirs.append(subdir)
|
||
|
||
for search_dir in search_dirs:
|
||
if not search_dir.exists():
|
||
continue
|
||
for session_dir in search_dir.iterdir():
|
||
if not session_dir.is_dir():
|
||
continue
|
||
name = session_dir.name
|
||
if not name.startswith('sess_'):
|
||
continue
|
||
|
||
# Filtrer par machine_id si connu : le session_dir parent doit contenir
|
||
# le machine_id, ou être directement dans data_dir
|
||
if machine_id and machine_id != "default":
|
||
parent_name = search_dir.name
|
||
# Accepter si le parent est le machine_id ou si c'est le data_dir racine
|
||
if parent_name != machine_id and search_dir != self.data_dir:
|
||
continue
|
||
|
||
try:
|
||
ts_part = name.split('_')[1]
|
||
session_dt = _dt.strptime(ts_part, '%Y%m%dT%H%M%S')
|
||
delta = abs((created_at - session_dt).total_seconds())
|
||
if delta < best_delta:
|
||
best_delta = delta
|
||
best_dir = session_dir
|
||
except (IndexError, ValueError):
|
||
continue
|
||
|
||
if best_dir and best_delta < max_delta:
|
||
events_file = best_dir / "live_events.jsonl"
|
||
if events_file.exists():
|
||
events = []
|
||
try:
|
||
with open(events_file, 'r', encoding='utf-8') as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
try:
|
||
data = _json.loads(line)
|
||
event = data.get('event', data)
|
||
events.append(event)
|
||
except _json.JSONDecodeError:
|
||
continue
|
||
if events:
|
||
logger.info(
|
||
"Recherche élargie : session trouvée dans %s "
|
||
"(delta=%ds, %d events, machine=%s)",
|
||
best_dir.name, int(best_delta), len(events),
|
||
machine_id or "?",
|
||
)
|
||
return (events, best_dir)
|
||
except Exception as e:
|
||
logger.warning("Erreur lecture events %s : %s", events_file, e)
|
||
|
||
return ([], None)
|
||
|
||
@staticmethod
|
||
def _infer_screen_resolution(events: List[Dict[str, Any]]) -> tuple:
|
||
"""Inférer la résolution d'écran depuis les positions maximales des events.
|
||
|
||
Analyse les coordonnées de tous les clics pour estimer la résolution
|
||
de l'écran source. Si un clic a x=1800 ou y=1500, la résolution
|
||
est au moins 1800+marge x 1500+marge.
|
||
|
||
Utilise une heuristique : arrondir vers la résolution standard la plus
|
||
proche parmi les plus courantes (1920x1080, 2560x1440, 2560x1600,
|
||
3840x2160, 1366x768, 1280x720).
|
||
|
||
Returns:
|
||
Tuple (width, height) de la résolution inférée, ou (1920, 1080) par défaut.
|
||
"""
|
||
# Résolutions standard connues
|
||
STANDARD_RESOLUTIONS = [
|
||
(1280, 720), (1366, 768), (1440, 900), (1600, 900),
|
||
(1920, 1080), (1920, 1200), (2560, 1440), (2560, 1600),
|
||
(3440, 1440), (3840, 2160),
|
||
]
|
||
|
||
max_x = 0
|
||
max_y = 0
|
||
for evt in events:
|
||
pos = evt.get('pos', [])
|
||
if pos and len(pos) == 2:
|
||
max_x = max(max_x, pos[0])
|
||
max_y = max(max_y, pos[1])
|
||
|
||
if max_x == 0 and max_y == 0:
|
||
return (1920, 1080)
|
||
|
||
# Trouver la résolution standard minimale qui contient tous les clics
|
||
for w, h in STANDARD_RESOLUTIONS:
|
||
if w >= max_x and h >= max_y:
|
||
return (w, h)
|
||
|
||
# Si aucune résolution standard ne convient, arrondir vers le haut
|
||
# par paliers de 100 pixels
|
||
inferred_w = ((max_x // 100) + 1) * 100
|
||
inferred_h = ((max_y // 100) + 1) * 100
|
||
return (inferred_w, inferred_h)
|
||
|
||
def _edge_to_enriched_action(
|
||
self,
|
||
edge,
|
||
node_index: Dict[str, Any],
|
||
original_events: List[Dict[str, Any]],
|
||
params: Dict[str, Any],
|
||
inferred_resolution: tuple = (1920, 1080),
|
||
source_session_dir: Optional[Path] = None,
|
||
) -> List[Dict[str, Any]]:
|
||
"""Convertir un edge de workflow en action(s) enrichie(s).
|
||
|
||
Enrichit chaque action avec :
|
||
- Coordonnées normalisées depuis les événements originaux
|
||
- Infos de ciblage visuel (by_text, by_role, window_title)
|
||
- Anchor image (crop) pour le template matching visuel
|
||
- Flag visual_mode pour la résolution visuelle côté agent
|
||
- Identifiants from_node/to_node pour pre-check et post-conditions
|
||
|
||
Args:
|
||
edge: WorkflowEdge (objet ou dict)
|
||
node_index: Index des nodes par ID
|
||
original_events: Événements originaux de la session
|
||
params: Variables de substitution
|
||
inferred_resolution: Résolution écran inférée
|
||
source_session_dir: Dossier de la session source (pour les crops/anchors)
|
||
|
||
Returns:
|
||
Liste d'actions enrichies (1 pour un edge simple, N pour un compound)
|
||
"""
|
||
import uuid
|
||
|
||
# Extraire les données de l'edge (objet ou dict)
|
||
if hasattr(edge, 'edge_id'):
|
||
edge_id = edge.edge_id
|
||
from_node = edge.from_node
|
||
to_node = edge.to_node
|
||
action_obj = edge.action
|
||
edge_metadata = edge.metadata if hasattr(edge, 'metadata') else {}
|
||
else:
|
||
edge_id = edge.get('edge_id', '')
|
||
from_node = edge.get('from_node', '')
|
||
to_node = edge.get('to_node', '')
|
||
action_obj = edge.get('action', {})
|
||
edge_metadata = edge.get('metadata', {})
|
||
|
||
# Extraire les données de l'action
|
||
if hasattr(action_obj, 'type'):
|
||
action_type = action_obj.type
|
||
target = action_obj.target
|
||
action_params = action_obj.parameters or {}
|
||
elif isinstance(action_obj, dict):
|
||
action_type = action_obj.get('type', 'unknown')
|
||
target = action_obj.get('target', {})
|
||
action_params = action_obj.get('parameters', {})
|
||
else:
|
||
return []
|
||
|
||
# Extraire les infos du target
|
||
if hasattr(target, 'by_role'):
|
||
by_role = target.by_role or ''
|
||
by_text = target.by_text or ''
|
||
by_position = target.by_position
|
||
elif isinstance(target, dict):
|
||
by_role = target.get('by_role', '') or ''
|
||
by_text = target.get('by_text', '') or ''
|
||
by_position = target.get('by_position')
|
||
else:
|
||
by_role = ''
|
||
by_text = ''
|
||
by_position = None
|
||
|
||
# Données du node source (pour pre-check et window_title)
|
||
source_node = node_index.get(from_node)
|
||
target_node = node_index.get(to_node)
|
||
|
||
window_title = self._extract_window_title(source_node)
|
||
target_window_title = self._extract_window_title(target_node)
|
||
|
||
# Chercher l'événement original correspondant à cet edge
|
||
matched_event = self._match_edge_to_event(
|
||
edge_metadata, action_type, action_params, original_events
|
||
)
|
||
|
||
# Construire les coordonnées par ordre de priorité :
|
||
# 1. by_position du target (explicite, fiable)
|
||
# 2. position dans action_params (set par GraphBuilder depuis l'event original)
|
||
# 3. matched_event (recherche dans les events de la session - moins fiable)
|
||
# 4. (0, 0) → sera résolu visuellement par l'agent
|
||
x_pct = 0.0
|
||
y_pct = 0.0
|
||
text = ''
|
||
keys = []
|
||
button = action_params.get('button', 'left') if isinstance(action_params, dict) else 'left'
|
||
|
||
# Priorité 1 : by_position explicite du target
|
||
if by_position and isinstance(by_position, (list, tuple)) and len(by_position) == 2:
|
||
px, py = by_position
|
||
if px <= 1.0 and py <= 1.0:
|
||
x_pct = px
|
||
y_pct = py
|
||
elif px > 0 or py > 0:
|
||
rw = (action_params.get('ref_width', 1920) or 1920) if isinstance(action_params, dict) else 1920
|
||
rh = (action_params.get('ref_height', 1080) or 1080) if isinstance(action_params, dict) else 1080
|
||
x_pct = round(px / rw, 6)
|
||
y_pct = round(py / rh, 6)
|
||
|
||
# Priorité 2 : position dans action_params (de GraphBuilder)
|
||
if x_pct == 0.0 and y_pct == 0.0 and isinstance(action_params, dict):
|
||
pos = action_params.get('position', [])
|
||
if pos and len(pos) == 2 and (pos[0] > 0 or pos[1] > 0):
|
||
rw = (action_params.get('ref_width') or inferred_resolution[0]) if isinstance(action_params, dict) else inferred_resolution[0]
|
||
rh = (action_params.get('ref_height') or inferred_resolution[1]) if isinstance(action_params, dict) else inferred_resolution[1]
|
||
x_pct = round(pos[0] / rw, 6)
|
||
y_pct = round(pos[1] / rh, 6)
|
||
|
||
# Priorité 3 : matched_event (session la plus proche)
|
||
if x_pct == 0.0 and y_pct == 0.0 and matched_event:
|
||
pos = matched_event.get('pos', [])
|
||
if pos and len(pos) == 2:
|
||
ref_width = matched_event.get('screen_width') or inferred_resolution[0]
|
||
ref_height = matched_event.get('screen_height') or inferred_resolution[1]
|
||
x_pct = round(pos[0] / ref_width, 6)
|
||
y_pct = round(pos[1] / ref_height, 6)
|
||
|
||
# Sécurité : clamper à [0, 1]
|
||
x_pct = max(0.0, min(1.0, x_pct))
|
||
y_pct = max(0.0, min(1.0, y_pct))
|
||
|
||
# Texte et touches : action_params d'abord, matched_event en complément
|
||
if isinstance(action_params, dict):
|
||
text = action_params.get('text', '')
|
||
keys = action_params.get('keys', [])
|
||
if not text and matched_event:
|
||
text = matched_event.get('text', '')
|
||
if not keys and matched_event:
|
||
keys = matched_event.get('keys', [])
|
||
if matched_event:
|
||
button = matched_event.get('button', button)
|
||
# Enrichir le window_title si absent
|
||
event_window = matched_event.get('window', {})
|
||
if not window_title and event_window:
|
||
window_title = event_window.get('title', '')
|
||
|
||
# Sanitiser les touches : convertir les caractères de contrôle
|
||
if keys:
|
||
keys = _sanitize_keys(keys)
|
||
# Si ne reste que des modificateurs seuls → action parasite, skip
|
||
if _is_modifier_only(keys):
|
||
return []
|
||
|
||
# Substitution de variables dans le texte
|
||
if text and params:
|
||
text = self._substitute_vars(text, params, action_params)
|
||
|
||
# Déterminer le type d'action normalisé
|
||
if action_type == 'mouse_click':
|
||
norm_type = 'click'
|
||
elif action_type == 'text_input':
|
||
norm_type = 'type'
|
||
elif action_type == 'key_press':
|
||
norm_type = 'key_combo'
|
||
elif action_type == 'compound':
|
||
# Décomposer les compound en sous-actions
|
||
steps = action_params.get('steps', []) if isinstance(action_params, dict) else []
|
||
return self._expand_compound_enriched(
|
||
steps, edge_id, from_node, to_node, window_title, params
|
||
)
|
||
elif action_type in ('unknown', 'unknown_element'):
|
||
# Actions "unknown" : essayer de deviner depuis l'événement original
|
||
if matched_event:
|
||
evt_type = matched_event.get('type', '')
|
||
if evt_type == 'mouse_click':
|
||
norm_type = 'click'
|
||
elif evt_type == 'text_input':
|
||
norm_type = 'type'
|
||
elif evt_type == 'key_press':
|
||
norm_type = 'key_combo'
|
||
else:
|
||
# Event trouvé mais type non reconnu : défaut click
|
||
norm_type = 'click'
|
||
logger.debug(
|
||
"Edge %s : action unknown, event type=%s -> défaut click",
|
||
edge_id, evt_type,
|
||
)
|
||
else:
|
||
# Pas d'événement original : défaut click (la transition entre
|
||
# deux états est presque toujours causée par un clic)
|
||
norm_type = 'click'
|
||
logger.debug(
|
||
"Edge %s : action unknown, pas d'event original -> défaut click",
|
||
edge_id,
|
||
)
|
||
else:
|
||
norm_type = action_type
|
||
|
||
# Construire le target_spec pour la résolution visuelle
|
||
target_spec = {}
|
||
if by_text and by_text not in ('', 'null', 'None'):
|
||
target_spec['by_text'] = by_text
|
||
if by_role and by_role not in ('', 'unknown', 'unknown_element', 'null'):
|
||
target_spec['by_role'] = by_role
|
||
if window_title:
|
||
target_spec['window_title'] = window_title
|
||
|
||
# Enrichir le target_spec avec les textes du node source (OCR)
|
||
source_texts = self._extract_required_texts(source_node)
|
||
if source_texts:
|
||
target_spec['context_hints'] = {'screen_texts': source_texts[:3]}
|
||
|
||
# Enrichir avec l'anchor image (crop de référence) pour les clics
|
||
if norm_type == 'click' and matched_event and source_session_dir:
|
||
anchor_b64 = self._load_anchor_crop(matched_event, source_session_dir)
|
||
if anchor_b64:
|
||
target_spec['anchor_image_base64'] = anchor_b64
|
||
logger.debug(
|
||
"Anchor image chargée pour edge %s (screenshot_id=%s)",
|
||
edge_id, matched_event.get('screenshot_id', '?'),
|
||
)
|
||
|
||
# Construire l'action enrichie
|
||
action = {
|
||
'action_id': f'act_{uuid.uuid4().hex[:8]}',
|
||
'type': norm_type,
|
||
'edge_id': edge_id,
|
||
'from_node': from_node,
|
||
'to_node': to_node,
|
||
'x_pct': x_pct,
|
||
'y_pct': y_pct,
|
||
'window_title': window_title,
|
||
}
|
||
|
||
# Ajouter les champs spécifiques au type d'action
|
||
if norm_type == 'click':
|
||
action['button'] = button
|
||
elif norm_type == 'type':
|
||
action['text'] = text
|
||
elif norm_type == 'key_combo':
|
||
action['keys'] = keys
|
||
|
||
# Activer la résolution visuelle si on a des critères sémantiques
|
||
# OU si les coordonnées sont nulles (nécessite une résolution)
|
||
if target_spec or (x_pct == 0.0 and y_pct == 0.0 and norm_type == 'click'):
|
||
action['visual_mode'] = True
|
||
action['target_spec'] = target_spec
|
||
|
||
return [action]
|
||
|
||
def _match_edge_to_event(
|
||
self,
|
||
edge_metadata: Dict[str, Any],
|
||
action_type: str,
|
||
action_params: Dict[str, Any],
|
||
original_events: List[Dict[str, Any]],
|
||
) -> Optional[Dict[str, Any]]:
|
||
"""Trouver l'événement original correspondant à un edge.
|
||
|
||
Stratégie de matching :
|
||
1. Par type d'action (mouse_click, text_input, key_press)
|
||
2. Par position approximative (si position dans action_params)
|
||
3. Par ordre chronologique (premier événement non-matché du bon type)
|
||
|
||
Returns:
|
||
L'événement matché ou None.
|
||
"""
|
||
if not original_events:
|
||
return None
|
||
|
||
# Type d'événement attendu
|
||
expected_types = set()
|
||
if action_type in ('mouse_click', 'click', 'unknown'):
|
||
expected_types.add('mouse_click')
|
||
if action_type in ('text_input', 'type', 'unknown'):
|
||
expected_types.add('text_input')
|
||
if action_type in ('key_press', 'key_combo', 'unknown'):
|
||
expected_types.add('key_press')
|
||
|
||
if not expected_types:
|
||
expected_types = {'mouse_click', 'text_input', 'key_press'}
|
||
|
||
# Filtrer les événements du bon type
|
||
candidates = [
|
||
e for e in original_events
|
||
if e.get('type', '') in expected_types
|
||
]
|
||
|
||
if not candidates:
|
||
return None
|
||
|
||
# Si on a une position dans action_params, chercher l'événement le plus proche
|
||
ref_pos = (action_params.get('position', []) if isinstance(action_params, dict) else [])
|
||
if ref_pos and len(ref_pos) == 2 and ref_pos[0] > 0:
|
||
best_event = None
|
||
best_dist = float('inf')
|
||
for evt in candidates:
|
||
evt_pos = evt.get('pos', [])
|
||
if evt_pos and len(evt_pos) == 2:
|
||
dx = ref_pos[0] - evt_pos[0]
|
||
dy = ref_pos[1] - evt_pos[1]
|
||
dist = (dx * dx + dy * dy) ** 0.5
|
||
if dist < best_dist:
|
||
best_dist = dist
|
||
best_event = evt
|
||
if best_event and best_dist < 200: # Max 200px d'écart
|
||
return best_event
|
||
|
||
# Fallback : le premier événement du bon type
|
||
# On utilise created_from_event du edge metadata comme hint
|
||
created_from = edge_metadata.get('created_from_event', '')
|
||
if created_from:
|
||
for evt in candidates:
|
||
if evt.get('type') == created_from:
|
||
return evt
|
||
|
||
return candidates[0] if candidates else None
|
||
|
||
def _extract_window_title(self, node) -> str:
|
||
"""Extraire le titre de fenêtre depuis un node (objet ou dict)."""
|
||
if node is None:
|
||
return ''
|
||
if hasattr(node, 'template'):
|
||
tpl = node.template
|
||
if tpl and hasattr(tpl, 'window') and tpl.window:
|
||
return tpl.window.title_contains or tpl.window.title_pattern or ''
|
||
elif isinstance(node, dict):
|
||
template = node.get('template', {})
|
||
if isinstance(template, dict):
|
||
window = template.get('window', {})
|
||
if isinstance(window, dict):
|
||
return window.get('title_contains', '') or window.get('title_pattern', '') or ''
|
||
return ''
|
||
|
||
def _extract_required_texts(self, node) -> List[str]:
|
||
"""Extraire les textes requis depuis le template d'un node."""
|
||
if node is None:
|
||
return []
|
||
if hasattr(node, 'template'):
|
||
tpl = node.template
|
||
if tpl and hasattr(tpl, 'text') and tpl.text:
|
||
texts = tpl.text.required_texts or []
|
||
# Filtrer les textes trop courts ou trop longs
|
||
return [t for t in texts if 3 <= len(t) <= 80]
|
||
elif isinstance(node, dict):
|
||
template = node.get('template', {})
|
||
if isinstance(template, dict):
|
||
text_spec = template.get('text', {})
|
||
if isinstance(text_spec, dict):
|
||
texts = text_spec.get('required_texts', [])
|
||
return [t for t in texts if isinstance(t, str) and 3 <= len(t) <= 80]
|
||
return []
|
||
|
||
@staticmethod
|
||
def _substitute_vars(text: str, params: Dict[str, Any], action_params: Dict[str, Any]) -> str:
|
||
"""Substituer les variables ${var} dans un texte."""
|
||
import re
|
||
defaults = action_params.get('defaults', {}) if isinstance(action_params, dict) else {}
|
||
|
||
def replacer(match):
|
||
var_name = match.group(1)
|
||
return str(params.get(var_name, defaults.get(var_name, match.group(0))))
|
||
|
||
return re.sub(r'\$\{(\w+)\}', replacer, text)
|
||
|
||
def _expand_compound_enriched(
|
||
self,
|
||
steps: List[Dict[str, Any]],
|
||
edge_id: str,
|
||
from_node: str,
|
||
to_node: str,
|
||
window_title: str,
|
||
params: Dict[str, Any],
|
||
) -> List[Dict[str, Any]]:
|
||
"""Décomposer un compound en actions enrichies individuelles.
|
||
|
||
Applique le nettoyage des steps parasites avant expansion :
|
||
- Suppression des modificateurs seuls (ctrl, alt, shift, etc.)
|
||
- Fusion des text_input consécutifs
|
||
- Déduplication des key_combo consécutifs identiques
|
||
|
||
Supporte les types de steps produits par GraphBuilder._build_compound_action() :
|
||
- mouse_click / click : clic souris avec x_pct/y_pct ou position/ref_*
|
||
- text_input / type : saisie de texte
|
||
- key_press / key_combo : combinaison de touches
|
||
- wait : pause entre actions
|
||
"""
|
||
import uuid
|
||
|
||
# Nettoyage des steps parasites avant expansion
|
||
steps = clean_compound_steps(steps)
|
||
|
||
actions = []
|
||
for step in steps:
|
||
step_type = step.get('type', 'unknown')
|
||
action = {
|
||
'action_id': f'act_{uuid.uuid4().hex[:8]}',
|
||
'edge_id': edge_id,
|
||
'from_node': from_node,
|
||
'to_node': to_node,
|
||
'window_title': window_title,
|
||
}
|
||
if step_type in ('key_press', 'key_combo'):
|
||
action['type'] = 'key_combo'
|
||
keys = step.get('keys', [])
|
||
if not keys and step.get('key'):
|
||
keys = [step['key']]
|
||
action['keys'] = keys
|
||
elif step_type in ('text_input', 'type'):
|
||
action['type'] = 'type'
|
||
text = step.get('text', '')
|
||
text = self._substitute_vars(text, params, step)
|
||
action['text'] = text
|
||
elif step_type == 'wait':
|
||
action['type'] = 'wait'
|
||
action['duration_ms'] = step.get('duration_ms', 500)
|
||
elif step_type in ('mouse_click', 'click'):
|
||
action['type'] = 'click'
|
||
# Coordonnées normalisées directes (x_pct/y_pct)
|
||
x_pct = step.get('x_pct', 0.0)
|
||
y_pct = step.get('y_pct', 0.0)
|
||
# Fallback : calculer depuis position absolue + résolution de référence
|
||
if x_pct == 0.0 and y_pct == 0.0:
|
||
pos = step.get('position', [])
|
||
if pos and len(pos) == 2 and (pos[0] > 0 or pos[1] > 0):
|
||
rw = step.get('ref_width', 1920) or 1920
|
||
rh = step.get('ref_height', 1080) or 1080
|
||
x_pct = round(pos[0] / rw, 6)
|
||
y_pct = round(pos[1] / rh, 6)
|
||
action['x_pct'] = x_pct
|
||
action['y_pct'] = y_pct
|
||
action['button'] = step.get('button', 'left')
|
||
else:
|
||
continue
|
||
actions.append(action)
|
||
|
||
# Nettoyage post-expansion des actions enrichies
|
||
return clean_enriched_actions(actions)
|
||
|
||
# =========================================================================
|
||
# Replay hybride : événements bruts + structure workflow
|
||
# =========================================================================
|
||
|
||
def build_hybrid_replay(
|
||
self,
|
||
workflow,
|
||
session_id: Optional[str] = None,
|
||
) -> List[Dict[str, Any]]:
|
||
"""Construire un replay hybride combinant événements bruts et structure workflow.
|
||
|
||
Le replay hybride utilise :
|
||
- Les événements bruts (live_events.jsonl) comme SOURCE D'ACTIONS
|
||
- La structure du workflow (nodes) comme STRUCTURE DE VÉRIFICATION
|
||
|
||
Les événements sont groupés par transition de node. Entre chaque groupe,
|
||
une action ``verify_screen`` est insérée pour vérifier que l'écran
|
||
correspond bien au node attendu avant de continuer.
|
||
|
||
Args:
|
||
workflow: Objet Workflow ou dict brut avec nodes/edges.
|
||
session_id: Identifiant de session explicite (optionnel, sinon
|
||
déduit depuis les métadonnées du workflow).
|
||
|
||
Returns:
|
||
Liste d'actions propres prêtes pour la queue de replay, avec des
|
||
``verify_screen`` intercalés entre les groupes de transition.
|
||
"""
|
||
import uuid
|
||
|
||
# 1. Charger les événements bruts
|
||
original_events = self._load_original_events_for_workflow(workflow)
|
||
|
||
# Si la recherche standard échoue, élargir la fenêtre temporelle.
|
||
# Le SessionWorker peut construire le workflow 1-2h après la capture
|
||
# (analyse VLM longue), donc la fenêtre de 10 min est trop serrée.
|
||
if not original_events:
|
||
original_events, session_dir_hint = self._find_session_wide_search(workflow)
|
||
else:
|
||
session_dir_hint = None
|
||
|
||
if not original_events:
|
||
logger.warning("build_hybrid_replay : aucun événement brut trouvé")
|
||
return []
|
||
|
||
# 2. Trouver le dossier de la session source
|
||
session_dir = session_dir_hint or self._find_session_dir_for_workflow(workflow)
|
||
if not session_dir:
|
||
# Fallback : chercher avec une fenêtre large
|
||
if not session_dir_hint:
|
||
_, session_dir = self._find_session_wide_search(workflow, return_dir=True)
|
||
if not session_dir:
|
||
logger.warning("build_hybrid_replay : dossier session introuvable")
|
||
return []
|
||
|
||
# 3. Mapper les screenshots aux nodes du workflow
|
||
node_timeline = self._map_screenshots_to_nodes(session_dir, workflow)
|
||
if not node_timeline:
|
||
logger.warning(
|
||
"build_hybrid_replay : impossible de mapper les screenshots aux nodes, "
|
||
"fallback sur extract_enriched_actions"
|
||
)
|
||
return []
|
||
|
||
# 4. Grouper les événements par transition de node
|
||
groups = self._group_events_by_transition(original_events, node_timeline)
|
||
if not groups:
|
||
logger.warning("build_hybrid_replay : aucun groupe de transition construit")
|
||
return []
|
||
|
||
# 5. Inférer la résolution d'écran
|
||
screen_w, screen_h = self._infer_screen_resolution(original_events)
|
||
|
||
# 6. Construire la liste d'actions
|
||
actions = []
|
||
for group_idx, group in enumerate(groups):
|
||
group_events = group["events"]
|
||
to_node = group["to_node"]
|
||
|
||
# Convertir les événements bruts en actions de replay
|
||
group_actions = self._events_to_replay_actions(
|
||
group_events, screen_w, screen_h, group_idx,
|
||
)
|
||
|
||
# Nettoyer le groupe (filtrer parasites, fusionner texte, dédupliquer combos, waits)
|
||
group_actions = clean_enriched_actions(group_actions)
|
||
|
||
actions.extend(group_actions)
|
||
|
||
# Insérer une vérification visuelle après chaque groupe
|
||
if to_node:
|
||
actions.append({
|
||
"action_id": f"act_verify_{uuid.uuid4().hex[:8]}",
|
||
"type": "verify_screen",
|
||
"expected_node": to_node,
|
||
"timeout_ms": 5000,
|
||
"group": group_idx,
|
||
})
|
||
|
||
logger.info(
|
||
"Replay hybride construit : %d actions (%d groupes, %d events bruts, "
|
||
"résolution=%dx%d)",
|
||
len(actions), len(groups), len(original_events), screen_w, screen_h,
|
||
)
|
||
return actions
|
||
|
||
def _map_screenshots_to_nodes(
|
||
self,
|
||
session_dir: Path,
|
||
workflow,
|
||
) -> List[tuple]:
|
||
"""Mapper chaque screenshot de la session au node du workflow correspondant.
|
||
|
||
Utilise les window_focus_change events pour associer un timestamp à un
|
||
window_title, puis le window_title au node (via node.template.window.title_contains).
|
||
|
||
Returns:
|
||
Liste de (timestamp, node_id) triée chronologiquement, ou liste vide
|
||
si le mapping est impossible.
|
||
"""
|
||
import json as _json
|
||
|
||
# Extraire les titres de fenêtre depuis les nodes du workflow
|
||
if hasattr(workflow, 'nodes'):
|
||
nodes = workflow.nodes
|
||
elif isinstance(workflow, dict):
|
||
nodes = workflow.get('nodes', [])
|
||
else:
|
||
return []
|
||
|
||
# Construire un index title_fragment → node_id
|
||
# Un node peut avoir un title_contains comme "Bloc-notes" ou "Sans titre"
|
||
title_to_node = {}
|
||
node_order = [] # Ordre topologique (par index)
|
||
for node in nodes:
|
||
nid = node.node_id if hasattr(node, 'node_id') else node.get('node_id', '')
|
||
node_order.append(nid)
|
||
win_title = self._extract_window_title(node)
|
||
if win_title:
|
||
title_to_node[win_title.lower()] = nid
|
||
|
||
if not title_to_node:
|
||
logger.debug("Aucun titre de fenêtre dans les nodes du workflow")
|
||
return []
|
||
|
||
# Charger les changements de fenêtre depuis live_events.jsonl
|
||
events_file = session_dir / "live_events.jsonl"
|
||
if not events_file.exists():
|
||
return []
|
||
|
||
window_changes = [] # [(timestamp, window_title)]
|
||
try:
|
||
for line in events_file.read_text(encoding="utf-8").splitlines():
|
||
if not line.strip():
|
||
continue
|
||
try:
|
||
evt = _json.loads(line)
|
||
except _json.JSONDecodeError:
|
||
continue
|
||
|
||
event_data = evt.get("event", evt)
|
||
evt_type = event_data.get("type", "")
|
||
ts = float(event_data.get("timestamp", evt.get("timestamp", 0)))
|
||
|
||
if evt_type == "window_focus_change":
|
||
to_info = event_data.get("to") or event_data.get("window") or {}
|
||
title = to_info.get("title", "")
|
||
if title:
|
||
window_changes.append((ts, title))
|
||
except Exception as e:
|
||
logger.warning("Erreur lecture events pour mapping screenshots : %s", e)
|
||
return []
|
||
|
||
if not window_changes:
|
||
# Fallback : utiliser les timestamps des screenshots (mtime) et les
|
||
# mapper aux nodes dans l'ordre du workflow
|
||
return self._map_screenshots_by_order(session_dir, node_order)
|
||
|
||
# Associer chaque changement de fenêtre au node correspondant
|
||
timeline = []
|
||
last_node_id = None
|
||
for ts, title in sorted(window_changes, key=lambda x: x[0]):
|
||
matched_node = self._match_title_to_node(title, title_to_node)
|
||
if matched_node and matched_node != last_node_id:
|
||
timeline.append((ts, matched_node))
|
||
last_node_id = matched_node
|
||
|
||
# Si timeline est vide, essayer un mapping plus souple avec les screenshots
|
||
if not timeline:
|
||
return self._map_screenshots_by_order(session_dir, node_order)
|
||
|
||
logger.info(
|
||
"Timeline node mappée : %d transitions (%s)",
|
||
len(timeline),
|
||
" → ".join(nid for _, nid in timeline),
|
||
)
|
||
return timeline
|
||
|
||
def _match_title_to_node(self, window_title: str, title_to_node: dict) -> Optional[str]:
|
||
"""Matcher un titre de fenêtre à un node via les fragments de titre.
|
||
|
||
Le matching est insensible à la casse et cherche si le fragment du node
|
||
est contenu dans le titre de la fenêtre.
|
||
|
||
Returns:
|
||
Le node_id correspondant, ou None.
|
||
"""
|
||
title_lower = window_title.lower()
|
||
best_match = None
|
||
best_len = 0
|
||
for fragment, node_id in title_to_node.items():
|
||
if fragment in title_lower and len(fragment) > best_len:
|
||
best_match = node_id
|
||
best_len = len(fragment)
|
||
return best_match
|
||
|
||
def _map_screenshots_by_order(
|
||
self,
|
||
session_dir: Path,
|
||
node_order: List[str],
|
||
) -> List[tuple]:
|
||
"""Fallback : mapper les screenshots aux nodes dans l'ordre topologique.
|
||
|
||
Utilisé quand les window_focus_change ne sont pas disponibles. Distribue
|
||
les screenshots uniformément entre les nodes.
|
||
|
||
Returns:
|
||
Liste de (timestamp, node_id).
|
||
"""
|
||
shots_dir = session_dir / "shots"
|
||
if not shots_dir.exists() or not node_order:
|
||
return []
|
||
|
||
shot_files = sorted(shots_dir.glob("shot_*_full.png"))
|
||
if not shot_files:
|
||
return []
|
||
|
||
timeline = []
|
||
shots_per_node = max(1, len(shot_files) // max(1, len(node_order)))
|
||
|
||
for i, node_id in enumerate(node_order):
|
||
shot_idx = min(i * shots_per_node, len(shot_files) - 1)
|
||
ts = shot_files[shot_idx].stat().st_mtime
|
||
timeline.append((ts, node_id))
|
||
|
||
return timeline
|
||
|
||
def _group_events_by_transition(
|
||
self,
|
||
events: List[Dict[str, Any]],
|
||
node_timeline: List[tuple],
|
||
) -> List[Dict[str, Any]]:
|
||
"""Grouper les événements bruts par transition de node.
|
||
|
||
Pour chaque événement, détermine dans quel intervalle de transition il
|
||
se situe (entre quel changement de node et le suivant).
|
||
|
||
Args:
|
||
events: Liste d'événements bruts (dicts avec ``timestamp`` et ``type``).
|
||
node_timeline: Liste de (timestamp, node_id) triée chronologiquement.
|
||
|
||
Returns:
|
||
Liste de groupes :
|
||
``[{"from_node": "node_000", "to_node": "node_001", "events": [...]}, ...]``
|
||
"""
|
||
if not node_timeline or not events:
|
||
return []
|
||
|
||
# Construire les intervalles de transition
|
||
# Chaque transition va du node_timeline[i] au node_timeline[i+1]
|
||
groups = []
|
||
for i in range(len(node_timeline)):
|
||
from_node = node_timeline[i][1]
|
||
to_node = node_timeline[i + 1][1] if i + 1 < len(node_timeline) else ""
|
||
start_ts = node_timeline[i][0]
|
||
end_ts = node_timeline[i + 1][0] if i + 1 < len(node_timeline) else float("inf")
|
||
|
||
# Collecter les événements dans cet intervalle
|
||
group_events = []
|
||
for evt in events:
|
||
evt_ts = float(evt.get("timestamp", 0))
|
||
evt_type = evt.get("type", "")
|
||
|
||
# Ignorer les événements non-actionnables
|
||
if evt_type in _PARASITIC_ACTION_TYPES:
|
||
continue
|
||
if evt_type in ("window_focus_change", "screenshot", "heartbeat"):
|
||
continue
|
||
|
||
if start_ts <= evt_ts < end_ts:
|
||
group_events.append(evt)
|
||
|
||
if group_events:
|
||
groups.append({
|
||
"from_node": from_node,
|
||
"to_node": to_node,
|
||
"events": group_events,
|
||
})
|
||
|
||
# Si aucun événement n'a de timestamp ou tout est tombé dans les parasites,
|
||
# essayer un groupement séquentiel sans timestamp
|
||
if not groups:
|
||
groups = self._group_events_sequential(events, node_timeline)
|
||
|
||
return groups
|
||
|
||
def _group_events_sequential(
|
||
self,
|
||
events: List[Dict[str, Any]],
|
||
node_timeline: List[tuple],
|
||
) -> List[Dict[str, Any]]:
|
||
"""Groupement séquentiel des événements quand les timestamps ne matchent pas.
|
||
|
||
Distribue les événements actionnables entre les transitions du workflow
|
||
de manière proportionnelle.
|
||
|
||
Returns:
|
||
Liste de groupes comme ``_group_events_by_transition``.
|
||
"""
|
||
# Filtrer les événements actionnables
|
||
actionable_types = {
|
||
"mouse_click", "text_input", "key_press", "key_combo", "scroll",
|
||
}
|
||
actionable = [
|
||
e for e in events
|
||
if e.get("type", "") in actionable_types
|
||
]
|
||
|
||
if not actionable or len(node_timeline) < 2:
|
||
# Un seul node → tout dans un seul groupe
|
||
if actionable and node_timeline:
|
||
return [{
|
||
"from_node": node_timeline[0][1],
|
||
"to_node": node_timeline[-1][1] if len(node_timeline) > 1 else "",
|
||
"events": actionable,
|
||
}]
|
||
return []
|
||
|
||
# Distribuer proportionnellement
|
||
n_transitions = len(node_timeline) - 1
|
||
events_per_group = max(1, len(actionable) // n_transitions)
|
||
groups = []
|
||
|
||
for i in range(n_transitions):
|
||
start_idx = i * events_per_group
|
||
end_idx = (i + 1) * events_per_group if i < n_transitions - 1 else len(actionable)
|
||
group_events = actionable[start_idx:end_idx]
|
||
if group_events:
|
||
groups.append({
|
||
"from_node": node_timeline[i][1],
|
||
"to_node": node_timeline[i + 1][1],
|
||
"events": group_events,
|
||
})
|
||
|
||
return groups
|
||
|
||
def _events_to_replay_actions(
|
||
self,
|
||
events: List[Dict[str, Any]],
|
||
screen_w: int,
|
||
screen_h: int,
|
||
group_idx: int,
|
||
) -> List[Dict[str, Any]]:
|
||
"""Convertir une liste d'événements bruts en actions de replay normalisées.
|
||
|
||
Pré-fusionne les text_input consécutifs (frappes individuelles) en un seul
|
||
bloc de texte avant de les convertir en actions avec timing. Cela évite
|
||
d'avoir des dizaines d'actions ``type`` d'un seul caractère.
|
||
|
||
Ajoute un ``wait`` entre les actions de types différents quand le délai
|
||
naturel est significatif (> 2s = pause de réflexion de l'utilisateur).
|
||
|
||
Args:
|
||
events: Événements bruts d'un groupe.
|
||
screen_w: Largeur de l'écran source (pixels).
|
||
screen_h: Hauteur de l'écran source (pixels).
|
||
group_idx: Index du groupe de transition.
|
||
|
||
Returns:
|
||
Liste d'actions normalisées pour le replay.
|
||
"""
|
||
import uuid
|
||
|
||
# ── Phase 1 : pré-fusionner les événements text_input consécutifs ──
|
||
# Les frappes clavier produisent un text_input par caractère. On les
|
||
# fusionne en un seul événement avec le texte complet et le timestamp
|
||
# du premier caractère.
|
||
merged_events = []
|
||
for evt in events:
|
||
evt_type = evt.get("type", "")
|
||
|
||
# Filtrer les événements non-actionnables
|
||
if evt_type in _PARASITIC_ACTION_TYPES:
|
||
continue
|
||
if evt_type in ("window_focus_change", "screenshot", "heartbeat"):
|
||
continue
|
||
|
||
if evt_type == "text_input":
|
||
text = evt.get("text", "")
|
||
if not text:
|
||
continue
|
||
# Fusionner avec le précédent si c'est aussi un text_input
|
||
if merged_events and merged_events[-1].get("type") == "text_input":
|
||
merged_events[-1]["text"] = merged_events[-1].get("text", "") + text
|
||
# Garder le timestamp le plus récent pour le calcul du delta suivant
|
||
merged_events[-1]["_end_ts"] = float(evt.get("timestamp", 0))
|
||
continue
|
||
|
||
merged_events.append(dict(evt))
|
||
|
||
# ── Phase 2 : convertir les événements fusionnés en actions replay ──
|
||
actions = []
|
||
last_action_end_ts = 0.0
|
||
|
||
for evt in merged_events:
|
||
evt_type = evt.get("type", "")
|
||
evt_ts = float(evt.get("timestamp", 0))
|
||
|
||
# Calculer le délai entre la fin de la dernière action et le début
|
||
# de celle-ci. Les waits ne sont insérés que pour les pauses
|
||
# significatives (> 2s), pas entre chaque frappe.
|
||
if last_action_end_ts > 0 and evt_ts > last_action_end_ts:
|
||
delta_ms = int((evt_ts - last_action_end_ts) * 1000)
|
||
if delta_ms > 2000:
|
||
capped_ms = min(delta_ms, 5000)
|
||
actions.append({
|
||
"action_id": f"act_wait_{uuid.uuid4().hex[:8]}",
|
||
"type": "wait",
|
||
"duration_ms": capped_ms,
|
||
"group": group_idx,
|
||
})
|
||
|
||
# Mettre à jour le timestamp de fin
|
||
end_ts = float(evt.get("_end_ts", evt_ts))
|
||
if end_ts > 0:
|
||
last_action_end_ts = end_ts
|
||
elif evt_ts > 0:
|
||
last_action_end_ts = evt_ts
|
||
|
||
action = {
|
||
"action_id": f"act_hyb_{uuid.uuid4().hex[:8]}",
|
||
"group": group_idx,
|
||
}
|
||
|
||
if evt_type == "mouse_click":
|
||
pos = evt.get("pos", [])
|
||
if pos and len(pos) == 2:
|
||
action["type"] = "click"
|
||
action["x_pct"] = round(pos[0] / screen_w, 6)
|
||
action["y_pct"] = round(pos[1] / screen_h, 6)
|
||
action["button"] = evt.get("button", "left")
|
||
else:
|
||
continue
|
||
|
||
elif evt_type == "text_input":
|
||
text = evt.get("text", "")
|
||
if not text:
|
||
continue
|
||
action["type"] = "type"
|
||
action["text"] = text
|
||
|
||
elif evt_type in ("key_press", "key_combo"):
|
||
keys = evt.get("keys", [])
|
||
if not keys:
|
||
key = evt.get("key", "")
|
||
if key:
|
||
keys = [key]
|
||
if not keys:
|
||
continue
|
||
keys = _sanitize_keys(keys)
|
||
if _is_modifier_only(keys):
|
||
continue
|
||
action["type"] = "key_combo"
|
||
action["keys"] = keys
|
||
|
||
elif evt_type == "scroll":
|
||
pos = evt.get("pos", [])
|
||
action["type"] = "scroll"
|
||
if pos and len(pos) == 2:
|
||
action["x_pct"] = round(pos[0] / screen_w, 6)
|
||
action["y_pct"] = round(pos[1] / screen_h, 6)
|
||
action["delta"] = evt.get("delta", -3)
|
||
|
||
else:
|
||
continue
|
||
|
||
actions.append(action)
|
||
|
||
return actions
|
||
|
||
@property
|
||
def stats(self) -> Dict[str, Any]:
|
||
"""Statistiques du processeur."""
|
||
with self._data_lock:
|
||
total_workflows = len(self._workflows)
|
||
return {
|
||
"active_sessions": self.session_manager.active_session_count,
|
||
"total_sessions": len(self.session_manager.session_ids),
|
||
"total_workflows": total_workflows,
|
||
"faiss_vectors": self._faiss_manager.index.ntotal if self._faiss_manager else 0,
|
||
"initialized": self._initialized,
|
||
}
|