1. Le grounding se déclenche pour by_text_source="vlm" (pas juste "ocr") Les textes lus par gemma4 (onglets, labels) sont du texte visible, le grounding doit les chercher comme n'importe quel texte OCR. 2. gemma4 est automatiquement déchargé après le build_replay pour libérer la VRAM et permettre à qwen2.5vl de charger au replay. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
5662 lines
219 KiB
Python
5662 lines
219 KiB
Python
# agent_v0/server_v1/api_stream.py
|
|
"""
|
|
API de Streaming Temps Réel pour RPA Vision V3.
|
|
|
|
Connecte l'Agent V1 au core pipeline via StreamProcessor.
|
|
Tous les calculs GPU (ScreenAnalyzer, CLIP, FAISS) tournent ici sur le serveur.
|
|
|
|
Inclut les endpoints de replay pour renvoyer des ordres d'exécution à l'Agent V1.
|
|
"""
|
|
|
|
import atexit
|
|
import json
|
|
import logging
|
|
import os
|
|
import secrets
|
|
import signal
|
|
import threading
|
|
import time
|
|
import uuid
|
|
from collections import defaultdict
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from fastapi import BackgroundTasks, Depends, FastAPI, File, HTTPException, Request, UploadFile
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from pydantic import BaseModel
|
|
|
|
from .replay_failure_logger import log_replay_failure
|
|
from .replay_verifier import ReplayVerifier, VerificationResult
|
|
from .stream_processor import StreamProcessor, build_replay_from_raw_events, enrich_click_from_screenshot
|
|
from .worker_stream import StreamWorker
|
|
|
|
# Instance globale du vérificateur de replay (comparaison screenshots avant/après)
|
|
_replay_verifier = ReplayVerifier()
|
|
|
|
# Nombre maximum de retries par action avant de déclarer un échec
|
|
MAX_RETRIES_PER_ACTION = 3
|
|
|
|
# Limites de sécurité pour les queues de replay
|
|
MAX_ACTIONS_PER_REPLAY = 500 # Max actions par requête de replay
|
|
MAX_REPLAY_STATES = 1000 # Max entrées dans _replay_states
|
|
REPLAY_STATE_TTL_SECONDS = 3600 # Nettoyage auto des replays terminés après 1h
|
|
|
|
# Actions en cours de retry : action_id -> {"action": ..., "retry_count": N, "replay_id": ...}
|
|
_retry_pending: Dict[str, Dict[str, Any]] = {}
|
|
|
|
# Callbacks d'erreur par replay_id : replay_id -> callback_url
|
|
_error_callbacks: Dict[str, str] = {}
|
|
|
|
# Optimisation des actions replay par gestes primitifs
|
|
try:
|
|
from agent_chat.gesture_catalog import get_gesture_catalog
|
|
_gesture_catalog = get_gesture_catalog()
|
|
except ImportError:
|
|
_gesture_catalog = None
|
|
|
|
# Authentification automatique (optionnel) — détection des écrans d'auth pendant le replay
|
|
# Nécessite un vault configuré via la variable d'env RPA_AUTH_VAULT_PATH + RPA_AUTH_VAULT_PASSWORD
|
|
_auth_handler = None
|
|
try:
|
|
_vault_path = os.environ.get("RPA_AUTH_VAULT_PATH")
|
|
_vault_password = os.environ.get("RPA_AUTH_VAULT_PASSWORD")
|
|
if _vault_path and _vault_password:
|
|
from core.auth.credential_vault import CredentialVault
|
|
from core.auth.auth_handler import AuthHandler
|
|
_auth_vault = CredentialVault(_vault_path, _vault_password)
|
|
_auth_handler = AuthHandler(_auth_vault)
|
|
except Exception:
|
|
_auth_handler = None
|
|
|
|
logger = logging.getLogger("api_stream")
|
|
|
|
# =========================================================================
|
|
# Authentification par token Bearer (sécurité HIGH)
|
|
# =========================================================================
|
|
# Le token est lu depuis l'environnement ou généré au démarrage.
|
|
# Tous les endpoints requièrent le header Authorization: Bearer <token>,
|
|
# sauf /health, /docs et /openapi.json (publics).
|
|
API_TOKEN = os.environ.get("RPA_API_TOKEN", secrets.token_hex(32))
|
|
|
|
# Endpoints publics (pas besoin de token)
|
|
# En production, /docs et /redoc sont désactivés (voir ci-dessous)
|
|
# Paths publics : pas de token requis
|
|
# /replay/next est public car l'agent Rust legacy n'envoie pas de token
|
|
# et c'est un endpoint read-only (polling, pas d'écriture)
|
|
_PUBLIC_PATHS = {
|
|
"/health", "/docs", "/openapi.json", "/redoc",
|
|
"/api/v1/traces/stream/replay/next",
|
|
"/api/v1/traces/stream/image",
|
|
}
|
|
|
|
|
|
async def _verify_token(request: Request):
|
|
"""Middleware de vérification du token API Bearer."""
|
|
if request.url.path in _PUBLIC_PATHS:
|
|
return
|
|
auth = request.headers.get("Authorization", "")
|
|
if not auth.startswith("Bearer ") or auth[7:] != API_TOKEN:
|
|
raise HTTPException(status_code=401, detail="Token API invalide")
|
|
|
|
|
|
# =========================================================================
|
|
# Rate limiting en mémoire (sécurité HIGH)
|
|
# =========================================================================
|
|
_rate_limits: Dict[str, list] = defaultdict(list)
|
|
_RATE_LIMIT_WINDOW = 60 # secondes
|
|
_RATE_LIMITS = {
|
|
"/api/v1/traces/stream/replay": 10, # 10 replays par minute
|
|
"/api/v1/traces/stream/replay/raw": 10,
|
|
"/api/v1/traces/stream/replay-session": 10, # 10 replays session par minute
|
|
"/api/v1/traces/stream/replay/single": 30, # 30 actions Copilot par minute
|
|
"/api/v1/traces/stream/finalize": 5,
|
|
"/api/v1/traces/stream/image": 200, # 200 images par minute (heartbeats)
|
|
}
|
|
|
|
|
|
def _check_rate_limit(endpoint: str, client_ip: str) -> bool:
|
|
"""Vérifie si le client a dépassé la limite de requêtes."""
|
|
key = f"{endpoint}:{client_ip}"
|
|
now = time.time()
|
|
# Nettoyer les entrées expirées
|
|
_rate_limits[key] = [t for t in _rate_limits[key] if now - t < _RATE_LIMIT_WINDOW]
|
|
limit = _RATE_LIMITS.get(endpoint, 100)
|
|
if len(_rate_limits[key]) >= limit:
|
|
return False
|
|
_rate_limits[key].append(now)
|
|
return True
|
|
|
|
|
|
# =========================================================================
|
|
# Validation des actions de replay (sécurité HIGH)
|
|
# =========================================================================
|
|
_ALLOWED_ACTION_TYPES = {
|
|
"click", "type", "key_combo", "scroll", "wait",
|
|
"file_open", "file_save", "file_close", "file_new", "file_dialog",
|
|
"double_click", "right_click", "drag",
|
|
"verify_screen", # Replay hybride : vérification visuelle entre groupes
|
|
}
|
|
_MAX_ACTION_TEXT_LENGTH = 10000
|
|
_MAX_KEYS_PER_COMBO = 10
|
|
# Touches autorisées dans les key_combo (modificateurs + touches spéciales + caractères simples)
|
|
_KNOWN_KEY_NAMES = {
|
|
"enter", "return", "tab", "escape", "esc", "backspace", "delete", "space",
|
|
"up", "down", "left", "right", "home", "end", "page_up", "page_down",
|
|
"f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11", "f12",
|
|
"ctrl", "ctrl_l", "ctrl_r", "alt", "alt_l", "alt_r",
|
|
"shift", "shift_l", "shift_r",
|
|
"cmd", "win", "super", "super_l", "super_r", "windows", "meta",
|
|
"insert", "print_screen", "caps_lock", "num_lock",
|
|
}
|
|
|
|
|
|
# =========================================================================
|
|
# Setup environnement — Préparation automatique avant le replay
|
|
# =========================================================================
|
|
# Mapping des noms d'exécutables Windows courants vers la commande de lancement.
|
|
# Utilisé comme fallback pour le texte de recherche dans le menu Démarrer.
|
|
# Le format est : "processname.exe" (minuscule) -> commande shell
|
|
_APP_LAUNCH_COMMANDS: Dict[str, str] = {
|
|
"notepad.exe": "notepad",
|
|
"explorer.exe": "explorer",
|
|
"calc.exe": "calc",
|
|
"mspaint.exe": "mspaint",
|
|
"cmd.exe": "cmd",
|
|
"powershell.exe": "powershell",
|
|
"wordpad.exe": "wordpad",
|
|
"charmap.exe": "charmap",
|
|
"snippingtool.exe": "snippingtool",
|
|
"taskmgr.exe": "taskmgr",
|
|
"regedit.exe": "regedit",
|
|
"mstsc.exe": "mstsc",
|
|
"winword.exe": "winword",
|
|
"excel.exe": "excel",
|
|
"powerpnt.exe": "powerpnt",
|
|
"outlook.exe": "outlook",
|
|
"msedge.exe": "msedge",
|
|
"chrome.exe": "chrome",
|
|
"firefox.exe": "firefox",
|
|
"code.exe": "code",
|
|
}
|
|
|
|
# Mapping des exécutables vers le nom visuel à chercher dans le menu Démarrer.
|
|
# Contient le texte de recherche (souvent le nom français) et une description
|
|
# pour le VLM afin d'identifier l'icône dans les résultats de recherche.
|
|
# Format : "processname.exe" -> {"search_text": ..., "display_name": ..., "vlm_description": ...}
|
|
_APP_VISUAL_SEARCH: Dict[str, Dict[str, str]] = {
|
|
"notepad.exe": {
|
|
"search_text": "Bloc-notes",
|
|
"display_name": "Bloc-notes",
|
|
"vlm_description": "L'application Bloc-notes (Notepad) dans les résultats de recherche",
|
|
},
|
|
"calc.exe": {
|
|
"search_text": "Calculatrice",
|
|
"display_name": "Calculatrice",
|
|
"vlm_description": "L'application Calculatrice dans les résultats de recherche",
|
|
},
|
|
"mspaint.exe": {
|
|
"search_text": "Paint",
|
|
"display_name": "Paint",
|
|
"vlm_description": "L'application Paint dans les résultats de recherche",
|
|
},
|
|
"cmd.exe": {
|
|
"search_text": "Invite de commandes",
|
|
"display_name": "Invite de commandes",
|
|
"vlm_description": "L'Invite de commandes (Command Prompt) dans les résultats",
|
|
},
|
|
"powershell.exe": {
|
|
"search_text": "PowerShell",
|
|
"display_name": "PowerShell",
|
|
"vlm_description": "Windows PowerShell dans les résultats de recherche",
|
|
},
|
|
"wordpad.exe": {
|
|
"search_text": "WordPad",
|
|
"display_name": "WordPad",
|
|
"vlm_description": "L'application WordPad dans les résultats de recherche",
|
|
},
|
|
"winword.exe": {
|
|
"search_text": "Word",
|
|
"display_name": "Microsoft Word",
|
|
"vlm_description": "Microsoft Word dans les résultats de recherche",
|
|
},
|
|
"excel.exe": {
|
|
"search_text": "Excel",
|
|
"display_name": "Microsoft Excel",
|
|
"vlm_description": "Microsoft Excel dans les résultats de recherche",
|
|
},
|
|
"powerpnt.exe": {
|
|
"search_text": "PowerPoint",
|
|
"display_name": "Microsoft PowerPoint",
|
|
"vlm_description": "Microsoft PowerPoint dans les résultats de recherche",
|
|
},
|
|
"outlook.exe": {
|
|
"search_text": "Outlook",
|
|
"display_name": "Microsoft Outlook",
|
|
"vlm_description": "Microsoft Outlook dans les résultats de recherche",
|
|
},
|
|
"msedge.exe": {
|
|
"search_text": "Edge",
|
|
"display_name": "Microsoft Edge",
|
|
"vlm_description": "Microsoft Edge dans les résultats de recherche",
|
|
},
|
|
"chrome.exe": {
|
|
"search_text": "Chrome",
|
|
"display_name": "Google Chrome",
|
|
"vlm_description": "Google Chrome dans les résultats de recherche",
|
|
},
|
|
"firefox.exe": {
|
|
"search_text": "Firefox",
|
|
"display_name": "Mozilla Firefox",
|
|
"vlm_description": "Mozilla Firefox dans les résultats de recherche",
|
|
},
|
|
"code.exe": {
|
|
"search_text": "Visual Studio Code",
|
|
"display_name": "Visual Studio Code",
|
|
"vlm_description": "Visual Studio Code dans les résultats de recherche",
|
|
},
|
|
"taskmgr.exe": {
|
|
"search_text": "Gestionnaire des tâches",
|
|
"display_name": "Gestionnaire des tâches",
|
|
"vlm_description": "Le Gestionnaire des tâches dans les résultats de recherche",
|
|
},
|
|
"snippingtool.exe": {
|
|
"search_text": "Outil Capture",
|
|
"display_name": "Outil Capture d'écran",
|
|
"vlm_description": "L'Outil Capture d'écran dans les résultats de recherche",
|
|
},
|
|
"mstsc.exe": {
|
|
"search_text": "Connexion Bureau à distance",
|
|
"display_name": "Bureau à distance",
|
|
"vlm_description": "La Connexion Bureau à distance dans les résultats",
|
|
},
|
|
}
|
|
|
|
# Applications Windows à ignorer pour le setup (processus système, agents, etc.)
|
|
_SETUP_IGNORE_APPS = {
|
|
"searchhost.exe", # Barre de recherche Windows
|
|
"explorer.exe", # Explorer est toujours lancé (shell Windows)
|
|
"pythonw.exe", # Agent Python (notre propre agent)
|
|
"python.exe", # Idem
|
|
"shellexperiencehost.exe",
|
|
"startmenuexperiencehost.exe",
|
|
"applicationframehost.exe",
|
|
"systemsettings.exe",
|
|
"textinputhost.exe",
|
|
"runtimebroker.exe",
|
|
}
|
|
|
|
|
|
def _extract_required_apps_from_events(raw_events: list) -> Dict[str, Any]:
|
|
"""Extraire les applications requises depuis les événements bruts d'une session.
|
|
|
|
Analyse les window_focus_change pour identifier :
|
|
- L'application principale (la plus utilisée hors apps système)
|
|
- La première fenêtre ciblée (pour le setup initial)
|
|
|
|
Args:
|
|
raw_events: Événements bruts depuis live_events.jsonl.
|
|
|
|
Returns:
|
|
Dict avec les clés :
|
|
- primary_app: str (nom de l'exécutable principal, ex: "Notepad.exe")
|
|
- primary_launch_cmd: str (commande Win+R, ex: "notepad")
|
|
- first_window_title: str (titre de la première fenêtre applicative)
|
|
- apps: dict[str, int] (app_name -> nombre d'occurrences)
|
|
"""
|
|
app_counts: Dict[str, int] = defaultdict(int)
|
|
first_app = None
|
|
first_window_title = None
|
|
|
|
for raw_evt in raw_events:
|
|
event_data = raw_evt.get("event", raw_evt)
|
|
evt_type = event_data.get("type", "")
|
|
|
|
if evt_type == "window_focus_change":
|
|
to_info = event_data.get("to", {})
|
|
if not to_info:
|
|
continue
|
|
app_name = to_info.get("app_name", "")
|
|
title = to_info.get("title", "")
|
|
if app_name:
|
|
app_counts[app_name] += 1
|
|
if first_app is None and app_name.lower() not in _SETUP_IGNORE_APPS:
|
|
first_app = app_name
|
|
first_window_title = title
|
|
|
|
# Aussi extraire depuis les mouse_click qui ont un champ window
|
|
elif evt_type == "mouse_click":
|
|
window = event_data.get("window", {})
|
|
if isinstance(window, dict):
|
|
app_name = window.get("app_name", "")
|
|
if app_name:
|
|
app_counts[app_name] += 1
|
|
|
|
if not app_counts:
|
|
return {}
|
|
|
|
# Déterminer l'application principale (la plus fréquente hors apps ignorées)
|
|
filtered_apps = {
|
|
k: v for k, v in app_counts.items()
|
|
if k.lower() not in _SETUP_IGNORE_APPS
|
|
}
|
|
if not filtered_apps:
|
|
return {}
|
|
|
|
primary_app = max(filtered_apps, key=filtered_apps.get)
|
|
|
|
# Résoudre la commande de lancement
|
|
primary_launch_cmd = _resolve_launch_command(primary_app)
|
|
|
|
return {
|
|
"primary_app": primary_app,
|
|
"primary_launch_cmd": primary_launch_cmd,
|
|
"first_window_title": first_window_title or "",
|
|
"apps": dict(app_counts),
|
|
}
|
|
|
|
|
|
def _extract_required_apps_from_workflow(workflow) -> Dict[str, Any]:
|
|
"""Extraire les applications requises depuis un workflow structuré.
|
|
|
|
Analyse les nodes du workflow pour identifier les titres de fenêtres
|
|
requis, puis infère l'application principale.
|
|
|
|
Args:
|
|
workflow: Objet Workflow ou dict brut.
|
|
|
|
Returns:
|
|
Même format que _extract_required_apps_from_events.
|
|
"""
|
|
# Accéder aux données (objet ou dict)
|
|
if hasattr(workflow, 'nodes'):
|
|
nodes = workflow.nodes
|
|
metadata = workflow.metadata if hasattr(workflow, 'metadata') else {}
|
|
elif isinstance(workflow, dict):
|
|
nodes = workflow.get('nodes', [])
|
|
metadata = workflow.get('metadata', {})
|
|
else:
|
|
return {}
|
|
|
|
if not nodes:
|
|
return {}
|
|
|
|
# Collecter les titres de fenêtres depuis les nodes
|
|
window_titles = []
|
|
for node in nodes:
|
|
template = node.template if hasattr(node, 'template') else node.get('template', {})
|
|
if isinstance(template, dict):
|
|
window = template.get('window', {})
|
|
elif hasattr(template, 'window'):
|
|
window = template.window if hasattr(template.window, '__dict__') else {}
|
|
else:
|
|
window = {}
|
|
|
|
if isinstance(window, dict):
|
|
title = window.get('title_pattern', '') or window.get('title_contains', '')
|
|
elif hasattr(window, 'title_pattern'):
|
|
title = getattr(window, 'title_pattern', '') or ''
|
|
else:
|
|
title = ''
|
|
|
|
if title:
|
|
window_titles.append(title)
|
|
|
|
# Inférer l'app principale depuis les titres de fenêtres
|
|
primary_app, primary_launch_cmd, matched_title = _infer_app_from_window_titles(window_titles)
|
|
# Utiliser le titre qui a matché l'app (pas le premier node qui peut être "Rechercher")
|
|
first_title = matched_title or (window_titles[0] if window_titles else "")
|
|
|
|
if not primary_app:
|
|
return {}
|
|
|
|
source_session_id = metadata.get("source_session_id", "") if isinstance(metadata, dict) else ""
|
|
machine_id = metadata.get("machine_id", "") if isinstance(metadata, dict) else ""
|
|
|
|
return {
|
|
"primary_app": primary_app,
|
|
"primary_launch_cmd": primary_launch_cmd,
|
|
"first_window_title": first_title,
|
|
"apps": {},
|
|
"source_session_id": source_session_id,
|
|
"machine_id": machine_id,
|
|
}
|
|
|
|
|
|
def _resolve_launch_command(app_name: str) -> str:
|
|
"""Résoudre la commande Win+R pour lancer une application.
|
|
|
|
Si l'app n'est pas dans le mapping, utilise le nom de l'exécutable
|
|
directement sans l'extension .exe (fonctionne pour la plupart des apps).
|
|
"""
|
|
app_lower = app_name.lower()
|
|
if app_lower in _APP_LAUNCH_COMMANDS:
|
|
return _APP_LAUNCH_COMMANDS[app_lower]
|
|
# Fallback : utiliser le nom sans l'extension .exe
|
|
if app_lower.endswith(".exe"):
|
|
return app_name[:-4]
|
|
return app_name
|
|
|
|
|
|
def _infer_app_from_window_titles(titles: list) -> tuple:
|
|
"""Inférer le nom de l'application et la commande de lancement depuis des titres de fenêtres.
|
|
|
|
Utilise des heuristiques basées sur les patterns de titres Windows courants.
|
|
|
|
Returns:
|
|
Tuple (app_name, launch_command, matched_title).
|
|
("", "", "") si non identifié.
|
|
"""
|
|
_TITLE_APP_PATTERNS = [
|
|
("bloc-notes", "Notepad.exe", "notepad"),
|
|
("notepad", "Notepad.exe", "notepad"),
|
|
("word", "winword.exe", "winword"),
|
|
("excel", "excel.exe", "excel"),
|
|
("powerpoint", "powerpnt.exe", "powerpnt"),
|
|
("outlook", "outlook.exe", "outlook"),
|
|
("paint", "mspaint.exe", "mspaint"),
|
|
("calculatrice", "calc.exe", "calc"),
|
|
("calculator", "calc.exe", "calc"),
|
|
("explorateur de fichiers", "explorer.exe", "explorer"),
|
|
("file explorer", "explorer.exe", "explorer"),
|
|
("invite de commandes", "cmd.exe", "cmd"),
|
|
("command prompt", "cmd.exe", "cmd"),
|
|
("powershell", "powershell.exe", "powershell"),
|
|
("visual studio code", "code.exe", "code"),
|
|
("edge", "msedge.exe", "msedge"),
|
|
("chrome", "chrome.exe", "chrome"),
|
|
("firefox", "firefox.exe", "firefox"),
|
|
]
|
|
|
|
for title in titles:
|
|
title_lower = title.lower()
|
|
for pattern, app_name, launch_cmd in _TITLE_APP_PATTERNS:
|
|
if pattern in title_lower:
|
|
# Ignorer les apps système (explorer, etc.)
|
|
if app_name.lower() in _SETUP_IGNORE_APPS:
|
|
continue
|
|
return (app_name, launch_cmd, title)
|
|
|
|
return ("", "", "")
|
|
|
|
|
|
def _get_visual_search_info(app_name: str) -> Dict[str, str]:
|
|
"""Obtenir les informations de recherche visuelle pour une application.
|
|
|
|
Consulte _APP_VISUAL_SEARCH, sinon construit un fallback à partir du nom
|
|
de l'exécutable (ex: "MonApp.exe" → search_text="MonApp").
|
|
|
|
Args:
|
|
app_name: Nom de l'exécutable (ex: "Notepad.exe").
|
|
|
|
Returns:
|
|
Dict avec search_text, display_name, vlm_description.
|
|
"""
|
|
app_lower = app_name.lower()
|
|
if app_lower in _APP_VISUAL_SEARCH:
|
|
return dict(_APP_VISUAL_SEARCH[app_lower])
|
|
|
|
# Fallback : utiliser le nom sans .exe
|
|
base_name = app_name[:-4] if app_lower.endswith(".exe") else app_name
|
|
return {
|
|
"search_text": base_name,
|
|
"display_name": base_name,
|
|
"vlm_description": f"L'application {base_name} dans les résultats de recherche",
|
|
}
|
|
|
|
|
|
def _generate_setup_actions(
|
|
app_info: Dict[str, Any],
|
|
setup_id_prefix: str = "setup",
|
|
) -> List[Dict[str, Any]]:
|
|
"""Générer les actions 100% visuelles pour ouvrir l'application avant le replay.
|
|
|
|
Approche entièrement visuelle — JAMAIS de raccourcis clavier (Win, Win+R,
|
|
Ctrl+X, etc.) qui n'ont pas été enregistrés par l'utilisateur. Tout passe
|
|
par des clics visuels résolus par le VLM (Qwen2.5-VL).
|
|
|
|
La séquence est :
|
|
1. Clic visuel sur le bouton Démarrer (coin bas-gauche de l'écran)
|
|
2. Attendre que le menu Démarrer s'ouvre (1s)
|
|
3. Clic visuel sur la barre de recherche du menu Démarrer
|
|
4. Attendre que la barre de recherche soit active (500ms)
|
|
5. Taper le nom de l'application (texte français, ex: "Bloc-notes")
|
|
6. Attendre les résultats de recherche (1.2s)
|
|
7. Clic visuel sur le résultat de l'application trouvée
|
|
8. Attendre que l'application s'ouvre (2-3s selon le poids)
|
|
9. verify_screen : vérifier que la fenêtre attendue est apparue
|
|
|
|
Args:
|
|
app_info: Dict retourné par _extract_required_apps_from_events ou
|
|
_extract_required_apps_from_workflow.
|
|
setup_id_prefix: Préfixe pour les action_id générés.
|
|
|
|
Returns:
|
|
Liste d'actions normalisées, prêtes à injecter dans la queue.
|
|
Liste vide si aucune préparation n'est nécessaire.
|
|
"""
|
|
if not app_info:
|
|
return []
|
|
|
|
launch_cmd = app_info.get("primary_launch_cmd", "")
|
|
primary_app = app_info.get("primary_app", "")
|
|
first_title = app_info.get("first_window_title", "")
|
|
|
|
if not launch_cmd:
|
|
logger.debug(
|
|
"setup_actions : pas de commande de lancement pour '%s', skip",
|
|
primary_app,
|
|
)
|
|
return []
|
|
|
|
# Ne pas lancer les apps système (toujours présentes)
|
|
if primary_app.lower() in _SETUP_IGNORE_APPS:
|
|
logger.debug("setup_actions : app '%s' ignorée (système)", primary_app)
|
|
return []
|
|
|
|
# Obtenir les informations de recherche visuelle pour cette app
|
|
visual_info = _get_visual_search_info(primary_app)
|
|
search_text = visual_info["search_text"]
|
|
display_name = visual_info["display_name"]
|
|
vlm_description = visual_info["vlm_description"]
|
|
|
|
actions = []
|
|
|
|
logger.info(
|
|
"Génération setup env 100%% visuel : lancement de '%s' via clic "
|
|
"Démarrer → recherche visuelle '%s' (fenêtre attendue : '%s')",
|
|
primary_app, search_text, first_title,
|
|
)
|
|
|
|
# 1. Clic visuel sur le bouton Démarrer (toujours visible, bas-gauche)
|
|
# Le VLM résout la position exacte ; x_pct/y_pct sont des fallbacks.
|
|
actions.append({
|
|
"action_id": f"act_{setup_id_prefix}_click_start",
|
|
"type": "click",
|
|
"x_pct": 0.02,
|
|
"y_pct": 0.98,
|
|
"button": "left",
|
|
"visual_mode": True,
|
|
"target_spec": {
|
|
"by_text": "Démarrer",
|
|
"by_role": "start_button",
|
|
"vlm_description": (
|
|
"Le bouton Démarrer de Windows (icône Windows), "
|
|
"en bas à gauche de la barre des tâches"
|
|
),
|
|
},
|
|
"_setup_phase": True,
|
|
"_setup_step": "click_start_menu",
|
|
})
|
|
|
|
# 2. Attendre que le menu Démarrer s'ouvre
|
|
actions.append({
|
|
"action_id": f"act_{setup_id_prefix}_wait_start",
|
|
"type": "wait",
|
|
"duration_ms": 1000,
|
|
"_setup_phase": True,
|
|
"_setup_step": "wait_start_menu",
|
|
})
|
|
|
|
# 3. Clic visuel sur la barre de recherche du menu Démarrer
|
|
# Sur Windows 10/11, la barre de recherche est intégrée au menu Démarrer
|
|
# ou visible dans la barre des tâches. Le VLM la trouve visuellement.
|
|
actions.append({
|
|
"action_id": f"act_{setup_id_prefix}_click_search",
|
|
"type": "click",
|
|
"x_pct": 0.20,
|
|
"y_pct": 0.92,
|
|
"button": "left",
|
|
"visual_mode": True,
|
|
"target_spec": {
|
|
"by_text": "Rechercher",
|
|
"by_role": "search_box",
|
|
"vlm_description": (
|
|
"La barre ou le champ de recherche dans le menu Démarrer "
|
|
"de Windows, souvent intitulé 'Tapez ici pour rechercher' "
|
|
"ou 'Rechercher'"
|
|
),
|
|
},
|
|
"_setup_phase": True,
|
|
"_setup_step": "click_search_box",
|
|
})
|
|
|
|
# 4. Attendre que la barre de recherche soit active et prête
|
|
actions.append({
|
|
"action_id": f"act_{setup_id_prefix}_wait_search_ready",
|
|
"type": "wait",
|
|
"duration_ms": 500,
|
|
"_setup_phase": True,
|
|
"_setup_step": "wait_search_ready",
|
|
})
|
|
|
|
# 5. Taper le nom visuel de l'application (texte français)
|
|
# Le champ de recherche a été cliqué visuellement à l'étape 3,
|
|
# donc le type s'exécute dans le champ actif.
|
|
actions.append({
|
|
"action_id": f"act_{setup_id_prefix}_type_search",
|
|
"type": "type",
|
|
"text": search_text,
|
|
"_setup_phase": True,
|
|
"_setup_step": "type_app_name",
|
|
})
|
|
|
|
# 6. Attendre que la recherche Windows trouve l'application
|
|
actions.append({
|
|
"action_id": f"act_{setup_id_prefix}_wait_results",
|
|
"type": "wait",
|
|
"duration_ms": 1200,
|
|
"_setup_phase": True,
|
|
"_setup_step": "wait_search_results",
|
|
})
|
|
|
|
# 7. Clic visuel sur le résultat de l'application dans la liste
|
|
# Le VLM identifie l'icône/texte de l'app dans les résultats.
|
|
actions.append({
|
|
"action_id": f"act_{setup_id_prefix}_click_result",
|
|
"type": "click",
|
|
"x_pct": 0.20,
|
|
"y_pct": 0.50,
|
|
"button": "left",
|
|
"visual_mode": True,
|
|
"target_spec": {
|
|
"by_text": display_name,
|
|
"by_role": "app_icon",
|
|
"vlm_description": vlm_description,
|
|
},
|
|
"_setup_phase": True,
|
|
"_setup_step": "click_app_result",
|
|
})
|
|
|
|
# 8. Attendre que l'application s'ouvre
|
|
# Durée variable : 3s pour les apps lourdes (Office, VS Code), 2s sinon
|
|
heavy_apps = {"winword.exe", "excel.exe", "powerpnt.exe", "outlook.exe", "code.exe"}
|
|
wait_ms = 3000 if primary_app.lower() in heavy_apps else 2000
|
|
actions.append({
|
|
"action_id": f"act_{setup_id_prefix}_wait_launch",
|
|
"type": "wait",
|
|
"duration_ms": wait_ms,
|
|
"_setup_phase": True,
|
|
"_setup_step": "wait_app_launch",
|
|
})
|
|
|
|
# 9. Vérification visuelle que la fenêtre attendue est apparue
|
|
if first_title:
|
|
actions.append({
|
|
"action_id": f"act_{setup_id_prefix}_verify",
|
|
"type": "verify_screen",
|
|
"expected_node": "setup_initial",
|
|
"timeout_ms": 5000,
|
|
"_setup_phase": True,
|
|
"_setup_step": "verify_app_ready",
|
|
"_expected_title": first_title,
|
|
})
|
|
|
|
logger.info(
|
|
"Setup env visuel généré : %d actions pour lancer '%s' "
|
|
"(recherche visuelle : '%s')",
|
|
len(actions), primary_app, search_text,
|
|
)
|
|
|
|
return actions
|
|
|
|
|
|
def _validate_replay_action(action: dict) -> Optional[str]:
|
|
"""Valide une action de replay. Retourne un message d'erreur ou None si valide."""
|
|
action_type = action.get("type", "")
|
|
|
|
# Vérifier le type d'action
|
|
if action_type not in _ALLOWED_ACTION_TYPES:
|
|
return f"Type d'action non autorisé : '{action_type}'. Autorisés : {sorted(_ALLOWED_ACTION_TYPES)}"
|
|
|
|
# Vérifier la longueur du texte
|
|
text = action.get("text", "")
|
|
if isinstance(text, str) and len(text) > _MAX_ACTION_TEXT_LENGTH:
|
|
return f"Texte trop long ({len(text)} > {_MAX_ACTION_TEXT_LENGTH} caractères)"
|
|
|
|
# Vérifier les touches
|
|
keys = action.get("keys", [])
|
|
if isinstance(keys, list):
|
|
if len(keys) > _MAX_KEYS_PER_COMBO:
|
|
return f"Trop de touches ({len(keys)} > {_MAX_KEYS_PER_COMBO})"
|
|
for key in keys:
|
|
key_lower = str(key).lower()
|
|
# Accepter les caractères simples (a-z, 0-9, ponctuation) et les noms connus
|
|
if len(str(key)) == 1 or key_lower in _KNOWN_KEY_NAMES:
|
|
continue
|
|
return f"Touche inconnue : '{key}'"
|
|
|
|
# Vérifier les coordonnées normalisées
|
|
for coord_name in ("x_pct", "y_pct"):
|
|
val = action.get(coord_name)
|
|
if val is not None:
|
|
try:
|
|
val_f = float(val)
|
|
if not (0.0 <= val_f <= 1.0):
|
|
return f"Coordonnée {coord_name}={val_f} hors limites [0.0, 1.0]"
|
|
except (TypeError, ValueError):
|
|
return f"Coordonnée {coord_name} invalide : {val}"
|
|
|
|
return None # Valide
|
|
|
|
|
|
# En production (ENVIRONMENT != development), désactiver la doc Swagger
|
|
_is_production = os.environ.get("ENVIRONMENT", "development") != "development"
|
|
|
|
app = FastAPI(
|
|
title="RPA Vision V3 - Streaming API v1",
|
|
dependencies=[Depends(_verify_token)],
|
|
docs_url=None if _is_production else "/docs",
|
|
redoc_url=None if _is_production else "/redoc",
|
|
openapi_url=None if _is_production else "/openapi.json",
|
|
)
|
|
|
|
# CORS — origines autorisées (VWB frontend, Agent Chat, Dashboard)
|
|
# Configurable via variable d'environnement CORS_ORIGINS (séparées par des virgules)
|
|
# Inclut le domaine public pour l'accès internet via NPM reverse proxy
|
|
_DEFAULT_CORS_ORIGINS = (
|
|
"http://localhost:3002," # VWB Frontend (Vite/React)
|
|
"http://localhost:5002," # VWB Backend (Flask)
|
|
"http://localhost:5004," # Agent Chat
|
|
"http://localhost:5001," # Web Dashboard
|
|
"http://192.168.1.40:3002," # VWB Frontend depuis le réseau local
|
|
"http://192.168.1.40:5004," # Agent Chat depuis le réseau local
|
|
"https://lea.labs.laurinebazin.design," # Domaine public HTTPS
|
|
"https://vwb.labs.laurinebazin.design" # VWB public HTTPS
|
|
)
|
|
CORS_ORIGINS = os.environ.get("CORS_ORIGINS", _DEFAULT_CORS_ORIGINS).split(",")
|
|
CORS_ORIGINS = [o.strip() for o in CORS_ORIGINS if o.strip()]
|
|
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=CORS_ORIGINS,
|
|
allow_credentials=True,
|
|
allow_methods=["GET", "POST"],
|
|
allow_headers=["Content-Type", "Authorization"],
|
|
)
|
|
|
|
|
|
@app.middleware("http")
|
|
async def security_headers_middleware(request: Request, call_next):
|
|
"""Ajouter les headers de sécurité sur toutes les réponses."""
|
|
response = await call_next(request)
|
|
response.headers["X-Content-Type-Options"] = "nosniff"
|
|
response.headers["X-Frame-Options"] = "DENY"
|
|
response.headers["X-XSS-Protection"] = "1; mode=block"
|
|
response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
|
|
if request.url.scheme == "https" or request.headers.get("X-Forwarded-Proto") == "https":
|
|
response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
|
|
return response
|
|
|
|
|
|
@app.middleware("http")
|
|
async def rate_limit_middleware(request: Request, call_next):
|
|
"""Middleware de rate limiting sur les endpoints sensibles."""
|
|
path = request.url.path
|
|
if path in _RATE_LIMITS:
|
|
client_ip = request.client.host if request.client else "unknown"
|
|
if not _check_rate_limit(path, client_ip):
|
|
from fastapi.responses import JSONResponse
|
|
logger.warning(f"Rate limit dépassé : {path} par {client_ip}")
|
|
return JSONResponse(
|
|
status_code=429,
|
|
content={"detail": f"Trop de requêtes. Limite : {_RATE_LIMITS[path]}/{_RATE_LIMIT_WINDOW}s"},
|
|
)
|
|
return await call_next(request)
|
|
|
|
|
|
# Dossier des sessions live
|
|
ROOT_DIR = Path(__file__).parent.parent.parent
|
|
LIVE_SESSIONS_DIR = ROOT_DIR / "data" / "training" / "live_sessions"
|
|
LIVE_SESSIONS_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# =========================================================================
|
|
# Communication avec le worker VLM (process séparé)
|
|
# Le serveur HTTP ne fait JAMAIS de VLM — il écrit dans des fichiers
|
|
# que le worker VLM (run_worker.py) lit dans son propre process.
|
|
# =========================================================================
|
|
_DATA_DIR = ROOT_DIR / "data" / "training"
|
|
WORKER_QUEUE_FILE = _DATA_DIR / "_worker_queue.txt"
|
|
REPLAY_LOCK_FILE = _DATA_DIR / "_replay_active.lock"
|
|
|
|
# Instance globale partagée (le StreamProcessor reste dans le serveur HTTP
|
|
# pour le CLIP, l'indexation FAISS, la gestion des sessions, le replay —
|
|
# mais ne fait PAS de VLM/reprocess_session, c'est le worker séparé qui s'en charge)
|
|
processor = StreamProcessor(data_dir=str(LIVE_SESSIONS_DIR))
|
|
worker = StreamWorker(live_dir=str(LIVE_SESSIONS_DIR), processor=processor)
|
|
|
|
|
|
# =========================================================================
|
|
# Flush garanti à l'arrêt — signal handler + atexit (ceinture et bretelles)
|
|
# =========================================================================
|
|
# Le shutdown handler FastAPI (@app.on_event("shutdown")) fait déjà un flush,
|
|
# mais si le serveur est tué par SIGTERM (systemd) ou SIGINT (Ctrl+C) avant
|
|
# que uvicorn ait le temps de déclencher le shutdown propre, le flush n'a pas
|
|
# lieu. On ajoute donc un signal handler ET un atexit comme filets de sécurité.
|
|
|
|
def _emergency_flush(signum=None, frame=None):
|
|
"""Flush les sessions dirty sur disque avant exit.
|
|
|
|
Appelé par SIGTERM/SIGINT ou atexit. Idempotent (flush() est thread-safe).
|
|
"""
|
|
sig_name = signal.Signals(signum).name if signum else "atexit"
|
|
logger.info(f"Flush d'urgence des sessions en cours ({sig_name})...")
|
|
try:
|
|
processor.session_manager.flush()
|
|
logger.info("Flush d'urgence terminé — données persistées.")
|
|
except Exception as e:
|
|
logger.error(f"Erreur pendant le flush d'urgence : {e}")
|
|
# Si c'est un signal, on laisse le handler par défaut terminer le process
|
|
if signum is not None:
|
|
# Remettre le handler par défaut et re-raise le signal
|
|
signal.signal(signum, signal.SIG_DFL)
|
|
os.kill(os.getpid(), signum)
|
|
|
|
# Enregistrer les handlers uniquement quand le module est exécuté comme serveur
|
|
# (pas lors d'un simple import depuis un autre process comme le retraitement batch)
|
|
def _register_shutdown_handlers():
|
|
signal.signal(signal.SIGTERM, _emergency_flush)
|
|
signal.signal(signal.SIGINT, _emergency_flush)
|
|
atexit.register(processor.session_manager.flush)
|
|
logger.info("Handlers de shutdown enregistrés (SIGTERM, SIGINT, atexit)")
|
|
|
|
|
|
def _enqueue_to_worker(session_id: str):
|
|
"""Ajoute un session_id à la queue du worker VLM (fichier sur disque).
|
|
|
|
Le worker VLM (process séparé) lit ce fichier et traite les sessions.
|
|
Évite les doublons : vérifie si le session_id est déjà dans la queue.
|
|
"""
|
|
try:
|
|
WORKER_QUEUE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Lire la queue existante pour éviter les doublons
|
|
existing = set()
|
|
if WORKER_QUEUE_FILE.exists():
|
|
existing = {
|
|
line.strip()
|
|
for line in WORKER_QUEUE_FILE.read_text(encoding="utf-8").splitlines()
|
|
if line.strip()
|
|
}
|
|
|
|
if session_id in existing:
|
|
logger.info(f"Session {session_id} déjà dans la queue worker, skip")
|
|
return
|
|
|
|
# Ajouter à la fin du fichier
|
|
with open(WORKER_QUEUE_FILE, "a", encoding="utf-8") as f:
|
|
f.write(session_id + "\n")
|
|
|
|
logger.info(f"Session {session_id} ajoutée à la queue worker ({WORKER_QUEUE_FILE})")
|
|
except Exception as e:
|
|
logger.error(f"Erreur écriture queue worker : {e}")
|
|
|
|
|
|
def _set_replay_lock(replay_id: str = ""):
|
|
"""Crée le fichier lock de replay (signale au worker VLM de se suspendre)."""
|
|
try:
|
|
REPLAY_LOCK_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
REPLAY_LOCK_FILE.write_text(
|
|
f"replay_id={replay_id}\ntimestamp={time.time()}\n",
|
|
encoding="utf-8",
|
|
)
|
|
logger.info(f"Replay lock créé : {REPLAY_LOCK_FILE} (replay={replay_id})")
|
|
except Exception as e:
|
|
logger.error(f"Erreur création replay lock : {e}")
|
|
|
|
|
|
def _clear_replay_lock():
|
|
"""Supprime le fichier lock de replay (le worker VLM peut reprendre)."""
|
|
try:
|
|
REPLAY_LOCK_FILE.unlink(missing_ok=True)
|
|
logger.info("Replay lock supprimé, worker VLM autorisé à reprendre")
|
|
except Exception as e:
|
|
logger.error(f"Erreur suppression replay lock : {e}")
|
|
|
|
|
|
def _get_worker_queue_status() -> Dict[str, Any]:
|
|
"""Retourne l'état de la queue du worker VLM (pour le monitoring)."""
|
|
queue = []
|
|
if WORKER_QUEUE_FILE.exists():
|
|
try:
|
|
queue = [
|
|
line.strip()
|
|
for line in WORKER_QUEUE_FILE.read_text(encoding="utf-8").splitlines()
|
|
if line.strip()
|
|
]
|
|
except Exception:
|
|
pass
|
|
|
|
return {
|
|
"running": True, # On ne sait pas si le worker process tourne, mais la queue existe
|
|
"queue_length": len(queue),
|
|
"queue": queue,
|
|
"replay_lock_active": REPLAY_LOCK_FILE.exists(),
|
|
"queue_file": str(WORKER_QUEUE_FILE),
|
|
"note": "Le worker VLM tourne dans un process séparé (run_worker.py)",
|
|
}
|
|
|
|
|
|
# =========================================================================
|
|
# Compteur d'analyses en cours par session (pour attendre avant finalize)
|
|
# =========================================================================
|
|
_pending_analyses: Dict[str, int] = defaultdict(int)
|
|
_pending_lock = threading.Lock()
|
|
|
|
# =========================================================================
|
|
# File d'attente de replay par session
|
|
# Chaque session a une queue d'actions à exécuter et un état de replay
|
|
# =========================================================================
|
|
_replay_lock = threading.Lock()
|
|
# session_id -> liste d'actions en attente (FIFO)
|
|
_replay_queues: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
|
# machine_id -> session_id (mapping pour le replay ciblé par machine)
|
|
_machine_replay_target: Dict[str, str] = {}
|
|
# replay_id -> état du replay (workflow_id, session_id, status, progress)
|
|
_replay_states: Dict[str, Dict[str, Any]] = {}
|
|
|
|
|
|
class StreamEvent(BaseModel):
|
|
session_id: str
|
|
timestamp: float
|
|
event: Dict[str, Any]
|
|
machine_id: str = "default" # Identifiant machine (multi-machine, rétrocompatible)
|
|
|
|
|
|
class ReplayRequest(BaseModel):
|
|
"""Requête de lancement de replay d'un workflow."""
|
|
workflow_id: str
|
|
session_id: str
|
|
machine_id: Optional[str] = None # Machine cible pour le replay (multi-machine)
|
|
params: Optional[Dict[str, Any]] = None
|
|
|
|
|
|
class RawReplayRequest(BaseModel):
|
|
"""Requête de replay avec actions brutes (mode Agent Libre)."""
|
|
actions: List[Dict[str, Any]]
|
|
session_id: str = ""
|
|
machine_id: Optional[str] = None # Machine cible (multi-machine)
|
|
task_description: str = ""
|
|
|
|
|
|
class SingleActionRequest(BaseModel):
|
|
"""Requête d'exécution d'une seule action (mode Copilot)."""
|
|
action: Dict[str, Any]
|
|
session_id: str = ""
|
|
machine_id: Optional[str] = None # Machine cible (multi-machine)
|
|
|
|
|
|
class ReplayResultReport(BaseModel):
|
|
"""Rapport de résultat d'exécution d'une action par l'Agent V1."""
|
|
session_id: str
|
|
action_id: str
|
|
success: bool
|
|
error: Optional[str] = None
|
|
warning: Optional[str] = None # "no_screen_change", "popup_handled", "visual_resolve_failed"
|
|
screenshot: Optional[str] = None # Chemin ou base64 du screenshot post-action
|
|
screenshot_after: Optional[str] = None # Chemin ou base64 du screenshot APRES l'action
|
|
actual_position: Optional[Dict[str, float]] = None # {"x": px, "y": py} position réelle du clic
|
|
# Métriques de résolution visuelle
|
|
resolution_method: Optional[str] = None # som_text_match, som_vlm, vlm_quick_find, etc.
|
|
resolution_score: Optional[float] = None
|
|
resolution_elapsed_ms: Optional[float] = None
|
|
# Champs enrichis pour target_not_found (pause supervisée)
|
|
target_description: Optional[str] = None # Description humaine de la cible
|
|
target_spec: Optional[Dict[str, Any]] = None # Spec complete de la cible
|
|
|
|
|
|
class ErrorCallbackConfig(BaseModel):
|
|
"""Configuration du callback d'erreur pour un replay."""
|
|
replay_id: str
|
|
callback_url: str # URL à appeler en cas d'erreur non-récupérable
|
|
|
|
|
|
# Thread de nettoyage périodique des replays terminés et sessions expirées
|
|
_cleanup_thread: Optional[threading.Thread] = None
|
|
_cleanup_running = False
|
|
|
|
|
|
def _cleanup_loop():
|
|
"""Nettoyage périodique des replay states terminés et des sessions expirées.
|
|
|
|
Tourne en arrière-plan toutes les 10 minutes :
|
|
- Supprime les replay states completed/error/failed plus vieux que REPLAY_STATE_TTL_SECONDS
|
|
- Nettoie les sessions en mémoire via LiveSessionManager.cleanup_old_sessions()
|
|
- Borne _replay_states à MAX_REPLAY_STATES entrées
|
|
"""
|
|
while _cleanup_running:
|
|
time.sleep(600) # 10 minutes
|
|
if not _cleanup_running:
|
|
break
|
|
try:
|
|
_cleanup_replay_states()
|
|
# Nettoyage des sessions expirées en mémoire (toutes les heures = 6 cycles)
|
|
processor.session_manager.cleanup_old_sessions(max_age_hours=24)
|
|
except Exception as e:
|
|
logger.error(f"Erreur dans la boucle de nettoyage : {e}")
|
|
|
|
|
|
def _cleanup_replay_states():
|
|
"""Supprimer les replay states terminés (completed/error/failed) plus vieux que le TTL."""
|
|
now = time.time()
|
|
to_delete = []
|
|
|
|
with _replay_lock:
|
|
for replay_id, state in _replay_states.items():
|
|
if state["status"] in ("completed", "error", "failed"):
|
|
# Vérifier l'âge via le dernier résultat ou le timestamp du dernier event
|
|
last_result = state.get("results", [])
|
|
last_time = last_result[-1].get("timestamp", 0) if last_result else 0
|
|
if not last_time:
|
|
# Pas de timestamp dans les résultats, utiliser les error_log
|
|
error_log = state.get("error_log", [])
|
|
last_time = error_log[-1].get("timestamp", 0) if error_log else 0
|
|
if not last_time:
|
|
# Aucun timestamp trouvé, marquer pour suppression (orphelin)
|
|
to_delete.append(replay_id)
|
|
continue
|
|
if now - last_time > REPLAY_STATE_TTL_SECONDS:
|
|
to_delete.append(replay_id)
|
|
|
|
# Supprimer les entrées expirées
|
|
for replay_id in to_delete:
|
|
del _replay_states[replay_id]
|
|
_error_callbacks.pop(replay_id, None)
|
|
|
|
# Borne de sécurité : si trop d'entrées, supprimer les plus anciens terminés
|
|
if len(_replay_states) > MAX_REPLAY_STATES:
|
|
finished = [
|
|
(rid, s) for rid, s in _replay_states.items()
|
|
if s["status"] in ("completed", "error", "failed")
|
|
]
|
|
# Trier par nombre de résultats (les plus anciens ont typiquement tous leurs résultats)
|
|
excess = len(_replay_states) - MAX_REPLAY_STATES
|
|
for rid, _ in finished[:excess]:
|
|
del _replay_states[rid]
|
|
_error_callbacks.pop(rid, None)
|
|
|
|
if to_delete:
|
|
logger.info(f"Nettoyage replay states : {len(to_delete)} entrées supprimées")
|
|
|
|
|
|
@app.get("/health")
|
|
async def health_check():
|
|
"""Endpoint de santé (public, pas besoin de token)."""
|
|
return {"status": "healthy", "version": "1.0.0"}
|
|
|
|
|
|
def _check_gpu_ready():
|
|
"""Vérifier que le GPU a assez de VRAM pour le pipeline.
|
|
|
|
Minimum 6 GB requis pour le VLM (gemma4:e4b ~10 GB) et les modèles CLIP/FAISS.
|
|
Loggue un avertissement si insuffisante, info sinon.
|
|
"""
|
|
try:
|
|
import subprocess
|
|
result = subprocess.run(
|
|
["nvidia-smi", "--query-gpu=memory.free", "--format=csv,noheader,nounits"],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
if result.returncode != 0:
|
|
logger.debug(f"nvidia-smi retour non-zéro : {result.stderr.strip()}")
|
|
return
|
|
# nvidia-smi peut retourner plusieurs lignes (multi-GPU) — prendre la première
|
|
free_mb_str = result.stdout.strip().split("\n")[0].strip()
|
|
free_mb = int(free_mb_str)
|
|
if free_mb < 6000: # 6 GB minimum pour le VLM + CLIP
|
|
logger.warning(
|
|
f"VRAM insuffisante : {free_mb} MB libres (minimum 6000 MB). "
|
|
f"Vérifier les process GPU avec nvidia-smi."
|
|
)
|
|
print(
|
|
f"\n [GPU WARNING] VRAM insuffisante : {free_mb} MB libres "
|
|
f"(minimum 6000 MB)\n"
|
|
)
|
|
else:
|
|
logger.info(f"GPU OK : {free_mb} MB VRAM libres")
|
|
except FileNotFoundError:
|
|
logger.debug("nvidia-smi non trouvé — pas de GPU NVIDIA détecté")
|
|
except Exception as e:
|
|
logger.debug(f"GPU check échoué : {e}")
|
|
|
|
|
|
@app.on_event("startup")
|
|
async def startup():
|
|
"""Démarrer le worker de streaming et charger les workflows existants.
|
|
|
|
NOTE: Le VLM (SessionWorker) tourne maintenant dans un process séparé
|
|
(run_worker.py). Ce serveur HTTP ne fait PLUS de VLM — il reste toujours
|
|
réactif pour les replays, events, images.
|
|
"""
|
|
global _cleanup_running, _cleanup_thread
|
|
|
|
# Vérifier la VRAM GPU disponible au démarrage
|
|
_check_gpu_ready()
|
|
|
|
# Résoudre et afficher le modèle VLM utilisé
|
|
# Enregistrer les handlers de shutdown (SIGTERM, SIGINT, atexit)
|
|
_register_shutdown_handlers()
|
|
|
|
from core.detection.vlm_config import get_vlm_model
|
|
_vlm_model_name = get_vlm_model()
|
|
logger.info("VLM model: %s", _vlm_model_name)
|
|
print(f"\n VLM model: {_vlm_model_name}")
|
|
|
|
# Afficher le token API au démarrage pour que l'utilisateur puisse configurer l'agent
|
|
_token_source = "env RPA_API_TOKEN" if os.environ.get("RPA_API_TOKEN") else "auto-généré"
|
|
logger.info(f"API Token ({_token_source}): {API_TOKEN}")
|
|
print(f"\n{'='*60}")
|
|
print(f" API Token ({_token_source}):")
|
|
print(f" {API_TOKEN}")
|
|
print(f" Configurer l'agent : export RPA_API_TOKEN={API_TOKEN}")
|
|
print(f"{'='*60}\n")
|
|
|
|
worker.start(blocking=False)
|
|
|
|
# Charger les workflows existants depuis le disque
|
|
_load_existing_workflows()
|
|
|
|
# S'assurer que le replay lock est nettoyé au démarrage (crash précédent)
|
|
_clear_replay_lock()
|
|
|
|
# Démarrer le thread de nettoyage périodique
|
|
_cleanup_running = True
|
|
_cleanup_thread = threading.Thread(target=_cleanup_loop, daemon=True, name="replay_cleanup")
|
|
_cleanup_thread.start()
|
|
|
|
logger.info(
|
|
"API Streaming démarrée — StreamProcessor, Worker et Cleanup prêts. "
|
|
"VLM Worker dans un process séparé (run_worker.py)."
|
|
)
|
|
|
|
|
|
def _load_existing_workflows():
|
|
"""Charger les workflows JSON existants dans processor._workflows.
|
|
|
|
Supporte deux formats :
|
|
- Workflow.load_from_file (format complet avec workflow_id)
|
|
- JSON brut avec clé 'name' (format simplifié VWB/manuels)
|
|
"""
|
|
from core.models.workflow_graph import Workflow
|
|
|
|
workflow_dirs = [
|
|
ROOT_DIR / "data" / "workflows",
|
|
ROOT_DIR / "data" / "training" / "workflows",
|
|
LIVE_SESSIONS_DIR / "workflows",
|
|
]
|
|
|
|
loaded = 0
|
|
for wf_dir in workflow_dirs:
|
|
if not wf_dir.exists():
|
|
continue
|
|
for wf_file in wf_dir.glob("*.json"):
|
|
try:
|
|
wf = Workflow.load_from_file(str(wf_file))
|
|
if wf and hasattr(wf, 'workflow_id'):
|
|
with processor._data_lock:
|
|
processor._workflows[wf.workflow_id] = wf
|
|
loaded += 1
|
|
continue
|
|
except Exception:
|
|
pass
|
|
|
|
# Fallback : charger comme JSON brut et injecter un workflow_id
|
|
try:
|
|
wf_data = json.loads(wf_file.read_text(encoding="utf-8"))
|
|
wf_id = wf_data.get("workflow_id") or wf_file.stem
|
|
# Stocker le dict brut (suffisant pour _workflow_to_actions)
|
|
with processor._data_lock:
|
|
processor._workflows[wf_id] = wf_data
|
|
loaded += 1
|
|
except Exception as e:
|
|
logger.debug(f"Skip workflow {wf_file.name}: {e}")
|
|
|
|
logger.info(f"Workflows chargés depuis disque: {loaded}")
|
|
|
|
|
|
@app.on_event("shutdown")
|
|
async def shutdown():
|
|
global _cleanup_running
|
|
_cleanup_running = False
|
|
worker.stop()
|
|
# Nettoyer le replay lock au shutdown (sinon le worker VLM resterait bloqué)
|
|
_clear_replay_lock()
|
|
processor.session_manager.flush()
|
|
logger.info("API Streaming arrêtée.")
|
|
|
|
|
|
# =========================================================================
|
|
# Session management
|
|
# =========================================================================
|
|
|
|
@app.post("/api/v1/traces/stream/register")
|
|
async def register_session(session_id: str, machine_id: str = "default"):
|
|
"""Enregistrer une nouvelle session de streaming.
|
|
|
|
Args:
|
|
session_id: Identifiant unique de la session
|
|
machine_id: Identifiant de la machine source (multi-machine, défaut: "default")
|
|
"""
|
|
processor.session_manager.register_session(session_id, machine_id=machine_id)
|
|
# Reset des compteurs pour cette session (évite les reliquats d'une session précédente)
|
|
with _pending_lock:
|
|
_pending_analyses[session_id] = 0
|
|
_analyzed_shots[session_id] = set()
|
|
logger.info(f"Session {session_id} enregistrée (machine={machine_id}, compteurs réinitialisés)")
|
|
return {"status": "session_registered", "session_id": session_id, "machine_id": machine_id}
|
|
|
|
|
|
def _ensure_session_registered(session_id: str, machine_id: str = "default"):
|
|
"""Auto-enregistrer une session si elle n'existe pas encore.
|
|
|
|
Robustesse au redémarrage du serveur : l'Agent V1 ne re-register pas
|
|
sa session, mais continue d'envoyer des events/images. On l'enregistre
|
|
automatiquement à la première réception.
|
|
|
|
Args:
|
|
session_id: Identifiant de la session
|
|
machine_id: Identifiant machine (propagé depuis l'agent)
|
|
"""
|
|
session = processor.session_manager.get_session(session_id)
|
|
if session is None:
|
|
logger.info(f"Auto-enregistrement de la session {session_id} (machine={machine_id})")
|
|
processor.session_manager.register_session(session_id, machine_id=machine_id)
|
|
with _pending_lock:
|
|
_pending_analyses[session_id] = 0
|
|
_analyzed_shots[session_id] = set()
|
|
elif machine_id != "default" and session.machine_id == "default":
|
|
# Mettre à jour le machine_id si l'agent l'envoie et qu'on ne l'avait pas
|
|
session.machine_id = machine_id
|
|
|
|
|
|
# =========================================================================
|
|
# Événements
|
|
# =========================================================================
|
|
|
|
@app.post("/api/v1/traces/stream/event")
|
|
async def stream_event(data: StreamEvent):
|
|
"""Reçoit un événement et l'enregistre dans la session."""
|
|
session_id = data.session_id
|
|
machine_id = data.machine_id or "default"
|
|
|
|
# Auto-enregistrer la session si inconnue (robustesse au redémarrage serveur)
|
|
_ensure_session_registered(session_id, machine_id=machine_id)
|
|
|
|
# Persister sur disque (journal JSONL, dans un sous-dossier par machine si multi-machine)
|
|
if machine_id and machine_id != "default":
|
|
session_path = LIVE_SESSIONS_DIR / machine_id / session_id
|
|
else:
|
|
session_path = LIVE_SESSIONS_DIR / session_id
|
|
session_path.mkdir(parents=True, exist_ok=True)
|
|
event_file = session_path / "live_events.jsonl"
|
|
with open(event_file, "a", encoding="utf-8") as f:
|
|
f.write(json.dumps(data.dict()) + "\n")
|
|
|
|
# Traitement direct via StreamProcessor
|
|
result = worker.process_event_direct(session_id, data.event)
|
|
|
|
# ── Enrichissement SomEngine temps réel pour les mouse_click ──
|
|
# Après l'enregistrement de l'event, tenter l'enrichissement si le
|
|
# screenshot est déjà arrivé. Sinon, l'event est mis en attente et
|
|
# sera enrichi quand le screenshot arrivera (voir stream_image).
|
|
event = data.event
|
|
if event.get("type") == "mouse_click" and event.get("screenshot_id"):
|
|
session = processor.session_manager.get_session(session_id)
|
|
if session:
|
|
event_index = len(session.events) - 1
|
|
submitted = _try_enrich_click_event(
|
|
session_id, event, event_index, machine_id,
|
|
)
|
|
result["som_enrichment"] = "submitted" if submitted else "pending_screenshot"
|
|
|
|
return {"status": "event_synced", "session_id": session_id, **result}
|
|
|
|
|
|
# =========================================================================
|
|
# Images
|
|
# =========================================================================
|
|
|
|
# Ensemble des screenshots déjà analysés (évite les doublons de retry)
|
|
_analyzed_shots: Dict[str, set] = defaultdict(set)
|
|
|
|
# Hash du dernier screenshot analysé par session (déduplication par similarité)
|
|
_last_screenshot_hash: Dict[str, str] = {}
|
|
|
|
# Dernier heartbeat reçu par session : {session_id: {"path": str, "timestamp": float}}
|
|
# Utilisé par le pre-check de replay pour vérifier l'état de l'écran avant action
|
|
_last_heartbeat: Dict[str, Dict[str, Any]] = {}
|
|
# Seuil max d'ancienneté du heartbeat (secondes) — au-delà, skip le pre-check
|
|
_HEARTBEAT_MAX_AGE_SECONDS = 10.0
|
|
# Seuil de similarité cosine pour valider le pre-check
|
|
_PRECHECK_SIMILARITY_THRESHOLD = 0.85
|
|
|
|
# ThreadPool pour l'analyse GPU (évite de bloquer le event loop async)
|
|
_gpu_executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="gpu_analysis")
|
|
|
|
# =========================================================================
|
|
# Enrichissement SomEngine en temps réel
|
|
# Quand un mouse_click arrive avec un screenshot_id, on lance SomEngine
|
|
# pour identifier l'élément UI cliqué. Le résultat est stocké dans l'event
|
|
# de la session, prêt pour le replay sans retraitement VLM.
|
|
# =========================================================================
|
|
|
|
# ThreadPool dédié SomEngine (1 seul worker pour ne pas saturer le GPU)
|
|
_som_enrichment_executor = ThreadPoolExecutor(
|
|
max_workers=1, thread_name_prefix="som_enrich",
|
|
)
|
|
|
|
# Clics en attente d'enrichissement (le screenshot n'est pas encore arrivé)
|
|
# Clé : (session_id, screenshot_id) → dict avec les infos nécessaires
|
|
_pending_click_enrichments: Dict[tuple, Dict[str, Any]] = {}
|
|
_enrichment_lock = threading.Lock()
|
|
|
|
# Screenshots d'action arrivés (pour matcher avec les events en attente)
|
|
# Clé : (session_id, screenshot_id) → chemin du fichier
|
|
_arrived_action_screenshots: Dict[tuple, str] = {}
|
|
|
|
|
|
def _get_session_dir(session_id: str, machine_id: str = "default") -> Path:
|
|
"""Retrouver le répertoire d'une session live."""
|
|
if machine_id and machine_id != "default":
|
|
return LIVE_SESSIONS_DIR / machine_id / session_id
|
|
return LIVE_SESSIONS_DIR / session_id
|
|
|
|
|
|
def _get_screen_resolution_for_session(session_id: str) -> tuple:
|
|
"""Récupérer la résolution d'écran depuis la session en mémoire."""
|
|
session = processor.session_manager.get_session(session_id)
|
|
if session and session.last_window_info:
|
|
res = session.last_window_info.get("screen_resolution", [1920, 1080])
|
|
if isinstance(res, list) and len(res) == 2:
|
|
return (int(res[0]), int(res[1]))
|
|
return (1920, 1080)
|
|
|
|
|
|
def _submit_click_enrichment(
|
|
session_id: str,
|
|
event_data: dict,
|
|
screenshot_path: str,
|
|
event_index: int,
|
|
machine_id: str = "default",
|
|
) -> None:
|
|
"""Soumettre l'enrichissement SomEngine d'un clic au thread pool dédié.
|
|
|
|
Ne bloque pas le handler HTTP — le résultat sera stocké dans l'event
|
|
de la session quand SomEngine aura terminé (~1-2 secondes).
|
|
|
|
Args:
|
|
session_id: Identifiant de la session.
|
|
event_data: Données de l'événement mouse_click (pos, window, etc.).
|
|
screenshot_path: Chemin vers le screenshot full (PNG).
|
|
event_index: Index de l'event dans la liste session.events.
|
|
machine_id: Identifiant machine.
|
|
"""
|
|
_som_enrichment_executor.submit(
|
|
_enrich_click_background,
|
|
session_id, event_data, screenshot_path, event_index, machine_id,
|
|
)
|
|
|
|
|
|
def _enrich_click_background(
|
|
session_id: str,
|
|
event_data: dict,
|
|
screenshot_path: str,
|
|
event_index: int,
|
|
machine_id: str = "default",
|
|
) -> None:
|
|
"""Enrichir un clic avec SomEngine en arrière-plan (thread séparé).
|
|
|
|
Appelle enrich_click_from_screenshot() et stocke le résultat
|
|
directement dans l'event de la session (enrichment dict).
|
|
"""
|
|
try:
|
|
pos = event_data.get("pos", [0, 0])
|
|
if not pos or len(pos) < 2:
|
|
return
|
|
|
|
click_x, click_y = int(pos[0]), int(pos[1])
|
|
screen_w, screen_h = _get_screen_resolution_for_session(session_id)
|
|
|
|
# Extraire le titre de fenêtre
|
|
window = event_data.get("window", {})
|
|
if isinstance(window, dict):
|
|
window_title = window.get("title", "")
|
|
else:
|
|
window_title = event_data.get("window_title", "")
|
|
|
|
# Extraire vision_info si disponible (OCR côté agent)
|
|
vision_info = event_data.get("vision_info")
|
|
|
|
# Déduire session_dir et screenshot_id pour le cache SomEngine
|
|
session_dir = _get_session_dir(session_id, machine_id)
|
|
screenshot_id = event_data.get("screenshot_id", "")
|
|
|
|
logger.info(
|
|
"[SoM-RT] Enrichissement clic (%d,%d) pour %s/%s",
|
|
click_x, click_y, session_id, screenshot_id,
|
|
)
|
|
|
|
enrichment = enrich_click_from_screenshot(
|
|
screenshot_path=Path(screenshot_path),
|
|
click_x=click_x,
|
|
click_y=click_y,
|
|
screen_w=screen_w,
|
|
screen_h=screen_h,
|
|
window_title=window_title,
|
|
vision_info=vision_info,
|
|
session_dir=session_dir,
|
|
screenshot_id=screenshot_id,
|
|
)
|
|
|
|
if not enrichment:
|
|
logger.debug(
|
|
"[SoM-RT] Enrichissement vide pour %s/%s (screenshot illisible ?)",
|
|
session_id, screenshot_id,
|
|
)
|
|
return
|
|
|
|
# Stocker le résultat dans l'event de la session
|
|
session = processor.session_manager.get_session(session_id)
|
|
if session and 0 <= event_index < len(session.events):
|
|
session.events[event_index]["enrichment"] = enrichment
|
|
# Forcer la persistance pour sauvegarder l'enrichissement
|
|
processor.session_manager._maybe_persist(session_id)
|
|
logger.info(
|
|
"[SoM-RT] Clic enrichi : %s/%s → by_text='%s', by_role='%s', som=%s",
|
|
session_id, screenshot_id,
|
|
enrichment.get("by_text", ""),
|
|
enrichment.get("by_role", ""),
|
|
bool(enrichment.get("som_element")),
|
|
)
|
|
else:
|
|
logger.warning(
|
|
"[SoM-RT] Session %s introuvable ou event_index %d invalide",
|
|
session_id, event_index,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
"[SoM-RT] Erreur enrichissement clic %s : %s",
|
|
session_id, e, exc_info=True,
|
|
)
|
|
|
|
|
|
def _try_enrich_click_event(
|
|
session_id: str,
|
|
event_data: dict,
|
|
event_index: int,
|
|
machine_id: str = "default",
|
|
) -> bool:
|
|
"""Tenter l'enrichissement SomEngine d'un event mouse_click.
|
|
|
|
Vérifie si le screenshot est déjà arrivé. Si oui, soumet l'enrichissement.
|
|
Si non, enregistre l'event dans la file d'attente.
|
|
|
|
Returns:
|
|
True si l'enrichissement a été soumis, False si en attente du screenshot.
|
|
"""
|
|
screenshot_id = event_data.get("screenshot_id", "")
|
|
if not screenshot_id:
|
|
return False
|
|
|
|
key = (session_id, screenshot_id)
|
|
|
|
with _enrichment_lock:
|
|
# Le screenshot est-il déjà arrivé ?
|
|
screenshot_path = _arrived_action_screenshots.get(key)
|
|
if screenshot_path:
|
|
# Screenshot disponible → soumettre immédiatement
|
|
_submit_click_enrichment(
|
|
session_id, event_data, screenshot_path, event_index, machine_id,
|
|
)
|
|
# Nettoyer : plus besoin de garder le screenshot en mémoire
|
|
_arrived_action_screenshots.pop(key, None)
|
|
return True
|
|
else:
|
|
# Screenshot pas encore arrivé → mettre en attente
|
|
_pending_click_enrichments[key] = {
|
|
"event_data": event_data,
|
|
"event_index": event_index,
|
|
"machine_id": machine_id,
|
|
}
|
|
logger.debug(
|
|
"[SoM-RT] Clic en attente du screenshot %s/%s",
|
|
session_id, screenshot_id,
|
|
)
|
|
return False
|
|
|
|
|
|
def _on_action_screenshot_arrived(
|
|
session_id: str,
|
|
shot_id: str,
|
|
file_path: str,
|
|
machine_id: str = "default",
|
|
) -> bool:
|
|
"""Appelé quand un screenshot d'action (shot_XXXX_full) arrive.
|
|
|
|
Vérifie s'il y a un clic en attente d'enrichissement pour ce screenshot.
|
|
Si oui, soumet l'enrichissement au thread pool.
|
|
|
|
Args:
|
|
session_id: Identifiant de la session.
|
|
shot_id: Identifiant du screenshot (ex: "shot_0003_full").
|
|
file_path: Chemin complet vers le fichier PNG.
|
|
machine_id: Identifiant machine.
|
|
|
|
Returns:
|
|
True si un enrichissement a été soumis, False sinon.
|
|
"""
|
|
# Extraire le screenshot_id depuis le shot_id : "shot_0003_full" → "shot_0003"
|
|
screenshot_id = shot_id.replace("_full", "")
|
|
key = (session_id, screenshot_id)
|
|
|
|
with _enrichment_lock:
|
|
# Y a-t-il un clic en attente pour ce screenshot ?
|
|
pending = _pending_click_enrichments.pop(key, None)
|
|
if pending:
|
|
# Clic trouvé → soumettre l'enrichissement
|
|
_submit_click_enrichment(
|
|
session_id,
|
|
pending["event_data"],
|
|
file_path,
|
|
pending["event_index"],
|
|
pending.get("machine_id", machine_id),
|
|
)
|
|
return True
|
|
else:
|
|
# Pas de clic en attente → enregistrer le screenshot pour plus tard
|
|
_arrived_action_screenshots[key] = file_path
|
|
# Nettoyage : limiter la taille du cache (les vieux screenshots
|
|
# dont l'event n'arrivera jamais)
|
|
if len(_arrived_action_screenshots) > 200:
|
|
# Supprimer les plus anciennes entrées (FIFO via insertion order)
|
|
oldest = next(iter(_arrived_action_screenshots))
|
|
_arrived_action_screenshots.pop(oldest, None)
|
|
return False
|
|
|
|
|
|
def _merge_enrichments_into_raw_events(
|
|
raw_events: List[Dict[str, Any]],
|
|
session_events: List[Dict[str, Any]],
|
|
) -> int:
|
|
"""Fusionner les enrichissements SomEngine temps réel dans les events JSONL.
|
|
|
|
Les events JSONL (raw_events) sont écrits AVANT l'enrichissement SomEngine.
|
|
Les events en mémoire (session_events) contiennent l'enrichissement dans
|
|
le champ "enrichment". On les fusionne par correspondance screenshot_id.
|
|
|
|
Args:
|
|
raw_events: Events chargés depuis live_events.jsonl (structure
|
|
{"session_id": ..., "event": {...}} ou directement {...}).
|
|
session_events: Events en mémoire depuis LiveSessionState.events
|
|
(contiennent potentiellement le champ "enrichment").
|
|
|
|
Returns:
|
|
Nombre d'enrichissements fusionnés.
|
|
"""
|
|
# Construire un index screenshot_id → enrichment depuis les events mémoire
|
|
enrichment_by_shot: Dict[str, dict] = {}
|
|
for evt in session_events:
|
|
enr = evt.get("enrichment")
|
|
shot_id = evt.get("screenshot_id", "")
|
|
if enr and shot_id:
|
|
enrichment_by_shot[shot_id] = enr
|
|
|
|
if not enrichment_by_shot:
|
|
return 0
|
|
|
|
merged = 0
|
|
for raw_evt in raw_events:
|
|
inner = raw_evt.get("event", raw_evt)
|
|
if inner.get("type") != "mouse_click":
|
|
continue
|
|
shot_id = inner.get("screenshot_id", "")
|
|
if not shot_id:
|
|
continue
|
|
enr = enrichment_by_shot.get(shot_id)
|
|
if enr and "enrichment" not in inner:
|
|
inner["enrichment"] = enr
|
|
merged += 1
|
|
|
|
if merged:
|
|
logger.info(
|
|
"[SoM-RT] %d enrichissement(s) temps réel fusionné(s) dans les events JSONL",
|
|
merged,
|
|
)
|
|
return merged
|
|
|
|
|
|
def _image_hash(file_path: str) -> str:
|
|
"""Hash rapide d'une image pour détecter les doublons (~identiques).
|
|
|
|
Utilise 32x32 au lieu de 16x16 pour une meilleure discrimination
|
|
entre screenshots similaires mais pas identiques (ex: texte modifié
|
|
dans un champ, curseur déplacé, etc.).
|
|
"""
|
|
try:
|
|
from PIL import Image
|
|
import hashlib
|
|
img = Image.open(file_path)
|
|
# Réduire à 32x32 et convertir en niveaux de gris pour un hash perceptuel
|
|
thumb = img.resize((32, 32)).convert('L')
|
|
return hashlib.md5(thumb.tobytes()).hexdigest()
|
|
except Exception:
|
|
return ""
|
|
|
|
|
|
@app.post("/api/v1/traces/stream/image")
|
|
async def stream_image(
|
|
session_id: str,
|
|
shot_id: str,
|
|
machine_id: str = "default",
|
|
file: UploadFile = File(...),
|
|
background_tasks: BackgroundTasks = None,
|
|
):
|
|
"""Reçoit une image et déclenche l'analyse via le core pipeline."""
|
|
# Auto-enregistrer la session si inconnue (robustesse au redémarrage serveur)
|
|
_ensure_session_registered(session_id, machine_id=machine_id)
|
|
|
|
# Sauvegarder sur disque (dans un sous-dossier par machine si multi-machine)
|
|
if machine_id and machine_id != "default":
|
|
session_path = LIVE_SESSIONS_DIR / machine_id / session_id
|
|
else:
|
|
session_path = LIVE_SESSIONS_DIR / session_id
|
|
shots_dir = session_path / "shots"
|
|
shots_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
file_path = shots_dir / f"{shot_id}.png"
|
|
content = await file.read()
|
|
with open(file_path, "wb") as f:
|
|
f.write(content)
|
|
|
|
file_path_str = str(file_path)
|
|
|
|
# Crops : traitement léger (pas d'analyse ScreenAnalyzer)
|
|
if "_crop" in shot_id:
|
|
result = worker.process_crop_direct(session_id, shot_id, file_path_str)
|
|
return {"status": "crop_stored", "shot_id": shot_id, **result}
|
|
|
|
# Filtrer les screenshots qui ne nécessitent PAS d'analyse GPU.
|
|
# Seuls les shot_XXXX_full (screenshots d'action) sont analysés.
|
|
# Les autres (heartbeat, focus, res_shot) sont stockés sur disque
|
|
# mais pas envoyés au GPU — sinon le ThreadPool (1 worker, ~10-30s/analyse)
|
|
# est submergé et la finalisation timeout avec 0 states.
|
|
if shot_id.startswith("heartbeat_"):
|
|
# Mémoriser le dernier heartbeat pour le pre-check de replay
|
|
_last_heartbeat[session_id] = {
|
|
"path": file_path_str,
|
|
"timestamp": time.time(),
|
|
}
|
|
return {"status": "heartbeat_stored", "shot_id": shot_id}
|
|
if shot_id.startswith("focus_"):
|
|
return {"status": "focus_stored", "shot_id": shot_id}
|
|
if shot_id.startswith("res_shot_"):
|
|
return {"status": "res_stored", "shot_id": shot_id}
|
|
if not shot_id.startswith("shot_") or "_full" not in shot_id:
|
|
# Tout ce qui n'est pas shot_XXXX_full → stocker sans analyser
|
|
logger.debug(f"Screenshot {shot_id} stocké sans analyse GPU")
|
|
return {"status": "stored_no_analysis", "shot_id": shot_id}
|
|
|
|
# Enrichissement SomEngine temps réel (léger, ~1-2s en background)
|
|
# Lancé AVANT la déduplication VLM car c'est un traitement indépendant.
|
|
# Si un event mouse_click attend ce screenshot, on lance SomEngine en background.
|
|
# Sinon, on enregistre le screenshot pour le matcher quand l'event arrivera.
|
|
_on_action_screenshot_arrived(session_id, shot_id, file_path_str, machine_id)
|
|
|
|
# Déduplication par ID : ne pas réanalyser un screenshot déjà traité
|
|
with _pending_lock:
|
|
if shot_id in _analyzed_shots[session_id]:
|
|
logger.debug(f"Screenshot {shot_id} déjà analysé, skip")
|
|
return {"status": "already_analyzed", "shot_id": shot_id}
|
|
|
|
# Déduplication par similarité : si l'image est quasi identique à la précédente, skip
|
|
img_hash = _image_hash(file_path_str)
|
|
if img_hash and img_hash == _last_screenshot_hash.get(session_id):
|
|
logger.info(f"Screenshot {shot_id} identique au précédent, skip analyse GPU")
|
|
with _pending_lock:
|
|
_analyzed_shots[session_id].add(shot_id)
|
|
return {"status": "duplicate_skipped", "shot_id": shot_id}
|
|
if img_hash:
|
|
_last_screenshot_hash[session_id] = img_hash
|
|
|
|
with _pending_lock:
|
|
_analyzed_shots[session_id].add(shot_id)
|
|
|
|
# Screenshots full : STOCKAGE UNIQUEMENT (pas d'analyse VLM lourde en temps réel)
|
|
# L'analyse VLM complète (ScreenAnalyzer + CLIP + FAISS) est faite par le
|
|
# worker séparé (run_worker.py) après finalisation de la session.
|
|
logger.debug(f"Screenshot {shot_id} stocké (analyse VLM différée au worker)")
|
|
|
|
return {"status": "image_stored", "shot_id": shot_id}
|
|
|
|
|
|
|
|
def _process_screenshot_thread(session_id: str, shot_id: str, path: str):
|
|
"""Analyse GPU d'un screenshot dans un thread séparé (ne bloque pas FastAPI)."""
|
|
try:
|
|
import traceback
|
|
logger.info(f"[GPU] Début analyse {shot_id} pour {session_id}")
|
|
result = worker.process_screenshot_direct(session_id, shot_id, path)
|
|
logger.info(
|
|
f"[GPU] Screenshot {shot_id} analysé: "
|
|
f"{result.get('ui_elements_count', 0)} UI, "
|
|
f"{result.get('text_detected', 0)} textes, "
|
|
f"indexed={result.get('embedding_indexed', False)}"
|
|
)
|
|
except Exception as e:
|
|
import traceback
|
|
logger.error(f"[GPU] Erreur analyse {shot_id}: {e}\n{traceback.format_exc()}")
|
|
finally:
|
|
with _pending_lock:
|
|
_pending_analyses[session_id] = max(0, _pending_analyses[session_id] - 1)
|
|
|
|
|
|
# =========================================================================
|
|
# Finalisation
|
|
# =========================================================================
|
|
|
|
@app.post("/api/v1/traces/stream/finalize")
|
|
async def finalize(session_id: str, machine_id: str = "default"):
|
|
"""Clôture la session et place le traitement en file d'attente.
|
|
|
|
Ne bloque plus : marque la session comme finalisée et l'ajoute à la queue
|
|
du worker VLM (process séparé) pour analyse + construction workflow.
|
|
|
|
Le client peut suivre la progression via GET /api/v1/traces/stream/processing/status.
|
|
|
|
Args:
|
|
session_id: Identifiant de la session à finaliser
|
|
machine_id: Identifiant machine (informatif, le machine_id est déjà dans la session)
|
|
"""
|
|
# Vérifier que la session existe
|
|
session = processor.session_manager.get_session(session_id)
|
|
if not session:
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f"Session {session_id} non trouvée",
|
|
)
|
|
|
|
# Marquer la session comme finalisée (persistée sur disque)
|
|
processor.session_manager.finalize(session_id)
|
|
logger.info(f"Session {session_id} finalisée, ajout à la queue du worker VLM")
|
|
|
|
# Nettoyer les structures d'enrichissement temps réel pour cette session
|
|
with _enrichment_lock:
|
|
keys_to_remove = [k for k in _pending_click_enrichments if k[0] == session_id]
|
|
for k in keys_to_remove:
|
|
del _pending_click_enrichments[k]
|
|
keys_to_remove = [k for k in _arrived_action_screenshots if k[0] == session_id]
|
|
for k in keys_to_remove:
|
|
del _arrived_action_screenshots[k]
|
|
|
|
# Écrire dans le fichier queue pour le worker VLM (process séparé)
|
|
_enqueue_to_worker(session_id)
|
|
|
|
# Compter les screenshots full disponibles pour donner une estimation
|
|
session_dir = processor._find_session_dir(session_id)
|
|
full_shots_count = 0
|
|
if session_dir:
|
|
shots_dir = session_dir / "shots"
|
|
if shots_dir.exists():
|
|
full_shots_count = len(list(shots_dir.glob("shot_*_full.png")))
|
|
|
|
return {
|
|
"status": "queued_for_processing",
|
|
"session_id": session_id,
|
|
"machine_id": session.machine_id,
|
|
"screenshots_to_analyze": full_shots_count,
|
|
"message": (
|
|
f"Session finalisée. {full_shots_count} screenshots seront analysés "
|
|
"en arrière-plan. Suivez la progression via "
|
|
"GET /api/v1/traces/stream/processing/status"
|
|
),
|
|
}
|
|
|
|
|
|
# =========================================================================
|
|
# Traitement asynchrone — Suivi de la queue de processing
|
|
# =========================================================================
|
|
|
|
@app.get("/api/v1/traces/stream/processing/status")
|
|
async def get_processing_status():
|
|
"""État de la queue de traitement VLM (worker process séparé).
|
|
|
|
Retourne :
|
|
- queue_length : nombre de sessions en attente dans le fichier queue
|
|
- queue : liste des session_ids en attente
|
|
- replay_lock_active : si un replay est en cours (worker suspendu)
|
|
"""
|
|
return _get_worker_queue_status()
|
|
|
|
|
|
@app.post("/api/v1/traces/stream/processing/requeue")
|
|
async def requeue_session(session_id: str):
|
|
"""Relancer le traitement d'une session (manuellement).
|
|
|
|
Utile pour :
|
|
- Relancer une session échouée après correction
|
|
- Forcer le retraitement d'une session déjà traitée
|
|
"""
|
|
session = processor.session_manager.get_session(session_id)
|
|
if not session:
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f"Session {session_id} non trouvée",
|
|
)
|
|
|
|
_enqueue_to_worker(session_id)
|
|
|
|
return {
|
|
"status": "requeued",
|
|
"session_id": session_id,
|
|
"queue_status": _get_worker_queue_status(),
|
|
}
|
|
|
|
|
|
# =========================================================================
|
|
# Monitoring
|
|
# =========================================================================
|
|
|
|
@app.get("/api/v1/traces/stream/stats")
|
|
async def get_stats():
|
|
"""Statistiques du serveur de streaming."""
|
|
stats = worker.stats
|
|
# Ajouter les machines connues
|
|
stats["machines"] = processor.session_manager.get_machine_ids()
|
|
return stats
|
|
|
|
|
|
@app.get("/api/v1/traces/stream/machines")
|
|
async def list_machines():
|
|
"""Lister toutes les machines connues avec leurs sessions actives.
|
|
|
|
Utile pour le dashboard et l'agent chat (Léa) pour savoir quelles
|
|
machines sont connectées et cibler un replay spécifique.
|
|
"""
|
|
machine_ids = processor.session_manager.get_machine_ids()
|
|
machines = []
|
|
for mid in machine_ids:
|
|
machine_sessions = processor.session_manager.get_sessions_by_machine(mid)
|
|
active = [s for s in machine_sessions if not s.finalized]
|
|
machines.append({
|
|
"machine_id": mid,
|
|
"total_sessions": len(machine_sessions),
|
|
"active_sessions": len(active),
|
|
"last_activity": max(
|
|
(s.last_activity for s in machine_sessions),
|
|
default=None,
|
|
).isoformat() if machine_sessions else None,
|
|
})
|
|
return {"machines": machines}
|
|
|
|
|
|
@app.get("/api/v1/traces/stream/sessions")
|
|
async def list_sessions(machine_id: Optional[str] = None):
|
|
"""Lister les sessions (actives et finalisées).
|
|
|
|
Args:
|
|
machine_id: Si fourni, filtre par machine. Si absent, retourne toutes les sessions.
|
|
"""
|
|
sessions = processor.list_sessions(machine_id=machine_id)
|
|
result = {"sessions": sessions}
|
|
# Ajouter la liste des machines connues pour l'UI
|
|
result["machines"] = processor.session_manager.get_machine_ids()
|
|
return result
|
|
|
|
|
|
@app.get("/api/v1/traces/stream/workflows")
|
|
async def list_workflows(machine_id: Optional[str] = None):
|
|
"""Lister les workflows construits.
|
|
|
|
Args:
|
|
machine_id: Si fourni, filtre par machine. Si absent, retourne tous les workflows.
|
|
"""
|
|
workflows = processor.list_workflows(machine_id=machine_id)
|
|
result = {"workflows": workflows}
|
|
# Ajouter la liste des machines connues pour l'UI
|
|
result["machines"] = processor.session_manager.get_machine_ids()
|
|
return result
|
|
|
|
|
|
@app.post("/api/v1/traces/stream/reload-workflows")
|
|
async def reload_workflows():
|
|
"""Recharger les workflows depuis le disque.
|
|
|
|
Appelé par le VWB après un export-for-lea pour que le streaming server
|
|
voie immédiatement les nouveaux workflows sans redémarrage.
|
|
"""
|
|
count = processor.reload_workflows()
|
|
return {"success": True, "workflows_count": count}
|
|
|
|
|
|
@app.get("/api/v1/traces/stream/workflow/{workflow_id}")
|
|
async def get_workflow_detail(workflow_id: str):
|
|
"""Retourne le détail complet d'un workflow (format core JSON).
|
|
|
|
Utilisé par le VWB pour importer un workflow appris qui n'est pas
|
|
encore sur disque (seulement en mémoire dans le streaming server).
|
|
"""
|
|
with processor._data_lock:
|
|
wf = processor._workflows.get(workflow_id)
|
|
|
|
if not wf:
|
|
raise HTTPException(status_code=404, detail=f"Workflow '{workflow_id}' non trouvé")
|
|
|
|
return wf.to_dict()
|
|
|
|
|
|
@app.get("/api/v1/traces/stream/session/{session_id}")
|
|
async def get_session(session_id: str):
|
|
"""État d'une session."""
|
|
session = processor.session_manager.get_session(session_id)
|
|
if not session:
|
|
raise HTTPException(status_code=404, detail=f"Session {session_id} non trouvée")
|
|
return {
|
|
"session_id": session.session_id,
|
|
"machine_id": session.machine_id,
|
|
"events_count": len(session.events),
|
|
"screenshots_count": len(session.shot_paths),
|
|
"last_window": session.last_window_info,
|
|
"created_at": session.created_at.isoformat(),
|
|
"last_activity": session.last_activity.isoformat(),
|
|
"finalized": session.finalized,
|
|
}
|
|
|
|
|
|
# =========================================================================
|
|
# Replay — Exécution de workflows sur l'Agent V1
|
|
# =========================================================================
|
|
|
|
|
|
def _find_active_agent_session(machine_id: Optional[str] = None) -> Optional[str]:
|
|
"""Trouver la dernière session Agent V1 pour le replay.
|
|
|
|
Stratégie en 2 passes :
|
|
1. D'abord chercher une session non-finalisée (Agent V1 actif)
|
|
2. Sinon, prendre la plus récente même finalisée (Agent V1 peut avoir
|
|
redémarré et créé une nouvelle session, ou la session a été finalisée
|
|
par timeout mais l'agent est toujours là)
|
|
|
|
Dans les deux cas, on ne considère que les sessions 'sess_*' (Agent V1).
|
|
|
|
Args:
|
|
machine_id: Si fourni, ne chercher que les sessions de cette machine.
|
|
Si None, chercher toutes les sessions (rétrocompatible).
|
|
"""
|
|
with processor.session_manager._lock:
|
|
all_agent_sessions = [
|
|
s for s in processor.session_manager._sessions.values()
|
|
if s.session_id.startswith("sess_")
|
|
and (machine_id is None or s.machine_id == machine_id)
|
|
]
|
|
|
|
if not all_agent_sessions:
|
|
return None
|
|
|
|
# Trier par session_id (contient un timestamp) — plus récent d'abord
|
|
all_agent_sessions.sort(key=lambda s: s.session_id, reverse=True)
|
|
|
|
# Passe 1 : préférer une session non-finalisée
|
|
for s in all_agent_sessions:
|
|
if not s.finalized:
|
|
return s.session_id
|
|
|
|
# Passe 2 : fallback sur la plus récente (même finalisée)
|
|
# L'Agent V1 poll /replay/next indépendamment de l'état finalized
|
|
return all_agent_sessions[0].session_id
|
|
|
|
|
|
def _workflow_to_actions(workflow, params: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
|
|
"""
|
|
Convertir un workflow (nodes + edges ordonnés) en liste d'actions normalisées.
|
|
|
|
Parcourt le graphe depuis les entry_nodes en suivant les edges.
|
|
Chaque edge produit une action normalisée avec coordonnées en pourcentage.
|
|
|
|
Mode intelligent (workflows appris par Léa) :
|
|
Si le workflow a des nodes avec des prototype_vectors, utilise le
|
|
StreamProcessor.extract_enriched_actions() qui enrichit les actions
|
|
avec les données de la session originale, le ciblage visuel et le
|
|
pre-check/post-check par embedding CLIP.
|
|
|
|
Mode classique (workflows VWB/manuels) :
|
|
Parcours BFS classique avec _edge_to_normalized_actions().
|
|
"""
|
|
params = params or {}
|
|
|
|
# Détection d'un workflow appris (a des nodes avec prototype_vectors)
|
|
# et qui a des edges structurés
|
|
if _is_learned_workflow(workflow):
|
|
# Priorité 1 : replay hybride (événements bruts + structure workflow)
|
|
# Beaucoup plus fiable car utilise les actions utilisateur réelles
|
|
# au lieu des compound actions du GraphBuilder qui perdent les détails
|
|
hybrid = processor.build_hybrid_replay(workflow)
|
|
if hybrid:
|
|
logger.info(
|
|
"Replay hybride : %d actions depuis events bruts + structure workflow",
|
|
len(hybrid),
|
|
)
|
|
# Optimisation par gestes clavier si disponible
|
|
if _gesture_catalog and hybrid:
|
|
hybrid = _gesture_catalog.optimize_replay_actions(hybrid)
|
|
return hybrid
|
|
|
|
# Priorité 2 : enrichissement classique (fallback si hybride échoue)
|
|
enriched = processor.extract_enriched_actions(workflow, params)
|
|
if enriched:
|
|
logger.info(
|
|
"Replay intelligent : %d actions enrichies depuis le workflow appris",
|
|
len(enriched),
|
|
)
|
|
# Optimisation par gestes clavier si disponible
|
|
if _gesture_catalog and enriched:
|
|
enriched = _gesture_catalog.optimize_replay_actions(enriched)
|
|
return enriched
|
|
# Si l'enrichissement échoue aussi, fallback sur le mode classique
|
|
logger.warning(
|
|
"Enrichissement échoué pour le workflow appris, fallback mode classique"
|
|
)
|
|
|
|
# Mode classique (VWB/manuels ou fallback)
|
|
actions = []
|
|
|
|
# Construire un index des edges sortants par node
|
|
outgoing: Dict[str, list] = defaultdict(list)
|
|
for edge in workflow.edges:
|
|
outgoing[edge.from_node].append(edge)
|
|
|
|
# Parcours linéaire depuis le premier entry_node
|
|
visited = set()
|
|
current_nodes = list(workflow.entry_nodes) if workflow.entry_nodes else []
|
|
|
|
# Fallback : si pas d'entry_nodes, prendre le premier node
|
|
if not current_nodes and workflow.nodes:
|
|
current_nodes = [workflow.nodes[0].node_id]
|
|
|
|
while current_nodes:
|
|
node_id = current_nodes.pop(0)
|
|
if node_id in visited:
|
|
continue
|
|
visited.add(node_id)
|
|
|
|
edges = outgoing.get(node_id, [])
|
|
for edge in edges:
|
|
edge_actions = _edge_to_normalized_actions(edge, params)
|
|
actions.extend(edge_actions)
|
|
# Suivre le graphe vers le prochain node
|
|
if edge.to_node not in visited:
|
|
current_nodes.append(edge.to_node)
|
|
|
|
# Optimisation : substituer les actions visuelles par des gestes clavier si possible
|
|
if _gesture_catalog and actions:
|
|
actions = _gesture_catalog.optimize_replay_actions(actions)
|
|
|
|
return actions
|
|
|
|
|
|
def _is_learned_workflow(workflow) -> bool:
|
|
"""Détecter si un workflow est un workflow appris (vs VWB/manuel).
|
|
|
|
Un workflow appris a :
|
|
- Des nodes avec _prototype_vector dans metadata
|
|
- Des edges avec from_node/to_node
|
|
- Un learning_state indicatif (OBSERVATION, COACHING, AUTO_CANDIDATE, etc.)
|
|
|
|
Un workflow VWB/manuel a généralement :
|
|
- Des edges avec des target_spec complets (by_text, by_role remplis)
|
|
- Pas de prototype_vectors
|
|
"""
|
|
# Accéder aux données (objet ou dict)
|
|
if hasattr(workflow, 'nodes'):
|
|
nodes = workflow.nodes
|
|
edges = workflow.edges
|
|
elif isinstance(workflow, dict):
|
|
nodes = workflow.get('nodes', [])
|
|
edges = workflow.get('edges', [])
|
|
else:
|
|
return False
|
|
|
|
if not nodes or not edges:
|
|
return False
|
|
|
|
# Vérifier si au moins un node a un prototype_vector
|
|
has_prototype = False
|
|
for node in nodes:
|
|
metadata = node.metadata if hasattr(node, 'metadata') else node.get('metadata', {})
|
|
if isinstance(metadata, dict) and '_prototype_vector' in metadata:
|
|
has_prototype = True
|
|
break
|
|
|
|
return has_prototype
|
|
|
|
|
|
def _edge_to_normalized_actions(edge, params: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Convertir un WorkflowEdge en liste d'actions normalisées pour l'Agent V1.
|
|
|
|
Un edge simple produit 1 action, un edge compound produit N actions (une par step).
|
|
"""
|
|
action = edge.action
|
|
if action is None:
|
|
logger.warning(f"Edge {edge.edge_id} sans action, skip")
|
|
return []
|
|
action_type = action.type
|
|
target = action.target
|
|
action_params = action.parameters or {}
|
|
|
|
# Extraire les coordonnées normalisées depuis TargetSpec.by_position
|
|
x_pct = 0.0
|
|
y_pct = 0.0
|
|
if target and target.by_position:
|
|
px, py = target.by_position
|
|
if px <= 1.0 and py <= 1.0:
|
|
x_pct = px
|
|
y_pct = py
|
|
else:
|
|
ref_w = action_params.get("ref_width", 1920) or 1920
|
|
ref_h = action_params.get("ref_height", 1080) or 1080
|
|
x_pct = round(px / ref_w, 6)
|
|
y_pct = round(py / ref_h, 6)
|
|
|
|
base = {"edge_id": edge.edge_id, "from_node": edge.from_node, "to_node": edge.to_node}
|
|
|
|
# Compound : décomposer en actions individuelles
|
|
if action_type == "compound":
|
|
return _expand_compound_steps(action_params.get("steps", []), base, params)
|
|
|
|
# Actions simples
|
|
normalized = {**base, "action_id": f"act_{uuid.uuid4().hex[:8]}"}
|
|
|
|
if action_type == "mouse_click":
|
|
normalized["type"] = "click"
|
|
normalized["x_pct"] = x_pct
|
|
normalized["y_pct"] = y_pct
|
|
normalized["button"] = action_params.get("button", "left")
|
|
|
|
elif action_type == "text_input":
|
|
normalized["type"] = "type"
|
|
text = action_params.get("text", "")
|
|
text = _substitute_variables(text, params, action_params.get("defaults", {}))
|
|
normalized["text"] = text
|
|
normalized["x_pct"] = x_pct
|
|
normalized["y_pct"] = y_pct
|
|
|
|
elif action_type == "key_press":
|
|
normalized["type"] = "key_combo"
|
|
keys = action_params.get("keys", [])
|
|
if not keys and action_params.get("key"):
|
|
keys = [action_params["key"]]
|
|
normalized["keys"] = keys
|
|
|
|
else:
|
|
logger.warning(f"Type d'action inconnu : {action_type}")
|
|
return []
|
|
|
|
# Ajouter le target_spec complet pour la résolution visuelle
|
|
target_spec = {}
|
|
if target and target.by_role:
|
|
target_spec["by_role"] = target.by_role
|
|
normalized["target_role"] = target.by_role # Compat debug
|
|
if target and target.by_text:
|
|
target_spec["by_text"] = target.by_text
|
|
normalized["target_text"] = target.by_text # Compat debug
|
|
if target and hasattr(target, 'context_hints') and target.context_hints:
|
|
target_spec["context_hints"] = target.context_hints
|
|
if target_spec:
|
|
normalized["target_spec"] = target_spec
|
|
normalized["visual_mode"] = True # Signal à l'agent d'utiliser la résolution visuelle
|
|
|
|
return [normalized]
|
|
|
|
|
|
def _substitute_variables(text: str, params: Dict[str, Any], defaults: Dict[str, Any]) -> str:
|
|
"""Substituer les variables ${var} dans un texte.
|
|
|
|
Priorité : params utilisateur > defaults du workflow > texte brut inchangé.
|
|
Supporte ${var} dans un texte plus long (ex: "${expression}=").
|
|
"""
|
|
import re
|
|
|
|
def replacer(match):
|
|
var_name = match.group(1)
|
|
return str(params.get(var_name, defaults.get(var_name, match.group(0))))
|
|
|
|
return re.sub(r'\$\{(\w+)\}', replacer, text)
|
|
|
|
|
|
def _expand_compound_steps(
|
|
steps: List[Dict[str, Any]], base: Dict[str, Any], params: Dict[str, Any]
|
|
) -> List[Dict[str, Any]]:
|
|
"""Décomposer les steps d'un compound en actions individuelles."""
|
|
actions = []
|
|
for step in steps:
|
|
step_type = step.get("type", "unknown")
|
|
action = {
|
|
**base,
|
|
"action_id": f"act_{uuid.uuid4().hex[:8]}",
|
|
}
|
|
|
|
if step_type == "key_press":
|
|
action["type"] = "key_combo"
|
|
keys = step.get("keys", [])
|
|
if not keys and step.get("key"):
|
|
keys = [step["key"]]
|
|
action["keys"] = keys
|
|
|
|
elif step_type == "text_input":
|
|
action["type"] = "type"
|
|
text = step.get("text", "")
|
|
text = _substitute_variables(text, params, {})
|
|
action["text"] = text
|
|
|
|
elif step_type == "wait":
|
|
action["type"] = "wait"
|
|
action["duration_ms"] = step.get("duration_ms", 500)
|
|
|
|
elif step_type == "mouse_click":
|
|
action["type"] = "click"
|
|
action["x_pct"] = step.get("x_pct", 0.0)
|
|
action["y_pct"] = step.get("y_pct", 0.0)
|
|
action["button"] = step.get("button", "left")
|
|
|
|
else:
|
|
logger.debug(f"Step compound inconnu : {step_type}")
|
|
continue
|
|
|
|
actions.append(action)
|
|
|
|
return actions
|
|
|
|
|
|
@app.post("/api/v1/traces/stream/replay")
|
|
async def start_replay(request: ReplayRequest):
|
|
"""
|
|
Lancer le replay d'un workflow sur une session Agent V1 active.
|
|
|
|
Le serveur charge le workflow, le convertit en liste d'actions normalisées,
|
|
et les place dans la queue de la session. L'Agent V1 les récupérera
|
|
via GET /replay/next (modèle pull).
|
|
|
|
Si session_id commence par "chat_" ou est vide, on détecte automatiquement
|
|
la dernière session Agent V1 active (non finalisée, préfixe "sess_").
|
|
Si machine_id est fourni, on cible spécifiquement cette machine.
|
|
"""
|
|
workflow_id = request.workflow_id
|
|
session_id = request.session_id
|
|
target_machine_id = request.machine_id
|
|
params = request.params or {}
|
|
|
|
# Auto-détection de la session Agent V1 active (avec filtre machine optionnel)
|
|
if not session_id or session_id.startswith("chat_"):
|
|
active_session = _find_active_agent_session(machine_id=target_machine_id)
|
|
if active_session:
|
|
logger.info(
|
|
f"Auto-détection session Agent V1 : {active_session} "
|
|
f"(demandé: {session_id}, machine={target_machine_id})"
|
|
)
|
|
session_id = active_session
|
|
else:
|
|
machine_hint = f" sur la machine '{target_machine_id}'" if target_machine_id else ""
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f"Aucune session Agent V1 active{machine_hint}. "
|
|
"Lancez l'Agent V1 et démarrez une session d'abord."
|
|
)
|
|
|
|
# Vérifier que le workflow existe
|
|
with processor._data_lock:
|
|
workflow = processor._workflows.get(workflow_id)
|
|
|
|
if not workflow:
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f"Workflow '{workflow_id}' non trouvé. "
|
|
f"Workflows disponibles : {list(processor._workflows.keys())}"
|
|
)
|
|
|
|
# Convertir le workflow en actions normalisées
|
|
actions = _workflow_to_actions(workflow, params)
|
|
if not actions:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Le workflow '{workflow_id}' ne contient aucune action exécutable."
|
|
)
|
|
|
|
# Limite de sécurité sur le nombre d'actions
|
|
if len(actions) > MAX_ACTIONS_PER_REPLAY:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Trop d'actions ({len(actions)} > {MAX_ACTIONS_PER_REPLAY}). "
|
|
"Découpez le workflow en parties plus petites."
|
|
)
|
|
|
|
# ── Setup environnement — ouvrir les applications nécessaires ──
|
|
setup_actions = []
|
|
app_info = _extract_required_apps_from_workflow(workflow)
|
|
if app_info:
|
|
setup_actions = _generate_setup_actions(app_info, setup_id_prefix="setup_wf")
|
|
if setup_actions:
|
|
actions = setup_actions + actions
|
|
logger.info(
|
|
"replay workflow %s : %d actions de setup injectées "
|
|
"(app=%s, cmd=%s)",
|
|
workflow_id, len(setup_actions),
|
|
app_info.get("primary_app"), app_info.get("primary_launch_cmd"),
|
|
)
|
|
|
|
# Créer l'identifiant de replay
|
|
replay_id = f"replay_{uuid.uuid4().hex[:8]}"
|
|
|
|
# Résoudre le machine_id de la session cible
|
|
session_obj = processor.session_manager.get_session(session_id)
|
|
resolved_machine_id = target_machine_id or (session_obj.machine_id if session_obj else "default")
|
|
|
|
# Injecter les actions dans la queue de la session
|
|
with _replay_lock:
|
|
_replay_queues[session_id] = list(actions) # Remplacer la queue existante
|
|
_replay_states[replay_id] = _create_replay_state(
|
|
replay_id=replay_id,
|
|
workflow_id=workflow_id,
|
|
session_id=session_id,
|
|
total_actions=len(actions),
|
|
params=params,
|
|
machine_id=resolved_machine_id,
|
|
)
|
|
# Enregistrer le mapping machine -> session pour le replay ciblé
|
|
if resolved_machine_id and resolved_machine_id != "default":
|
|
_machine_replay_target[resolved_machine_id] = session_id
|
|
|
|
# Signaler au worker VLM (process séparé) qu'un replay est actif → se suspendre
|
|
_set_replay_lock(replay_id)
|
|
|
|
logger.info(
|
|
f"Replay démarré : {replay_id} | workflow={workflow_id} | "
|
|
f"session={session_id} | machine={resolved_machine_id} | "
|
|
f"{len(actions)} actions ({len(setup_actions)} setup + "
|
|
f"{len(actions) - len(setup_actions)} replay) (worker suspendu)"
|
|
)
|
|
|
|
return {
|
|
"replay_id": replay_id,
|
|
"status": "running",
|
|
"workflow_id": workflow_id,
|
|
"session_id": session_id,
|
|
"machine_id": resolved_machine_id,
|
|
"total_actions": len(actions),
|
|
"setup_actions": len(setup_actions),
|
|
"setup_app": app_info.get("primary_app", "") if app_info else "",
|
|
}
|
|
|
|
|
|
@app.post("/api/v1/traces/stream/replay/raw")
|
|
async def start_raw_replay(request: RawReplayRequest):
|
|
"""
|
|
Lancer un replay avec des actions brutes (mode Agent Libre).
|
|
|
|
Au lieu de charger un workflow, accepte directement une liste d'actions
|
|
normalisées générées par le LLM planner. Les actions sont injectées
|
|
dans la queue de replay de l'Agent V1.
|
|
"""
|
|
session_id = request.session_id
|
|
actions = request.actions
|
|
target_machine_id = request.machine_id
|
|
task = request.task_description or "Tâche libre"
|
|
|
|
if not actions:
|
|
raise HTTPException(status_code=400, detail="Aucune action fournie.")
|
|
|
|
# Limite de sécurité sur le nombre d'actions
|
|
if len(actions) > MAX_ACTIONS_PER_REPLAY:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Trop d'actions ({len(actions)} > {MAX_ACTIONS_PER_REPLAY}). "
|
|
"Réduisez le plan d'exécution."
|
|
)
|
|
|
|
# Validation de chaque action (sécurité HIGH)
|
|
for i, action in enumerate(actions):
|
|
error = _validate_replay_action(action)
|
|
if error:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Action #{i} invalide : {error}"
|
|
)
|
|
|
|
# Auto-détection de la session Agent V1 (avec filtre machine optionnel)
|
|
if not session_id or session_id.startswith("chat_"):
|
|
active_session = _find_active_agent_session(machine_id=target_machine_id)
|
|
if active_session:
|
|
session_id = active_session
|
|
else:
|
|
machine_hint = f" sur la machine '{target_machine_id}'" if target_machine_id else ""
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f"Aucune session Agent V1 active{machine_hint}. "
|
|
"Lancez l'Agent V1 sur le PC cible."
|
|
)
|
|
|
|
# Assigner des action_id si manquants
|
|
for i, action in enumerate(actions):
|
|
if "action_id" not in action:
|
|
action["action_id"] = f"act_free_{uuid.uuid4().hex[:6]}"
|
|
|
|
replay_id = f"replay_free_{uuid.uuid4().hex[:8]}"
|
|
|
|
# Résoudre le machine_id de la session cible
|
|
session_obj = processor.session_manager.get_session(session_id)
|
|
resolved_machine_id = target_machine_id or (session_obj.machine_id if session_obj else "default")
|
|
|
|
with _replay_lock:
|
|
_replay_queues[session_id] = list(actions)
|
|
_replay_states[replay_id] = _create_replay_state(
|
|
replay_id=replay_id,
|
|
workflow_id=f"free_task:{task[:50]}",
|
|
session_id=session_id,
|
|
total_actions=len(actions),
|
|
params={},
|
|
machine_id=resolved_machine_id,
|
|
)
|
|
# Enregistrer le mapping machine -> session pour le replay ciblé
|
|
if resolved_machine_id and resolved_machine_id != "default":
|
|
_machine_replay_target[resolved_machine_id] = session_id
|
|
|
|
# Signaler au worker VLM (process séparé) qu'un replay est actif → se suspendre
|
|
_set_replay_lock(replay_id)
|
|
|
|
logger.info(
|
|
f"Replay libre démarré : {replay_id} | task='{task}' | "
|
|
f"session={session_id} | machine={resolved_machine_id} | {len(actions)} actions (worker suspendu)"
|
|
)
|
|
|
|
return {
|
|
"replay_id": replay_id,
|
|
"status": "running",
|
|
"task": task,
|
|
"session_id": session_id,
|
|
"machine_id": resolved_machine_id,
|
|
"total_actions": len(actions),
|
|
}
|
|
|
|
|
|
@app.post("/api/v1/traces/stream/replay-session")
|
|
async def replay_from_session(
|
|
session_id: str,
|
|
machine_id: str = "default",
|
|
):
|
|
"""Rejouer une session directement depuis ses événements bruts.
|
|
|
|
Pas besoin d'attendre le traitement VLM/GraphBuilder.
|
|
Construit le replay propre automatiquement depuis live_events.jsonl.
|
|
|
|
Pipeline :
|
|
1. Charge les events bruts de la session
|
|
2. Filtre les parasites (heartbeat, focus_change, action_result)
|
|
3. Fusionne les text_input consécutifs
|
|
4. Normalise les coordonnées en pourcentage
|
|
5. Ajoute des waits contextuels (après Win+R, Ctrl+S, Alt+F4, Enter)
|
|
6. Coupe après Alt+F4
|
|
7. Injecte dans la queue de replay
|
|
|
|
Résultat typique : ~15-20 actions propres, prêtes à exécuter immédiatement.
|
|
"""
|
|
if not session_id:
|
|
raise HTTPException(status_code=400, detail="session_id requis")
|
|
|
|
# ── 1. Trouver le fichier live_events.jsonl de la session ──
|
|
events_file = None
|
|
|
|
# Chercher dans le sous-dossier machine_id (format standard)
|
|
if machine_id and machine_id != "default":
|
|
candidate = LIVE_SESSIONS_DIR / machine_id / session_id / "live_events.jsonl"
|
|
if candidate.exists():
|
|
events_file = candidate
|
|
|
|
# Fallback : chercher dans tous les sous-dossiers machine
|
|
if not events_file:
|
|
for machine_dir in LIVE_SESSIONS_DIR.iterdir():
|
|
if not machine_dir.is_dir():
|
|
continue
|
|
candidate = machine_dir / session_id / "live_events.jsonl"
|
|
if candidate.exists():
|
|
events_file = candidate
|
|
# Résoudre le machine_id depuis le dossier
|
|
if machine_id == "default":
|
|
machine_id = machine_dir.name
|
|
break
|
|
|
|
# Dernier fallback : dossier session directement sous LIVE_SESSIONS_DIR
|
|
if not events_file:
|
|
candidate = LIVE_SESSIONS_DIR / session_id / "live_events.jsonl"
|
|
if candidate.exists():
|
|
events_file = candidate
|
|
|
|
if not events_file:
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f"Session '{session_id}' introuvable. "
|
|
f"Fichier live_events.jsonl non trouvé dans "
|
|
f"{LIVE_SESSIONS_DIR}/{machine_id}/{session_id}/"
|
|
)
|
|
|
|
# ── 2. Charger les événements bruts ──
|
|
raw_events = []
|
|
try:
|
|
for line in events_file.read_text(encoding="utf-8").splitlines():
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
raw_events.append(json.loads(line))
|
|
except json.JSONDecodeError:
|
|
continue
|
|
except Exception as e:
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f"Erreur lecture events de la session : {e}"
|
|
)
|
|
|
|
if not raw_events:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Session '{session_id}' : aucun événement trouvé dans live_events.jsonl"
|
|
)
|
|
|
|
# ── 2b. Fusionner les enrichissements temps réel depuis la session en mémoire ──
|
|
# Le JSONL ne contient pas les enrichissements SomEngine calculés pendant
|
|
# l'enregistrement (ils sont ajoutés en mémoire après écriture JSONL).
|
|
# On les injecte ici pour que build_replay_from_raw_events puisse les réutiliser.
|
|
session_mem = processor.session_manager.get_session(session_id)
|
|
if session_mem and session_mem.events:
|
|
_merge_enrichments_into_raw_events(raw_events, session_mem.events)
|
|
|
|
# ── 3. Construire le replay propre depuis les events bruts ──
|
|
# Passer le répertoire de session pour activer le visual replay (crops de référence)
|
|
session_dir = str(events_file.parent)
|
|
actions = build_replay_from_raw_events(
|
|
raw_events, session_id=session_id, session_dir=session_dir,
|
|
)
|
|
|
|
if not actions:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Session '{session_id}' : aucune action exploitable après nettoyage "
|
|
f"({len(raw_events)} événements bruts)"
|
|
)
|
|
|
|
# Limite de sécurité
|
|
if len(actions) > MAX_ACTIONS_PER_REPLAY:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Trop d'actions ({len(actions)} > {MAX_ACTIONS_PER_REPLAY}). "
|
|
"La session est trop longue pour un replay direct."
|
|
)
|
|
|
|
# Validation de chaque action (sécurité HIGH)
|
|
for i, action in enumerate(actions):
|
|
error = _validate_replay_action(action)
|
|
if error:
|
|
logger.warning(
|
|
"replay-session : action #%d invalide (%s), suppression", i, error
|
|
)
|
|
# Supprimer les actions invalides plutôt que rejeter tout le replay
|
|
actions[i] = None
|
|
actions = [a for a in actions if a is not None]
|
|
|
|
if not actions:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Session '{session_id}' : toutes les actions ont été rejetées par la validation"
|
|
)
|
|
|
|
# Optimisation par gestes clavier si disponible
|
|
if _gesture_catalog and actions:
|
|
actions = _gesture_catalog.optimize_replay_actions(actions)
|
|
|
|
# ── 3b. Setup environnement — ouvrir les applications nécessaires ──
|
|
# Analyser les événements bruts pour détecter quelles applications sont requises
|
|
# et injecter des actions de setup en tête de la queue de replay.
|
|
setup_actions = []
|
|
app_info = _extract_required_apps_from_events(raw_events)
|
|
if app_info:
|
|
setup_actions = _generate_setup_actions(app_info, setup_id_prefix="setup_sess")
|
|
if setup_actions:
|
|
actions = setup_actions + actions
|
|
logger.info(
|
|
"replay-session %s : %d actions de setup injectées avant le replay "
|
|
"(app=%s, cmd=%s)",
|
|
session_id, len(setup_actions),
|
|
app_info.get("primary_app"), app_info.get("primary_launch_cmd"),
|
|
)
|
|
|
|
# ── 4. Trouver la session de replay cible (Agent V1 actif) ──
|
|
# L'agent actif peut avoir une session différente de la session source
|
|
target_session_id = _find_active_agent_session(machine_id=machine_id)
|
|
if not target_session_id:
|
|
# Fallback : utiliser la session source si c'est une session Agent V1
|
|
if session_id.startswith("sess_"):
|
|
target_session_id = session_id
|
|
else:
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f"Aucune session Agent V1 active sur la machine '{machine_id}'. "
|
|
"Lancez l'Agent V1 sur le PC cible."
|
|
)
|
|
|
|
# ── 5. Injecter dans la queue de replay ──
|
|
replay_id = f"replay_sess_{uuid.uuid4().hex[:8]}"
|
|
|
|
with _replay_lock:
|
|
_replay_queues[target_session_id] = list(actions)
|
|
_replay_states[replay_id] = _create_replay_state(
|
|
replay_id=replay_id,
|
|
workflow_id=f"session_replay:{session_id}",
|
|
session_id=target_session_id,
|
|
total_actions=len(actions),
|
|
params={},
|
|
machine_id=machine_id,
|
|
)
|
|
# Enregistrer le mapping machine -> session pour le replay ciblé
|
|
if machine_id and machine_id != "default":
|
|
_machine_replay_target[machine_id] = target_session_id
|
|
|
|
# Signaler au worker VLM (process séparé) qu'un replay est actif → se suspendre
|
|
_set_replay_lock(replay_id)
|
|
|
|
logger.info(
|
|
"Replay session démarré : %s | source=%s | target=%s | machine=%s | "
|
|
"%d actions (%d setup + %d replay) (worker suspendu)",
|
|
replay_id, session_id, target_session_id, machine_id,
|
|
len(actions), len(setup_actions), len(actions) - len(setup_actions),
|
|
)
|
|
|
|
return {
|
|
"replay_id": replay_id,
|
|
"status": "running",
|
|
"source_session_id": session_id,
|
|
"target_session_id": target_session_id,
|
|
"machine_id": machine_id,
|
|
"total_actions": len(actions),
|
|
"setup_actions": len(setup_actions),
|
|
"replay_actions": len(actions) - len(setup_actions),
|
|
"total_raw_events": len(raw_events),
|
|
"setup_app": app_info.get("primary_app", "") if app_info else "",
|
|
"actions_preview": [
|
|
{
|
|
k: (
|
|
# Ne pas sérialiser l'image base64 dans le preview
|
|
{kk: ("..." if kk == "anchor_image_base64" else vv) for kk, vv in v.items()}
|
|
if k == "target_spec" and isinstance(v, dict)
|
|
else v
|
|
)
|
|
for k, v in a.items()
|
|
if k != "action_id"
|
|
}
|
|
for a in actions[:8] # Montrer plus d'actions pour inclure le setup
|
|
],
|
|
}
|
|
|
|
|
|
@app.post("/api/v1/traces/stream/replay/single")
|
|
async def enqueue_single_action(request: SingleActionRequest):
|
|
"""
|
|
Enqueue une seule action pour exécution (mode Copilot).
|
|
|
|
Contrairement à /replay et /replay/raw qui injectent toute une liste,
|
|
cet endpoint n'enqueue qu'UNE action à la fois. L'agent chat Copilot
|
|
appelle cet endpoint étape par étape après validation utilisateur.
|
|
|
|
Retourne un action_id pour le tracking du résultat via /replay/result.
|
|
"""
|
|
session_id = request.session_id
|
|
action = dict(request.action)
|
|
target_machine_id = request.machine_id
|
|
|
|
# Validation de l'action (sécurité HIGH)
|
|
error = _validate_replay_action(action)
|
|
if error:
|
|
raise HTTPException(status_code=400, detail=f"Action invalide : {error}")
|
|
|
|
# Auto-détection de la session Agent V1 (avec filtre machine optionnel)
|
|
if not session_id or session_id.startswith("chat_"):
|
|
active_session = _find_active_agent_session(machine_id=target_machine_id)
|
|
if active_session:
|
|
session_id = active_session
|
|
else:
|
|
machine_hint = f" sur la machine '{target_machine_id}'" if target_machine_id else ""
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f"Aucune session Agent V1 active{machine_hint}. "
|
|
"Lancez l'Agent V1 sur le PC cible."
|
|
)
|
|
|
|
# Assigner un action_id si manquant
|
|
if "action_id" not in action:
|
|
action["action_id"] = f"act_copilot_{uuid.uuid4().hex[:8]}"
|
|
|
|
action_id = action["action_id"]
|
|
|
|
with _replay_lock:
|
|
_replay_queues[session_id].append(action)
|
|
|
|
logger.info(
|
|
f"Action Copilot enqueued: {action_id} | type={action.get('type')} | "
|
|
f"session={session_id} | machine={target_machine_id}"
|
|
)
|
|
|
|
return {
|
|
"action_id": action_id,
|
|
"session_id": session_id,
|
|
"machine_id": target_machine_id,
|
|
"status": "enqueued",
|
|
}
|
|
|
|
|
|
# =========================================================================
|
|
# Pre-check écran — Vérification pré-action par embedding CLIP
|
|
# =========================================================================
|
|
|
|
|
|
def _pre_check_screen_state(
|
|
session_id: str,
|
|
expected_node_id: str,
|
|
current_screenshot_path: str,
|
|
active_processor: StreamProcessor,
|
|
) -> Dict[str, Any]:
|
|
"""Vérifier que l'écran actuel correspond à l'état attendu du node.
|
|
|
|
Compare le screenshot actuel avec le prototype du node attendu
|
|
via similarité d'embedding CLIP (rapide, ~200ms).
|
|
|
|
Args:
|
|
session_id: ID de la session de replay
|
|
expected_node_id: ID du node source de l'action (from_node)
|
|
current_screenshot_path: Chemin du screenshot heartbeat récent
|
|
active_processor: Instance StreamProcessor avec le CLIPEmbedder chargé
|
|
|
|
Returns:
|
|
{"match": True/False, "similarity": float, "expected_node": str,
|
|
"reason": str (si mismatch), "popup_detected": bool}
|
|
"""
|
|
result: Dict[str, Any] = {
|
|
"match": True,
|
|
"similarity": 1.0,
|
|
"expected_node": expected_node_id,
|
|
"popup_detected": False,
|
|
}
|
|
|
|
try:
|
|
# 1. Trouver le workflow actif pour cette session
|
|
replay_state = None
|
|
workflow = None
|
|
with _replay_lock:
|
|
for state in _replay_states.values():
|
|
if state["session_id"] == session_id and state["status"] == "running":
|
|
replay_state = state
|
|
break
|
|
|
|
if not replay_state:
|
|
result["reason"] = "no_active_replay"
|
|
return result
|
|
|
|
workflow_id = replay_state.get("workflow_id", "")
|
|
with active_processor._data_lock:
|
|
workflow = active_processor._workflows.get(workflow_id)
|
|
|
|
if workflow is None:
|
|
result["reason"] = "workflow_not_found"
|
|
return result
|
|
|
|
# 2. Récupérer le prototype du node attendu
|
|
# Supporter à la fois les objets Workflow et les dicts bruts
|
|
node = None
|
|
if hasattr(workflow, "get_node"):
|
|
node = workflow.get_node(expected_node_id)
|
|
elif isinstance(workflow, dict):
|
|
# Format dict brut (workflows VWB/manuels)
|
|
for n in workflow.get("nodes", []):
|
|
if n.get("node_id") == expected_node_id:
|
|
node = n
|
|
break
|
|
|
|
if node is None:
|
|
result["reason"] = "node_not_found"
|
|
return result
|
|
|
|
# Extraire le prototype vector
|
|
metadata = node.metadata if hasattr(node, "metadata") else node.get("metadata", {})
|
|
proto_list = metadata.get("_prototype_vector")
|
|
if not proto_list or not isinstance(proto_list, (list, tuple)):
|
|
result["reason"] = "no_prototype_vector"
|
|
return result
|
|
|
|
import numpy as np
|
|
prototype_vector = np.array(proto_list, dtype=np.float32)
|
|
|
|
# 3. Calculer l'embedding CLIP du screenshot actuel
|
|
active_processor._ensure_initialized()
|
|
if active_processor._clip_embedder is None:
|
|
result["reason"] = "clip_embedder_unavailable"
|
|
return result
|
|
|
|
from PIL import Image
|
|
pil_image = Image.open(current_screenshot_path)
|
|
current_vector = active_processor._clip_embedder.embed_image(pil_image)
|
|
|
|
if current_vector is None or len(current_vector) == 0:
|
|
result["reason"] = "embedding_failed"
|
|
return result
|
|
|
|
# 4. Similarité cosine
|
|
current_vector = current_vector.flatten().astype(np.float32)
|
|
prototype_vector = prototype_vector.flatten().astype(np.float32)
|
|
|
|
norm_current = np.linalg.norm(current_vector)
|
|
norm_proto = np.linalg.norm(prototype_vector)
|
|
if norm_current < 1e-8 or norm_proto < 1e-8:
|
|
result["reason"] = "zero_norm_vector"
|
|
result["match"] = False
|
|
result["similarity"] = 0.0
|
|
return result
|
|
|
|
similarity = float(
|
|
np.dot(current_vector, prototype_vector) / (norm_current * norm_proto)
|
|
)
|
|
result["similarity"] = round(similarity, 4)
|
|
result["match"] = similarity >= _PRECHECK_SIMILARITY_THRESHOLD
|
|
|
|
if not result["match"]:
|
|
result["reason"] = "screen_mismatch"
|
|
logger.warning(
|
|
f"Pre-check MISMATCH pour session={session_id} "
|
|
f"node={expected_node_id}: similarity={similarity:.4f} "
|
|
f"< seuil={_PRECHECK_SIMILARITY_THRESHOLD}"
|
|
)
|
|
|
|
# 5. Détection de popup par changement de titre de fenêtre
|
|
result["popup_detected"] = _detect_popup_hint(
|
|
session_id, workflow, expected_node_id
|
|
)
|
|
|
|
except Exception as e:
|
|
# Ne jamais bloquer le replay en cas d'erreur du pre-check
|
|
logger.error(f"Pre-check échoué (non bloquant): {e}")
|
|
result["match"] = True # Fallback permissif
|
|
result["reason"] = f"precheck_error: {e}"
|
|
|
|
return result
|
|
|
|
|
|
def _detect_popup_hint(
|
|
session_id: str,
|
|
workflow: Any,
|
|
expected_node_id: str,
|
|
) -> bool:
|
|
"""Détecter si une popup ou un dialogue modal est probable.
|
|
|
|
Compare le titre de fenêtre actuel (via last_window_info de la session)
|
|
avec le titre attendu du node dans le workflow. Un changement de titre
|
|
suggère une popup/dialogue inattendu.
|
|
|
|
Args:
|
|
session_id: ID de la session
|
|
workflow: Workflow object ou dict
|
|
expected_node_id: ID du node attendu
|
|
|
|
Returns:
|
|
True si un changement de titre suggère une popup
|
|
"""
|
|
try:
|
|
# Titre actuel depuis la session
|
|
session = processor.session_manager.get_session(session_id)
|
|
if not session:
|
|
return False
|
|
current_title = session.last_window_info.get("title", "").strip().lower()
|
|
if not current_title or current_title == "unknown":
|
|
return False
|
|
|
|
# Titre attendu depuis le node du workflow
|
|
expected_title = ""
|
|
if hasattr(workflow, "get_node"):
|
|
node = workflow.get_node(expected_node_id)
|
|
if node and hasattr(node, "template") and hasattr(node.template, "window"):
|
|
window_spec = node.template.window
|
|
if hasattr(window_spec, "title_contains") and window_spec.title_contains:
|
|
expected_title = window_spec.title_contains.strip().lower()
|
|
elif isinstance(workflow, dict):
|
|
for n in workflow.get("nodes", []):
|
|
if n.get("node_id") == expected_node_id:
|
|
template = n.get("template", {})
|
|
window = template.get("window", {})
|
|
expected_title = (window.get("title_contains") or "").strip().lower()
|
|
break
|
|
|
|
if not expected_title:
|
|
return False
|
|
|
|
# Si le titre actuel ne contient plus le titre attendu, popup probable
|
|
if expected_title not in current_title:
|
|
logger.info(
|
|
f"Popup détectée: titre actuel='{current_title}' "
|
|
f"ne contient pas '{expected_title}'"
|
|
)
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Détection popup échouée: {e}")
|
|
|
|
return False
|
|
|
|
|
|
@app.get("/api/v1/traces/stream/replay/next")
|
|
async def get_next_action(session_id: str, machine_id: str = "default"):
|
|
"""
|
|
L'Agent V1 poll cet endpoint pour récupérer la prochaine action à exécuter.
|
|
|
|
Retourne la prochaine action de la queue ou {"action": null} si rien.
|
|
Modèle pull : l'agent demande, pas de WebSocket nécessaire.
|
|
|
|
Inclut un pre-check optionnel : si un heartbeat récent est disponible,
|
|
compare l'écran actuel avec le node attendu via similarité CLIP.
|
|
En cas de mismatch, retourne une action "wait" au lieu de l'action réelle,
|
|
laissant le client le temps de retrouver le bon état.
|
|
|
|
Multi-machine : si machine_id est fourni, ne retourne que les actions
|
|
destinées à cette machine (évite les fuites cross-machine).
|
|
|
|
Si la session de l'agent n'a pas d'actions en attente, cherche dans les
|
|
autres queues de la MÊME machine (pas cross-machine).
|
|
"""
|
|
with _replay_lock:
|
|
# Verifier si le replay est en pause supervisee (target_not_found).
|
|
# Dans ce cas, NE PAS envoyer d'action — attendre l'intervention utilisateur.
|
|
for state in _replay_states.values():
|
|
if (state["session_id"] == session_id
|
|
and state["status"] == "paused_need_help"):
|
|
logger.debug(
|
|
f"Replay {state['replay_id']} en pause supervisee "
|
|
f"pour session {session_id} — pas d'action envoyee"
|
|
)
|
|
return {
|
|
"action": None,
|
|
"session_id": session_id,
|
|
"machine_id": machine_id,
|
|
"replay_paused": True,
|
|
"pause_message": state.get("pause_message", "Replay en pause"),
|
|
"replay_id": state["replay_id"],
|
|
}
|
|
|
|
queue = _replay_queues.get(session_id, [])
|
|
# Log seulement quand il y a des actions à distribuer
|
|
if queue:
|
|
logger.info(f"[REPLAY-QUEUE] session={session_id}, actions_en_attente={len(queue)}")
|
|
|
|
if not queue and machine_id != "default":
|
|
# Lookup 1 : machine_replay_target (mapping explicite POST /replay)
|
|
target_sid = _machine_replay_target.get(machine_id)
|
|
if target_sid and target_sid != session_id:
|
|
target_queue = _replay_queues.get(target_sid, [])
|
|
if target_queue:
|
|
queue = target_queue
|
|
_replay_queues[session_id] = target_queue
|
|
del _replay_queues[target_sid]
|
|
for state in _replay_states.values():
|
|
if state["session_id"] == target_sid and state["status"] == "running":
|
|
state["session_id"] = session_id
|
|
_machine_replay_target[machine_id] = session_id
|
|
logger.info(f"Replay machine-target: {machine_id} -> {target_sid} -> {session_id}")
|
|
|
|
# Lookup 2 : chercher dans les replay_states actifs pour cette machine
|
|
if not queue:
|
|
for state in _replay_states.values():
|
|
if (state.get("machine_id") == machine_id
|
|
and state["status"] == "running"
|
|
and state["session_id"] != session_id):
|
|
other_sid = state["session_id"]
|
|
other_queue = _replay_queues.get(other_sid, [])
|
|
if other_queue:
|
|
queue = other_queue
|
|
_replay_queues[session_id] = other_queue
|
|
del _replay_queues[other_sid]
|
|
state["session_id"] = session_id
|
|
_machine_replay_target[machine_id] = session_id
|
|
logger.info(f"Replay machine-state: {machine_id} -> {other_sid} -> {session_id}")
|
|
break
|
|
|
|
if not queue:
|
|
return {"action": None, "session_id": session_id, "machine_id": machine_id}
|
|
|
|
# Peek à la prochaine action SANS la retirer (pour le pre-check)
|
|
action = queue[0]
|
|
|
|
# ---- Pre-check écran (optionnel, non bloquant) ----
|
|
# Ne s'applique qu'aux actions qui ont un from_node (actions de workflow,
|
|
# pas les wait/retry auto-injectés ni les actions Copilot/Agent Libre)
|
|
from_node = action.get("from_node")
|
|
precheck_result = None
|
|
if from_node and action.get("type") not in ("wait",):
|
|
heartbeat = _last_heartbeat.get(session_id)
|
|
if heartbeat:
|
|
age = time.time() - heartbeat["timestamp"]
|
|
if age <= _HEARTBEAT_MAX_AGE_SECONDS:
|
|
try:
|
|
import asyncio
|
|
loop = asyncio.get_event_loop()
|
|
# Exécuter le pre-check dans un thread séparé pour ne pas
|
|
# bloquer l'event loop async (CLIP embed ~200ms)
|
|
precheck_result = await asyncio.wait_for(
|
|
loop.run_in_executor(
|
|
None, # ThreadPool par défaut
|
|
_pre_check_screen_state,
|
|
session_id,
|
|
from_node,
|
|
heartbeat["path"],
|
|
processor,
|
|
),
|
|
timeout=0.5, # Max 500ms pour le pre-check
|
|
)
|
|
except asyncio.TimeoutError:
|
|
logger.warning(
|
|
f"Pre-check timeout (>500ms) pour session={session_id} "
|
|
f"node={from_node}, skip"
|
|
)
|
|
precheck_result = None
|
|
except Exception as e:
|
|
logger.error(f"Pre-check exception (non bloquant): {e}")
|
|
precheck_result = None
|
|
else:
|
|
logger.debug(
|
|
f"Pre-check skip: heartbeat trop ancien ({age:.1f}s "
|
|
f"> {_HEARTBEAT_MAX_AGE_SECONDS}s)"
|
|
)
|
|
|
|
# Si le pre-check détecte un mismatch, ne pas retirer l'action de la queue
|
|
# et retourner une action "wait" pour que le client attende et ré-essaie
|
|
if precheck_result and not precheck_result["match"]:
|
|
# ---- Auth auto : détecter un écran d'authentification (optionnel) ----
|
|
# Si le mismatch est dû à un écran d'auth, injecter les actions d'auth
|
|
# en tête de queue pour que l'agent s'authentifie automatiquement.
|
|
if _auth_handler and not precheck_result.get("popup_detected"):
|
|
try:
|
|
# Construire un ScreenState minimal depuis le heartbeat
|
|
heartbeat = _last_heartbeat.get(session_id, {})
|
|
_auth_screen_state = {
|
|
"perception": {"detected_text": heartbeat.get("detected_text", [])},
|
|
"ui_elements": heartbeat.get("ui_elements", []),
|
|
"window": heartbeat.get("window_info", {}),
|
|
"ocr_text": heartbeat.get("ocr_text", ""),
|
|
}
|
|
auth_request = _auth_handler.detect_auth_screen(_auth_screen_state)
|
|
if auth_request and auth_request.confidence >= 0.5:
|
|
auth_actions = _auth_handler.get_auth_actions(auth_request)
|
|
if auth_actions:
|
|
# Injecter les actions d'auth en tête de queue (avant l'action bloquée)
|
|
with _replay_lock:
|
|
current_q = _replay_queues.get(session_id, [])
|
|
_replay_queues[session_id] = auth_actions + current_q
|
|
logger.info(
|
|
f"Auth auto : {len(auth_actions)} actions injectées pour "
|
|
f"session={session_id} app={auth_request.app_name} "
|
|
f"type={auth_request.auth_type} (confiance={auth_request.confidence:.2f})"
|
|
)
|
|
# Retourner la première action d'auth immédiatement
|
|
with _replay_lock:
|
|
first_auth = _replay_queues[session_id].pop(0)
|
|
return {
|
|
"action": first_auth,
|
|
"session_id": session_id,
|
|
"machine_id": machine_id,
|
|
"precheck": precheck_result,
|
|
"auth_detected": True,
|
|
}
|
|
except Exception as e:
|
|
logger.warning(f"Auth auto : détection échouée (non bloquant) : {e}")
|
|
|
|
if precheck_result.get("popup_detected"):
|
|
wait_action = {
|
|
"action_id": f"precheck_wait_{uuid.uuid4().hex[:6]}",
|
|
"type": "wait",
|
|
"reason": "popup_detected",
|
|
"suggestion": "press_escape_or_click_close",
|
|
"expected_node": from_node,
|
|
"similarity": precheck_result["similarity"],
|
|
"duration_ms": 2000,
|
|
}
|
|
logger.warning(
|
|
f"Pre-check: popup détectée pour session={session_id} "
|
|
f"node={from_node}, envoi wait+suggestion"
|
|
)
|
|
else:
|
|
wait_action = {
|
|
"action_id": f"precheck_wait_{uuid.uuid4().hex[:6]}",
|
|
"type": "wait",
|
|
"reason": "screen_mismatch",
|
|
"expected_node": from_node,
|
|
"similarity": precheck_result["similarity"],
|
|
"threshold": _PRECHECK_SIMILARITY_THRESHOLD,
|
|
"duration_ms": 1500,
|
|
}
|
|
logger.warning(
|
|
f"Pre-check: mismatch écran pour session={session_id} "
|
|
f"node={from_node} (sim={precheck_result['similarity']:.4f}), envoi wait"
|
|
)
|
|
return {
|
|
"action": wait_action,
|
|
"session_id": session_id,
|
|
"machine_id": machine_id,
|
|
"precheck": precheck_result,
|
|
}
|
|
|
|
# Pre-check OK (ou skip) : retirer l'action de la queue et l'envoyer
|
|
with _replay_lock:
|
|
current_queue = _replay_queues.get(session_id, [])
|
|
if current_queue and current_queue[0].get("action_id") == action.get("action_id"):
|
|
current_queue.pop(0)
|
|
# Else: queue a changé entre temps (race condition bénigne), on envoie quand même
|
|
|
|
# Sauvegarder l'action envoyée pour le retry (si la vérification échoue)
|
|
# NE PAS écraser si _schedule_retry a déjà mis le bon retry_count
|
|
action_id_sent = action.get("action_id", "")
|
|
if action_id_sent and action_id_sent not in _retry_pending:
|
|
_retry_pending[action_id_sent] = {
|
|
"action": dict(action),
|
|
"retry_count": 0,
|
|
"replay_id": "",
|
|
}
|
|
|
|
logger.info(
|
|
f"Action envoyée à {session_id} (machine={machine_id}) : "
|
|
f"{action.get('type')} (id={action.get('action_id')})"
|
|
f"{' [precheck OK sim=' + str(precheck_result['similarity']) + ']' if precheck_result else ''}"
|
|
)
|
|
|
|
response: Dict[str, Any] = {
|
|
"action": action,
|
|
"session_id": session_id,
|
|
"machine_id": machine_id,
|
|
}
|
|
if precheck_result:
|
|
response["precheck"] = precheck_result
|
|
return response
|
|
|
|
|
|
@app.post("/api/v1/traces/stream/replay/result")
|
|
async def report_action_result(report: ReplayResultReport):
|
|
"""
|
|
L'Agent V1 renvoie le résultat d'exécution d'une action.
|
|
|
|
Permet au serveur de suivre la progression et de détecter les échecs.
|
|
Intègre la vérification post-action (comparaison screenshots) et le retry
|
|
automatique (max 3 tentatives) avant de déclarer un échec.
|
|
|
|
Stratégie de retry :
|
|
- Retry 1 : re-résoudre la cible visuellement et réinjecter l'action
|
|
- Retry 2 : attendre 2s (wait) puis réinjecter l'action (possible loading)
|
|
- Retry 3 : dernier essai identique, si échec → erreur non-récupérable
|
|
"""
|
|
session_id = report.session_id
|
|
action_id = report.action_id
|
|
|
|
# Trouver le replay correspondant à cette session
|
|
with _replay_lock:
|
|
replay_state = None
|
|
for state in _replay_states.values():
|
|
if state["session_id"] == session_id and state["status"] == "running":
|
|
replay_state = state
|
|
break
|
|
|
|
if not replay_state:
|
|
logger.warning(
|
|
f"Résultat reçu pour session {session_id} mais aucun replay actif"
|
|
)
|
|
return {"status": "no_active_replay", "session_id": session_id}
|
|
|
|
# Récupérer l'info de retry pour cette action (si c'est un retry)
|
|
retry_info = _retry_pending.pop(action_id, None)
|
|
retry_count = retry_info["retry_count"] if retry_info else 0
|
|
original_action = retry_info["action"] if retry_info else None
|
|
|
|
# Guard de sécurité : détecter le retry_count depuis l'action_id si non trouvé
|
|
# Évite la boucle infinie si _retry_pending est désynchronisé
|
|
if retry_count == 0 and "_retry" in action_id:
|
|
import re
|
|
retry_suffixes = re.findall(r"_retry\d+", action_id)
|
|
retry_count = max(retry_count, len(retry_suffixes))
|
|
if retry_count > 0:
|
|
logger.warning(
|
|
f"retry_count corrigé par action_id : {retry_count} "
|
|
f"(action_id contient {len(retry_suffixes)} suffixes _retry)"
|
|
)
|
|
|
|
# Mettre à jour le dernier screenshot reçu
|
|
screenshot_after = report.screenshot_after or report.screenshot
|
|
if screenshot_after:
|
|
with _replay_lock:
|
|
replay_state["last_screenshot"] = screenshot_after
|
|
|
|
# === Vérification post-action ===
|
|
# Ne vérifier que les actions "click" — les "type" et "key_combo" sont
|
|
# toujours considérées réussies si l'agent dit success (pas de position à vérifier,
|
|
# et le screenshot change peu pour une frappe clavier)
|
|
#
|
|
# Si l'agent a envoyé un warning "no_screen_change" ou "popup_handled",
|
|
# il a déjà tenté de gérer la situation (popup handler). Ne PAS relancer
|
|
# de retry côté serveur — continuer vers l'action suivante.
|
|
agent_warning = report.warning or ""
|
|
agent_handled_popup = agent_warning in ("no_screen_change", "popup_handled")
|
|
if agent_handled_popup:
|
|
logger.info(
|
|
f"Action {action_id} : agent warning='{agent_warning}' — "
|
|
f"popup déjà gérée côté agent, pas de retry serveur"
|
|
)
|
|
|
|
action_type_for_verify = (original_action or {}).get("type", "unknown")
|
|
skip_verify = action_type_for_verify in ("type", "key_combo", "wait")
|
|
# Skip aussi la vérification serveur si l'agent a déjà géré la popup
|
|
skip_verify = skip_verify or agent_handled_popup
|
|
verification = None
|
|
if report.success and screenshot_after and not skip_verify:
|
|
# Chercher le screenshot avant (dernier connu de la session)
|
|
screenshot_before = replay_state.get("_last_screenshot_before")
|
|
if screenshot_before:
|
|
try:
|
|
action_dict = original_action or {"type": "unknown", "action_id": action_id}
|
|
result_dict = {
|
|
"success": report.success,
|
|
"error": report.error,
|
|
}
|
|
verification = _replay_verifier.verify_action(
|
|
action=action_dict,
|
|
result=result_dict,
|
|
screenshot_before=screenshot_before,
|
|
screenshot_after=screenshot_after,
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Vérification post-action échouée: {e}")
|
|
|
|
# Stocker le screenshot actuel comme "before" pour la prochaine action
|
|
if screenshot_after:
|
|
with _replay_lock:
|
|
replay_state["_last_screenshot_before"] = screenshot_after
|
|
|
|
# === Enregistrer le résultat ===
|
|
with _replay_lock:
|
|
result_entry = {
|
|
"action_id": action_id,
|
|
"success": report.success,
|
|
"error": report.error,
|
|
"warning": report.warning,
|
|
"has_screenshot": bool(screenshot_after),
|
|
"actual_position": report.actual_position,
|
|
"retry_count": retry_count,
|
|
"verification": verification.to_dict() if verification else None,
|
|
"resolution_method": report.resolution_method,
|
|
"resolution_score": report.resolution_score,
|
|
"resolution_elapsed_ms": report.resolution_elapsed_ms,
|
|
}
|
|
replay_state["results"].append(result_entry)
|
|
|
|
# === Logique de retry / success / failure ===
|
|
if report.success and (verification is None or verification.verified):
|
|
# Action réussie (vérification OK ou pas de vérification)
|
|
replay_state["completed_actions"] += 1
|
|
replay_state["current_action_index"] += 1
|
|
|
|
elif report.success and verification and not verification.verified:
|
|
# Agent dit "success" mais la vérification échoue (rien n'a changé)
|
|
replay_state["unverified_actions"] += 1
|
|
logger.warning(
|
|
f"Action {action_id} marquée success mais non vérifiée: "
|
|
f"{verification.detail}"
|
|
)
|
|
if verification.suggestion == "retry" and retry_count < MAX_RETRIES_PER_ACTION:
|
|
# Réinjecter pour retry
|
|
_schedule_retry(
|
|
session_id, replay_state, original_action or {"action_id": action_id},
|
|
retry_count, "verification_failed"
|
|
)
|
|
else:
|
|
# Continuer malgré tout (action non vérifiée)
|
|
replay_state["completed_actions"] += 1
|
|
replay_state["current_action_index"] += 1
|
|
|
|
elif not report.success and agent_warning == "no_screen_change":
|
|
# L'action a été exécutée mais l'écran n'a pas changé.
|
|
# PAS de retry — loguer l'échec et continuer vers l'action suivante.
|
|
# C'est plus honnête que "success" et évite les retries en boucle.
|
|
replay_state["unverified_actions"] += 1
|
|
replay_state["completed_actions"] += 1
|
|
replay_state["current_action_index"] += 1
|
|
logger.warning(
|
|
f"Action {action_id} : écran inchangé (no_screen_change) — "
|
|
f"action sans effet visible, on continue"
|
|
)
|
|
|
|
elif not report.success and (report.error or "") == "target_not_found":
|
|
# Cible non trouvée visuellement — PAUSE supervisée, PAS d'erreur fatale.
|
|
# L'utilisateur doit intervenir (naviguer vers le bon ecran, fermer une popup, etc.)
|
|
# On NE vide PAS la queue : les actions restantes seront reprises apres intervention.
|
|
target_desc = report.target_description or "élément inconnu"
|
|
replay_state["status"] = "paused_need_help"
|
|
replay_state["failed_action"] = {
|
|
"action_id": action_id,
|
|
"type": (original_action or {}).get("type", "unknown"),
|
|
"target_description": target_desc,
|
|
"screenshot_b64": screenshot_after or report.screenshot,
|
|
"target_spec": report.target_spec,
|
|
}
|
|
replay_state["pause_message"] = f"Je ne vois pas '{target_desc}' à l'écran"
|
|
error_entry = {
|
|
"action_id": action_id,
|
|
"error": f"target_not_found: {target_desc}",
|
|
"retry_count": 0,
|
|
"timestamp": time.time(),
|
|
}
|
|
replay_state["error_log"].append(error_entry)
|
|
logger.warning(
|
|
f"Replay PAUSE supervisée : cible '{target_desc}' non trouvée "
|
|
f"pour {action_id} — en attente d'intervention utilisateur"
|
|
)
|
|
# Logger l'echec pour l'apprentissage futur
|
|
log_replay_failure(
|
|
replay_id=replay_state["replay_id"],
|
|
action_id=action_id,
|
|
target_spec=report.target_spec,
|
|
screenshot_b64=screenshot_after or report.screenshot,
|
|
resolution_attempts=[
|
|
r for r in replay_state["results"]
|
|
if r.get("action_id") == action_id and r.get("resolution_method")
|
|
],
|
|
error="target_not_found",
|
|
extra={
|
|
"target_description": target_desc,
|
|
"actions_completed": replay_state["completed_actions"],
|
|
"actions_remaining": len(_replay_queues.get(session_id, [])),
|
|
},
|
|
)
|
|
|
|
elif not report.success and "visual resolve" in (report.error or "").lower():
|
|
# Visual resolve échoué (ancien format d'erreur) — PAUSE supervisée aussi.
|
|
# Compatibilité avec les agents qui n'envoient pas encore "target_not_found".
|
|
target_desc = report.target_description or (report.error or "Visual resolve échoué")
|
|
replay_state["status"] = "paused_need_help"
|
|
replay_state["failed_action"] = {
|
|
"action_id": action_id,
|
|
"type": (original_action or {}).get("type", "unknown"),
|
|
"target_description": target_desc,
|
|
"screenshot_b64": screenshot_after or report.screenshot,
|
|
"target_spec": report.target_spec,
|
|
}
|
|
replay_state["pause_message"] = f"Je ne vois pas '{target_desc}' à l'écran"
|
|
error_entry = {
|
|
"action_id": action_id,
|
|
"error": report.error or "Visual resolve échoué",
|
|
"retry_count": 0,
|
|
"timestamp": time.time(),
|
|
}
|
|
replay_state["error_log"].append(error_entry)
|
|
logger.warning(
|
|
f"Replay PAUSE supervisée (compat) : visual resolve échoué pour {action_id} — "
|
|
f"{report.error}"
|
|
)
|
|
# Logger l'echec pour l'apprentissage futur
|
|
log_replay_failure(
|
|
replay_id=replay_state["replay_id"],
|
|
action_id=action_id,
|
|
target_spec=report.target_spec,
|
|
screenshot_b64=screenshot_after or report.screenshot,
|
|
error="visual_resolve_failed",
|
|
)
|
|
|
|
elif not report.success and retry_count < MAX_RETRIES_PER_ACTION:
|
|
# Échec réel (pas juste screen inchangé ou visual) — retry
|
|
action_to_retry = original_action or {"action_id": action_id, "type": "unknown"}
|
|
_schedule_retry(
|
|
session_id, replay_state, action_to_retry,
|
|
retry_count, report.error or "unknown_error"
|
|
)
|
|
|
|
else:
|
|
# Échec définitif (retries épuisés)
|
|
replay_state["failed_actions"] += 1
|
|
error_entry = {
|
|
"action_id": action_id,
|
|
"error": report.error or "Retries épuisés",
|
|
"retry_count": retry_count,
|
|
"timestamp": time.time(),
|
|
}
|
|
replay_state["error_log"].append(error_entry)
|
|
|
|
# Marquer le replay en erreur et vider la queue
|
|
replay_state["status"] = "error"
|
|
_replay_queues[session_id] = []
|
|
logger.error(
|
|
f"Replay {replay_state['replay_id']} échoué à l'action {action_id} "
|
|
f"après {retry_count} retries: {report.error}"
|
|
)
|
|
|
|
# Notifier via callback si configuré
|
|
_notify_error_callback(replay_state, action_id, report.error)
|
|
|
|
# Vérifier si le replay est terminé (queue vide + dernière action réussie)
|
|
remaining = len(_replay_queues.get(session_id, []))
|
|
if remaining == 0 and replay_state["status"] == "running":
|
|
replay_state["status"] = "completed"
|
|
logger.info(
|
|
f"Replay {replay_state['replay_id']} terminé avec succès : "
|
|
f"{replay_state['completed_actions']}/{replay_state['total_actions']} actions"
|
|
f" ({replay_state['retried_actions']} retries, "
|
|
f"{replay_state['unverified_actions']} non vérifiées)"
|
|
)
|
|
# Résumé des métriques de résolution visuelle
|
|
results_with_method = [
|
|
r for r in replay_state["results"]
|
|
if r.get("resolution_method")
|
|
]
|
|
if results_with_method:
|
|
methods_count = {}
|
|
total_elapsed = 0.0
|
|
total_score = 0.0
|
|
for r in results_with_method:
|
|
m = r["resolution_method"]
|
|
methods_count[m] = methods_count.get(m, 0) + 1
|
|
total_elapsed += r.get("resolution_elapsed_ms") or 0
|
|
total_score += r.get("resolution_score") or 0
|
|
avg_elapsed = total_elapsed / len(results_with_method)
|
|
avg_score = total_score / len(results_with_method)
|
|
methods_str = ", ".join(
|
|
f"{m}={c}" for m, c in sorted(methods_count.items())
|
|
)
|
|
logger.info(
|
|
f"Replay {replay_state['replay_id']} métriques résolution : "
|
|
f"{len(results_with_method)} resolves [{methods_str}] "
|
|
f"score_moy={avg_score:.2f} temps_moy={avg_elapsed:.0f}ms"
|
|
)
|
|
|
|
# Libérer le GPU pour le worker VLM si le replay est terminé ou en erreur
|
|
if replay_state["status"] in ("completed", "error"):
|
|
_clear_replay_lock()
|
|
logger.info(
|
|
f"Replay {replay_state['replay_id']} terminé (status={replay_state['status']}) "
|
|
f"— worker VLM autorisé à reprendre"
|
|
)
|
|
|
|
return {
|
|
"status": "recorded",
|
|
"action_id": action_id,
|
|
"success": report.success,
|
|
"replay_status": replay_state["status"],
|
|
"remaining_actions": remaining,
|
|
"retry_count": retry_count,
|
|
"verification": verification.to_dict() if verification else None,
|
|
}
|
|
|
|
|
|
def _create_replay_state(
|
|
replay_id: str,
|
|
workflow_id: str,
|
|
session_id: str,
|
|
total_actions: int,
|
|
params: Optional[Dict[str, Any]] = None,
|
|
machine_id: Optional[str] = None,
|
|
) -> Dict[str, Any]:
|
|
"""Créer un état de replay enrichi avec les champs de suivi d'erreur."""
|
|
return {
|
|
"replay_id": replay_id,
|
|
"workflow_id": workflow_id,
|
|
"session_id": session_id,
|
|
"machine_id": machine_id or "default", # Machine cible du replay
|
|
"status": "running",
|
|
"total_actions": total_actions,
|
|
"completed_actions": 0,
|
|
"failed_actions": 0,
|
|
"current_action_index": 0,
|
|
"params": params or {},
|
|
"results": [], # Historique des résultats action par action
|
|
# Champs enrichis pour le suivi d'erreur (#7)
|
|
"retried_actions": 0,
|
|
"unverified_actions": 0,
|
|
"error_log": [], # Liste des erreurs rencontrées
|
|
"last_screenshot": None, # Path du dernier screenshot reçu
|
|
"_last_screenshot_before": None, # Interne: screenshot avant la dernière action
|
|
# Champs pour pause supervisée (target_not_found)
|
|
"failed_action": None, # Contexte de l'action en echec (quand paused_need_help)
|
|
"pause_message": None, # Message a afficher a l'utilisateur
|
|
}
|
|
|
|
|
|
def _schedule_retry(
|
|
session_id: str,
|
|
replay_state: Dict[str, Any],
|
|
action: Dict[str, Any],
|
|
current_retry: int,
|
|
reason: str,
|
|
):
|
|
"""
|
|
Programmer un retry pour une action échouée.
|
|
|
|
Stratégie :
|
|
- Retry 1 : réinjecter l'action directement (re-résolution visuelle par l'agent)
|
|
- Retry 2 : injecter un wait de 2s avant l'action (possible loading en cours)
|
|
- Retry 3 : dernier essai direct
|
|
|
|
L'action est réinsérée en tête de la queue pour être la prochaine exécutée.
|
|
_replay_lock doit être acquis par l'appelant.
|
|
"""
|
|
next_retry = current_retry + 1
|
|
replay_state["retried_actions"] += 1
|
|
|
|
# Créer une copie de l'action avec un nouveau action_id pour le tracking
|
|
retry_action = dict(action)
|
|
retry_action_id = f"{action.get('action_id', 'unknown')}_retry{next_retry}"
|
|
retry_action["action_id"] = retry_action_id
|
|
|
|
# Stocker l'info de retry pour le prochain report_action_result
|
|
_retry_pending[retry_action_id] = {
|
|
"action": action,
|
|
"retry_count": next_retry,
|
|
"replay_id": replay_state["replay_id"],
|
|
"reason": reason,
|
|
}
|
|
|
|
# Stratégie de retry selon le numéro
|
|
actions_to_insert = []
|
|
|
|
if next_retry == 2:
|
|
# Retry 2 : injecter un wait de 2s avant l'action
|
|
wait_action = {
|
|
"action_id": f"wait_retry_{uuid.uuid4().hex[:6]}",
|
|
"type": "wait",
|
|
"duration_ms": 2000,
|
|
}
|
|
actions_to_insert.append(wait_action)
|
|
|
|
actions_to_insert.append(retry_action)
|
|
|
|
# Insérer en tête de la queue (prochaine action à exécuter)
|
|
queue = _replay_queues.get(session_id, [])
|
|
_replay_queues[session_id] = actions_to_insert + queue
|
|
|
|
logger.info(
|
|
f"Retry {next_retry}/{MAX_RETRIES_PER_ACTION} programmé pour {action.get('action_id')} "
|
|
f"(raison: {reason}) | nouveau id: {retry_action_id}"
|
|
)
|
|
|
|
|
|
def _notify_error_callback(
|
|
replay_state: Dict[str, Any],
|
|
action_id: str,
|
|
error: Optional[str],
|
|
):
|
|
"""
|
|
Notifier le callback d'erreur si configuré pour ce replay.
|
|
|
|
Appel HTTP POST non-bloquant vers l'URL de callback.
|
|
En cas d'échec de notification, on log mais on ne bloque pas.
|
|
"""
|
|
replay_id = replay_state["replay_id"]
|
|
callback_url = _error_callbacks.get(replay_id)
|
|
if not callback_url:
|
|
return
|
|
|
|
def _send_callback():
|
|
try:
|
|
import urllib.request
|
|
payload = json.dumps({
|
|
"replay_id": replay_id,
|
|
"workflow_id": replay_state.get("workflow_id"),
|
|
"session_id": replay_state.get("session_id"),
|
|
"action_id": action_id,
|
|
"error": error or "Erreur inconnue",
|
|
"retried_actions": replay_state.get("retried_actions", 0),
|
|
"error_log": replay_state.get("error_log", []),
|
|
"status": replay_state.get("status"),
|
|
}).encode("utf-8")
|
|
|
|
req = urllib.request.Request(
|
|
callback_url,
|
|
data=payload,
|
|
headers={"Content-Type": "application/json"},
|
|
method="POST",
|
|
)
|
|
with urllib.request.urlopen(req, timeout=5) as resp:
|
|
logger.info(
|
|
f"Error callback envoyé à {callback_url}: {resp.status}"
|
|
)
|
|
except Exception as e:
|
|
logger.warning(
|
|
f"Échec envoi error callback à {callback_url}: {e}"
|
|
)
|
|
|
|
# Envoyer en arrière-plan pour ne pas bloquer
|
|
threading.Thread(target=_send_callback, daemon=True).start()
|
|
|
|
|
|
@app.post("/api/v1/traces/stream/replay/error_callback")
|
|
async def register_error_callback(config: ErrorCallbackConfig):
|
|
"""
|
|
Enregistrer une URL de callback pour les erreurs non-récupérables d'un replay.
|
|
|
|
Le chat server configure cette URL lors du lancement du replay.
|
|
Quand une erreur non-récupérable se produit (retries épuisés),
|
|
le serveur POST vers cette URL avec les détails de l'erreur.
|
|
"""
|
|
replay_id = config.replay_id
|
|
callback_url = config.callback_url
|
|
|
|
with _replay_lock:
|
|
if replay_id not in _replay_states:
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f"Replay '{replay_id}' non trouvé"
|
|
)
|
|
|
|
_error_callbacks[replay_id] = callback_url
|
|
logger.info(f"Error callback enregistré pour {replay_id}: {callback_url}")
|
|
|
|
return {
|
|
"status": "callback_registered",
|
|
"replay_id": replay_id,
|
|
"callback_url": callback_url,
|
|
}
|
|
|
|
|
|
@app.get("/api/v1/traces/stream/replay/{replay_id}")
|
|
async def get_replay_status(replay_id: str):
|
|
"""Consulter l'etat d'un replay en cours ou termine.
|
|
|
|
Quand le replay est en pause supervisee (paused_need_help), la reponse
|
|
inclut le contexte complet de l'echec : action echouee, screenshot,
|
|
target_spec, et message utilisateur.
|
|
"""
|
|
with _replay_lock:
|
|
state = _replay_states.get(replay_id)
|
|
|
|
if not state:
|
|
raise HTTPException(
|
|
status_code=404, detail=f"Replay '{replay_id}' non trouvé"
|
|
)
|
|
|
|
# Filtrer les champs internes (prefixes par _)
|
|
result = {k: v for k, v in state.items() if not k.startswith("_")}
|
|
|
|
# Enrichir avec le contexte de pause si applicable
|
|
if state["status"] == "paused_need_help":
|
|
session_id = state["session_id"]
|
|
remaining = len(_replay_queues.get(session_id, []))
|
|
result["actions_completed"] = state["completed_actions"]
|
|
result["actions_remaining"] = remaining
|
|
result["message"] = state.get("pause_message", "Replay en pause")
|
|
# Le failed_action contient deja screenshot_b64 et target_spec
|
|
|
|
return result
|
|
|
|
|
|
@app.get("/api/v1/traces/stream/replays")
|
|
async def list_replays():
|
|
"""Lister tous les replays (actifs, terminés, en erreur)."""
|
|
with _replay_lock:
|
|
# Filtrer les champs internes (préfixés par _)
|
|
return {
|
|
"replays": [
|
|
{k: v for k, v in state.items() if not k.startswith("_")}
|
|
for state in _replay_states.values()
|
|
]
|
|
}
|
|
|
|
|
|
@app.post("/api/v1/traces/stream/replay/{replay_id}/resume")
|
|
async def resume_replay(replay_id: str):
|
|
"""Reprendre un replay en pause supervisee (paused_need_help).
|
|
|
|
L'utilisateur a intervenu manuellement (naviguer vers le bon ecran,
|
|
fermer une popup, etc.) et veut relancer le replay. L'action echouee
|
|
est reinjectee en tete de queue pour etre re-tentee.
|
|
|
|
Si le replay n'est pas en pause, retourne une erreur 409 (conflit).
|
|
"""
|
|
with _replay_lock:
|
|
state = _replay_states.get(replay_id)
|
|
|
|
if not state:
|
|
raise HTTPException(
|
|
status_code=404, detail=f"Replay '{replay_id}' non trouvé"
|
|
)
|
|
|
|
if state["status"] != "paused_need_help":
|
|
raise HTTPException(
|
|
status_code=409,
|
|
detail=(
|
|
f"Replay '{replay_id}' n'est pas en pause "
|
|
f"(status actuel: {state['status']})"
|
|
),
|
|
)
|
|
|
|
# Recuperer l'action echouee pour la reinjecter
|
|
failed_action = state.get("failed_action")
|
|
session_id = state["session_id"]
|
|
|
|
# Remettre le replay en mode running
|
|
state["status"] = "running"
|
|
state["failed_action"] = None
|
|
state["pause_message"] = None
|
|
|
|
# Reinjecter l'action echouee en tete de queue (sera re-tentee)
|
|
if failed_action and failed_action.get("action_id"):
|
|
# Reconstruire l'action a partir du retry_pending ou de l'original
|
|
original_action_id = failed_action["action_id"]
|
|
# Chercher l'action originale dans les retry_pending
|
|
original = _retry_pending.pop(original_action_id, {}).get("action")
|
|
if not original:
|
|
# Reconstruire un minimum depuis le failed_action context
|
|
original = {
|
|
"action_id": original_action_id,
|
|
"type": failed_action.get("type", "click"),
|
|
"target_spec": failed_action.get("target_spec"),
|
|
"visual_mode": True,
|
|
}
|
|
# Creer un nouvel action_id pour le tracking
|
|
resume_id = f"{original_action_id}_resume"
|
|
resume_action = dict(original)
|
|
resume_action["action_id"] = resume_id
|
|
# Stocker dans retry_pending pour le suivi
|
|
_retry_pending[resume_id] = {
|
|
"action": original,
|
|
"retry_count": 0,
|
|
"replay_id": replay_id,
|
|
"reason": "resume_after_pause",
|
|
}
|
|
queue = _replay_queues.get(session_id, [])
|
|
_replay_queues[session_id] = [resume_action] + queue
|
|
|
|
remaining = len(_replay_queues.get(session_id, []))
|
|
logger.info(
|
|
f"Replay {replay_id} repris apres pause supervisee — "
|
|
f"{remaining} actions en attente"
|
|
)
|
|
|
|
return {
|
|
"status": "resumed",
|
|
"replay_id": replay_id,
|
|
"session_id": session_id,
|
|
"remaining_actions": remaining,
|
|
}
|
|
|
|
|
|
# =========================================================================
|
|
# Visual Replay — Résolution visuelle des cibles
|
|
# =========================================================================
|
|
|
|
|
|
class ResolveTargetRequest(BaseModel):
|
|
"""Requête de résolution visuelle d'une cible."""
|
|
session_id: str
|
|
screenshot_b64: str # Screenshot JPEG en base64
|
|
target_spec: Dict[str, Any] # {by_role, by_text, by_position, ...}
|
|
fallback_x_pct: float = 0.0 # Coordonnées de fallback
|
|
fallback_y_pct: float = 0.0
|
|
screen_width: int = 1920
|
|
screen_height: int = 1080
|
|
strict_mode: bool = False # True pour replay sessions (seuil template 0.90 + YOLO)
|
|
|
|
|
|
@app.post("/api/v1/traces/stream/replay/resolve_target")
|
|
async def resolve_target(request: ResolveTargetRequest):
|
|
"""
|
|
Résoudre visuellement une cible UI à partir d'un screenshot.
|
|
|
|
L'Agent V1 envoie un screenshot + target_spec AVANT d'exécuter l'action.
|
|
Le serveur analyse l'image avec UIDetector/OCR et retourne les coordonnées
|
|
de l'élément trouvé.
|
|
|
|
Stratégie de matching (par priorité) :
|
|
1. Template matching OpenCV (~100ms) — si anchor_image_base64 fourni
|
|
2. VLM Quick Find (~5-10s) — 1 appel VLM pour localiser l'élément
|
|
3. Matching sémantique complet (~15-20s) — ScreenAnalyzer + OCR + UI detection
|
|
4. Fallback — coordonnées statiques
|
|
"""
|
|
import base64
|
|
import io
|
|
import tempfile
|
|
|
|
from PIL import Image
|
|
|
|
# Décoder le screenshot
|
|
try:
|
|
img_bytes = base64.b64decode(request.screenshot_b64)
|
|
img = Image.open(io.BytesIO(img_bytes))
|
|
except Exception as e:
|
|
logger.error(f"Décodage screenshot échoué: {e}")
|
|
return _fallback_response(request, "decode_error", str(e))
|
|
|
|
# Sauver temporairement pour les analyseurs (ils attendent un chemin fichier)
|
|
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
|
|
img.save(tmp, format="JPEG", quality=90)
|
|
tmp_path = tmp.name
|
|
|
|
try:
|
|
# Lancer la résolution visuelle dans un thread SÉPARÉ (pas le GPU executor).
|
|
# Le template matching est CPU-only.
|
|
import asyncio
|
|
loop = asyncio.get_event_loop()
|
|
result = await loop.run_in_executor(
|
|
None, # ThreadPool par défaut (pas _gpu_executor)
|
|
_resolve_target_sync,
|
|
tmp_path,
|
|
request.target_spec,
|
|
request.screen_width,
|
|
request.screen_height,
|
|
request.fallback_x_pct,
|
|
request.fallback_y_pct,
|
|
request.strict_mode,
|
|
)
|
|
return result
|
|
except Exception as e:
|
|
logger.error(f"Résolution visuelle échouée: {e}")
|
|
return _fallback_response(request, "analysis_error", str(e))
|
|
finally:
|
|
import os
|
|
try:
|
|
os.unlink(tmp_path)
|
|
except OSError:
|
|
pass
|
|
|
|
|
|
def _resolve_by_template_matching(
|
|
screenshot_path: str,
|
|
anchor_image_b64: str,
|
|
screen_width: int,
|
|
screen_height: int,
|
|
confidence_threshold: float = 0.7,
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Résoudre la position d'une ancre par template matching OpenCV.
|
|
|
|
Compare l'image de l'ancre (crop) avec le screenshot actuel pour trouver
|
|
la meilleure correspondance. Utilise cv2.matchTemplate avec TM_CCOEFF_NORMED.
|
|
|
|
Args:
|
|
screenshot_path: Chemin du screenshot de l'écran actuel
|
|
anchor_image_b64: Image de l'ancre encodée en base64 (PNG)
|
|
screen_width: Largeur de l'écran en pixels
|
|
screen_height: Hauteur de l'écran en pixels
|
|
confidence_threshold: Seuil minimum de confiance (0.0 à 1.0)
|
|
|
|
Returns:
|
|
Dict avec resolved=True et coordonnées, ou None si pas de match
|
|
"""
|
|
import base64
|
|
import io
|
|
|
|
try:
|
|
import cv2
|
|
import numpy as np
|
|
except ImportError:
|
|
logger.warning("OpenCV non disponible pour template matching")
|
|
return None
|
|
|
|
try:
|
|
# Charger le screenshot
|
|
screenshot = cv2.imread(screenshot_path)
|
|
if screenshot is None:
|
|
logger.warning("Impossible de lire le screenshot : %s", screenshot_path)
|
|
return None
|
|
|
|
# Décoder l'image de l'ancre depuis base64
|
|
anchor_bytes = base64.b64decode(anchor_image_b64)
|
|
anchor_array = np.frombuffer(anchor_bytes, dtype=np.uint8)
|
|
anchor_img = cv2.imdecode(anchor_array, cv2.IMREAD_COLOR)
|
|
if anchor_img is None:
|
|
logger.warning("Impossible de décoder l'image de l'ancre")
|
|
return None
|
|
|
|
# Convertir en niveaux de gris pour le matching
|
|
screenshot_gray = cv2.cvtColor(screenshot, cv2.COLOR_BGR2GRAY)
|
|
anchor_gray = cv2.cvtColor(anchor_img, cv2.COLOR_BGR2GRAY)
|
|
|
|
# Vérifier que l'ancre n'est pas plus grande que le screenshot
|
|
sh, sw = screenshot_gray.shape[:2]
|
|
ah, aw = anchor_gray.shape[:2]
|
|
if ah > sh or aw > sw:
|
|
logger.warning(
|
|
"Ancre (%dx%d) plus grande que le screenshot (%dx%d)",
|
|
aw, ah, sw, sh,
|
|
)
|
|
return None
|
|
|
|
# Template matching multi-échelle : essayer l'échelle 1.0 d'abord,
|
|
# puis quelques variations si la résolution a changé.
|
|
# Plage étendue 0.5x-2.0x pour couvrir les écarts importants
|
|
# (ex: apprentissage 2560x1600 → replay 1280x720 = ratio ~0.5x)
|
|
best_val = -1.0
|
|
best_loc = None
|
|
best_scale = 1.0
|
|
best_anchor_size = (aw, ah)
|
|
|
|
for scale in [1.0, 0.9, 1.1, 0.8, 1.2, 0.75, 1.25, 0.6, 1.5, 0.5, 1.75, 2.0]:
|
|
if scale != 1.0:
|
|
new_w = int(aw * scale)
|
|
new_h = int(ah * scale)
|
|
if new_w < 10 or new_h < 10 or new_w > sw or new_h > sh:
|
|
continue
|
|
scaled_anchor = cv2.resize(anchor_gray, (new_w, new_h))
|
|
else:
|
|
scaled_anchor = anchor_gray
|
|
new_w, new_h = aw, ah
|
|
|
|
result = cv2.matchTemplate(screenshot_gray, scaled_anchor, cv2.TM_CCOEFF_NORMED)
|
|
_, max_val, _, max_loc = cv2.minMaxLoc(result)
|
|
|
|
if max_val > best_val:
|
|
best_val = max_val
|
|
best_loc = max_loc
|
|
best_scale = scale
|
|
best_anchor_size = (new_w, new_h)
|
|
|
|
# Si on a un très bon match, pas besoin de continuer
|
|
if best_val >= 0.95:
|
|
break
|
|
|
|
if best_val < confidence_threshold:
|
|
logger.info(
|
|
"Template matching : meilleur score=%.3f < seuil=%.3f (ancre %dx%d, écran %dx%d)",
|
|
best_val, confidence_threshold, aw, ah, sw, sh,
|
|
)
|
|
return None
|
|
|
|
# Calculer le centre du match
|
|
match_w, match_h = best_anchor_size
|
|
cx = best_loc[0] + match_w / 2.0
|
|
cy = best_loc[1] + match_h / 2.0
|
|
|
|
# Convertir en proportions normalisées
|
|
x_pct = round(cx / sw, 6) if sw > 0 else 0.0
|
|
y_pct = round(cy / sh, 6) if sh > 0 else 0.0
|
|
|
|
logger.info(
|
|
"Template matching OK : score=%.3f, échelle=%.2f, "
|
|
"centre=(%d, %d) → (%.4f, %.4f) sur %dx%d",
|
|
best_val, best_scale, int(cx), int(cy), x_pct, y_pct, sw, sh,
|
|
)
|
|
|
|
return {
|
|
"resolved": True,
|
|
"method": "template_matching",
|
|
"x_pct": x_pct,
|
|
"y_pct": y_pct,
|
|
"matched_element": {
|
|
"label": f"anchor_template",
|
|
"type": "visual_anchor",
|
|
"role": "anchor",
|
|
"center": [int(cx), int(cy)],
|
|
"confidence": best_val,
|
|
},
|
|
"score": best_val,
|
|
"scale": best_scale,
|
|
"match_box": {
|
|
"x": best_loc[0],
|
|
"y": best_loc[1],
|
|
"width": match_w,
|
|
"height": match_h,
|
|
},
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error("Erreur template matching : %s", e)
|
|
return None
|
|
|
|
|
|
def _validate_match_context(
|
|
result: Dict[str, Any],
|
|
original_x_pct: float,
|
|
original_y_pct: float,
|
|
target_spec: Dict[str, Any],
|
|
max_distance: float = 0.35,
|
|
) -> bool:
|
|
"""Vérifier que la position trouvée est dans la même zone que l'originale.
|
|
|
|
Évite les faux positifs du template matching : un bouton similaire visuellement
|
|
mais situé dans une zone très différente de l'écran.
|
|
|
|
Args:
|
|
result: Résultat du template matching (contient x_pct, y_pct).
|
|
original_x_pct: Position X originale (pourcentage, 0.0-1.0).
|
|
original_y_pct: Position Y originale (pourcentage, 0.0-1.0).
|
|
target_spec: Spécification de la cible (non utilisé pour l'instant,
|
|
mais disponible pour des règles contextuelles futures).
|
|
max_distance: Distance euclidienne maximum acceptée (en pourcentage de l'écran).
|
|
Défaut 0.35 = ~35% de la diagonale, assez permissif pour les UI dynamiques.
|
|
|
|
Returns:
|
|
True si la position est valide (même zone), False sinon.
|
|
"""
|
|
found_x = result.get("x_pct", 0.0)
|
|
found_y = result.get("y_pct", 0.0)
|
|
|
|
# Distance euclidienne en pourcentage de l'écran
|
|
dx = found_x - original_x_pct
|
|
dy = found_y - original_y_pct
|
|
distance = (dx ** 2 + dy ** 2) ** 0.5
|
|
|
|
if distance > max_distance:
|
|
logger.debug(
|
|
"Context validation : distance=%.3f > max=%.3f "
|
|
"(found=(%.3f, %.3f), original=(%.3f, %.3f))",
|
|
distance, max_distance, found_x, found_y, original_x_pct, original_y_pct,
|
|
)
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
# =========================================================================
|
|
# YOLO/OmniParser — Résolution par détection d'éléments UI
|
|
# =========================================================================
|
|
|
|
# Chargement paresseux d'OmniParser (singleton, GPU)
|
|
_omniparser_available: Optional[bool] = None # None = pas encore vérifié
|
|
_omniparser_instance = None
|
|
_omniparser_lock = threading.Lock()
|
|
|
|
|
|
def _get_omniparser():
|
|
"""Obtenir l'instance OmniParser (lazy loading, thread-safe).
|
|
|
|
Returns:
|
|
OmniParserAdapter ou None si non disponible.
|
|
"""
|
|
global _omniparser_available, _omniparser_instance
|
|
if _omniparser_available is False:
|
|
return None
|
|
if _omniparser_instance is not None:
|
|
return _omniparser_instance
|
|
|
|
with _omniparser_lock:
|
|
if _omniparser_available is False:
|
|
return None
|
|
if _omniparser_instance is not None:
|
|
return _omniparser_instance
|
|
try:
|
|
from core.detection.omniparser_adapter import OmniParserAdapter
|
|
adapter = OmniParserAdapter()
|
|
if adapter.available:
|
|
_omniparser_instance = adapter
|
|
_omniparser_available = True
|
|
logger.info("OmniParser disponible pour la résolution YOLO")
|
|
return adapter
|
|
else:
|
|
_omniparser_available = False
|
|
logger.info("OmniParser : modèles non trouvés, YOLO désactivé")
|
|
return None
|
|
except ImportError:
|
|
_omniparser_available = False
|
|
logger.info("OmniParser non installé, YOLO désactivé")
|
|
return None
|
|
except Exception as e:
|
|
_omniparser_available = False
|
|
logger.warning("OmniParser init échouée : %s", e)
|
|
return None
|
|
|
|
|
|
def _resolve_by_yolo(
|
|
screenshot_path: str,
|
|
anchor_image_b64: str,
|
|
screen_width: int,
|
|
screen_height: int,
|
|
target_spec: Dict[str, Any],
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Résolution via YOLO/OmniParser : détecte tous les éléments UI
|
|
puis matche le crop de référence contre les éléments détectés.
|
|
|
|
Stratégie :
|
|
1. OmniParser détecte tous les éléments UI du screenshot (~0.6-0.8s)
|
|
2. Pour chaque élément détecté, template matching local contre l'anchor
|
|
3. Si 1 seul bon match (score >= 0.50) → accepter
|
|
4. Si 2+ matchs ambigus → retourner None (le VLM tranchera)
|
|
|
|
Args:
|
|
screenshot_path: Chemin vers le screenshot JPEG
|
|
anchor_image_b64: Image de l'anchor encodée en base64
|
|
screen_width: Largeur de l'écran
|
|
screen_height: Hauteur de l'écran
|
|
target_spec: Spécification de la cible
|
|
|
|
Returns:
|
|
Dict avec resolved=True/False, x_pct, y_pct, score
|
|
ou None si OmniParser pas disponible ou aucun match
|
|
"""
|
|
import base64
|
|
|
|
try:
|
|
import cv2
|
|
import numpy as np
|
|
except ImportError:
|
|
return None
|
|
|
|
omniparser = _get_omniparser()
|
|
if omniparser is None:
|
|
return None
|
|
|
|
t0 = time.time()
|
|
|
|
try:
|
|
from PIL import Image as PILImage
|
|
|
|
# Charger le screenshot en PIL
|
|
screenshot_pil = PILImage.open(screenshot_path)
|
|
sw, sh = screenshot_pil.size
|
|
|
|
# Charger le screenshot en numpy/OpenCV pour le template matching
|
|
screenshot_np = np.array(screenshot_pil)
|
|
if len(screenshot_np.shape) == 3 and screenshot_np.shape[2] == 3:
|
|
# PIL est RGB, convertir en BGR pour OpenCV
|
|
screenshot_bgr = cv2.cvtColor(screenshot_np, cv2.COLOR_RGB2BGR)
|
|
else:
|
|
screenshot_bgr = screenshot_np
|
|
screenshot_gray = cv2.cvtColor(screenshot_bgr, cv2.COLOR_BGR2GRAY)
|
|
|
|
# Décoder l'anchor depuis base64
|
|
anchor_bytes = base64.b64decode(anchor_image_b64)
|
|
anchor_array = np.frombuffer(anchor_bytes, dtype=np.uint8)
|
|
anchor_img = cv2.imdecode(anchor_array, cv2.IMREAD_COLOR)
|
|
if anchor_img is None:
|
|
logger.warning("YOLO resolve : impossible de décoder l'anchor")
|
|
return None
|
|
anchor_gray = cv2.cvtColor(anchor_img, cv2.COLOR_BGR2GRAY)
|
|
anchor_h, anchor_w = anchor_gray.shape[:2]
|
|
|
|
# Détecter tous les éléments UI avec OmniParser
|
|
elements = omniparser.detect(screenshot_pil)
|
|
if not elements:
|
|
elapsed = time.time() - t0
|
|
logger.info("YOLO resolve : 0 éléments détectés (%.1fs)", elapsed)
|
|
return None
|
|
|
|
logger.info(
|
|
"YOLO resolve : %d éléments détectés, matching anchor %dx%d...",
|
|
len(elements), anchor_w, anchor_h,
|
|
)
|
|
|
|
# Matcher l'anchor contre chaque élément détecté
|
|
YOLO_MATCH_THRESHOLD = 0.50
|
|
matches = []
|
|
|
|
for elem in elements:
|
|
x1, y1, x2, y2 = elem.bbox
|
|
elem_w = x2 - x1
|
|
elem_h = y2 - y1
|
|
|
|
# Ignorer les éléments trop petits
|
|
if elem_w < 5 or elem_h < 5:
|
|
continue
|
|
|
|
# Extraire le crop de l'élément depuis le screenshot
|
|
elem_crop = screenshot_gray[y1:y2, x1:x2]
|
|
if elem_crop.size == 0:
|
|
continue
|
|
|
|
# Template matching local : resize anchor pour matcher la taille de l'élément
|
|
# ou inversement, selon les dimensions relatives
|
|
try:
|
|
# Approche : resize l'anchor à la taille du crop et comparer
|
|
if elem_w > 0 and elem_h > 0:
|
|
anchor_resized = cv2.resize(anchor_gray, (elem_w, elem_h))
|
|
result = cv2.matchTemplate(
|
|
elem_crop, anchor_resized, cv2.TM_CCOEFF_NORMED
|
|
)
|
|
_, max_val, _, _ = cv2.minMaxLoc(result)
|
|
else:
|
|
continue
|
|
|
|
# Aussi essayer le crop à la taille de l'anchor si c'est plus grand
|
|
if elem_w >= anchor_w and elem_h >= anchor_h:
|
|
result2 = cv2.matchTemplate(
|
|
elem_crop, anchor_gray, cv2.TM_CCOEFF_NORMED
|
|
)
|
|
_, max_val2, _, _ = cv2.minMaxLoc(result2)
|
|
max_val = max(max_val, max_val2)
|
|
|
|
if max_val >= YOLO_MATCH_THRESHOLD:
|
|
matches.append((elem, max_val))
|
|
|
|
except cv2.error:
|
|
continue
|
|
|
|
elapsed = time.time() - t0
|
|
|
|
if not matches:
|
|
logger.info(
|
|
"YOLO resolve : aucun match >= %.2f parmi %d éléments (%.1fs)",
|
|
YOLO_MATCH_THRESHOLD, len(elements), elapsed,
|
|
)
|
|
return None
|
|
|
|
# Trier par score décroissant
|
|
matches.sort(key=lambda m: m[1], reverse=True)
|
|
best_elem, best_score = matches[0]
|
|
|
|
# Si 2+ matchs avec des scores proches (< 0.10 d'écart), c'est ambigu
|
|
# → laisser le VLM trancher
|
|
if len(matches) >= 2:
|
|
second_score = matches[1][1]
|
|
if best_score - second_score < 0.10:
|
|
logger.info(
|
|
"YOLO resolve : %d matchs ambigus (best=%.3f, second=%.3f, "
|
|
"écart=%.3f < 0.10), VLM requis (%.1fs)",
|
|
len(matches), best_score, second_score,
|
|
best_score - second_score, elapsed,
|
|
)
|
|
return None
|
|
|
|
# 1 seul match clair → accepter
|
|
cx, cy = best_elem.center
|
|
x_pct = round(cx / sw, 6) if sw > 0 else 0.0
|
|
y_pct = round(cy / sh, 6) if sh > 0 else 0.0
|
|
|
|
logger.info(
|
|
"YOLO resolve OK : '%s' (%s) score=%.3f → (%.4f, %.4f) "
|
|
"parmi %d éléments, %d matchs (%.1fs)",
|
|
best_elem.label, best_elem.element_type, best_score,
|
|
x_pct, y_pct, len(elements), len(matches), elapsed,
|
|
)
|
|
|
|
return {
|
|
"resolved": True,
|
|
"method": "yolo_omniparser",
|
|
"x_pct": x_pct,
|
|
"y_pct": y_pct,
|
|
"matched_element": {
|
|
"label": best_elem.label,
|
|
"type": best_elem.element_type,
|
|
"role": "yolo_detected",
|
|
"center": [cx, cy],
|
|
"confidence": best_score,
|
|
},
|
|
"score": best_score,
|
|
"yolo_elements_count": len(elements),
|
|
"yolo_matches_count": len(matches),
|
|
}
|
|
|
|
except Exception as e:
|
|
elapsed = time.time() - t0
|
|
logger.warning("YOLO resolve : exception (%.1fs) — %s", elapsed, e)
|
|
return None
|
|
|
|
|
|
# =========================================================================
|
|
# VLM Quick Find — Fallback léger quand le template matching échoue
|
|
# =========================================================================
|
|
|
|
# Client Ollama singleton (initialisé au premier appel, pas au démarrage)
|
|
_vlm_client = None
|
|
_vlm_client_lock = threading.Lock()
|
|
|
|
# Timeout dédié pour le VLM Quick Find (plus court que le timeout par défaut)
|
|
_VLM_QUICK_FIND_TIMEOUT = 30 # secondes
|
|
|
|
|
|
def _get_vlm_client():
|
|
"""Obtenir ou créer le client Ollama singleton pour le VLM Quick Find.
|
|
|
|
Initialisation paresseuse : le client n'est créé qu'au premier appel,
|
|
pas au démarrage du serveur (évite de bloquer si Ollama est down).
|
|
Le modèle est résolu automatiquement via vlm_config (RPA_VLM_MODEL).
|
|
"""
|
|
global _vlm_client
|
|
if _vlm_client is not None:
|
|
return _vlm_client
|
|
with _vlm_client_lock:
|
|
if _vlm_client is not None:
|
|
return _vlm_client
|
|
try:
|
|
from core.detection.ollama_client import OllamaClient
|
|
from core.detection.vlm_config import get_vlm_model
|
|
_model = get_vlm_model()
|
|
_vlm_client = OllamaClient(
|
|
endpoint="http://localhost:11434",
|
|
model=_model,
|
|
timeout=_VLM_QUICK_FIND_TIMEOUT,
|
|
)
|
|
logger.info("VLM Quick Find : client Ollama initialisé (%s)", _model)
|
|
except Exception as e:
|
|
logger.warning(f"VLM Quick Find : impossible d'initialiser le client Ollama : {e}")
|
|
return None
|
|
return _vlm_client
|
|
|
|
|
|
def _build_target_description(target_spec: Dict[str, Any]) -> str:
|
|
"""Construire une description textuelle de l'élément à trouver.
|
|
|
|
Utilisé par le VLM Quick Find pour savoir quoi chercher sur le screenshot.
|
|
|
|
Args:
|
|
target_spec: Spécification de la cible (by_text, by_role, etc.)
|
|
|
|
Returns:
|
|
Description en langage naturel, ex: "un bouton contenant 'Valider'"
|
|
"""
|
|
by_text = target_spec.get("by_text", "").strip()
|
|
by_role = target_spec.get("by_role", "").strip()
|
|
|
|
if by_text and by_role:
|
|
return f"un {by_role} contenant '{by_text}'"
|
|
elif by_text:
|
|
return f"élément contenant le texte '{by_text}'"
|
|
elif by_role:
|
|
return f"un {by_role}"
|
|
else:
|
|
return "l'élément interactif principal"
|
|
|
|
|
|
def _vlm_quick_find(
|
|
screenshot_path: str,
|
|
target_description: str,
|
|
anchor_image_b64: Optional[str] = None,
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Demander au VLM de localiser un élément sur le screenshot.
|
|
|
|
Stratégie VLM-first pour le replay : le VLM comprend le contexte
|
|
de l'écran et peut trouver un élément même si l'apparence a changé.
|
|
|
|
Modes de fonctionnement :
|
|
- Avec anchor_image_b64 + description : multi-image (screenshot + crop de référence).
|
|
Le VLM voit le screenshot ET le crop, ce qui est beaucoup plus précis.
|
|
- Avec description seule : single-image, le VLM cherche par la description textuelle.
|
|
- Avec anchor_image_b64 seule (pas de description) : multi-image avec prompt visuel pur.
|
|
|
|
Args:
|
|
screenshot_path: Chemin du screenshot actuel
|
|
target_description: Description riche de l'élément à trouver.
|
|
Ex: "Dans la fenêtre 'Exécuter', l'élément cliqué en bas au centre"
|
|
anchor_image_b64: Image de référence (crop) en base64 (optionnel).
|
|
Si fourni, envoyé comme seconde image au VLM pour comparaison visuelle.
|
|
|
|
Returns:
|
|
{"x_pct": float, "y_pct": float, "confidence": float, "method": "vlm_quick_find"}
|
|
ou None si l'élément n'est pas trouvé ou en cas d'erreur
|
|
"""
|
|
client = _get_vlm_client()
|
|
if client is None:
|
|
logger.debug("VLM Quick Find : client Ollama non disponible, skip")
|
|
return None
|
|
|
|
t0 = time.time()
|
|
|
|
# Construire le prompt adapté selon les informations disponibles
|
|
has_anchor = bool(anchor_image_b64)
|
|
has_description = bool(target_description and target_description.strip())
|
|
|
|
if has_anchor and has_description:
|
|
# Mode optimal : screenshot + crop de référence + description textuelle
|
|
prompt = (
|
|
"The first image is the current screen. "
|
|
"The second image shows the element I want to click.\n\n"
|
|
f"Context: {target_description}\n\n"
|
|
"Find this exact element on the screen and return its CENTER coordinates "
|
|
"as percentage of the screen dimensions.\n"
|
|
'Return ONLY a JSON object: {"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}\n'
|
|
'If the element is not visible, return: {"x_pct": null, "y_pct": null, "confidence": 0.0}'
|
|
)
|
|
elif has_anchor:
|
|
# Mode visuel pur : screenshot + crop, pas de description
|
|
prompt = (
|
|
"The first image is the current screen. "
|
|
"The second image shows the element I want to click.\n\n"
|
|
"Find this exact element on the screen and return its CENTER coordinates "
|
|
"as percentage of the screen dimensions.\n"
|
|
'Return ONLY a JSON object: {"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}\n'
|
|
'If the element is not visible, return: {"x_pct": null, "y_pct": null, "confidence": 0.0}'
|
|
)
|
|
else:
|
|
# Mode description seule
|
|
prompt = (
|
|
"Look at this screenshot carefully.\n\n"
|
|
f"{target_description}\n\n"
|
|
"Find this element and return its CENTER coordinates "
|
|
"as percentage of the image dimensions.\n"
|
|
'Return ONLY a JSON object: {"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}\n'
|
|
'If the element is not visible, return: {"x_pct": null, "y_pct": null, "confidence": 0.0}'
|
|
)
|
|
|
|
system_prompt = "You are a UI element locator. Output raw JSON only. No explanation."
|
|
|
|
try:
|
|
# Préparer les images supplémentaires (anchor crop)
|
|
extra_images = [anchor_image_b64] if has_anchor else None
|
|
|
|
result = client.generate(
|
|
prompt=prompt,
|
|
image_path=screenshot_path,
|
|
system_prompt=system_prompt,
|
|
temperature=0.1,
|
|
max_tokens=200,
|
|
force_json=False,
|
|
extra_images_b64=extra_images,
|
|
)
|
|
|
|
elapsed = time.time() - t0
|
|
|
|
if not result.get("success"):
|
|
logger.info(
|
|
"VLM Quick Find : échec appel VLM (%.1fs) — %s",
|
|
elapsed, result.get("error", "?"),
|
|
)
|
|
return None
|
|
|
|
response_text = result.get("response", "").strip()
|
|
if not response_text:
|
|
logger.info("VLM Quick Find : réponse vide du VLM (%.1fs)", elapsed)
|
|
return None
|
|
|
|
# Parser la réponse JSON (réutiliser le parser robuste d'OllamaClient)
|
|
parsed = client._extract_json_from_response(response_text)
|
|
if parsed is None:
|
|
logger.info(
|
|
"VLM Quick Find : réponse non-JSON (%.1fs) — %.80s",
|
|
elapsed, response_text,
|
|
)
|
|
return None
|
|
|
|
# Valider les coordonnées
|
|
x_pct = parsed.get("x_pct")
|
|
y_pct = parsed.get("y_pct")
|
|
confidence = float(parsed.get("confidence", 0.0))
|
|
|
|
if x_pct is None or y_pct is None or confidence < 0.3:
|
|
logger.info(
|
|
"VLM Quick Find : élément non trouvé ou confiance trop basse "
|
|
"(%.1fs, confidence=%.2f) pour '%s'",
|
|
elapsed, confidence,
|
|
target_description[:80] if target_description else "(anchor only)",
|
|
)
|
|
return None
|
|
|
|
x_pct = float(x_pct)
|
|
y_pct = float(y_pct)
|
|
|
|
# Vérifier que les coordonnées sont dans les bornes [0, 1]
|
|
if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0):
|
|
logger.info(
|
|
"VLM Quick Find : coordonnées hors bornes (%.4f, %.4f), ignoré",
|
|
x_pct, y_pct,
|
|
)
|
|
return None
|
|
|
|
mode_str = "multi-image" if has_anchor else "description"
|
|
desc_short = (target_description[:60] + "...") if target_description and len(target_description) > 60 else (target_description or "(anchor)")
|
|
logger.info(
|
|
"VLM Quick Find OK [%s] : '%s' → (%.4f, %.4f) confidence=%.2f en %.1fs",
|
|
mode_str, desc_short, x_pct, y_pct, confidence, elapsed,
|
|
)
|
|
|
|
return {
|
|
"resolved": True,
|
|
"method": "vlm_quick_find",
|
|
"x_pct": round(x_pct, 6),
|
|
"y_pct": round(y_pct, 6),
|
|
"matched_element": {
|
|
"label": target_description or "anchor_visual",
|
|
"type": "vlm_located",
|
|
"role": "vlm_quick_find",
|
|
"confidence": confidence,
|
|
},
|
|
"score": confidence,
|
|
}
|
|
|
|
except Exception as e:
|
|
elapsed = time.time() - t0
|
|
logger.warning(
|
|
"VLM Quick Find : exception (%.1fs) — %s", elapsed, e,
|
|
)
|
|
return None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Résolution par VLM Grounding Direct (configurable via RPA_VLM_MODEL)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _resolve_by_grounding(
|
|
screenshot_path: str,
|
|
target_spec: Dict[str, Any],
|
|
screen_width: int,
|
|
screen_height: int,
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Résoudre une cible via grounding VLM direct.
|
|
|
|
Le modèle VLM (gemma4:e4b par défaut, configurable via RPA_VLM_MODEL)
|
|
reçoit le screenshot + une description textuelle et retourne
|
|
directement les coordonnées de l'élément. Pas de SomEngine,
|
|
pas de numérotation — le VLM fait du grounding UI natif.
|
|
|
|
Approche plus fiable que SomEngine+VLM pour les icônes et éléments
|
|
visuels sans texte (logo Windows, disquette, bouton fermer).
|
|
"""
|
|
import base64
|
|
import io
|
|
import re
|
|
|
|
t0 = time.time()
|
|
|
|
# Construire la description de la cible
|
|
by_text = target_spec.get("by_text", "").strip()
|
|
vlm_desc = target_spec.get("vlm_description", "").strip()
|
|
window_title = target_spec.get("window_title", "").strip()
|
|
|
|
if by_text:
|
|
description = by_text
|
|
elif vlm_desc:
|
|
description = vlm_desc
|
|
else:
|
|
return None
|
|
|
|
# Utiliser la capture fenêtre si disponible (plus ciblée, moins de bruit)
|
|
# Sinon fallback sur le full screen
|
|
window_capture = target_spec.get("window_capture", {})
|
|
window_rect = window_capture.get("rect") # [x1, y1, x2, y2] écran
|
|
|
|
try:
|
|
from PIL import Image as PILImage
|
|
from pathlib import Path
|
|
|
|
# Utiliser la fenêtre active : cropper depuis le screenshot full
|
|
# via window_rect (fonctionne au replay comme à l'enregistrement)
|
|
img = PILImage.open(screenshot_path)
|
|
|
|
if window_rect:
|
|
x1, y1, x2, y2 = window_rect
|
|
img = img.crop((x1, y1, x2, y2))
|
|
using_window = True
|
|
logger.debug("Grounding : crop fenêtre (%d,%d,%d,%d) → %dx%d", x1, y1, x2, y2, *img.size)
|
|
else:
|
|
using_window = False
|
|
|
|
orig_w, orig_h = img.size
|
|
small_w, small_h = orig_w, orig_h # pas de redimensionnement
|
|
|
|
buf = io.BytesIO()
|
|
img.save(buf, format="JPEG", quality=80)
|
|
shot_b64 = base64.b64encode(buf.getvalue()).decode()
|
|
except Exception as e:
|
|
logger.warning("Grounding : erreur chargement image — %s", e)
|
|
return None
|
|
|
|
# Prompt natif Qwen2.5-VL — format bbox_2d (le seul fiable)
|
|
# Ajouter la position relative pour désambiguïser (ex: deux "Rechercher" à l'écran)
|
|
original_pos = target_spec.get("original_position", {})
|
|
pos_hint = ""
|
|
y_rel = original_pos.get("y_relative", "")
|
|
x_rel = original_pos.get("x_relative", "")
|
|
if y_rel or x_rel:
|
|
pos_hint = f" located {y_rel} {x_rel} of the screen".strip()
|
|
prompt = f"Detect '{description}'{pos_hint} in this image with a bounding box."
|
|
|
|
# Le grounding nécessite un modèle entraîné pour les coordonnées (bbox_2d).
|
|
# Qwen2.5-VL est le seul qui retourne des positions précises.
|
|
# gemma4 comprend les images mais ne sait pas localiser en coordonnées.
|
|
_grounding_model = os.environ.get("RPA_GROUNDING_MODEL", "qwen2.5vl:7b")
|
|
|
|
# Appel VLM — vLLM (GPU, rapide) en priorité, Ollama en fallback
|
|
import requests as _requests
|
|
content = ""
|
|
|
|
# Port vLLM configurable via env
|
|
_vllm_port = os.environ.get("VLLM_PORT", "8100")
|
|
_vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ")
|
|
|
|
# Essai 1 : vLLM (API OpenAI-compatible, GPU)
|
|
try:
|
|
vllm_resp = _requests.post(
|
|
f"http://localhost:{_vllm_port}/v1/chat/completions",
|
|
json={
|
|
"model": _vllm_model,
|
|
"messages": [
|
|
{"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."},
|
|
{"role": "user", "content": [
|
|
{"type": "text", "text": prompt},
|
|
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}},
|
|
]},
|
|
],
|
|
"temperature": 0.1,
|
|
"max_tokens": 80,
|
|
},
|
|
timeout=30,
|
|
)
|
|
if vllm_resp.ok:
|
|
content = vllm_resp.json().get("choices", [{}])[0].get("message", {}).get("content", "")
|
|
if content:
|
|
logger.debug("Grounding via vLLM OK")
|
|
except Exception as e:
|
|
logger.debug("vLLM non disponible (%s), fallback Ollama", e)
|
|
|
|
# Essai 2 : Ollama (qwen2.5vl:7b pour le grounding — format bbox_2d natif)
|
|
if not content:
|
|
try:
|
|
resp = _requests.post("http://localhost:11434/api/chat", json={
|
|
"model": _grounding_model,
|
|
"messages": [
|
|
{"role": "user", "content": prompt, "images": [shot_b64]},
|
|
],
|
|
"stream": False,
|
|
"options": {"temperature": 0.1, "num_predict": 100},
|
|
}, timeout=60)
|
|
content = resp.json().get("message", {}).get("content", "")
|
|
except Exception as e:
|
|
logger.info("Grounding VLM timeout/erreur : %s", e)
|
|
return None
|
|
|
|
elapsed = time.time() - t0
|
|
|
|
# Parser la réponse — supporte bbox_2d en pixels, JSON %, arrays bruts
|
|
x_pct, y_pct = None, None
|
|
|
|
# Format 1 : bbox_2d en pixels [x, y] ou [x1, y1, x2, y2]
|
|
bbox_match = re.search(r'"bbox_2d"\s*:\s*\[([^\]]+)\]', content)
|
|
if bbox_match:
|
|
coords = [float(v.strip()) for v in bbox_match.group(1).split(",")]
|
|
if len(coords) == 2:
|
|
x_pct = coords[0] / small_w
|
|
y_pct = coords[1] / small_h
|
|
elif len(coords) >= 4:
|
|
x_pct = (coords[0] + coords[2]) / 2 / small_w
|
|
y_pct = (coords[1] + coords[3]) / 2 / small_h
|
|
|
|
# Format 2 : JSON {"x": 0.XX, "y": 0.YY}
|
|
if x_pct is None:
|
|
json_match = re.search(r'"x"\s*:\s*([\d.]+).*?"y"\s*:\s*([\d.]+)', content)
|
|
if json_match:
|
|
x_val, y_val = float(json_match.group(1)), float(json_match.group(2))
|
|
# Si > 1, c'est en pixels
|
|
if x_val > 1:
|
|
x_pct = x_val / small_w
|
|
y_pct = y_val / small_h
|
|
else:
|
|
x_pct = x_val
|
|
y_pct = y_val
|
|
|
|
# Format 3 : {"x_pct": 0.XX, "y_pct": 0.YY}
|
|
if x_pct is None:
|
|
pct_match = re.search(r'"x_pct"\s*:\s*([\d.]+).*?"y_pct"\s*:\s*([\d.]+)', content)
|
|
if pct_match:
|
|
x_pct = float(pct_match.group(1))
|
|
y_pct = float(pct_match.group(2))
|
|
|
|
# Format 4 : array brut [x1, y1, x2, y2] ou [x, y]
|
|
if x_pct is None:
|
|
arr_match = re.search(r'\[[\s]*([\d.]+)\s*,\s*([\d.]+)(?:\s*,\s*([\d.]+)\s*,\s*([\d.]+))?\s*\]', content)
|
|
if arr_match:
|
|
vals = [float(v) for v in arr_match.groups() if v is not None]
|
|
if len(vals) >= 4:
|
|
x_pct = (vals[0] + vals[2]) / 2 / small_w
|
|
y_pct = (vals[1] + vals[3]) / 2 / small_h
|
|
elif len(vals) == 2:
|
|
x_pct = vals[0] / small_w
|
|
y_pct = vals[1] / small_h
|
|
|
|
if x_pct is None or y_pct is None:
|
|
# Fallback multi-image : screenshot + crop → grounding sans description
|
|
anchor_b64 = target_spec.get("anchor_image_base64", "")
|
|
if anchor_b64:
|
|
try:
|
|
prompt_mi = (
|
|
"Image 1 is a screenshot. Image 2 shows a UI element.\n"
|
|
"Find where Image 2 appears on Image 1.\n"
|
|
'Return position: {"x": NNN, "y": NNN} in pixels of Image 1.'
|
|
)
|
|
resp2 = _requests.post("http://localhost:11434/api/chat", json={
|
|
"model": _grounding_model,
|
|
"messages": [
|
|
{"role": "user", "content": prompt_mi, "images": [shot_b64, anchor_b64]},
|
|
],
|
|
"stream": False,
|
|
"options": {"temperature": 0.1, "num_predict": 50},
|
|
}, timeout=60)
|
|
content2 = resp2.json().get("message", {}).get("content", "")
|
|
elapsed = time.time() - t0
|
|
|
|
# Parser tous les formats
|
|
arr2 = re.search(r'\[[\s]*([\d.]+)\s*,\s*([\d.]+)(?:\s*,\s*([\d.]+)\s*,\s*([\d.]+))?\s*\]', content2)
|
|
if arr2:
|
|
vals = [float(v) for v in arr2.groups() if v is not None]
|
|
if len(vals) >= 4:
|
|
x_pct = (vals[0] + vals[2]) / 2 / small_w
|
|
y_pct = (vals[1] + vals[3]) / 2 / small_h
|
|
elif len(vals) == 2:
|
|
x_pct = vals[0] / small_w
|
|
y_pct = vals[1] / small_h
|
|
if x_pct is None:
|
|
json2 = re.search(r'"x"\s*:\s*([\d.]+).*?"y"\s*:\s*([\d.]+)', content2)
|
|
if json2:
|
|
x_pct = float(json2.group(1)) / small_w
|
|
y_pct = float(json2.group(2)) / small_h
|
|
if x_pct is not None:
|
|
logger.info("Grounding multi-image OK (%.1fs)", elapsed)
|
|
except Exception as e:
|
|
logger.debug("Grounding multi-image erreur: %s", e)
|
|
|
|
if x_pct is None or y_pct is None:
|
|
logger.info(
|
|
"Grounding : réponse non parsable (%.1fs) — %s",
|
|
elapsed, content[:120],
|
|
)
|
|
return None
|
|
|
|
# Valider les bornes
|
|
if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0):
|
|
logger.info("Grounding : coordonnées hors bornes (%.3f, %.3f)", x_pct, y_pct)
|
|
return None
|
|
|
|
# Convertir coordonnées fenêtre → coordonnées écran
|
|
if using_window and window_rect:
|
|
win_x1, win_y1, win_x2, win_y2 = window_rect
|
|
win_w = win_x2 - win_x1
|
|
win_h = win_y2 - win_y1
|
|
# x_pct/y_pct sont relatifs à la fenêtre, convertir en relatif à l'écran
|
|
abs_x = win_x1 + x_pct * win_w
|
|
abs_y = win_y1 + y_pct * win_h
|
|
x_pct = abs_x / screen_width
|
|
y_pct = abs_y / screen_height
|
|
logger.info(
|
|
"Grounding OK [%s/window] : '%s' → (%.4f, %.4f) en %.1fs",
|
|
_grounding_model, description[:50], x_pct, y_pct, elapsed,
|
|
)
|
|
else:
|
|
logger.info(
|
|
"Grounding OK [%s/full] : '%s' → (%.4f, %.4f) en %.1fs",
|
|
_grounding_model, description[:50], x_pct, y_pct, elapsed,
|
|
)
|
|
|
|
return {
|
|
"resolved": True,
|
|
"method": "grounding_vlm",
|
|
"x_pct": round(x_pct, 6),
|
|
"y_pct": round(y_pct, 6),
|
|
"matched_element": {
|
|
"label": description[:60],
|
|
"type": "grounding",
|
|
"role": "grounding_vlm",
|
|
"confidence": 0.85,
|
|
},
|
|
"score": 0.85,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Résolution Set-of-Mark : SomEngine (détection) + VLM (identification)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _get_som_engine_api():
|
|
"""Singleton SomEngine partagé."""
|
|
try:
|
|
from core.detection.som_engine import get_shared_engine
|
|
return get_shared_engine()
|
|
except ImportError:
|
|
return None
|
|
|
|
|
|
def _resolve_by_som(
|
|
screenshot_path: str,
|
|
target_spec: Dict[str, Any],
|
|
screen_width: int,
|
|
screen_height: int,
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Résoudre une cible UI via Set-of-Mark + VLM.
|
|
|
|
Pipeline :
|
|
1. SomEngine détecte tous les éléments et les numérote sur le screenshot
|
|
2. VLM reçoit l'image annotée + description de la cible
|
|
3. VLM identifie le numéro du mark → coordonnées précises
|
|
|
|
Avantages vs VLM direct :
|
|
- Le VLM n'a qu'à identifier (son point fort), pas localiser
|
|
- Les coordonnées viennent de SomEngine (pixel-perfect)
|
|
- Question simple "quel numéro ?" → réponse simple
|
|
|
|
Args:
|
|
screenshot_path: Chemin du screenshot actuel
|
|
target_spec: Spécification de la cible (vlm_description, som_element, etc.)
|
|
screen_width: Largeur écran en pixels
|
|
screen_height: Hauteur écran en pixels
|
|
|
|
Returns:
|
|
Dict avec resolved=True et coordonnées, ou None si indisponible.
|
|
"""
|
|
engine = _get_som_engine_api()
|
|
if engine is None:
|
|
return None
|
|
|
|
client = _get_vlm_client()
|
|
if client is None:
|
|
return None
|
|
|
|
t0 = time.time()
|
|
|
|
# ── 1. Lancer SomEngine sur le screenshot actuel ──
|
|
try:
|
|
from PIL import Image as PILImage
|
|
img = PILImage.open(screenshot_path).convert("RGB")
|
|
som_result = engine.analyze(img)
|
|
except Exception as e:
|
|
logger.warning("SoM resolve : erreur analyse — %s", e)
|
|
return None
|
|
|
|
if not som_result.elements:
|
|
logger.info("SoM resolve : 0 éléments détectés")
|
|
return None
|
|
|
|
# ── 2. Construire la description de la cible ──
|
|
som_element = target_spec.get("som_element", {})
|
|
vlm_description = target_spec.get("vlm_description", "")
|
|
anchor_label = som_element.get("label", "")
|
|
|
|
# Construire un prompt riche
|
|
target_parts = []
|
|
if anchor_label:
|
|
target_parts.append(f"texte '{anchor_label}'")
|
|
if vlm_description:
|
|
target_parts.append(vlm_description)
|
|
if not target_parts:
|
|
# Sans description, SoM resolve ne peut pas fonctionner
|
|
logger.debug("SoM resolve : pas de description pour identifier l'élément")
|
|
return None
|
|
|
|
target_desc = ", ".join(target_parts)
|
|
|
|
# ── 2.5. Raccourci : si le label est connu, chercher par texte directement ──
|
|
# Pas besoin du VLM si on connaît le texte exact de l'élément !
|
|
if anchor_label and len(anchor_label) >= 2:
|
|
label_lower = anchor_label.lower()
|
|
# Match exact d'abord, puis partiel
|
|
exact_matches = [
|
|
e for e in som_result.elements
|
|
if e.label and e.label.lower() == label_lower
|
|
]
|
|
if not exact_matches:
|
|
exact_matches = [
|
|
e for e in som_result.elements
|
|
if e.label and len(e.label) >= 3 and (
|
|
label_lower in e.label.lower()
|
|
or e.label.lower() in label_lower
|
|
)
|
|
]
|
|
|
|
if len(exact_matches) == 1:
|
|
# Match unique par texte → pas besoin du VLM
|
|
elem = exact_matches[0]
|
|
elapsed = time.time() - t0
|
|
cx_norm, cy_norm = elem.center_norm
|
|
logger.info(
|
|
"SoM resolve FAST : match texte unique '#%d %s' → (%.4f, %.4f) en %.1fs",
|
|
elem.id, elem.label, cx_norm, cy_norm, elapsed,
|
|
)
|
|
return {
|
|
"resolved": True,
|
|
"method": "som_text_match",
|
|
"x_pct": round(cx_norm, 6),
|
|
"y_pct": round(cy_norm, 6),
|
|
"matched_element": {
|
|
"label": elem.label,
|
|
"type": elem.source,
|
|
"role": "som_text_match",
|
|
"confidence": max(elem.confidence, 0.85),
|
|
"som_id": elem.id,
|
|
},
|
|
"score": max(elem.confidence, 0.85),
|
|
}
|
|
elif len(exact_matches) > 1:
|
|
# Plusieurs matchs texte → disambiguïser par proximité à la position originale
|
|
ref_center = som_element.get("center_norm", [])
|
|
if ref_center and len(ref_center) == 2:
|
|
ref_x, ref_y = ref_center
|
|
best = min(
|
|
exact_matches,
|
|
key=lambda e: (
|
|
(e.center_norm[0] - ref_x) ** 2
|
|
+ (e.center_norm[1] - ref_y) ** 2
|
|
),
|
|
)
|
|
elapsed = time.time() - t0
|
|
cx_norm, cy_norm = best.center_norm
|
|
dist = ((cx_norm - ref_x) ** 2 + (cy_norm - ref_y) ** 2) ** 0.5
|
|
if dist < 0.15: # Tolérance 15% de l'écran
|
|
logger.info(
|
|
"SoM resolve FAST : match texte proximité '#%d %s' (dist=%.3f) "
|
|
"→ (%.4f, %.4f) en %.1fs",
|
|
best.id, best.label, dist, cx_norm, cy_norm, elapsed,
|
|
)
|
|
return {
|
|
"resolved": True,
|
|
"method": "som_text_match",
|
|
"x_pct": round(cx_norm, 6),
|
|
"y_pct": round(cy_norm, 6),
|
|
"matched_element": {
|
|
"label": best.label,
|
|
"type": best.source,
|
|
"role": "som_text_match_proximity",
|
|
"confidence": max(best.confidence, 0.80),
|
|
"som_id": best.id,
|
|
},
|
|
"score": max(best.confidence, 0.80),
|
|
}
|
|
logger.info(
|
|
"SoM resolve : %d matchs texte pour '%s', VLM nécessaire",
|
|
len(exact_matches), anchor_label,
|
|
)
|
|
|
|
# ── 2.7. Fallback : template matching anchor vs éléments SomEngine ──
|
|
# Pour les icônes sans texte : comparer le crop de référence contre
|
|
# chaque région YOLO détectée par SomEngine.
|
|
anchor_b64 = target_spec.get("anchor_image_base64", "")
|
|
by_text = target_spec.get("by_text", "").strip()
|
|
if anchor_b64 and (not anchor_label or not by_text):
|
|
try:
|
|
import cv2
|
|
import numpy as np
|
|
|
|
# Décoder l'anchor
|
|
anc_bytes = base64.b64decode(anchor_b64)
|
|
anc_array = np.frombuffer(anc_bytes, dtype=np.uint8)
|
|
anc_img = cv2.imdecode(anc_array, cv2.IMREAD_GRAYSCALE)
|
|
|
|
# Charger le screenshot en OpenCV
|
|
screenshot_cv = cv2.imread(screenshot_path, cv2.IMREAD_GRAYSCALE)
|
|
|
|
if anc_img is not None and screenshot_cv is not None:
|
|
# Template matching de l'anchor sur le SCREENSHOT ENTIER
|
|
# (pas sur les régions individuelles — l'anchor est souvent plus grand)
|
|
anc_h, anc_w = anc_img.shape[:2]
|
|
if screenshot_cv.shape[0] >= anc_h and screenshot_cv.shape[1] >= anc_w:
|
|
res = cv2.matchTemplate(screenshot_cv, anc_img, cv2.TM_CCOEFF_NORMED)
|
|
_, max_score, _, max_loc = cv2.minMaxLoc(res)
|
|
|
|
if max_score >= 0.5:
|
|
# Centre du match
|
|
match_cx = max_loc[0] + anc_w // 2
|
|
match_cy = max_loc[1] + anc_h // 2
|
|
|
|
# Trouver l'élément SomEngine le plus proche du centre du match
|
|
best_elem = None
|
|
best_dist = float("inf")
|
|
for elem in som_result.elements:
|
|
cx, cy = elem.center
|
|
dist = ((match_cx - cx) ** 2 + (match_cy - cy) ** 2) ** 0.5
|
|
if dist < best_dist:
|
|
best_dist = dist
|
|
best_elem = elem
|
|
|
|
if best_elem and best_dist < 100: # Max 100px de distance
|
|
elapsed = time.time() - t0
|
|
cx_norm, cy_norm = best_elem.center_norm
|
|
logger.info(
|
|
"SoM resolve ANCHOR : match crop score=%.3f → "
|
|
"elem '#%d %s' (dist=%.0fpx) → (%.4f, %.4f) en %.1fs",
|
|
max_score, best_elem.id, best_elem.label,
|
|
best_dist, cx_norm, cy_norm, elapsed,
|
|
)
|
|
return {
|
|
"resolved": True,
|
|
"method": "som_anchor_match",
|
|
"x_pct": round(cx_norm, 6),
|
|
"y_pct": round(cy_norm, 6),
|
|
"matched_element": {
|
|
"label": best_elem.label or f"icon #{best_elem.id}",
|
|
"type": best_elem.source,
|
|
"role": "som_anchor_match",
|
|
"confidence": max_score,
|
|
"som_id": best_elem.id,
|
|
},
|
|
"score": max_score,
|
|
}
|
|
except ImportError:
|
|
pass
|
|
except Exception as e:
|
|
logger.debug("SoM anchor match erreur : %s", e)
|
|
|
|
# ── 3. Sauvegarder l'image annotée SoM temporairement ──
|
|
if som_result.som_image is None:
|
|
logger.debug("SoM resolve : pas d'image annotée, skip VLM")
|
|
return None
|
|
|
|
import tempfile
|
|
try:
|
|
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
|
|
som_result.som_image.save(tmp, format="JPEG", quality=85)
|
|
som_img_path = tmp.name
|
|
except Exception as e:
|
|
logger.warning("SoM resolve : erreur sauvegarde image annotée — %s", e)
|
|
return None
|
|
|
|
# ── 4. VLM : identifier le numéro du mark ──
|
|
# Lister uniquement les éléments avec un label (plus concis pour le VLM)
|
|
labeled_elements = [e for e in som_result.elements if e.label][:30]
|
|
elements_list = "\n".join(
|
|
f" #{e.id}: '{e.label}'"
|
|
for e in labeled_elements
|
|
)
|
|
|
|
# Multi-image : SoM annotée + anchor crop (si disponible)
|
|
anchor_b64 = target_spec.get("anchor_image_base64", "")
|
|
extra_images = [anchor_b64] if anchor_b64 else None
|
|
|
|
if extra_images:
|
|
prompt = (
|
|
"Image 1 shows the screen with numbered marks on each UI element.\n"
|
|
"Image 2 shows the element I'm looking for.\n\n"
|
|
f"Target: {target_desc}\n\n"
|
|
f"Detected elements:\n{elements_list}\n\n"
|
|
"Which mark number matches the target element in Image 2?\n"
|
|
'Answer with JSON only: {"mark_id": N, "confidence": 0.9}'
|
|
)
|
|
else:
|
|
prompt = (
|
|
f"I'm looking for: {target_desc}\n\n"
|
|
f"Detected elements:\n{elements_list}\n\n"
|
|
"Which number is the correct element?\n"
|
|
'Answer with JSON only: {"mark_id": N, "confidence": 0.9}'
|
|
)
|
|
|
|
system_prompt = "You identify UI elements by number. Output JSON only, no explanation."
|
|
|
|
try:
|
|
result = client.generate(
|
|
prompt=prompt,
|
|
image_path=som_img_path,
|
|
system_prompt=system_prompt,
|
|
temperature=0.1,
|
|
max_tokens=50,
|
|
force_json=False,
|
|
extra_images_b64=extra_images,
|
|
)
|
|
except Exception as e:
|
|
logger.warning("SoM resolve : erreur VLM — %s", e)
|
|
return None
|
|
finally:
|
|
import os
|
|
try:
|
|
os.unlink(som_img_path)
|
|
except OSError:
|
|
pass
|
|
|
|
elapsed = time.time() - t0
|
|
|
|
if not result.get("success"):
|
|
logger.info("SoM resolve : VLM échoué (%.1fs)", elapsed)
|
|
return None
|
|
|
|
# ── 5. Parser la réponse et retourner les coordonnées ──
|
|
response_text = result.get("response", "").strip()
|
|
|
|
# Tenter d'abord l'extraction JSON standard
|
|
parsed = client._extract_json_from_response(response_text)
|
|
|
|
# Fallback : extraire un nombre simple de la réponse
|
|
if parsed is None:
|
|
import re
|
|
numbers = re.findall(r'\b(\d+)\b', response_text)
|
|
if numbers:
|
|
candidate = int(numbers[0])
|
|
if som_result.get_element_by_id(candidate) is not None:
|
|
parsed = {"mark_id": candidate, "confidence": 0.7}
|
|
logger.debug("SoM resolve : extraction numéro fallback → #%d", candidate)
|
|
|
|
if parsed is None:
|
|
logger.info("SoM resolve : réponse non-JSON (%.1fs) — %.80s", elapsed, response_text)
|
|
return None
|
|
|
|
mark_id = parsed.get("mark_id")
|
|
confidence = float(parsed.get("confidence", 0.0))
|
|
|
|
if mark_id is None or confidence < 0.3:
|
|
logger.info(
|
|
"SoM resolve : mark non trouvé ou confiance trop basse (mark=%s, conf=%.2f, %.1fs)",
|
|
mark_id, confidence, elapsed,
|
|
)
|
|
return None
|
|
|
|
mark_id = int(mark_id)
|
|
elem = som_result.get_element_by_id(mark_id)
|
|
if elem is None:
|
|
logger.warning("SoM resolve : mark #%d inexistant (%.1fs)", mark_id, elapsed)
|
|
return None
|
|
|
|
cx_norm, cy_norm = elem.center_norm
|
|
logger.info(
|
|
"SoM resolve OK : mark #%d '%s' → (%.4f, %.4f) conf=%.2f en %.1fs (%d éléments)",
|
|
mark_id, elem.label, cx_norm, cy_norm, confidence, elapsed, len(som_result.elements),
|
|
)
|
|
|
|
return {
|
|
"resolved": True,
|
|
"method": "som_vlm",
|
|
"x_pct": round(cx_norm, 6),
|
|
"y_pct": round(cy_norm, 6),
|
|
"matched_element": {
|
|
"label": elem.label or f"mark #{mark_id}",
|
|
"type": elem.source,
|
|
"role": "som_identified",
|
|
"confidence": confidence,
|
|
"som_id": mark_id,
|
|
},
|
|
"score": confidence,
|
|
}
|
|
|
|
|
|
def _resolve_target_sync(
|
|
screenshot_path: str,
|
|
target_spec: Dict[str, Any],
|
|
screen_width: int,
|
|
screen_height: int,
|
|
fallback_x_pct: float,
|
|
fallback_y_pct: float,
|
|
strict_mode: bool = False,
|
|
) -> Dict[str, Any]:
|
|
"""Résoudre la cible visuellement (exécuté dans un thread séparé).
|
|
|
|
Hiérarchie de résolution (strict_mode=True, replay sessions) — VLM-FIRST :
|
|
1. VLM Quick Find (~3-8s) — compréhension sémantique de l'écran, multi-image
|
|
(screenshot + crop de référence + description riche)
|
|
1.5. SoM + VLM (~5-15s) — SomEngine numérote les éléments, VLM identifie le bon
|
|
2. Template matching OpenCV (~100ms) — fallback pixel, seuil STRICT 0.90
|
|
3. resolved=False → STOP le replay
|
|
|
|
Le VLM comprend le contexte (titre de fenêtre, type d'élément, position)
|
|
et peut trouver un élément même si l'écran est différent de l'enregistrement.
|
|
Le template matching ne compare que des pixels et produit des faux positifs.
|
|
|
|
Hiérarchie classique (strict_mode=False, VWB et autres) — INCHANGÉE :
|
|
1. Template matching OpenCV (~100ms) — seuil 0.70
|
|
1.5. VLM Quick Find si template échoue et by_text/by_role dispo
|
|
2. by_text/by_role → VLM Quick Find puis ScreenAnalyzer
|
|
3. fallback coordonnées statiques
|
|
"""
|
|
anchor_image_b64 = target_spec.get("anchor_image_base64", "")
|
|
|
|
# ===================================================================
|
|
# MODE STRICT (replay sessions) — Stratégie VLM-FIRST
|
|
# ===================================================================
|
|
if strict_mode and anchor_image_b64:
|
|
vlm_description = target_spec.get("vlm_description", "")
|
|
by_text_strict = target_spec.get("by_text", "").strip()
|
|
|
|
# Fallback : construire la description depuis by_text/by_role
|
|
if not vlm_description:
|
|
by_role = target_spec.get("by_role", "").strip()
|
|
if by_text_strict or by_role:
|
|
vlm_description = _build_target_description(target_spec)
|
|
|
|
# ---------------------------------------------------------------
|
|
# Étape 0 : Choisir la stratégie selon le type d'élément
|
|
# - Texte OCR fiable → grounding VLM (description textuelle)
|
|
# - Icône sans texte → template matching (crop 80x80)
|
|
# ---------------------------------------------------------------
|
|
by_text_source = target_spec.get("by_text_source", "")
|
|
|
|
if by_text_strict and by_text_source in ("ocr", "vlm"):
|
|
# Texte visible (OCR ou lu par gemma4) → grounding VLM direct
|
|
grounding_result = _resolve_by_grounding(
|
|
screenshot_path=screenshot_path,
|
|
target_spec=target_spec,
|
|
screen_width=screen_width,
|
|
screen_height=screen_height,
|
|
)
|
|
if grounding_result and grounding_result.get("resolved"):
|
|
logger.info(
|
|
"Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'",
|
|
grounding_result.get("x_pct", 0),
|
|
grounding_result.get("y_pct", 0),
|
|
by_text_strict[:50],
|
|
)
|
|
return grounding_result
|
|
|
|
if not by_text_strict or by_text_source not in ("ocr", "vlm"):
|
|
# Template matching pour les éléments sans texte (icônes pures)
|
|
window_capture = target_spec.get("window_capture", {})
|
|
window_rect = window_capture.get("rect")
|
|
from pathlib import Path as _Path
|
|
_full = _Path(screenshot_path)
|
|
_win = _full.parent / _full.name.replace("_full.png", "_window.png")
|
|
tm_path = str(_win) if _win.is_file() and window_rect else screenshot_path
|
|
tm_screen_w = (window_rect[2] - window_rect[0]) if window_rect and _win.is_file() else screen_width
|
|
tm_screen_h = (window_rect[3] - window_rect[1]) if window_rect and _win.is_file() else screen_height
|
|
|
|
result = _resolve_by_template_matching(
|
|
screenshot_path=tm_path,
|
|
anchor_image_b64=anchor_image_b64,
|
|
screen_width=tm_screen_w,
|
|
screen_height=tm_screen_h,
|
|
confidence_threshold=0.90,
|
|
)
|
|
if result and result.get("score", 0) >= 0.90:
|
|
x_tm, y_tm = result["x_pct"], result["y_pct"]
|
|
# Convertir coordonnées fenêtre → écran si nécessaire
|
|
if window_rect and _win.is_file():
|
|
abs_x = window_rect[0] + x_tm * tm_screen_w
|
|
abs_y = window_rect[1] + y_tm * tm_screen_h
|
|
result["x_pct"] = round(abs_x / screen_width, 6)
|
|
result["y_pct"] = round(abs_y / screen_height, 6)
|
|
logger.info(
|
|
"Strict resolve TEMPLATE : icon match (score=%.3f)",
|
|
result.get("score", 0),
|
|
)
|
|
return result
|
|
|
|
# ---------------------------------------------------------------
|
|
# Étape 1 : VLM Quick Find (fallback, multi-image)
|
|
# ---------------------------------------------------------------
|
|
if vlm_description or anchor_image_b64:
|
|
vlm_result = _vlm_quick_find(
|
|
screenshot_path=screenshot_path,
|
|
target_description=vlm_description,
|
|
anchor_image_b64=anchor_image_b64,
|
|
)
|
|
if vlm_result and vlm_result.get("resolved"):
|
|
if vlm_result.get("score", 0) >= 0.3:
|
|
logger.info(
|
|
"Strict resolve VLM-first : VLM OK (score=%.2f) pour '%s'",
|
|
vlm_result.get("score", 0),
|
|
vlm_description[:60] if vlm_description else "(anchor)",
|
|
)
|
|
return vlm_result
|
|
else:
|
|
logger.info(
|
|
"Strict resolve VLM-first : VLM score=%.2f trop bas, passage template",
|
|
vlm_result.get("score", 0),
|
|
)
|
|
else:
|
|
logger.info(
|
|
"Strict resolve VLM-first : VLM échoué pour '%s', passage template matching",
|
|
vlm_description[:60] if vlm_description else "(anchor)",
|
|
)
|
|
|
|
# ---------------------------------------------------------------
|
|
# Étape 1.5 : SoM + VLM (Set-of-Mark + identification)
|
|
# SomEngine numérote les éléments, VLM identifie le bon numéro.
|
|
# Plus fiable que le VLM direct car le VLM n'a qu'à identifier,
|
|
# pas localiser — et les coordonnées sont pixel-perfect.
|
|
# ---------------------------------------------------------------
|
|
som_element = target_spec.get("som_element", {})
|
|
if som_element or vlm_description:
|
|
som_result = _resolve_by_som(
|
|
screenshot_path=screenshot_path,
|
|
target_spec=target_spec,
|
|
screen_width=screen_width,
|
|
screen_height=screen_height,
|
|
)
|
|
if som_result and som_result.get("resolved"):
|
|
logger.info(
|
|
"Strict resolve SoM+VLM : OK (score=%.2f, mark=#%s)",
|
|
som_result.get("score", 0),
|
|
som_result.get("matched_element", {}).get("som_id", "?"),
|
|
)
|
|
return som_result
|
|
else:
|
|
logger.info("Strict resolve SoM+VLM : échoué, passage template matching")
|
|
|
|
# ---------------------------------------------------------------
|
|
# Étape 2 : Template matching (fallback pixel) — seuil STRICT 0.90
|
|
# ---------------------------------------------------------------
|
|
result = _resolve_by_template_matching(
|
|
screenshot_path=screenshot_path,
|
|
anchor_image_b64=anchor_image_b64,
|
|
screen_width=screen_width,
|
|
screen_height=screen_height,
|
|
confidence_threshold=0.90,
|
|
)
|
|
if result:
|
|
score = result.get("score", 0)
|
|
# Score >= 0.95 : match quasi-parfait, pas besoin de valider le contexte
|
|
if score >= 0.95:
|
|
logger.info(
|
|
"Strict resolve VLM-first : template matching fallback OK "
|
|
"(score=%.3f >= 0.95, contexte skip — match quasi-parfait)",
|
|
score,
|
|
)
|
|
return result
|
|
elif _validate_match_context(result, fallback_x_pct, fallback_y_pct, target_spec):
|
|
logger.info(
|
|
"Strict resolve VLM-first : template matching fallback OK "
|
|
"(score=%.3f >= 0.90, context OK)",
|
|
score,
|
|
)
|
|
return result
|
|
else:
|
|
logger.warning(
|
|
"Strict resolve VLM-first : template score=%.3f MAIS contexte invalide, rejeté",
|
|
score,
|
|
)
|
|
|
|
# ---------------------------------------------------------------
|
|
# Étape 3 : RIEN ne fonctionne → resolved=False → STOP replay
|
|
# ---------------------------------------------------------------
|
|
return {
|
|
"resolved": False,
|
|
"method": "strict_vlm_template_failed",
|
|
"reason": "vlm_and_template_all_failed",
|
|
"x_pct": fallback_x_pct,
|
|
"y_pct": fallback_y_pct,
|
|
}
|
|
|
|
# ===================================================================
|
|
# MODE CLASSIQUE (VWB et autres) — Comportement existant
|
|
# ===================================================================
|
|
|
|
# ---------------------------------------------------------------
|
|
# Stratégie 1 : Template matching par image d'ancre (seuil 0.70)
|
|
# ---------------------------------------------------------------
|
|
if anchor_image_b64:
|
|
result = _resolve_by_template_matching(
|
|
screenshot_path=screenshot_path,
|
|
anchor_image_b64=anchor_image_b64,
|
|
screen_width=screen_width,
|
|
screen_height=screen_height,
|
|
confidence_threshold=0.7,
|
|
)
|
|
if result:
|
|
return result
|
|
logger.info(
|
|
"Template matching échoué pour ancre '%s', tentative VLM Quick Find",
|
|
target_spec.get("anchor_id", "?"),
|
|
)
|
|
|
|
# ---------------------------------------------------------------
|
|
# Stratégie 1.5 : VLM Quick Find (fallback léger après template matching)
|
|
# ---------------------------------------------------------------
|
|
by_text = target_spec.get("by_text", "").strip()
|
|
by_role = target_spec.get("by_role", "").strip()
|
|
if by_text or by_role:
|
|
vlm_desc = _build_target_description(target_spec)
|
|
vlm_result = _vlm_quick_find(
|
|
screenshot_path=screenshot_path,
|
|
target_description=vlm_desc,
|
|
anchor_image_b64=anchor_image_b64,
|
|
)
|
|
if vlm_result:
|
|
return vlm_result
|
|
logger.info(
|
|
"VLM Quick Find échoué pour ancre '%s', fallback coordonnées",
|
|
target_spec.get("anchor_id", "?"),
|
|
)
|
|
|
|
return {
|
|
"resolved": False,
|
|
"method": "fallback",
|
|
"reason": "template_matching_failed",
|
|
"x_pct": fallback_x_pct,
|
|
"y_pct": fallback_y_pct,
|
|
}
|
|
|
|
# ---------------------------------------------------------------
|
|
# Stratégie 2 : VLM Quick Find (léger, ~5-10s)
|
|
# ---------------------------------------------------------------
|
|
by_text = target_spec.get("by_text", "")
|
|
by_role = target_spec.get("by_role", "")
|
|
|
|
# Si aucun critère sémantique et pas d'ancre, fallback direct
|
|
if not by_text and not by_role and not anchor_image_b64:
|
|
return {
|
|
"resolved": False,
|
|
"method": "fallback",
|
|
"reason": "no_target_criteria",
|
|
"x_pct": fallback_x_pct,
|
|
"y_pct": fallback_y_pct,
|
|
}
|
|
|
|
# Tenter le VLM Quick Find AVANT ScreenAnalyzer (beaucoup plus rapide)
|
|
if by_text or by_role:
|
|
vlm_desc = _build_target_description(target_spec)
|
|
vlm_result = _vlm_quick_find(
|
|
screenshot_path=screenshot_path,
|
|
target_description=vlm_desc,
|
|
)
|
|
if vlm_result:
|
|
return vlm_result
|
|
logger.info(
|
|
"VLM Quick Find échoué pour '%s', fallback ScreenAnalyzer",
|
|
vlm_desc,
|
|
)
|
|
|
|
# ---------------------------------------------------------------
|
|
# Stratégie 3 : Matching sémantique via ScreenAnalyzer (~15-20s)
|
|
# ---------------------------------------------------------------
|
|
processor._ensure_initialized()
|
|
|
|
if processor._screen_analyzer is None:
|
|
return {
|
|
"resolved": False,
|
|
"method": "fallback",
|
|
"reason": "screen_analyzer_unavailable",
|
|
"x_pct": fallback_x_pct,
|
|
"y_pct": fallback_y_pct,
|
|
}
|
|
|
|
# Analyser le screenshot (Niveaux 1-3 : raw, OCR, UI elements)
|
|
try:
|
|
screen_state = processor._screen_analyzer.analyze(screenshot_path)
|
|
except Exception as e:
|
|
logger.warning(f"Analyse screenshot échouée: {e}")
|
|
return {
|
|
"resolved": False,
|
|
"method": "fallback",
|
|
"reason": f"analysis_failed: {e}",
|
|
"x_pct": fallback_x_pct,
|
|
"y_pct": fallback_y_pct,
|
|
}
|
|
|
|
ui_elements = screen_state.ui_elements or []
|
|
if not ui_elements:
|
|
logger.info("Aucun élément UI détecté, fallback coordonnées")
|
|
return {
|
|
"resolved": False,
|
|
"method": "fallback",
|
|
"reason": "no_ui_elements",
|
|
"x_pct": fallback_x_pct,
|
|
"y_pct": fallback_y_pct,
|
|
}
|
|
|
|
# Matching de la cible parmi les éléments détectés
|
|
candidates = []
|
|
|
|
for elem in ui_elements:
|
|
score = 0.0
|
|
|
|
# Score par texte (label)
|
|
if by_text and elem.label:
|
|
text_lower = by_text.lower()
|
|
label_lower = elem.label.lower()
|
|
if text_lower in label_lower or label_lower in text_lower:
|
|
score += 0.6
|
|
elif _fuzzy_match(text_lower, label_lower):
|
|
score += 0.3
|
|
|
|
# Score par rôle
|
|
if by_role:
|
|
role_lower = by_role.lower()
|
|
if elem.role and role_lower in elem.role.lower():
|
|
score += 0.3
|
|
if elem.type and role_lower in elem.type.lower():
|
|
score += 0.2
|
|
|
|
if score > 0:
|
|
candidates.append((elem, score))
|
|
|
|
if not candidates:
|
|
logger.info(
|
|
f"Aucun match visuel pour target(text='{by_text}', role='{by_role}') "
|
|
f"parmi {len(ui_elements)} éléments"
|
|
)
|
|
return {
|
|
"resolved": False,
|
|
"method": "fallback",
|
|
"reason": "no_match",
|
|
"x_pct": fallback_x_pct,
|
|
"y_pct": fallback_y_pct,
|
|
"ui_elements_count": len(ui_elements),
|
|
}
|
|
|
|
# Trier par score décroissant et prendre le meilleur
|
|
candidates.sort(key=lambda c: c[1], reverse=True)
|
|
best_elem, best_score = candidates[0]
|
|
|
|
# Convertir les coordonnées pixel en proportions
|
|
cx, cy = best_elem.center
|
|
x_pct = round(cx / screen_width, 6) if screen_width > 0 else 0.0
|
|
y_pct = round(cy / screen_height, 6) if screen_height > 0 else 0.0
|
|
|
|
logger.info(
|
|
f"Cible résolue visuellement: '{best_elem.label}' ({best_elem.type}/{best_elem.role}) "
|
|
f"score={best_score:.2f} → ({x_pct:.4f}, {y_pct:.4f})"
|
|
)
|
|
|
|
return {
|
|
"resolved": True,
|
|
"method": "visual",
|
|
"x_pct": x_pct,
|
|
"y_pct": y_pct,
|
|
"matched_element": {
|
|
"label": best_elem.label,
|
|
"type": best_elem.type,
|
|
"role": best_elem.role,
|
|
"center": list(best_elem.center),
|
|
"confidence": best_elem.label_confidence,
|
|
},
|
|
"score": best_score,
|
|
"candidates_count": len(candidates),
|
|
"ui_elements_count": len(ui_elements),
|
|
}
|
|
|
|
|
|
def _fuzzy_match(a: str, b: str, threshold: float = 0.6) -> bool:
|
|
"""Match approximatif par ratio de caractères communs."""
|
|
if not a or not b:
|
|
return False
|
|
common = sum(1 for c in a if c in b)
|
|
return (common / max(len(a), len(b))) >= threshold
|
|
|
|
|
|
def _fallback_response(request: ResolveTargetRequest, reason: str, detail: str) -> Dict:
|
|
"""Réponse de fallback quand la résolution visuelle échoue."""
|
|
return {
|
|
"resolved": False,
|
|
"method": "fallback",
|
|
"reason": reason,
|
|
"detail": detail,
|
|
"x_pct": request.fallback_x_pct,
|
|
"y_pct": request.fallback_y_pct,
|
|
}
|
|
|
|
|
|
# =========================================================================
|
|
# Learning Pack — Export / Import pour la fédération des apprentissages
|
|
# =========================================================================
|
|
|
|
class LearningPackImportRequest(BaseModel):
|
|
"""Corps de la requête d'import d'un Learning Pack."""
|
|
# Le pack complet au format JSON (structure LearningPack.to_dict())
|
|
pack: Dict[str, Any]
|
|
|
|
|
|
@app.get("/api/v1/traces/stream/learning-pack/export")
|
|
async def export_learning_pack(client_id: str, request: Request):
|
|
"""
|
|
Exporter les apprentissages d'un client en Learning Pack anonymisé.
|
|
|
|
Le client_id est haché (SHA-256) dans le pack exporté —
|
|
aucune donnée d'identification ne sort du serveur.
|
|
|
|
Query params:
|
|
client_id: identifiant du client (obligatoire).
|
|
|
|
Returns:
|
|
JSON du LearningPack anonymisé.
|
|
"""
|
|
try:
|
|
from core.federation.learning_pack import LearningPackExporter
|
|
from core.models.workflow_graph import Workflow
|
|
except ImportError as exc:
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f"Module federation non disponible : {exc}",
|
|
)
|
|
|
|
if not client_id or not client_id.strip():
|
|
raise HTTPException(status_code=400, detail="client_id requis")
|
|
|
|
# Récupérer tous les workflows chargés par le StreamProcessor
|
|
workflows = list(processor._workflows.values())
|
|
if not workflows:
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail="Aucun workflow trouvé pour l'export",
|
|
)
|
|
|
|
exporter = LearningPackExporter()
|
|
pack = exporter.export(workflows, client_id=client_id.strip())
|
|
|
|
logger.info(
|
|
"Learning pack exporté pour client_id=%s (hash=%s) : %d workflows, %d prototypes",
|
|
client_id[:8] + "...", pack.source_hash[:16] + "...",
|
|
len(workflows), len(pack.screen_prototypes),
|
|
)
|
|
return pack.to_dict()
|
|
|
|
|
|
@app.post("/api/v1/traces/stream/learning-pack/import")
|
|
async def import_learning_pack(body: LearningPackImportRequest, request: Request):
|
|
"""
|
|
Importer un Learning Pack dans l'index FAISS global.
|
|
|
|
Body JSON:
|
|
{ "pack": { ... } } — structure LearningPack complète
|
|
|
|
Returns:
|
|
Statistiques de l'import (vecteurs ajoutés, total index, etc.).
|
|
"""
|
|
try:
|
|
from core.federation.learning_pack import LearningPack
|
|
from core.federation.faiss_global import GlobalFAISSIndex
|
|
except ImportError as exc:
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f"Module federation non disponible : {exc}",
|
|
)
|
|
|
|
try:
|
|
pack = LearningPack.from_dict(body.pack)
|
|
except Exception as exc:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Format de Learning Pack invalide : {exc}",
|
|
)
|
|
|
|
# Utiliser ou créer l'index global (singleton au niveau du module)
|
|
global _global_faiss_index
|
|
if _global_faiss_index is None:
|
|
_global_faiss_index = GlobalFAISSIndex()
|
|
|
|
added = _global_faiss_index.add_pack(pack)
|
|
stats = _global_faiss_index.get_stats()
|
|
|
|
logger.info(
|
|
"Learning pack importé : pack_id=%s, +%d vecteurs (total=%d)",
|
|
pack.pack_id, added, stats["total_vectors"],
|
|
)
|
|
return {
|
|
"status": "ok",
|
|
"pack_id": pack.pack_id,
|
|
"source_hash": pack.source_hash,
|
|
"vectors_added": added,
|
|
"index_stats": stats,
|
|
}
|
|
|
|
|
|
# Index FAISS global (singleton, initialisé au premier import)
|
|
_global_faiss_index = None
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [API-STREAM] %(message)s",
|
|
)
|
|
uvicorn.run(app, host="0.0.0.0", port=5005)
|