Architecture 3 niveaux implémentée et testée (137 tests unitaires + 21 visuels) : MÉSO (acteur intelligent) : - P0 Critic : vérification sémantique post-action via gemma4 (replay_verifier.py) - P1 Observer : pré-analyse écran avant chaque action (api_stream.py /pre_analyze) - P2 Grounding/Policy : séparation localisation (grounding.py) et décision (policy.py) - P3 Recovery : rollback automatique Ctrl+Z/Escape/Alt+F4 (recovery.py) - P4 Learning : apprentissage runtime avec boucle de consolidation (replay_learner.py) MACRO (planificateur) : - TaskPlanner : comprend les ordres en langage naturel via gemma4 (task_planner.py) - Contexte métier TIM/CIM-10 pour les hôpitaux (domain_context.py) - Endpoint POST /api/v1/task pour l'exécution par instruction Traçabilité : - Audit trail complet avec 18 champs par action (audit_trail.py) - Endpoints GET /audit/history, /audit/summary, /audit/export (CSV) Grounding : - Fix parsing bbox_2d qwen2.5vl (pixels relatifs, pas grille 1000x1000) - Benchmarks visuels sur captures réelles (3 approches : baseline, zoom, Citrix) - Reproductibilité validée : variance < 0.008 sur 10 itérations Sécurité : - Tokens de production retirés du code source → .env.local - Secret key aléatoire si non configuré - Suppression logs qui leakent les tokens Résultats : 80% de replay (vs 12.5% avant), 100% détection visuelle Citrix JPEG Q20 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
6428 lines
248 KiB
Python
6428 lines
248 KiB
Python
# agent_v0/server_v1/api_stream.py
|
||
"""
|
||
API de Streaming Temps Réel pour RPA Vision V3.
|
||
|
||
Connecte l'Agent V1 au core pipeline via StreamProcessor.
|
||
Tous les calculs GPU (ScreenAnalyzer, CLIP, FAISS) tournent ici sur le serveur.
|
||
|
||
Inclut les endpoints de replay pour renvoyer des ordres d'exécution à l'Agent V1.
|
||
"""
|
||
|
||
import atexit
|
||
import json
|
||
import logging
|
||
import os
|
||
import secrets
|
||
import signal
|
||
import threading
|
||
import time
|
||
import uuid
|
||
from collections import defaultdict
|
||
from concurrent.futures import ThreadPoolExecutor
|
||
from pathlib import Path
|
||
from typing import Any, Dict, List, Optional
|
||
|
||
from fastapi import BackgroundTasks, Depends, FastAPI, File, HTTPException, Request, UploadFile
|
||
from fastapi.middleware.cors import CORSMiddleware
|
||
from pydantic import BaseModel
|
||
|
||
from .replay_failure_logger import log_replay_failure
|
||
from .replay_verifier import ReplayVerifier, VerificationResult
|
||
from .replay_learner import ReplayLearner
|
||
from .audit_trail import AuditTrail, AuditEntry
|
||
from .stream_processor import StreamProcessor, build_replay_from_raw_events, enrich_click_from_screenshot
|
||
from .worker_stream import StreamWorker
|
||
|
||
# Instance globale du vérificateur de replay (comparaison screenshots avant/après)
|
||
_replay_verifier = ReplayVerifier()
|
||
_replay_learner = ReplayLearner()
|
||
_audit_trail = AuditTrail()
|
||
|
||
# Nombre maximum de retries par action avant de déclarer un échec
|
||
MAX_RETRIES_PER_ACTION = 3
|
||
|
||
# Limites de sécurité pour les queues de replay
|
||
MAX_ACTIONS_PER_REPLAY = 500 # Max actions par requête de replay
|
||
MAX_REPLAY_STATES = 1000 # Max entrées dans _replay_states
|
||
REPLAY_STATE_TTL_SECONDS = 3600 # Nettoyage auto des replays terminés après 1h
|
||
|
||
# Actions en cours de retry : action_id -> {"action": ..., "retry_count": N, "replay_id": ...}
|
||
_retry_pending: Dict[str, Dict[str, Any]] = {}
|
||
|
||
# Callbacks d'erreur par replay_id : replay_id -> callback_url
|
||
_error_callbacks: Dict[str, str] = {}
|
||
|
||
# Optimisation des actions replay par gestes primitifs
|
||
try:
|
||
from agent_chat.gesture_catalog import get_gesture_catalog
|
||
_gesture_catalog = get_gesture_catalog()
|
||
except ImportError:
|
||
_gesture_catalog = None
|
||
|
||
# Authentification automatique (optionnel) — détection des écrans d'auth pendant le replay
|
||
# Nécessite un vault configuré via la variable d'env RPA_AUTH_VAULT_PATH + RPA_AUTH_VAULT_PASSWORD
|
||
_auth_handler = None
|
||
try:
|
||
_vault_path = os.environ.get("RPA_AUTH_VAULT_PATH")
|
||
_vault_password = os.environ.get("RPA_AUTH_VAULT_PASSWORD")
|
||
if _vault_path and _vault_password:
|
||
from core.auth.credential_vault import CredentialVault
|
||
from core.auth.auth_handler import AuthHandler
|
||
_auth_vault = CredentialVault(_vault_path, _vault_password)
|
||
_auth_handler = AuthHandler(_auth_vault)
|
||
except Exception:
|
||
_auth_handler = None
|
||
|
||
logger = logging.getLogger("api_stream")
|
||
|
||
# =========================================================================
|
||
# Authentification par token Bearer (sécurité HIGH)
|
||
# =========================================================================
|
||
# Le token est lu depuis l'environnement ou généré au démarrage.
|
||
# Tous les endpoints requièrent le header Authorization: Bearer <token>,
|
||
# sauf /health, /docs et /openapi.json (publics).
|
||
API_TOKEN = os.environ.get("RPA_API_TOKEN", secrets.token_hex(32))
|
||
|
||
# Endpoints publics (pas besoin de token)
|
||
# En production, /docs et /redoc sont désactivés (voir ci-dessous)
|
||
# Paths publics : pas de token requis
|
||
# /replay/next est public car l'agent Rust legacy n'envoie pas de token
|
||
# et c'est un endpoint read-only (polling, pas d'écriture)
|
||
_PUBLIC_PATHS = {
|
||
"/health", "/docs", "/openapi.json", "/redoc",
|
||
"/api/v1/traces/stream/replay/next",
|
||
"/api/v1/traces/stream/image",
|
||
}
|
||
|
||
|
||
async def _verify_token(request: Request):
|
||
"""Middleware de vérification du token API Bearer."""
|
||
if request.url.path in _PUBLIC_PATHS:
|
||
return
|
||
auth = request.headers.get("Authorization", "")
|
||
if not auth.startswith("Bearer ") or auth[7:] != API_TOKEN:
|
||
raise HTTPException(status_code=401, detail="Token API invalide")
|
||
|
||
|
||
# =========================================================================
|
||
# Rate limiting en mémoire (sécurité HIGH)
|
||
# =========================================================================
|
||
_rate_limits: Dict[str, list] = defaultdict(list)
|
||
_RATE_LIMIT_WINDOW = 60 # secondes
|
||
_RATE_LIMITS = {
|
||
"/api/v1/traces/stream/replay": 10, # 10 replays par minute
|
||
"/api/v1/traces/stream/replay/raw": 10,
|
||
"/api/v1/traces/stream/replay-session": 10, # 10 replays session par minute
|
||
"/api/v1/traces/stream/replay/single": 30, # 30 actions Copilot par minute
|
||
"/api/v1/traces/stream/finalize": 5,
|
||
"/api/v1/traces/stream/image": 200, # 200 images par minute (heartbeats)
|
||
}
|
||
|
||
|
||
def _check_rate_limit(endpoint: str, client_ip: str) -> bool:
|
||
"""Vérifie si le client a dépassé la limite de requêtes."""
|
||
key = f"{endpoint}:{client_ip}"
|
||
now = time.time()
|
||
# Nettoyer les entrées expirées
|
||
_rate_limits[key] = [t for t in _rate_limits[key] if now - t < _RATE_LIMIT_WINDOW]
|
||
limit = _RATE_LIMITS.get(endpoint, 100)
|
||
if len(_rate_limits[key]) >= limit:
|
||
return False
|
||
_rate_limits[key].append(now)
|
||
return True
|
||
|
||
|
||
# =========================================================================
|
||
# Validation des actions de replay (sécurité HIGH)
|
||
# =========================================================================
|
||
_ALLOWED_ACTION_TYPES = {
|
||
"click", "type", "key_combo", "scroll", "wait",
|
||
"file_open", "file_save", "file_close", "file_new", "file_dialog",
|
||
"double_click", "right_click", "drag",
|
||
"verify_screen", # Replay hybride : vérification visuelle entre groupes
|
||
}
|
||
_MAX_ACTION_TEXT_LENGTH = 10000
|
||
_MAX_KEYS_PER_COMBO = 10
|
||
# Touches autorisées dans les key_combo (modificateurs + touches spéciales + caractères simples)
|
||
_KNOWN_KEY_NAMES = {
|
||
"enter", "return", "tab", "escape", "esc", "backspace", "delete", "space",
|
||
"up", "down", "left", "right", "home", "end", "page_up", "page_down",
|
||
"f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11", "f12",
|
||
"ctrl", "ctrl_l", "ctrl_r", "alt", "alt_l", "alt_r",
|
||
"shift", "shift_l", "shift_r",
|
||
"cmd", "win", "super", "super_l", "super_r", "windows", "meta",
|
||
"insert", "print_screen", "caps_lock", "num_lock",
|
||
}
|
||
|
||
|
||
# =========================================================================
|
||
# Setup environnement — Préparation automatique avant le replay
|
||
# =========================================================================
|
||
# Mapping des noms d'exécutables Windows courants vers la commande de lancement.
|
||
# Utilisé comme fallback pour le texte de recherche dans le menu Démarrer.
|
||
# Le format est : "processname.exe" (minuscule) -> commande shell
|
||
_APP_LAUNCH_COMMANDS: Dict[str, str] = {
|
||
"notepad.exe": "notepad",
|
||
"explorer.exe": "explorer",
|
||
"calc.exe": "calc",
|
||
"mspaint.exe": "mspaint",
|
||
"cmd.exe": "cmd",
|
||
"powershell.exe": "powershell",
|
||
"wordpad.exe": "wordpad",
|
||
"charmap.exe": "charmap",
|
||
"snippingtool.exe": "snippingtool",
|
||
"taskmgr.exe": "taskmgr",
|
||
"regedit.exe": "regedit",
|
||
"mstsc.exe": "mstsc",
|
||
"winword.exe": "winword",
|
||
"excel.exe": "excel",
|
||
"powerpnt.exe": "powerpnt",
|
||
"outlook.exe": "outlook",
|
||
"msedge.exe": "msedge",
|
||
"chrome.exe": "chrome",
|
||
"firefox.exe": "firefox",
|
||
"code.exe": "code",
|
||
}
|
||
|
||
# Mapping des exécutables vers le nom visuel à chercher dans le menu Démarrer.
|
||
# Contient le texte de recherche (souvent le nom français) et une description
|
||
# pour le VLM afin d'identifier l'icône dans les résultats de recherche.
|
||
# Format : "processname.exe" -> {"search_text": ..., "display_name": ..., "vlm_description": ...}
|
||
_APP_VISUAL_SEARCH: Dict[str, Dict[str, str]] = {
|
||
"notepad.exe": {
|
||
"search_text": "Bloc-notes",
|
||
"display_name": "Bloc-notes",
|
||
"vlm_description": "L'application Bloc-notes (Notepad) dans les résultats de recherche",
|
||
},
|
||
"calc.exe": {
|
||
"search_text": "Calculatrice",
|
||
"display_name": "Calculatrice",
|
||
"vlm_description": "L'application Calculatrice dans les résultats de recherche",
|
||
},
|
||
"mspaint.exe": {
|
||
"search_text": "Paint",
|
||
"display_name": "Paint",
|
||
"vlm_description": "L'application Paint dans les résultats de recherche",
|
||
},
|
||
"cmd.exe": {
|
||
"search_text": "Invite de commandes",
|
||
"display_name": "Invite de commandes",
|
||
"vlm_description": "L'Invite de commandes (Command Prompt) dans les résultats",
|
||
},
|
||
"powershell.exe": {
|
||
"search_text": "PowerShell",
|
||
"display_name": "PowerShell",
|
||
"vlm_description": "Windows PowerShell dans les résultats de recherche",
|
||
},
|
||
"wordpad.exe": {
|
||
"search_text": "WordPad",
|
||
"display_name": "WordPad",
|
||
"vlm_description": "L'application WordPad dans les résultats de recherche",
|
||
},
|
||
"winword.exe": {
|
||
"search_text": "Word",
|
||
"display_name": "Microsoft Word",
|
||
"vlm_description": "Microsoft Word dans les résultats de recherche",
|
||
},
|
||
"excel.exe": {
|
||
"search_text": "Excel",
|
||
"display_name": "Microsoft Excel",
|
||
"vlm_description": "Microsoft Excel dans les résultats de recherche",
|
||
},
|
||
"powerpnt.exe": {
|
||
"search_text": "PowerPoint",
|
||
"display_name": "Microsoft PowerPoint",
|
||
"vlm_description": "Microsoft PowerPoint dans les résultats de recherche",
|
||
},
|
||
"outlook.exe": {
|
||
"search_text": "Outlook",
|
||
"display_name": "Microsoft Outlook",
|
||
"vlm_description": "Microsoft Outlook dans les résultats de recherche",
|
||
},
|
||
"msedge.exe": {
|
||
"search_text": "Edge",
|
||
"display_name": "Microsoft Edge",
|
||
"vlm_description": "Microsoft Edge dans les résultats de recherche",
|
||
},
|
||
"chrome.exe": {
|
||
"search_text": "Chrome",
|
||
"display_name": "Google Chrome",
|
||
"vlm_description": "Google Chrome dans les résultats de recherche",
|
||
},
|
||
"firefox.exe": {
|
||
"search_text": "Firefox",
|
||
"display_name": "Mozilla Firefox",
|
||
"vlm_description": "Mozilla Firefox dans les résultats de recherche",
|
||
},
|
||
"code.exe": {
|
||
"search_text": "Visual Studio Code",
|
||
"display_name": "Visual Studio Code",
|
||
"vlm_description": "Visual Studio Code dans les résultats de recherche",
|
||
},
|
||
"taskmgr.exe": {
|
||
"search_text": "Gestionnaire des tâches",
|
||
"display_name": "Gestionnaire des tâches",
|
||
"vlm_description": "Le Gestionnaire des tâches dans les résultats de recherche",
|
||
},
|
||
"snippingtool.exe": {
|
||
"search_text": "Outil Capture",
|
||
"display_name": "Outil Capture d'écran",
|
||
"vlm_description": "L'Outil Capture d'écran dans les résultats de recherche",
|
||
},
|
||
"mstsc.exe": {
|
||
"search_text": "Connexion Bureau à distance",
|
||
"display_name": "Bureau à distance",
|
||
"vlm_description": "La Connexion Bureau à distance dans les résultats",
|
||
},
|
||
}
|
||
|
||
# Applications Windows à ignorer pour le setup (processus système, agents, etc.)
|
||
_SETUP_IGNORE_APPS = {
|
||
"searchhost.exe", # Barre de recherche Windows
|
||
"explorer.exe", # Explorer est toujours lancé (shell Windows)
|
||
"pythonw.exe", # Agent Python (notre propre agent)
|
||
"python.exe", # Idem
|
||
"shellexperiencehost.exe",
|
||
"startmenuexperiencehost.exe",
|
||
"applicationframehost.exe",
|
||
"systemsettings.exe",
|
||
"textinputhost.exe",
|
||
"runtimebroker.exe",
|
||
}
|
||
|
||
|
||
def _extract_required_apps_from_events(raw_events: list) -> Dict[str, Any]:
|
||
"""Extraire les applications requises depuis les événements bruts d'une session.
|
||
|
||
Analyse les window_focus_change pour identifier :
|
||
- L'application principale (la plus utilisée hors apps système)
|
||
- La première fenêtre ciblée (pour le setup initial)
|
||
|
||
Args:
|
||
raw_events: Événements bruts depuis live_events.jsonl.
|
||
|
||
Returns:
|
||
Dict avec les clés :
|
||
- primary_app: str (nom de l'exécutable principal, ex: "Notepad.exe")
|
||
- primary_launch_cmd: str (commande Win+R, ex: "notepad")
|
||
- first_window_title: str (titre de la première fenêtre applicative)
|
||
- apps: dict[str, int] (app_name -> nombre d'occurrences)
|
||
"""
|
||
app_counts: Dict[str, int] = defaultdict(int)
|
||
first_app = None
|
||
first_window_title = None
|
||
|
||
for raw_evt in raw_events:
|
||
event_data = raw_evt.get("event", raw_evt)
|
||
evt_type = event_data.get("type", "")
|
||
|
||
if evt_type == "window_focus_change":
|
||
to_info = event_data.get("to", {})
|
||
if not to_info:
|
||
continue
|
||
app_name = to_info.get("app_name", "")
|
||
title = to_info.get("title", "")
|
||
if app_name:
|
||
app_counts[app_name] += 1
|
||
if first_app is None and app_name.lower() not in _SETUP_IGNORE_APPS:
|
||
first_app = app_name
|
||
first_window_title = title
|
||
|
||
# Aussi extraire depuis les mouse_click qui ont un champ window
|
||
elif evt_type == "mouse_click":
|
||
window = event_data.get("window", {})
|
||
if isinstance(window, dict):
|
||
app_name = window.get("app_name", "")
|
||
if app_name:
|
||
app_counts[app_name] += 1
|
||
|
||
if not app_counts:
|
||
return {}
|
||
|
||
# Déterminer l'application principale (la plus fréquente hors apps ignorées)
|
||
filtered_apps = {
|
||
k: v for k, v in app_counts.items()
|
||
if k.lower() not in _SETUP_IGNORE_APPS
|
||
}
|
||
if not filtered_apps:
|
||
return {}
|
||
|
||
primary_app = max(filtered_apps, key=filtered_apps.get)
|
||
|
||
# Résoudre la commande de lancement
|
||
primary_launch_cmd = _resolve_launch_command(primary_app)
|
||
|
||
return {
|
||
"primary_app": primary_app,
|
||
"primary_launch_cmd": primary_launch_cmd,
|
||
"first_window_title": first_window_title or "",
|
||
"apps": dict(app_counts),
|
||
}
|
||
|
||
|
||
def _extract_required_apps_from_workflow(workflow) -> Dict[str, Any]:
|
||
"""Extraire les applications requises depuis un workflow structuré.
|
||
|
||
Analyse les nodes du workflow pour identifier les titres de fenêtres
|
||
requis, puis infère l'application principale.
|
||
|
||
Args:
|
||
workflow: Objet Workflow ou dict brut.
|
||
|
||
Returns:
|
||
Même format que _extract_required_apps_from_events.
|
||
"""
|
||
# Accéder aux données (objet ou dict)
|
||
if hasattr(workflow, 'nodes'):
|
||
nodes = workflow.nodes
|
||
metadata = workflow.metadata if hasattr(workflow, 'metadata') else {}
|
||
elif isinstance(workflow, dict):
|
||
nodes = workflow.get('nodes', [])
|
||
metadata = workflow.get('metadata', {})
|
||
else:
|
||
return {}
|
||
|
||
if not nodes:
|
||
return {}
|
||
|
||
# Collecter les titres de fenêtres depuis les nodes
|
||
window_titles = []
|
||
for node in nodes:
|
||
template = node.template if hasattr(node, 'template') else node.get('template', {})
|
||
if isinstance(template, dict):
|
||
window = template.get('window', {})
|
||
elif hasattr(template, 'window'):
|
||
window = template.window if hasattr(template.window, '__dict__') else {}
|
||
else:
|
||
window = {}
|
||
|
||
if isinstance(window, dict):
|
||
title = window.get('title_pattern', '') or window.get('title_contains', '')
|
||
elif hasattr(window, 'title_pattern'):
|
||
title = getattr(window, 'title_pattern', '') or ''
|
||
else:
|
||
title = ''
|
||
|
||
if title:
|
||
window_titles.append(title)
|
||
|
||
# Inférer l'app principale depuis les titres de fenêtres
|
||
primary_app, primary_launch_cmd, matched_title = _infer_app_from_window_titles(window_titles)
|
||
# Utiliser le titre qui a matché l'app (pas le premier node qui peut être "Rechercher")
|
||
first_title = matched_title or (window_titles[0] if window_titles else "")
|
||
|
||
if not primary_app:
|
||
return {}
|
||
|
||
source_session_id = metadata.get("source_session_id", "") if isinstance(metadata, dict) else ""
|
||
machine_id = metadata.get("machine_id", "") if isinstance(metadata, dict) else ""
|
||
|
||
return {
|
||
"primary_app": primary_app,
|
||
"primary_launch_cmd": primary_launch_cmd,
|
||
"first_window_title": first_title,
|
||
"apps": {},
|
||
"source_session_id": source_session_id,
|
||
"machine_id": machine_id,
|
||
}
|
||
|
||
|
||
def _resolve_launch_command(app_name: str) -> str:
|
||
"""Résoudre la commande Win+R pour lancer une application.
|
||
|
||
Si l'app n'est pas dans le mapping, utilise le nom de l'exécutable
|
||
directement sans l'extension .exe (fonctionne pour la plupart des apps).
|
||
"""
|
||
app_lower = app_name.lower()
|
||
if app_lower in _APP_LAUNCH_COMMANDS:
|
||
return _APP_LAUNCH_COMMANDS[app_lower]
|
||
# Fallback : utiliser le nom sans l'extension .exe
|
||
if app_lower.endswith(".exe"):
|
||
return app_name[:-4]
|
||
return app_name
|
||
|
||
|
||
def _infer_app_from_window_titles(titles: list) -> tuple:
|
||
"""Inférer le nom de l'application et la commande de lancement depuis des titres de fenêtres.
|
||
|
||
Utilise des heuristiques basées sur les patterns de titres Windows courants.
|
||
|
||
Returns:
|
||
Tuple (app_name, launch_command, matched_title).
|
||
("", "", "") si non identifié.
|
||
"""
|
||
_TITLE_APP_PATTERNS = [
|
||
("bloc-notes", "Notepad.exe", "notepad"),
|
||
("notepad", "Notepad.exe", "notepad"),
|
||
("word", "winword.exe", "winword"),
|
||
("excel", "excel.exe", "excel"),
|
||
("powerpoint", "powerpnt.exe", "powerpnt"),
|
||
("outlook", "outlook.exe", "outlook"),
|
||
("paint", "mspaint.exe", "mspaint"),
|
||
("calculatrice", "calc.exe", "calc"),
|
||
("calculator", "calc.exe", "calc"),
|
||
("explorateur de fichiers", "explorer.exe", "explorer"),
|
||
("file explorer", "explorer.exe", "explorer"),
|
||
("invite de commandes", "cmd.exe", "cmd"),
|
||
("command prompt", "cmd.exe", "cmd"),
|
||
("powershell", "powershell.exe", "powershell"),
|
||
("visual studio code", "code.exe", "code"),
|
||
("edge", "msedge.exe", "msedge"),
|
||
("chrome", "chrome.exe", "chrome"),
|
||
("firefox", "firefox.exe", "firefox"),
|
||
]
|
||
|
||
for title in titles:
|
||
title_lower = title.lower()
|
||
for pattern, app_name, launch_cmd in _TITLE_APP_PATTERNS:
|
||
if pattern in title_lower:
|
||
# Ignorer les apps système (explorer, etc.)
|
||
if app_name.lower() in _SETUP_IGNORE_APPS:
|
||
continue
|
||
return (app_name, launch_cmd, title)
|
||
|
||
return ("", "", "")
|
||
|
||
|
||
def _get_visual_search_info(app_name: str) -> Dict[str, str]:
|
||
"""Obtenir les informations de recherche visuelle pour une application.
|
||
|
||
Consulte _APP_VISUAL_SEARCH, sinon construit un fallback à partir du nom
|
||
de l'exécutable (ex: "MonApp.exe" → search_text="MonApp").
|
||
|
||
Args:
|
||
app_name: Nom de l'exécutable (ex: "Notepad.exe").
|
||
|
||
Returns:
|
||
Dict avec search_text, display_name, vlm_description.
|
||
"""
|
||
app_lower = app_name.lower()
|
||
if app_lower in _APP_VISUAL_SEARCH:
|
||
return dict(_APP_VISUAL_SEARCH[app_lower])
|
||
|
||
# Fallback : utiliser le nom sans .exe
|
||
base_name = app_name[:-4] if app_lower.endswith(".exe") else app_name
|
||
return {
|
||
"search_text": base_name,
|
||
"display_name": base_name,
|
||
"vlm_description": f"L'application {base_name} dans les résultats de recherche",
|
||
}
|
||
|
||
|
||
def _generate_setup_actions(
|
||
app_info: Dict[str, Any],
|
||
setup_id_prefix: str = "setup",
|
||
) -> List[Dict[str, Any]]:
|
||
"""Générer les actions 100% visuelles pour ouvrir l'application avant le replay.
|
||
|
||
Approche entièrement visuelle — JAMAIS de raccourcis clavier (Win, Win+R,
|
||
Ctrl+X, etc.) qui n'ont pas été enregistrés par l'utilisateur. Tout passe
|
||
par des clics visuels résolus par le VLM (Qwen2.5-VL).
|
||
|
||
La séquence est :
|
||
1. Clic visuel sur le bouton Démarrer (coin bas-gauche de l'écran)
|
||
2. Attendre que le menu Démarrer s'ouvre (1s)
|
||
3. Clic visuel sur la barre de recherche du menu Démarrer
|
||
4. Attendre que la barre de recherche soit active (500ms)
|
||
5. Taper le nom de l'application (texte français, ex: "Bloc-notes")
|
||
6. Attendre les résultats de recherche (1.2s)
|
||
7. Clic visuel sur le résultat de l'application trouvée
|
||
8. Attendre que l'application s'ouvre (2-3s selon le poids)
|
||
9. verify_screen : vérifier que la fenêtre attendue est apparue
|
||
|
||
Args:
|
||
app_info: Dict retourné par _extract_required_apps_from_events ou
|
||
_extract_required_apps_from_workflow.
|
||
setup_id_prefix: Préfixe pour les action_id générés.
|
||
|
||
Returns:
|
||
Liste d'actions normalisées, prêtes à injecter dans la queue.
|
||
Liste vide si aucune préparation n'est nécessaire.
|
||
"""
|
||
if not app_info:
|
||
return []
|
||
|
||
launch_cmd = app_info.get("primary_launch_cmd", "")
|
||
primary_app = app_info.get("primary_app", "")
|
||
first_title = app_info.get("first_window_title", "")
|
||
|
||
if not launch_cmd:
|
||
logger.debug(
|
||
"setup_actions : pas de commande de lancement pour '%s', skip",
|
||
primary_app,
|
||
)
|
||
return []
|
||
|
||
# Ne pas lancer les apps système (toujours présentes)
|
||
if primary_app.lower() in _SETUP_IGNORE_APPS:
|
||
logger.debug("setup_actions : app '%s' ignorée (système)", primary_app)
|
||
return []
|
||
|
||
# Obtenir les informations de recherche visuelle pour cette app
|
||
visual_info = _get_visual_search_info(primary_app)
|
||
search_text = visual_info["search_text"]
|
||
display_name = visual_info["display_name"]
|
||
vlm_description = visual_info["vlm_description"]
|
||
|
||
actions = []
|
||
|
||
logger.info(
|
||
"Génération setup env 100%% visuel : lancement de '%s' via clic "
|
||
"Démarrer → recherche visuelle '%s' (fenêtre attendue : '%s')",
|
||
primary_app, search_text, first_title,
|
||
)
|
||
|
||
# 1. Clic visuel sur le bouton Démarrer (toujours visible, bas-gauche)
|
||
# Le VLM résout la position exacte ; x_pct/y_pct sont des fallbacks.
|
||
actions.append({
|
||
"action_id": f"act_{setup_id_prefix}_click_start",
|
||
"type": "click",
|
||
"x_pct": 0.02,
|
||
"y_pct": 0.98,
|
||
"button": "left",
|
||
"visual_mode": True,
|
||
"target_spec": {
|
||
"by_text": "Démarrer",
|
||
"by_role": "start_button",
|
||
"vlm_description": (
|
||
"Le bouton Démarrer de Windows (icône Windows), "
|
||
"en bas à gauche de la barre des tâches"
|
||
),
|
||
},
|
||
"_setup_phase": True,
|
||
"_setup_step": "click_start_menu",
|
||
})
|
||
|
||
# 2. Attendre que le menu Démarrer s'ouvre
|
||
actions.append({
|
||
"action_id": f"act_{setup_id_prefix}_wait_start",
|
||
"type": "wait",
|
||
"duration_ms": 1000,
|
||
"_setup_phase": True,
|
||
"_setup_step": "wait_start_menu",
|
||
})
|
||
|
||
# 3. Clic visuel sur la barre de recherche du menu Démarrer
|
||
# Sur Windows 10/11, la barre de recherche est intégrée au menu Démarrer
|
||
# ou visible dans la barre des tâches. Le VLM la trouve visuellement.
|
||
actions.append({
|
||
"action_id": f"act_{setup_id_prefix}_click_search",
|
||
"type": "click",
|
||
"x_pct": 0.20,
|
||
"y_pct": 0.92,
|
||
"button": "left",
|
||
"visual_mode": True,
|
||
"target_spec": {
|
||
"by_text": "Rechercher",
|
||
"by_role": "search_box",
|
||
"vlm_description": (
|
||
"La barre ou le champ de recherche dans le menu Démarrer "
|
||
"de Windows, souvent intitulé 'Tapez ici pour rechercher' "
|
||
"ou 'Rechercher'"
|
||
),
|
||
},
|
||
"_setup_phase": True,
|
||
"_setup_step": "click_search_box",
|
||
})
|
||
|
||
# 4. Attendre que la barre de recherche soit active et prête
|
||
actions.append({
|
||
"action_id": f"act_{setup_id_prefix}_wait_search_ready",
|
||
"type": "wait",
|
||
"duration_ms": 500,
|
||
"_setup_phase": True,
|
||
"_setup_step": "wait_search_ready",
|
||
})
|
||
|
||
# 5. Taper le nom visuel de l'application (texte français)
|
||
# Le champ de recherche a été cliqué visuellement à l'étape 3,
|
||
# donc le type s'exécute dans le champ actif.
|
||
actions.append({
|
||
"action_id": f"act_{setup_id_prefix}_type_search",
|
||
"type": "type",
|
||
"text": search_text,
|
||
"_setup_phase": True,
|
||
"_setup_step": "type_app_name",
|
||
})
|
||
|
||
# 6. Attendre que la recherche Windows trouve l'application
|
||
actions.append({
|
||
"action_id": f"act_{setup_id_prefix}_wait_results",
|
||
"type": "wait",
|
||
"duration_ms": 1200,
|
||
"_setup_phase": True,
|
||
"_setup_step": "wait_search_results",
|
||
})
|
||
|
||
# 7. Clic visuel sur le résultat de l'application dans la liste
|
||
# Le VLM identifie l'icône/texte de l'app dans les résultats.
|
||
actions.append({
|
||
"action_id": f"act_{setup_id_prefix}_click_result",
|
||
"type": "click",
|
||
"x_pct": 0.20,
|
||
"y_pct": 0.50,
|
||
"button": "left",
|
||
"visual_mode": True,
|
||
"target_spec": {
|
||
"by_text": display_name,
|
||
"by_role": "app_icon",
|
||
"vlm_description": vlm_description,
|
||
},
|
||
"_setup_phase": True,
|
||
"_setup_step": "click_app_result",
|
||
})
|
||
|
||
# 8. Attendre que l'application s'ouvre
|
||
# Durée variable : 3s pour les apps lourdes (Office, VS Code), 2s sinon
|
||
heavy_apps = {"winword.exe", "excel.exe", "powerpnt.exe", "outlook.exe", "code.exe"}
|
||
wait_ms = 3000 if primary_app.lower() in heavy_apps else 2000
|
||
actions.append({
|
||
"action_id": f"act_{setup_id_prefix}_wait_launch",
|
||
"type": "wait",
|
||
"duration_ms": wait_ms,
|
||
"_setup_phase": True,
|
||
"_setup_step": "wait_app_launch",
|
||
})
|
||
|
||
# 9. Vérification visuelle que la fenêtre attendue est apparue
|
||
if first_title:
|
||
actions.append({
|
||
"action_id": f"act_{setup_id_prefix}_verify",
|
||
"type": "verify_screen",
|
||
"expected_node": "setup_initial",
|
||
"timeout_ms": 5000,
|
||
"_setup_phase": True,
|
||
"_setup_step": "verify_app_ready",
|
||
"_expected_title": first_title,
|
||
})
|
||
|
||
logger.info(
|
||
"Setup env visuel généré : %d actions pour lancer '%s' "
|
||
"(recherche visuelle : '%s')",
|
||
len(actions), primary_app, search_text,
|
||
)
|
||
|
||
return actions
|
||
|
||
|
||
def _validate_replay_action(action: dict) -> Optional[str]:
|
||
"""Valide une action de replay. Retourne un message d'erreur ou None si valide."""
|
||
action_type = action.get("type", "")
|
||
|
||
# Vérifier le type d'action
|
||
if action_type not in _ALLOWED_ACTION_TYPES:
|
||
return f"Type d'action non autorisé : '{action_type}'. Autorisés : {sorted(_ALLOWED_ACTION_TYPES)}"
|
||
|
||
# Vérifier la longueur du texte
|
||
text = action.get("text", "")
|
||
if isinstance(text, str) and len(text) > _MAX_ACTION_TEXT_LENGTH:
|
||
return f"Texte trop long ({len(text)} > {_MAX_ACTION_TEXT_LENGTH} caractères)"
|
||
|
||
# Vérifier les touches
|
||
keys = action.get("keys", [])
|
||
if isinstance(keys, list):
|
||
if len(keys) > _MAX_KEYS_PER_COMBO:
|
||
return f"Trop de touches ({len(keys)} > {_MAX_KEYS_PER_COMBO})"
|
||
for key in keys:
|
||
key_lower = str(key).lower()
|
||
# Accepter les caractères simples (a-z, 0-9, ponctuation) et les noms connus
|
||
if len(str(key)) == 1 or key_lower in _KNOWN_KEY_NAMES:
|
||
continue
|
||
return f"Touche inconnue : '{key}'"
|
||
|
||
# Vérifier les coordonnées normalisées
|
||
for coord_name in ("x_pct", "y_pct"):
|
||
val = action.get(coord_name)
|
||
if val is not None:
|
||
try:
|
||
val_f = float(val)
|
||
if not (0.0 <= val_f <= 1.0):
|
||
return f"Coordonnée {coord_name}={val_f} hors limites [0.0, 1.0]"
|
||
except (TypeError, ValueError):
|
||
return f"Coordonnée {coord_name} invalide : {val}"
|
||
|
||
return None # Valide
|
||
|
||
|
||
# En production (ENVIRONMENT != development), désactiver la doc Swagger
|
||
_is_production = os.environ.get("ENVIRONMENT", "development") != "development"
|
||
|
||
app = FastAPI(
|
||
title="RPA Vision V3 - Streaming API v1",
|
||
dependencies=[Depends(_verify_token)],
|
||
docs_url=None if _is_production else "/docs",
|
||
redoc_url=None if _is_production else "/redoc",
|
||
openapi_url=None if _is_production else "/openapi.json",
|
||
)
|
||
|
||
# CORS — origines autorisées (VWB frontend, Agent Chat, Dashboard)
|
||
# Configurable via variable d'environnement CORS_ORIGINS (séparées par des virgules)
|
||
# Inclut le domaine public pour l'accès internet via NPM reverse proxy
|
||
_DEFAULT_CORS_ORIGINS = (
|
||
"http://localhost:3002," # VWB Frontend (Vite/React)
|
||
"http://localhost:5002," # VWB Backend (Flask)
|
||
"http://localhost:5004," # Agent Chat
|
||
"http://localhost:5001," # Web Dashboard
|
||
"http://192.168.1.40:3002," # VWB Frontend depuis le réseau local
|
||
"http://192.168.1.40:5004," # Agent Chat depuis le réseau local
|
||
"https://lea.labs.laurinebazin.design," # Domaine public HTTPS
|
||
"https://vwb.labs.laurinebazin.design" # VWB public HTTPS
|
||
)
|
||
CORS_ORIGINS = os.environ.get("CORS_ORIGINS", _DEFAULT_CORS_ORIGINS).split(",")
|
||
CORS_ORIGINS = [o.strip() for o in CORS_ORIGINS if o.strip()]
|
||
|
||
app.add_middleware(
|
||
CORSMiddleware,
|
||
allow_origins=CORS_ORIGINS,
|
||
allow_credentials=True,
|
||
allow_methods=["GET", "POST"],
|
||
allow_headers=["Content-Type", "Authorization"],
|
||
)
|
||
|
||
|
||
@app.middleware("http")
|
||
async def security_headers_middleware(request: Request, call_next):
|
||
"""Ajouter les headers de sécurité sur toutes les réponses."""
|
||
response = await call_next(request)
|
||
response.headers["X-Content-Type-Options"] = "nosniff"
|
||
response.headers["X-Frame-Options"] = "DENY"
|
||
response.headers["X-XSS-Protection"] = "1; mode=block"
|
||
response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
|
||
if request.url.scheme == "https" or request.headers.get("X-Forwarded-Proto") == "https":
|
||
response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
|
||
return response
|
||
|
||
|
||
@app.middleware("http")
|
||
async def rate_limit_middleware(request: Request, call_next):
|
||
"""Middleware de rate limiting sur les endpoints sensibles."""
|
||
path = request.url.path
|
||
if path in _RATE_LIMITS:
|
||
client_ip = request.client.host if request.client else "unknown"
|
||
if not _check_rate_limit(path, client_ip):
|
||
from fastapi.responses import JSONResponse
|
||
logger.warning(f"Rate limit dépassé : {path} par {client_ip}")
|
||
return JSONResponse(
|
||
status_code=429,
|
||
content={"detail": f"Trop de requêtes. Limite : {_RATE_LIMITS[path]}/{_RATE_LIMIT_WINDOW}s"},
|
||
)
|
||
return await call_next(request)
|
||
|
||
|
||
# Dossier des sessions live
|
||
ROOT_DIR = Path(__file__).parent.parent.parent
|
||
LIVE_SESSIONS_DIR = ROOT_DIR / "data" / "training" / "live_sessions"
|
||
LIVE_SESSIONS_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
# =========================================================================
|
||
# Communication avec le worker VLM (process séparé)
|
||
# Le serveur HTTP ne fait JAMAIS de VLM — il écrit dans des fichiers
|
||
# que le worker VLM (run_worker.py) lit dans son propre process.
|
||
# =========================================================================
|
||
_DATA_DIR = ROOT_DIR / "data" / "training"
|
||
WORKER_QUEUE_FILE = _DATA_DIR / "_worker_queue.txt"
|
||
REPLAY_LOCK_FILE = _DATA_DIR / "_replay_active.lock"
|
||
|
||
# Instance globale partagée (le StreamProcessor reste dans le serveur HTTP
|
||
# pour le CLIP, l'indexation FAISS, la gestion des sessions, le replay —
|
||
# mais ne fait PAS de VLM/reprocess_session, c'est le worker séparé qui s'en charge)
|
||
processor = StreamProcessor(data_dir=str(LIVE_SESSIONS_DIR))
|
||
worker = StreamWorker(live_dir=str(LIVE_SESSIONS_DIR), processor=processor)
|
||
|
||
|
||
# =========================================================================
|
||
# Flush garanti à l'arrêt — signal handler + atexit (ceinture et bretelles)
|
||
# =========================================================================
|
||
# Le shutdown handler FastAPI (@app.on_event("shutdown")) fait déjà un flush,
|
||
# mais si le serveur est tué par SIGTERM (systemd) ou SIGINT (Ctrl+C) avant
|
||
# que uvicorn ait le temps de déclencher le shutdown propre, le flush n'a pas
|
||
# lieu. On ajoute donc un signal handler ET un atexit comme filets de sécurité.
|
||
|
||
def _emergency_flush(signum=None, frame=None):
|
||
"""Flush les sessions dirty sur disque avant exit.
|
||
|
||
Appelé par SIGTERM/SIGINT ou atexit. Idempotent (flush() est thread-safe).
|
||
"""
|
||
sig_name = signal.Signals(signum).name if signum else "atexit"
|
||
logger.info(f"Flush d'urgence des sessions en cours ({sig_name})...")
|
||
try:
|
||
processor.session_manager.flush()
|
||
logger.info("Flush d'urgence terminé — données persistées.")
|
||
except Exception as e:
|
||
logger.error(f"Erreur pendant le flush d'urgence : {e}")
|
||
# Si c'est un signal, on laisse le handler par défaut terminer le process
|
||
if signum is not None:
|
||
# Remettre le handler par défaut et re-raise le signal
|
||
signal.signal(signum, signal.SIG_DFL)
|
||
os.kill(os.getpid(), signum)
|
||
|
||
# Enregistrer les handlers uniquement quand le module est exécuté comme serveur
|
||
# (pas lors d'un simple import depuis un autre process comme le retraitement batch)
|
||
def _register_shutdown_handlers():
|
||
signal.signal(signal.SIGTERM, _emergency_flush)
|
||
signal.signal(signal.SIGINT, _emergency_flush)
|
||
atexit.register(processor.session_manager.flush)
|
||
logger.info("Handlers de shutdown enregistrés (SIGTERM, SIGINT, atexit)")
|
||
|
||
|
||
def _enqueue_to_worker(session_id: str):
|
||
"""Ajoute un session_id à la queue du worker VLM (fichier sur disque).
|
||
|
||
Le worker VLM (process séparé) lit ce fichier et traite les sessions.
|
||
Évite les doublons : vérifie si le session_id est déjà dans la queue.
|
||
"""
|
||
try:
|
||
WORKER_QUEUE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Lire la queue existante pour éviter les doublons
|
||
existing = set()
|
||
if WORKER_QUEUE_FILE.exists():
|
||
existing = {
|
||
line.strip()
|
||
for line in WORKER_QUEUE_FILE.read_text(encoding="utf-8").splitlines()
|
||
if line.strip()
|
||
}
|
||
|
||
if session_id in existing:
|
||
logger.info(f"Session {session_id} déjà dans la queue worker, skip")
|
||
return
|
||
|
||
# Ajouter à la fin du fichier
|
||
with open(WORKER_QUEUE_FILE, "a", encoding="utf-8") as f:
|
||
f.write(session_id + "\n")
|
||
|
||
logger.info(f"Session {session_id} ajoutée à la queue worker ({WORKER_QUEUE_FILE})")
|
||
except Exception as e:
|
||
logger.error(f"Erreur écriture queue worker : {e}")
|
||
|
||
|
||
def _set_replay_lock(replay_id: str = ""):
|
||
"""Crée le fichier lock de replay (signale au worker VLM de se suspendre)."""
|
||
try:
|
||
REPLAY_LOCK_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||
REPLAY_LOCK_FILE.write_text(
|
||
f"replay_id={replay_id}\ntimestamp={time.time()}\n",
|
||
encoding="utf-8",
|
||
)
|
||
logger.info(f"Replay lock créé : {REPLAY_LOCK_FILE} (replay={replay_id})")
|
||
except Exception as e:
|
||
logger.error(f"Erreur création replay lock : {e}")
|
||
|
||
|
||
def _clear_replay_lock():
|
||
"""Supprime le fichier lock de replay (le worker VLM peut reprendre)."""
|
||
try:
|
||
REPLAY_LOCK_FILE.unlink(missing_ok=True)
|
||
logger.info("Replay lock supprimé, worker VLM autorisé à reprendre")
|
||
except Exception as e:
|
||
logger.error(f"Erreur suppression replay lock : {e}")
|
||
|
||
|
||
def _get_worker_queue_status() -> Dict[str, Any]:
|
||
"""Retourne l'état de la queue du worker VLM (pour le monitoring)."""
|
||
queue = []
|
||
if WORKER_QUEUE_FILE.exists():
|
||
try:
|
||
queue = [
|
||
line.strip()
|
||
for line in WORKER_QUEUE_FILE.read_text(encoding="utf-8").splitlines()
|
||
if line.strip()
|
||
]
|
||
except Exception:
|
||
pass
|
||
|
||
return {
|
||
"running": True, # On ne sait pas si le worker process tourne, mais la queue existe
|
||
"queue_length": len(queue),
|
||
"queue": queue,
|
||
"replay_lock_active": REPLAY_LOCK_FILE.exists(),
|
||
"queue_file": str(WORKER_QUEUE_FILE),
|
||
"note": "Le worker VLM tourne dans un process séparé (run_worker.py)",
|
||
}
|
||
|
||
|
||
# =========================================================================
|
||
# Compteur d'analyses en cours par session (pour attendre avant finalize)
|
||
# =========================================================================
|
||
_pending_analyses: Dict[str, int] = defaultdict(int)
|
||
_pending_lock = threading.Lock()
|
||
|
||
# =========================================================================
|
||
# File d'attente de replay par session
|
||
# Chaque session a une queue d'actions à exécuter et un état de replay
|
||
# =========================================================================
|
||
_replay_lock = threading.Lock()
|
||
# session_id -> liste d'actions en attente (FIFO)
|
||
_replay_queues: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
||
# machine_id -> session_id (mapping pour le replay ciblé par machine)
|
||
_machine_replay_target: Dict[str, str] = {}
|
||
# replay_id -> état du replay (workflow_id, session_id, status, progress)
|
||
_replay_states: Dict[str, Dict[str, Any]] = {}
|
||
|
||
|
||
class StreamEvent(BaseModel):
|
||
session_id: str
|
||
timestamp: float
|
||
event: Dict[str, Any]
|
||
machine_id: str = "default" # Identifiant machine (multi-machine, rétrocompatible)
|
||
|
||
|
||
class ReplayRequest(BaseModel):
|
||
"""Requête de lancement de replay d'un workflow."""
|
||
workflow_id: str
|
||
session_id: str
|
||
machine_id: Optional[str] = None # Machine cible pour le replay (multi-machine)
|
||
params: Optional[Dict[str, Any]] = None
|
||
|
||
|
||
class RawReplayRequest(BaseModel):
|
||
"""Requête de replay avec actions brutes (mode Agent Libre)."""
|
||
actions: List[Dict[str, Any]]
|
||
session_id: str = ""
|
||
machine_id: Optional[str] = None # Machine cible (multi-machine)
|
||
task_description: str = ""
|
||
|
||
|
||
class SingleActionRequest(BaseModel):
|
||
"""Requête d'exécution d'une seule action (mode Copilot)."""
|
||
action: Dict[str, Any]
|
||
session_id: str = ""
|
||
machine_id: Optional[str] = None # Machine cible (multi-machine)
|
||
|
||
|
||
class ReplayResultReport(BaseModel):
|
||
"""Rapport de résultat d'exécution d'une action par l'Agent V1."""
|
||
session_id: str
|
||
action_id: str
|
||
success: bool
|
||
error: Optional[str] = None
|
||
warning: Optional[str] = None # "no_screen_change", "popup_handled", "visual_resolve_failed"
|
||
screenshot: Optional[str] = None # Chemin ou base64 du screenshot post-action
|
||
screenshot_after: Optional[str] = None # Chemin ou base64 du screenshot APRES l'action
|
||
screenshot_before: Optional[str] = None # Screenshot AVANT l'action (pour le Critic)
|
||
actual_position: Optional[Dict[str, float]] = None # {"x": px, "y": py} position réelle du clic
|
||
# Métriques de résolution visuelle
|
||
resolution_method: Optional[str] = None # som_text_match, som_vlm, vlm_quick_find, etc.
|
||
resolution_score: Optional[float] = None
|
||
resolution_elapsed_ms: Optional[float] = None
|
||
# Champs enrichis pour target_not_found (pause supervisée)
|
||
target_description: Optional[str] = None # Description humaine de la cible
|
||
target_spec: Optional[Dict[str, Any]] = None # Spec complete de la cible
|
||
|
||
|
||
class ErrorCallbackConfig(BaseModel):
|
||
"""Configuration du callback d'erreur pour un replay."""
|
||
replay_id: str
|
||
callback_url: str # URL à appeler en cas d'erreur non-récupérable
|
||
|
||
|
||
# Thread de nettoyage périodique des replays terminés et sessions expirées
|
||
_cleanup_thread: Optional[threading.Thread] = None
|
||
_cleanup_running = False
|
||
|
||
|
||
def _cleanup_loop():
|
||
"""Nettoyage périodique des replay states terminés et des sessions expirées.
|
||
|
||
Tourne en arrière-plan toutes les 10 minutes :
|
||
- Supprime les replay states completed/error/failed plus vieux que REPLAY_STATE_TTL_SECONDS
|
||
- Nettoie les sessions en mémoire via LiveSessionManager.cleanup_old_sessions()
|
||
- Borne _replay_states à MAX_REPLAY_STATES entrées
|
||
"""
|
||
while _cleanup_running:
|
||
time.sleep(600) # 10 minutes
|
||
if not _cleanup_running:
|
||
break
|
||
try:
|
||
_cleanup_replay_states()
|
||
# Nettoyage des sessions expirées en mémoire (toutes les heures = 6 cycles)
|
||
processor.session_manager.cleanup_old_sessions(max_age_hours=24)
|
||
except Exception as e:
|
||
logger.error(f"Erreur dans la boucle de nettoyage : {e}")
|
||
|
||
|
||
def _cleanup_replay_states():
|
||
"""Supprimer les replay states terminés (completed/error/failed) plus vieux que le TTL."""
|
||
now = time.time()
|
||
to_delete = []
|
||
|
||
with _replay_lock:
|
||
for replay_id, state in _replay_states.items():
|
||
if state["status"] in ("completed", "error", "failed"):
|
||
# Vérifier l'âge via le dernier résultat ou le timestamp du dernier event
|
||
last_result = state.get("results", [])
|
||
last_time = last_result[-1].get("timestamp", 0) if last_result else 0
|
||
if not last_time:
|
||
# Pas de timestamp dans les résultats, utiliser les error_log
|
||
error_log = state.get("error_log", [])
|
||
last_time = error_log[-1].get("timestamp", 0) if error_log else 0
|
||
if not last_time:
|
||
# Aucun timestamp trouvé, marquer pour suppression (orphelin)
|
||
to_delete.append(replay_id)
|
||
continue
|
||
if now - last_time > REPLAY_STATE_TTL_SECONDS:
|
||
to_delete.append(replay_id)
|
||
|
||
# Supprimer les entrées expirées
|
||
for replay_id in to_delete:
|
||
del _replay_states[replay_id]
|
||
_error_callbacks.pop(replay_id, None)
|
||
|
||
# Borne de sécurité : si trop d'entrées, supprimer les plus anciens terminés
|
||
if len(_replay_states) > MAX_REPLAY_STATES:
|
||
finished = [
|
||
(rid, s) for rid, s in _replay_states.items()
|
||
if s["status"] in ("completed", "error", "failed")
|
||
]
|
||
# Trier par nombre de résultats (les plus anciens ont typiquement tous leurs résultats)
|
||
excess = len(_replay_states) - MAX_REPLAY_STATES
|
||
for rid, _ in finished[:excess]:
|
||
del _replay_states[rid]
|
||
_error_callbacks.pop(rid, None)
|
||
|
||
if to_delete:
|
||
logger.info(f"Nettoyage replay states : {len(to_delete)} entrées supprimées")
|
||
|
||
|
||
@app.get("/health")
|
||
async def health_check():
|
||
"""Endpoint de santé (public, pas besoin de token)."""
|
||
return {"status": "healthy", "version": "1.0.0"}
|
||
|
||
|
||
def _check_gpu_ready():
|
||
"""Vérifier que le GPU a assez de VRAM pour le pipeline.
|
||
|
||
Minimum 6 GB requis pour le VLM (gemma4:e4b ~10 GB) et les modèles CLIP/FAISS.
|
||
Loggue un avertissement si insuffisante, info sinon.
|
||
"""
|
||
try:
|
||
import subprocess
|
||
result = subprocess.run(
|
||
["nvidia-smi", "--query-gpu=memory.free", "--format=csv,noheader,nounits"],
|
||
capture_output=True, text=True, timeout=5
|
||
)
|
||
if result.returncode != 0:
|
||
logger.debug(f"nvidia-smi retour non-zéro : {result.stderr.strip()}")
|
||
return
|
||
# nvidia-smi peut retourner plusieurs lignes (multi-GPU) — prendre la première
|
||
free_mb_str = result.stdout.strip().split("\n")[0].strip()
|
||
free_mb = int(free_mb_str)
|
||
if free_mb < 6000: # 6 GB minimum pour le VLM + CLIP
|
||
logger.warning(
|
||
f"VRAM insuffisante : {free_mb} MB libres (minimum 6000 MB). "
|
||
f"Vérifier les process GPU avec nvidia-smi."
|
||
)
|
||
print(
|
||
f"\n [GPU WARNING] VRAM insuffisante : {free_mb} MB libres "
|
||
f"(minimum 6000 MB)\n"
|
||
)
|
||
else:
|
||
logger.info(f"GPU OK : {free_mb} MB VRAM libres")
|
||
except FileNotFoundError:
|
||
logger.debug("nvidia-smi non trouvé — pas de GPU NVIDIA détecté")
|
||
except Exception as e:
|
||
logger.debug(f"GPU check échoué : {e}")
|
||
|
||
|
||
@app.on_event("startup")
|
||
async def startup():
|
||
"""Démarrer le worker de streaming et charger les workflows existants.
|
||
|
||
NOTE: Le VLM (SessionWorker) tourne maintenant dans un process séparé
|
||
(run_worker.py). Ce serveur HTTP ne fait PLUS de VLM — il reste toujours
|
||
réactif pour les replays, events, images.
|
||
"""
|
||
global _cleanup_running, _cleanup_thread
|
||
|
||
# Vérifier la VRAM GPU disponible au démarrage
|
||
_check_gpu_ready()
|
||
|
||
# Résoudre et afficher le modèle VLM utilisé
|
||
# Enregistrer les handlers de shutdown (SIGTERM, SIGINT, atexit)
|
||
_register_shutdown_handlers()
|
||
|
||
from core.detection.vlm_config import get_vlm_model
|
||
_vlm_model_name = get_vlm_model()
|
||
logger.info("VLM model: %s", _vlm_model_name)
|
||
print(f"\n VLM model: {_vlm_model_name}")
|
||
|
||
# Afficher le token API au démarrage pour que l'utilisateur puisse configurer l'agent
|
||
_token_source = "env RPA_API_TOKEN" if os.environ.get("RPA_API_TOKEN") else "auto-généré"
|
||
logger.info(f"API Token ({_token_source}): {API_TOKEN}")
|
||
print(f"\n{'='*60}")
|
||
print(f" API Token ({_token_source}):")
|
||
print(f" {API_TOKEN}")
|
||
print(f" Configurer l'agent : export RPA_API_TOKEN={API_TOKEN}")
|
||
print(f"{'='*60}\n")
|
||
|
||
worker.start(blocking=False)
|
||
|
||
# Charger les workflows existants depuis le disque
|
||
_load_existing_workflows()
|
||
|
||
# S'assurer que le replay lock est nettoyé au démarrage (crash précédent)
|
||
_clear_replay_lock()
|
||
|
||
# Démarrer le thread de nettoyage périodique
|
||
_cleanup_running = True
|
||
_cleanup_thread = threading.Thread(target=_cleanup_loop, daemon=True, name="replay_cleanup")
|
||
_cleanup_thread.start()
|
||
|
||
logger.info(
|
||
"API Streaming démarrée — StreamProcessor, Worker et Cleanup prêts. "
|
||
"VLM Worker dans un process séparé (run_worker.py)."
|
||
)
|
||
|
||
|
||
def _load_existing_workflows():
|
||
"""Charger les workflows JSON existants dans processor._workflows.
|
||
|
||
Supporte deux formats :
|
||
- Workflow.load_from_file (format complet avec workflow_id)
|
||
- JSON brut avec clé 'name' (format simplifié VWB/manuels)
|
||
"""
|
||
from core.models.workflow_graph import Workflow
|
||
|
||
workflow_dirs = [
|
||
ROOT_DIR / "data" / "workflows",
|
||
ROOT_DIR / "data" / "training" / "workflows",
|
||
LIVE_SESSIONS_DIR / "workflows",
|
||
]
|
||
|
||
loaded = 0
|
||
for wf_dir in workflow_dirs:
|
||
if not wf_dir.exists():
|
||
continue
|
||
for wf_file in wf_dir.glob("*.json"):
|
||
try:
|
||
wf = Workflow.load_from_file(str(wf_file))
|
||
if wf and hasattr(wf, 'workflow_id'):
|
||
with processor._data_lock:
|
||
processor._workflows[wf.workflow_id] = wf
|
||
loaded += 1
|
||
continue
|
||
except Exception:
|
||
pass
|
||
|
||
# Fallback : charger comme JSON brut et injecter un workflow_id
|
||
try:
|
||
wf_data = json.loads(wf_file.read_text(encoding="utf-8"))
|
||
wf_id = wf_data.get("workflow_id") or wf_file.stem
|
||
# Stocker le dict brut (suffisant pour _workflow_to_actions)
|
||
with processor._data_lock:
|
||
processor._workflows[wf_id] = wf_data
|
||
loaded += 1
|
||
except Exception as e:
|
||
logger.debug(f"Skip workflow {wf_file.name}: {e}")
|
||
|
||
logger.info(f"Workflows chargés depuis disque: {loaded}")
|
||
|
||
|
||
@app.on_event("shutdown")
|
||
async def shutdown():
|
||
global _cleanup_running
|
||
_cleanup_running = False
|
||
worker.stop()
|
||
# Nettoyer le replay lock au shutdown (sinon le worker VLM resterait bloqué)
|
||
_clear_replay_lock()
|
||
processor.session_manager.flush()
|
||
logger.info("API Streaming arrêtée.")
|
||
|
||
|
||
# =========================================================================
|
||
# Session management
|
||
# =========================================================================
|
||
|
||
@app.post("/api/v1/traces/stream/register")
|
||
async def register_session(session_id: str, machine_id: str = "default"):
|
||
"""Enregistrer une nouvelle session de streaming.
|
||
|
||
Args:
|
||
session_id: Identifiant unique de la session
|
||
machine_id: Identifiant de la machine source (multi-machine, défaut: "default")
|
||
"""
|
||
processor.session_manager.register_session(session_id, machine_id=machine_id)
|
||
# Reset des compteurs pour cette session (évite les reliquats d'une session précédente)
|
||
with _pending_lock:
|
||
_pending_analyses[session_id] = 0
|
||
_analyzed_shots[session_id] = set()
|
||
logger.info(f"Session {session_id} enregistrée (machine={machine_id}, compteurs réinitialisés)")
|
||
return {"status": "session_registered", "session_id": session_id, "machine_id": machine_id}
|
||
|
||
|
||
def _ensure_session_registered(session_id: str, machine_id: str = "default"):
|
||
"""Auto-enregistrer une session si elle n'existe pas encore.
|
||
|
||
Robustesse au redémarrage du serveur : l'Agent V1 ne re-register pas
|
||
sa session, mais continue d'envoyer des events/images. On l'enregistre
|
||
automatiquement à la première réception.
|
||
|
||
Args:
|
||
session_id: Identifiant de la session
|
||
machine_id: Identifiant machine (propagé depuis l'agent)
|
||
"""
|
||
session = processor.session_manager.get_session(session_id)
|
||
if session is None:
|
||
logger.info(f"Auto-enregistrement de la session {session_id} (machine={machine_id})")
|
||
processor.session_manager.register_session(session_id, machine_id=machine_id)
|
||
with _pending_lock:
|
||
_pending_analyses[session_id] = 0
|
||
_analyzed_shots[session_id] = set()
|
||
elif machine_id != "default" and session.machine_id == "default":
|
||
# Mettre à jour le machine_id si l'agent l'envoie et qu'on ne l'avait pas
|
||
session.machine_id = machine_id
|
||
|
||
|
||
# =========================================================================
|
||
# Événements
|
||
# =========================================================================
|
||
|
||
@app.post("/api/v1/traces/stream/event")
|
||
async def stream_event(data: StreamEvent):
|
||
"""Reçoit un événement et l'enregistre dans la session."""
|
||
session_id = data.session_id
|
||
machine_id = data.machine_id or "default"
|
||
|
||
# Auto-enregistrer la session si inconnue (robustesse au redémarrage serveur)
|
||
_ensure_session_registered(session_id, machine_id=machine_id)
|
||
|
||
# Persister sur disque (journal JSONL, dans un sous-dossier par machine si multi-machine)
|
||
if machine_id and machine_id != "default":
|
||
session_path = LIVE_SESSIONS_DIR / machine_id / session_id
|
||
else:
|
||
session_path = LIVE_SESSIONS_DIR / session_id
|
||
session_path.mkdir(parents=True, exist_ok=True)
|
||
event_file = session_path / "live_events.jsonl"
|
||
with open(event_file, "a", encoding="utf-8") as f:
|
||
f.write(json.dumps(data.dict()) + "\n")
|
||
|
||
# Traitement direct via StreamProcessor
|
||
result = worker.process_event_direct(session_id, data.event)
|
||
|
||
# ── Enrichissement SomEngine temps réel pour les mouse_click ──
|
||
# Après l'enregistrement de l'event, tenter l'enrichissement si le
|
||
# screenshot est déjà arrivé. Sinon, l'event est mis en attente et
|
||
# sera enrichi quand le screenshot arrivera (voir stream_image).
|
||
event = data.event
|
||
if event.get("type") == "mouse_click" and event.get("screenshot_id"):
|
||
session = processor.session_manager.get_session(session_id)
|
||
if session:
|
||
event_index = len(session.events) - 1
|
||
submitted = _try_enrich_click_event(
|
||
session_id, event, event_index, machine_id,
|
||
)
|
||
result["som_enrichment"] = "submitted" if submitted else "pending_screenshot"
|
||
|
||
return {"status": "event_synced", "session_id": session_id, **result}
|
||
|
||
|
||
# =========================================================================
|
||
# Images
|
||
# =========================================================================
|
||
|
||
# Ensemble des screenshots déjà analysés (évite les doublons de retry)
|
||
_analyzed_shots: Dict[str, set] = defaultdict(set)
|
||
|
||
# Hash du dernier screenshot analysé par session (déduplication par similarité)
|
||
_last_screenshot_hash: Dict[str, str] = {}
|
||
|
||
# Dernier heartbeat reçu par session : {session_id: {"path": str, "timestamp": float}}
|
||
# Utilisé par le pre-check de replay pour vérifier l'état de l'écran avant action
|
||
_last_heartbeat: Dict[str, Dict[str, Any]] = {}
|
||
# Seuil max d'ancienneté du heartbeat (secondes) — au-delà, skip le pre-check
|
||
_HEARTBEAT_MAX_AGE_SECONDS = 10.0
|
||
# Seuil de similarité cosine pour valider le pre-check
|
||
_PRECHECK_SIMILARITY_THRESHOLD = 0.85
|
||
|
||
# ThreadPool pour l'analyse GPU (évite de bloquer le event loop async)
|
||
_gpu_executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="gpu_analysis")
|
||
|
||
# =========================================================================
|
||
# Enrichissement SomEngine en temps réel
|
||
# Quand un mouse_click arrive avec un screenshot_id, on lance SomEngine
|
||
# pour identifier l'élément UI cliqué. Le résultat est stocké dans l'event
|
||
# de la session, prêt pour le replay sans retraitement VLM.
|
||
# =========================================================================
|
||
|
||
# ThreadPool dédié SomEngine (1 seul worker pour ne pas saturer le GPU)
|
||
_som_enrichment_executor = ThreadPoolExecutor(
|
||
max_workers=1, thread_name_prefix="som_enrich",
|
||
)
|
||
|
||
# Clics en attente d'enrichissement (le screenshot n'est pas encore arrivé)
|
||
# Clé : (session_id, screenshot_id) → dict avec les infos nécessaires
|
||
_pending_click_enrichments: Dict[tuple, Dict[str, Any]] = {}
|
||
_enrichment_lock = threading.Lock()
|
||
|
||
# Screenshots d'action arrivés (pour matcher avec les events en attente)
|
||
# Clé : (session_id, screenshot_id) → chemin du fichier
|
||
_arrived_action_screenshots: Dict[tuple, str] = {}
|
||
|
||
|
||
def _get_session_dir(session_id: str, machine_id: str = "default") -> Path:
|
||
"""Retrouver le répertoire d'une session live."""
|
||
if machine_id and machine_id != "default":
|
||
return LIVE_SESSIONS_DIR / machine_id / session_id
|
||
return LIVE_SESSIONS_DIR / session_id
|
||
|
||
|
||
def _get_screen_resolution_for_session(session_id: str) -> tuple:
|
||
"""Récupérer la résolution d'écran depuis la session en mémoire."""
|
||
session = processor.session_manager.get_session(session_id)
|
||
if session and session.last_window_info:
|
||
res = session.last_window_info.get("screen_resolution", [1920, 1080])
|
||
if isinstance(res, list) and len(res) == 2:
|
||
return (int(res[0]), int(res[1]))
|
||
return (1920, 1080)
|
||
|
||
|
||
def _submit_click_enrichment(
|
||
session_id: str,
|
||
event_data: dict,
|
||
screenshot_path: str,
|
||
event_index: int,
|
||
machine_id: str = "default",
|
||
) -> None:
|
||
"""Soumettre l'enrichissement SomEngine d'un clic au thread pool dédié.
|
||
|
||
Ne bloque pas le handler HTTP — le résultat sera stocké dans l'event
|
||
de la session quand SomEngine aura terminé (~1-2 secondes).
|
||
|
||
Args:
|
||
session_id: Identifiant de la session.
|
||
event_data: Données de l'événement mouse_click (pos, window, etc.).
|
||
screenshot_path: Chemin vers le screenshot full (PNG).
|
||
event_index: Index de l'event dans la liste session.events.
|
||
machine_id: Identifiant machine.
|
||
"""
|
||
_som_enrichment_executor.submit(
|
||
_enrich_click_background,
|
||
session_id, event_data, screenshot_path, event_index, machine_id,
|
||
)
|
||
|
||
|
||
def _enrich_click_background(
|
||
session_id: str,
|
||
event_data: dict,
|
||
screenshot_path: str,
|
||
event_index: int,
|
||
machine_id: str = "default",
|
||
) -> None:
|
||
"""Enrichir un clic avec SomEngine en arrière-plan (thread séparé).
|
||
|
||
Appelle enrich_click_from_screenshot() et stocke le résultat
|
||
directement dans l'event de la session (enrichment dict).
|
||
"""
|
||
try:
|
||
pos = event_data.get("pos", [0, 0])
|
||
if not pos or len(pos) < 2:
|
||
return
|
||
|
||
click_x, click_y = int(pos[0]), int(pos[1])
|
||
screen_w, screen_h = _get_screen_resolution_for_session(session_id)
|
||
|
||
# Extraire le titre de fenêtre
|
||
window = event_data.get("window", {})
|
||
if isinstance(window, dict):
|
||
window_title = window.get("title", "")
|
||
else:
|
||
window_title = event_data.get("window_title", "")
|
||
|
||
# Extraire vision_info si disponible (OCR côté agent)
|
||
vision_info = event_data.get("vision_info")
|
||
|
||
# Déduire session_dir et screenshot_id pour le cache SomEngine
|
||
session_dir = _get_session_dir(session_id, machine_id)
|
||
screenshot_id = event_data.get("screenshot_id", "")
|
||
|
||
logger.info(
|
||
"[SoM-RT] Enrichissement clic (%d,%d) pour %s/%s",
|
||
click_x, click_y, session_id, screenshot_id,
|
||
)
|
||
|
||
enrichment = enrich_click_from_screenshot(
|
||
screenshot_path=Path(screenshot_path),
|
||
click_x=click_x,
|
||
click_y=click_y,
|
||
screen_w=screen_w,
|
||
screen_h=screen_h,
|
||
window_title=window_title,
|
||
vision_info=vision_info,
|
||
session_dir=session_dir,
|
||
screenshot_id=screenshot_id,
|
||
)
|
||
|
||
if not enrichment:
|
||
logger.debug(
|
||
"[SoM-RT] Enrichissement vide pour %s/%s (screenshot illisible ?)",
|
||
session_id, screenshot_id,
|
||
)
|
||
return
|
||
|
||
# Stocker le résultat dans l'event de la session
|
||
session = processor.session_manager.get_session(session_id)
|
||
if session and 0 <= event_index < len(session.events):
|
||
session.events[event_index]["enrichment"] = enrichment
|
||
# Forcer la persistance pour sauvegarder l'enrichissement
|
||
processor.session_manager._maybe_persist(session_id)
|
||
logger.info(
|
||
"[SoM-RT] Clic enrichi : %s/%s → by_text='%s', by_role='%s', som=%s",
|
||
session_id, screenshot_id,
|
||
enrichment.get("by_text", ""),
|
||
enrichment.get("by_role", ""),
|
||
bool(enrichment.get("som_element")),
|
||
)
|
||
else:
|
||
logger.warning(
|
||
"[SoM-RT] Session %s introuvable ou event_index %d invalide",
|
||
session_id, event_index,
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(
|
||
"[SoM-RT] Erreur enrichissement clic %s : %s",
|
||
session_id, e, exc_info=True,
|
||
)
|
||
|
||
|
||
def _try_enrich_click_event(
|
||
session_id: str,
|
||
event_data: dict,
|
||
event_index: int,
|
||
machine_id: str = "default",
|
||
) -> bool:
|
||
"""Tenter l'enrichissement SomEngine d'un event mouse_click.
|
||
|
||
Vérifie si le screenshot est déjà arrivé. Si oui, soumet l'enrichissement.
|
||
Si non, enregistre l'event dans la file d'attente.
|
||
|
||
Returns:
|
||
True si l'enrichissement a été soumis, False si en attente du screenshot.
|
||
"""
|
||
screenshot_id = event_data.get("screenshot_id", "")
|
||
if not screenshot_id:
|
||
return False
|
||
|
||
key = (session_id, screenshot_id)
|
||
|
||
with _enrichment_lock:
|
||
# Le screenshot est-il déjà arrivé ?
|
||
screenshot_path = _arrived_action_screenshots.get(key)
|
||
if screenshot_path:
|
||
# Screenshot disponible → soumettre immédiatement
|
||
_submit_click_enrichment(
|
||
session_id, event_data, screenshot_path, event_index, machine_id,
|
||
)
|
||
# Nettoyer : plus besoin de garder le screenshot en mémoire
|
||
_arrived_action_screenshots.pop(key, None)
|
||
return True
|
||
else:
|
||
# Screenshot pas encore arrivé → mettre en attente
|
||
_pending_click_enrichments[key] = {
|
||
"event_data": event_data,
|
||
"event_index": event_index,
|
||
"machine_id": machine_id,
|
||
}
|
||
logger.debug(
|
||
"[SoM-RT] Clic en attente du screenshot %s/%s",
|
||
session_id, screenshot_id,
|
||
)
|
||
return False
|
||
|
||
|
||
def _on_action_screenshot_arrived(
|
||
session_id: str,
|
||
shot_id: str,
|
||
file_path: str,
|
||
machine_id: str = "default",
|
||
) -> bool:
|
||
"""Appelé quand un screenshot d'action (shot_XXXX_full) arrive.
|
||
|
||
Vérifie s'il y a un clic en attente d'enrichissement pour ce screenshot.
|
||
Si oui, soumet l'enrichissement au thread pool.
|
||
|
||
Args:
|
||
session_id: Identifiant de la session.
|
||
shot_id: Identifiant du screenshot (ex: "shot_0003_full").
|
||
file_path: Chemin complet vers le fichier PNG.
|
||
machine_id: Identifiant machine.
|
||
|
||
Returns:
|
||
True si un enrichissement a été soumis, False sinon.
|
||
"""
|
||
# Extraire le screenshot_id depuis le shot_id : "shot_0003_full" → "shot_0003"
|
||
screenshot_id = shot_id.replace("_full", "")
|
||
key = (session_id, screenshot_id)
|
||
|
||
with _enrichment_lock:
|
||
# Y a-t-il un clic en attente pour ce screenshot ?
|
||
pending = _pending_click_enrichments.pop(key, None)
|
||
if pending:
|
||
# Clic trouvé → soumettre l'enrichissement
|
||
_submit_click_enrichment(
|
||
session_id,
|
||
pending["event_data"],
|
||
file_path,
|
||
pending["event_index"],
|
||
pending.get("machine_id", machine_id),
|
||
)
|
||
return True
|
||
else:
|
||
# Pas de clic en attente → enregistrer le screenshot pour plus tard
|
||
_arrived_action_screenshots[key] = file_path
|
||
# Nettoyage : limiter la taille du cache (les vieux screenshots
|
||
# dont l'event n'arrivera jamais)
|
||
if len(_arrived_action_screenshots) > 200:
|
||
# Supprimer les plus anciennes entrées (FIFO via insertion order)
|
||
oldest = next(iter(_arrived_action_screenshots))
|
||
_arrived_action_screenshots.pop(oldest, None)
|
||
return False
|
||
|
||
|
||
def _merge_enrichments_into_raw_events(
|
||
raw_events: List[Dict[str, Any]],
|
||
session_events: List[Dict[str, Any]],
|
||
) -> int:
|
||
"""Fusionner les enrichissements SomEngine temps réel dans les events JSONL.
|
||
|
||
Les events JSONL (raw_events) sont écrits AVANT l'enrichissement SomEngine.
|
||
Les events en mémoire (session_events) contiennent l'enrichissement dans
|
||
le champ "enrichment". On les fusionne par correspondance screenshot_id.
|
||
|
||
Args:
|
||
raw_events: Events chargés depuis live_events.jsonl (structure
|
||
{"session_id": ..., "event": {...}} ou directement {...}).
|
||
session_events: Events en mémoire depuis LiveSessionState.events
|
||
(contiennent potentiellement le champ "enrichment").
|
||
|
||
Returns:
|
||
Nombre d'enrichissements fusionnés.
|
||
"""
|
||
# Construire un index screenshot_id → enrichment depuis les events mémoire
|
||
enrichment_by_shot: Dict[str, dict] = {}
|
||
for evt in session_events:
|
||
enr = evt.get("enrichment")
|
||
shot_id = evt.get("screenshot_id", "")
|
||
if enr and shot_id:
|
||
enrichment_by_shot[shot_id] = enr
|
||
|
||
if not enrichment_by_shot:
|
||
return 0
|
||
|
||
merged = 0
|
||
for raw_evt in raw_events:
|
||
inner = raw_evt.get("event", raw_evt)
|
||
if inner.get("type") != "mouse_click":
|
||
continue
|
||
shot_id = inner.get("screenshot_id", "")
|
||
if not shot_id:
|
||
continue
|
||
enr = enrichment_by_shot.get(shot_id)
|
||
if enr and "enrichment" not in inner:
|
||
inner["enrichment"] = enr
|
||
merged += 1
|
||
|
||
if merged:
|
||
logger.info(
|
||
"[SoM-RT] %d enrichissement(s) temps réel fusionné(s) dans les events JSONL",
|
||
merged,
|
||
)
|
||
return merged
|
||
|
||
|
||
def _image_hash(file_path: str) -> str:
|
||
"""Hash rapide d'une image pour détecter les doublons (~identiques).
|
||
|
||
Utilise 32x32 au lieu de 16x16 pour une meilleure discrimination
|
||
entre screenshots similaires mais pas identiques (ex: texte modifié
|
||
dans un champ, curseur déplacé, etc.).
|
||
"""
|
||
try:
|
||
from PIL import Image
|
||
import hashlib
|
||
img = Image.open(file_path)
|
||
# Réduire à 32x32 et convertir en niveaux de gris pour un hash perceptuel
|
||
thumb = img.resize((32, 32)).convert('L')
|
||
return hashlib.md5(thumb.tobytes()).hexdigest()
|
||
except Exception:
|
||
return ""
|
||
|
||
|
||
@app.post("/api/v1/traces/stream/image")
|
||
async def stream_image(
|
||
session_id: str,
|
||
shot_id: str,
|
||
machine_id: str = "default",
|
||
file: UploadFile = File(...),
|
||
background_tasks: BackgroundTasks = None,
|
||
):
|
||
"""Reçoit une image et déclenche l'analyse via le core pipeline."""
|
||
# Auto-enregistrer la session si inconnue (robustesse au redémarrage serveur)
|
||
_ensure_session_registered(session_id, machine_id=machine_id)
|
||
|
||
# Sauvegarder sur disque (dans un sous-dossier par machine si multi-machine)
|
||
if machine_id and machine_id != "default":
|
||
session_path = LIVE_SESSIONS_DIR / machine_id / session_id
|
||
else:
|
||
session_path = LIVE_SESSIONS_DIR / session_id
|
||
shots_dir = session_path / "shots"
|
||
shots_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
file_path = shots_dir / f"{shot_id}.png"
|
||
content = await file.read()
|
||
with open(file_path, "wb") as f:
|
||
f.write(content)
|
||
|
||
file_path_str = str(file_path)
|
||
|
||
# Crops : traitement léger (pas d'analyse ScreenAnalyzer)
|
||
if "_crop" in shot_id:
|
||
result = worker.process_crop_direct(session_id, shot_id, file_path_str)
|
||
return {"status": "crop_stored", "shot_id": shot_id, **result}
|
||
|
||
# Filtrer les screenshots qui ne nécessitent PAS d'analyse GPU.
|
||
# Seuls les shot_XXXX_full (screenshots d'action) sont analysés.
|
||
# Les autres (heartbeat, focus, res_shot) sont stockés sur disque
|
||
# mais pas envoyés au GPU — sinon le ThreadPool (1 worker, ~10-30s/analyse)
|
||
# est submergé et la finalisation timeout avec 0 states.
|
||
if shot_id.startswith("heartbeat_"):
|
||
# Mémoriser le dernier heartbeat pour le pre-check de replay
|
||
_last_heartbeat[session_id] = {
|
||
"path": file_path_str,
|
||
"timestamp": time.time(),
|
||
}
|
||
return {"status": "heartbeat_stored", "shot_id": shot_id}
|
||
if shot_id.startswith("focus_"):
|
||
return {"status": "focus_stored", "shot_id": shot_id}
|
||
if shot_id.startswith("res_shot_"):
|
||
return {"status": "res_stored", "shot_id": shot_id}
|
||
if not shot_id.startswith("shot_") or "_full" not in shot_id:
|
||
# Tout ce qui n'est pas shot_XXXX_full → stocker sans analyser
|
||
logger.debug(f"Screenshot {shot_id} stocké sans analyse GPU")
|
||
return {"status": "stored_no_analysis", "shot_id": shot_id}
|
||
|
||
# Enrichissement SomEngine temps réel (léger, ~1-2s en background)
|
||
# Lancé AVANT la déduplication VLM car c'est un traitement indépendant.
|
||
# Si un event mouse_click attend ce screenshot, on lance SomEngine en background.
|
||
# Sinon, on enregistre le screenshot pour le matcher quand l'event arrivera.
|
||
_on_action_screenshot_arrived(session_id, shot_id, file_path_str, machine_id)
|
||
|
||
# Déduplication par ID : ne pas réanalyser un screenshot déjà traité
|
||
with _pending_lock:
|
||
if shot_id in _analyzed_shots[session_id]:
|
||
logger.debug(f"Screenshot {shot_id} déjà analysé, skip")
|
||
return {"status": "already_analyzed", "shot_id": shot_id}
|
||
|
||
# Déduplication par similarité : si l'image est quasi identique à la précédente, skip
|
||
img_hash = _image_hash(file_path_str)
|
||
if img_hash and img_hash == _last_screenshot_hash.get(session_id):
|
||
logger.info(f"Screenshot {shot_id} identique au précédent, skip analyse GPU")
|
||
with _pending_lock:
|
||
_analyzed_shots[session_id].add(shot_id)
|
||
return {"status": "duplicate_skipped", "shot_id": shot_id}
|
||
if img_hash:
|
||
_last_screenshot_hash[session_id] = img_hash
|
||
|
||
with _pending_lock:
|
||
_analyzed_shots[session_id].add(shot_id)
|
||
|
||
# Screenshots full : STOCKAGE UNIQUEMENT (pas d'analyse VLM lourde en temps réel)
|
||
# L'analyse VLM complète (ScreenAnalyzer + CLIP + FAISS) est faite par le
|
||
# worker séparé (run_worker.py) après finalisation de la session.
|
||
logger.debug(f"Screenshot {shot_id} stocké (analyse VLM différée au worker)")
|
||
|
||
return {"status": "image_stored", "shot_id": shot_id}
|
||
|
||
|
||
|
||
def _process_screenshot_thread(session_id: str, shot_id: str, path: str):
|
||
"""Analyse GPU d'un screenshot dans un thread séparé (ne bloque pas FastAPI)."""
|
||
try:
|
||
import traceback
|
||
logger.info(f"[GPU] Début analyse {shot_id} pour {session_id}")
|
||
result = worker.process_screenshot_direct(session_id, shot_id, path)
|
||
logger.info(
|
||
f"[GPU] Screenshot {shot_id} analysé: "
|
||
f"{result.get('ui_elements_count', 0)} UI, "
|
||
f"{result.get('text_detected', 0)} textes, "
|
||
f"indexed={result.get('embedding_indexed', False)}"
|
||
)
|
||
except Exception as e:
|
||
import traceback
|
||
logger.error(f"[GPU] Erreur analyse {shot_id}: {e}\n{traceback.format_exc()}")
|
||
finally:
|
||
with _pending_lock:
|
||
_pending_analyses[session_id] = max(0, _pending_analyses[session_id] - 1)
|
||
|
||
|
||
# =========================================================================
|
||
# Finalisation
|
||
# =========================================================================
|
||
|
||
@app.post("/api/v1/traces/stream/finalize")
|
||
async def finalize(session_id: str, machine_id: str = "default"):
|
||
"""Clôture la session et place le traitement en file d'attente.
|
||
|
||
Ne bloque plus : marque la session comme finalisée et l'ajoute à la queue
|
||
du worker VLM (process séparé) pour analyse + construction workflow.
|
||
|
||
Le client peut suivre la progression via GET /api/v1/traces/stream/processing/status.
|
||
|
||
Args:
|
||
session_id: Identifiant de la session à finaliser
|
||
machine_id: Identifiant machine (informatif, le machine_id est déjà dans la session)
|
||
"""
|
||
# Vérifier que la session existe
|
||
session = processor.session_manager.get_session(session_id)
|
||
if not session:
|
||
raise HTTPException(
|
||
status_code=404,
|
||
detail=f"Session {session_id} non trouvée",
|
||
)
|
||
|
||
# Marquer la session comme finalisée (persistée sur disque)
|
||
processor.session_manager.finalize(session_id)
|
||
logger.info(f"Session {session_id} finalisée, ajout à la queue du worker VLM")
|
||
|
||
# Nettoyer les structures d'enrichissement temps réel pour cette session
|
||
with _enrichment_lock:
|
||
keys_to_remove = [k for k in _pending_click_enrichments if k[0] == session_id]
|
||
for k in keys_to_remove:
|
||
del _pending_click_enrichments[k]
|
||
keys_to_remove = [k for k in _arrived_action_screenshots if k[0] == session_id]
|
||
for k in keys_to_remove:
|
||
del _arrived_action_screenshots[k]
|
||
|
||
# Écrire dans le fichier queue pour le worker VLM (process séparé)
|
||
_enqueue_to_worker(session_id)
|
||
|
||
# Compter les screenshots full disponibles pour donner une estimation
|
||
session_dir = processor._find_session_dir(session_id)
|
||
full_shots_count = 0
|
||
if session_dir:
|
||
shots_dir = session_dir / "shots"
|
||
if shots_dir.exists():
|
||
full_shots_count = len(list(shots_dir.glob("shot_*_full.png")))
|
||
|
||
return {
|
||
"status": "queued_for_processing",
|
||
"session_id": session_id,
|
||
"machine_id": session.machine_id,
|
||
"screenshots_to_analyze": full_shots_count,
|
||
"message": (
|
||
f"Session finalisée. {full_shots_count} screenshots seront analysés "
|
||
"en arrière-plan. Suivez la progression via "
|
||
"GET /api/v1/traces/stream/processing/status"
|
||
),
|
||
}
|
||
|
||
|
||
# =========================================================================
|
||
# Traitement asynchrone — Suivi de la queue de processing
|
||
# =========================================================================
|
||
|
||
@app.get("/api/v1/traces/stream/processing/status")
|
||
async def get_processing_status():
|
||
"""État de la queue de traitement VLM (worker process séparé).
|
||
|
||
Retourne :
|
||
- queue_length : nombre de sessions en attente dans le fichier queue
|
||
- queue : liste des session_ids en attente
|
||
- replay_lock_active : si un replay est en cours (worker suspendu)
|
||
"""
|
||
return _get_worker_queue_status()
|
||
|
||
|
||
@app.post("/api/v1/traces/stream/processing/requeue")
|
||
async def requeue_session(session_id: str):
|
||
"""Relancer le traitement d'une session (manuellement).
|
||
|
||
Utile pour :
|
||
- Relancer une session échouée après correction
|
||
- Forcer le retraitement d'une session déjà traitée
|
||
"""
|
||
session = processor.session_manager.get_session(session_id)
|
||
if not session:
|
||
raise HTTPException(
|
||
status_code=404,
|
||
detail=f"Session {session_id} non trouvée",
|
||
)
|
||
|
||
_enqueue_to_worker(session_id)
|
||
|
||
return {
|
||
"status": "requeued",
|
||
"session_id": session_id,
|
||
"queue_status": _get_worker_queue_status(),
|
||
}
|
||
|
||
|
||
# =========================================================================
|
||
# Monitoring
|
||
# =========================================================================
|
||
|
||
@app.get("/api/v1/traces/stream/stats")
|
||
async def get_stats():
|
||
"""Statistiques du serveur de streaming."""
|
||
stats = worker.stats
|
||
# Ajouter les machines connues
|
||
stats["machines"] = processor.session_manager.get_machine_ids()
|
||
return stats
|
||
|
||
|
||
@app.get("/api/v1/traces/stream/machines")
|
||
async def list_machines():
|
||
"""Lister toutes les machines connues avec leurs sessions actives.
|
||
|
||
Utile pour le dashboard et l'agent chat (Léa) pour savoir quelles
|
||
machines sont connectées et cibler un replay spécifique.
|
||
"""
|
||
machine_ids = processor.session_manager.get_machine_ids()
|
||
machines = []
|
||
for mid in machine_ids:
|
||
machine_sessions = processor.session_manager.get_sessions_by_machine(mid)
|
||
active = [s for s in machine_sessions if not s.finalized]
|
||
machines.append({
|
||
"machine_id": mid,
|
||
"total_sessions": len(machine_sessions),
|
||
"active_sessions": len(active),
|
||
"last_activity": max(
|
||
(s.last_activity for s in machine_sessions),
|
||
default=None,
|
||
).isoformat() if machine_sessions else None,
|
||
})
|
||
return {"machines": machines}
|
||
|
||
|
||
@app.get("/api/v1/traces/stream/sessions")
|
||
async def list_sessions(machine_id: Optional[str] = None):
|
||
"""Lister les sessions (actives et finalisées).
|
||
|
||
Args:
|
||
machine_id: Si fourni, filtre par machine. Si absent, retourne toutes les sessions.
|
||
"""
|
||
sessions = processor.list_sessions(machine_id=machine_id)
|
||
result = {"sessions": sessions}
|
||
# Ajouter la liste des machines connues pour l'UI
|
||
result["machines"] = processor.session_manager.get_machine_ids()
|
||
return result
|
||
|
||
|
||
@app.get("/api/v1/traces/stream/workflows")
|
||
async def list_workflows(machine_id: Optional[str] = None):
|
||
"""Lister les workflows construits.
|
||
|
||
Args:
|
||
machine_id: Si fourni, filtre par machine. Si absent, retourne tous les workflows.
|
||
"""
|
||
workflows = processor.list_workflows(machine_id=machine_id)
|
||
result = {"workflows": workflows}
|
||
# Ajouter la liste des machines connues pour l'UI
|
||
result["machines"] = processor.session_manager.get_machine_ids()
|
||
return result
|
||
|
||
|
||
@app.post("/api/v1/traces/stream/reload-workflows")
|
||
async def reload_workflows():
|
||
"""Recharger les workflows depuis le disque.
|
||
|
||
Appelé par le VWB après un export-for-lea pour que le streaming server
|
||
voie immédiatement les nouveaux workflows sans redémarrage.
|
||
"""
|
||
count = processor.reload_workflows()
|
||
return {"success": True, "workflows_count": count}
|
||
|
||
|
||
@app.get("/api/v1/traces/stream/workflow/{workflow_id}")
|
||
async def get_workflow_detail(workflow_id: str):
|
||
"""Retourne le détail complet d'un workflow (format core JSON).
|
||
|
||
Utilisé par le VWB pour importer un workflow appris qui n'est pas
|
||
encore sur disque (seulement en mémoire dans le streaming server).
|
||
"""
|
||
with processor._data_lock:
|
||
wf = processor._workflows.get(workflow_id)
|
||
|
||
if not wf:
|
||
raise HTTPException(status_code=404, detail=f"Workflow '{workflow_id}' non trouvé")
|
||
|
||
return wf.to_dict()
|
||
|
||
|
||
@app.get("/api/v1/traces/stream/session/{session_id}")
|
||
async def get_session(session_id: str):
|
||
"""État d'une session."""
|
||
session = processor.session_manager.get_session(session_id)
|
||
if not session:
|
||
raise HTTPException(status_code=404, detail=f"Session {session_id} non trouvée")
|
||
return {
|
||
"session_id": session.session_id,
|
||
"machine_id": session.machine_id,
|
||
"events_count": len(session.events),
|
||
"screenshots_count": len(session.shot_paths),
|
||
"last_window": session.last_window_info,
|
||
"created_at": session.created_at.isoformat(),
|
||
"last_activity": session.last_activity.isoformat(),
|
||
"finalized": session.finalized,
|
||
}
|
||
|
||
|
||
# =========================================================================
|
||
# Replay — Exécution de workflows sur l'Agent V1
|
||
# =========================================================================
|
||
|
||
|
||
def _find_active_agent_session(machine_id: Optional[str] = None) -> Optional[str]:
|
||
"""Trouver la dernière session Agent V1 pour le replay.
|
||
|
||
Stratégie en 2 passes :
|
||
1. D'abord chercher une session non-finalisée (Agent V1 actif)
|
||
2. Sinon, prendre la plus récente même finalisée (Agent V1 peut avoir
|
||
redémarré et créé une nouvelle session, ou la session a été finalisée
|
||
par timeout mais l'agent est toujours là)
|
||
|
||
Dans les deux cas, on ne considère que les sessions 'sess_*' (Agent V1).
|
||
|
||
Args:
|
||
machine_id: Si fourni, ne chercher que les sessions de cette machine.
|
||
Si None, chercher toutes les sessions (rétrocompatible).
|
||
"""
|
||
with processor.session_manager._lock:
|
||
all_agent_sessions = [
|
||
s for s in processor.session_manager._sessions.values()
|
||
if s.session_id.startswith("sess_")
|
||
and (machine_id is None or s.machine_id == machine_id)
|
||
]
|
||
|
||
if not all_agent_sessions:
|
||
return None
|
||
|
||
# Trier par session_id (contient un timestamp) — plus récent d'abord
|
||
all_agent_sessions.sort(key=lambda s: s.session_id, reverse=True)
|
||
|
||
# Passe 1 : préférer une session non-finalisée
|
||
for s in all_agent_sessions:
|
||
if not s.finalized:
|
||
return s.session_id
|
||
|
||
# Passe 2 : fallback sur la plus récente (même finalisée)
|
||
# L'Agent V1 poll /replay/next indépendamment de l'état finalized
|
||
return all_agent_sessions[0].session_id
|
||
|
||
|
||
def _workflow_to_actions(workflow, params: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
|
||
"""
|
||
Convertir un workflow (nodes + edges ordonnés) en liste d'actions normalisées.
|
||
|
||
Parcourt le graphe depuis les entry_nodes en suivant les edges.
|
||
Chaque edge produit une action normalisée avec coordonnées en pourcentage.
|
||
|
||
Mode intelligent (workflows appris par Léa) :
|
||
Si le workflow a des nodes avec des prototype_vectors, utilise le
|
||
StreamProcessor.extract_enriched_actions() qui enrichit les actions
|
||
avec les données de la session originale, le ciblage visuel et le
|
||
pre-check/post-check par embedding CLIP.
|
||
|
||
Mode classique (workflows VWB/manuels) :
|
||
Parcours BFS classique avec _edge_to_normalized_actions().
|
||
"""
|
||
params = params or {}
|
||
|
||
# Détection d'un workflow appris (a des nodes avec prototype_vectors)
|
||
# et qui a des edges structurés
|
||
if _is_learned_workflow(workflow):
|
||
# Priorité 1 : replay hybride (événements bruts + structure workflow)
|
||
# Beaucoup plus fiable car utilise les actions utilisateur réelles
|
||
# au lieu des compound actions du GraphBuilder qui perdent les détails
|
||
hybrid = processor.build_hybrid_replay(workflow)
|
||
if hybrid:
|
||
logger.info(
|
||
"Replay hybride : %d actions depuis events bruts + structure workflow",
|
||
len(hybrid),
|
||
)
|
||
# Optimisation par gestes clavier si disponible
|
||
if _gesture_catalog and hybrid:
|
||
hybrid = _gesture_catalog.optimize_replay_actions(hybrid)
|
||
return hybrid
|
||
|
||
# Priorité 2 : enrichissement classique (fallback si hybride échoue)
|
||
enriched = processor.extract_enriched_actions(workflow, params)
|
||
if enriched:
|
||
logger.info(
|
||
"Replay intelligent : %d actions enrichies depuis le workflow appris",
|
||
len(enriched),
|
||
)
|
||
# Optimisation par gestes clavier si disponible
|
||
if _gesture_catalog and enriched:
|
||
enriched = _gesture_catalog.optimize_replay_actions(enriched)
|
||
return enriched
|
||
# Si l'enrichissement échoue aussi, fallback sur le mode classique
|
||
logger.warning(
|
||
"Enrichissement échoué pour le workflow appris, fallback mode classique"
|
||
)
|
||
|
||
# Mode classique (VWB/manuels ou fallback)
|
||
actions = []
|
||
|
||
# Construire un index des edges sortants par node
|
||
outgoing: Dict[str, list] = defaultdict(list)
|
||
for edge in workflow.edges:
|
||
outgoing[edge.from_node].append(edge)
|
||
|
||
# Parcours linéaire depuis le premier entry_node
|
||
visited = set()
|
||
current_nodes = list(workflow.entry_nodes) if workflow.entry_nodes else []
|
||
|
||
# Fallback : si pas d'entry_nodes, prendre le premier node
|
||
if not current_nodes and workflow.nodes:
|
||
current_nodes = [workflow.nodes[0].node_id]
|
||
|
||
while current_nodes:
|
||
node_id = current_nodes.pop(0)
|
||
if node_id in visited:
|
||
continue
|
||
visited.add(node_id)
|
||
|
||
edges = outgoing.get(node_id, [])
|
||
for edge in edges:
|
||
edge_actions = _edge_to_normalized_actions(edge, params)
|
||
actions.extend(edge_actions)
|
||
# Suivre le graphe vers le prochain node
|
||
if edge.to_node not in visited:
|
||
current_nodes.append(edge.to_node)
|
||
|
||
# Optimisation : substituer les actions visuelles par des gestes clavier si possible
|
||
if _gesture_catalog and actions:
|
||
actions = _gesture_catalog.optimize_replay_actions(actions)
|
||
|
||
return actions
|
||
|
||
|
||
def _is_learned_workflow(workflow) -> bool:
|
||
"""Détecter si un workflow est un workflow appris (vs VWB/manuel).
|
||
|
||
Un workflow appris a :
|
||
- Des nodes avec _prototype_vector dans metadata
|
||
- Des edges avec from_node/to_node
|
||
- Un learning_state indicatif (OBSERVATION, COACHING, AUTO_CANDIDATE, etc.)
|
||
|
||
Un workflow VWB/manuel a généralement :
|
||
- Des edges avec des target_spec complets (by_text, by_role remplis)
|
||
- Pas de prototype_vectors
|
||
"""
|
||
# Accéder aux données (objet ou dict)
|
||
if hasattr(workflow, 'nodes'):
|
||
nodes = workflow.nodes
|
||
edges = workflow.edges
|
||
elif isinstance(workflow, dict):
|
||
nodes = workflow.get('nodes', [])
|
||
edges = workflow.get('edges', [])
|
||
else:
|
||
return False
|
||
|
||
if not nodes or not edges:
|
||
return False
|
||
|
||
# Vérifier si au moins un node a un prototype_vector
|
||
has_prototype = False
|
||
for node in nodes:
|
||
metadata = node.metadata if hasattr(node, 'metadata') else node.get('metadata', {})
|
||
if isinstance(metadata, dict) and '_prototype_vector' in metadata:
|
||
has_prototype = True
|
||
break
|
||
|
||
return has_prototype
|
||
|
||
|
||
def _edge_to_normalized_actions(edge, params: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||
"""
|
||
Convertir un WorkflowEdge en liste d'actions normalisées pour l'Agent V1.
|
||
|
||
Un edge simple produit 1 action, un edge compound produit N actions (une par step).
|
||
"""
|
||
action = edge.action
|
||
if action is None:
|
||
logger.warning(f"Edge {edge.edge_id} sans action, skip")
|
||
return []
|
||
action_type = action.type
|
||
target = action.target
|
||
action_params = action.parameters or {}
|
||
|
||
# Extraire les coordonnées normalisées depuis TargetSpec.by_position
|
||
x_pct = 0.0
|
||
y_pct = 0.0
|
||
if target and target.by_position:
|
||
px, py = target.by_position
|
||
if px <= 1.0 and py <= 1.0:
|
||
x_pct = px
|
||
y_pct = py
|
||
else:
|
||
ref_w = action_params.get("ref_width", 1920) or 1920
|
||
ref_h = action_params.get("ref_height", 1080) or 1080
|
||
x_pct = round(px / ref_w, 6)
|
||
y_pct = round(py / ref_h, 6)
|
||
|
||
base = {"edge_id": edge.edge_id, "from_node": edge.from_node, "to_node": edge.to_node}
|
||
|
||
# Compound : décomposer en actions individuelles
|
||
if action_type == "compound":
|
||
return _expand_compound_steps(action_params.get("steps", []), base, params)
|
||
|
||
# Actions simples
|
||
normalized = {**base, "action_id": f"act_{uuid.uuid4().hex[:8]}"}
|
||
|
||
if action_type == "mouse_click":
|
||
normalized["type"] = "click"
|
||
normalized["x_pct"] = x_pct
|
||
normalized["y_pct"] = y_pct
|
||
normalized["button"] = action_params.get("button", "left")
|
||
|
||
elif action_type == "text_input":
|
||
normalized["type"] = "type"
|
||
text = action_params.get("text", "")
|
||
text = _substitute_variables(text, params, action_params.get("defaults", {}))
|
||
normalized["text"] = text
|
||
normalized["x_pct"] = x_pct
|
||
normalized["y_pct"] = y_pct
|
||
|
||
elif action_type == "key_press":
|
||
normalized["type"] = "key_combo"
|
||
keys = action_params.get("keys", [])
|
||
if not keys and action_params.get("key"):
|
||
keys = [action_params["key"]]
|
||
normalized["keys"] = keys
|
||
|
||
else:
|
||
logger.warning(f"Type d'action inconnu : {action_type}")
|
||
return []
|
||
|
||
# Ajouter le target_spec complet pour la résolution visuelle
|
||
target_spec = {}
|
||
if target and target.by_role:
|
||
target_spec["by_role"] = target.by_role
|
||
normalized["target_role"] = target.by_role # Compat debug
|
||
if target and target.by_text:
|
||
target_spec["by_text"] = target.by_text
|
||
normalized["target_text"] = target.by_text # Compat debug
|
||
if target and hasattr(target, 'context_hints') and target.context_hints:
|
||
target_spec["context_hints"] = target.context_hints
|
||
if target_spec:
|
||
normalized["target_spec"] = target_spec
|
||
normalized["visual_mode"] = True # Signal à l'agent d'utiliser la résolution visuelle
|
||
|
||
return [normalized]
|
||
|
||
|
||
def _substitute_variables(text: str, params: Dict[str, Any], defaults: Dict[str, Any]) -> str:
|
||
"""Substituer les variables ${var} dans un texte.
|
||
|
||
Priorité : params utilisateur > defaults du workflow > texte brut inchangé.
|
||
Supporte ${var} dans un texte plus long (ex: "${expression}=").
|
||
"""
|
||
import re
|
||
|
||
def replacer(match):
|
||
var_name = match.group(1)
|
||
return str(params.get(var_name, defaults.get(var_name, match.group(0))))
|
||
|
||
return re.sub(r'\$\{(\w+)\}', replacer, text)
|
||
|
||
|
||
def _expand_compound_steps(
|
||
steps: List[Dict[str, Any]], base: Dict[str, Any], params: Dict[str, Any]
|
||
) -> List[Dict[str, Any]]:
|
||
"""Décomposer les steps d'un compound en actions individuelles."""
|
||
actions = []
|
||
for step in steps:
|
||
step_type = step.get("type", "unknown")
|
||
action = {
|
||
**base,
|
||
"action_id": f"act_{uuid.uuid4().hex[:8]}",
|
||
}
|
||
|
||
if step_type == "key_press":
|
||
action["type"] = "key_combo"
|
||
keys = step.get("keys", [])
|
||
if not keys and step.get("key"):
|
||
keys = [step["key"]]
|
||
action["keys"] = keys
|
||
|
||
elif step_type == "text_input":
|
||
action["type"] = "type"
|
||
text = step.get("text", "")
|
||
text = _substitute_variables(text, params, {})
|
||
action["text"] = text
|
||
|
||
elif step_type == "wait":
|
||
action["type"] = "wait"
|
||
action["duration_ms"] = step.get("duration_ms", 500)
|
||
|
||
elif step_type == "mouse_click":
|
||
action["type"] = "click"
|
||
action["x_pct"] = step.get("x_pct", 0.0)
|
||
action["y_pct"] = step.get("y_pct", 0.0)
|
||
action["button"] = step.get("button", "left")
|
||
|
||
else:
|
||
logger.debug(f"Step compound inconnu : {step_type}")
|
||
continue
|
||
|
||
actions.append(action)
|
||
|
||
return actions
|
||
|
||
|
||
@app.post("/api/v1/traces/stream/replay")
|
||
async def start_replay(request: ReplayRequest):
|
||
"""
|
||
Lancer le replay d'un workflow sur une session Agent V1 active.
|
||
|
||
Le serveur charge le workflow, le convertit en liste d'actions normalisées,
|
||
et les place dans la queue de la session. L'Agent V1 les récupérera
|
||
via GET /replay/next (modèle pull).
|
||
|
||
Si session_id commence par "chat_" ou est vide, on détecte automatiquement
|
||
la dernière session Agent V1 active (non finalisée, préfixe "sess_").
|
||
Si machine_id est fourni, on cible spécifiquement cette machine.
|
||
"""
|
||
workflow_id = request.workflow_id
|
||
session_id = request.session_id
|
||
target_machine_id = request.machine_id
|
||
params = request.params or {}
|
||
|
||
# Auto-détection de la session Agent V1 active (avec filtre machine optionnel)
|
||
if not session_id or session_id.startswith("chat_"):
|
||
active_session = _find_active_agent_session(machine_id=target_machine_id)
|
||
if active_session:
|
||
logger.info(
|
||
f"Auto-détection session Agent V1 : {active_session} "
|
||
f"(demandé: {session_id}, machine={target_machine_id})"
|
||
)
|
||
session_id = active_session
|
||
else:
|
||
machine_hint = f" sur la machine '{target_machine_id}'" if target_machine_id else ""
|
||
raise HTTPException(
|
||
status_code=404,
|
||
detail=f"Aucune session Agent V1 active{machine_hint}. "
|
||
"Lancez l'Agent V1 et démarrez une session d'abord."
|
||
)
|
||
|
||
# Vérifier que le workflow existe
|
||
with processor._data_lock:
|
||
workflow = processor._workflows.get(workflow_id)
|
||
|
||
if not workflow:
|
||
raise HTTPException(
|
||
status_code=404,
|
||
detail=f"Workflow '{workflow_id}' non trouvé. "
|
||
f"Workflows disponibles : {list(processor._workflows.keys())}"
|
||
)
|
||
|
||
# Convertir le workflow en actions normalisées
|
||
actions = _workflow_to_actions(workflow, params)
|
||
if not actions:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"Le workflow '{workflow_id}' ne contient aucune action exécutable."
|
||
)
|
||
|
||
# Limite de sécurité sur le nombre d'actions
|
||
if len(actions) > MAX_ACTIONS_PER_REPLAY:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"Trop d'actions ({len(actions)} > {MAX_ACTIONS_PER_REPLAY}). "
|
||
"Découpez le workflow en parties plus petites."
|
||
)
|
||
|
||
# ── Setup environnement — ouvrir les applications nécessaires ──
|
||
setup_actions = []
|
||
app_info = _extract_required_apps_from_workflow(workflow)
|
||
if app_info:
|
||
setup_actions = _generate_setup_actions(app_info, setup_id_prefix="setup_wf")
|
||
if setup_actions:
|
||
actions = setup_actions + actions
|
||
logger.info(
|
||
"replay workflow %s : %d actions de setup injectées "
|
||
"(app=%s, cmd=%s)",
|
||
workflow_id, len(setup_actions),
|
||
app_info.get("primary_app"), app_info.get("primary_launch_cmd"),
|
||
)
|
||
|
||
# Créer l'identifiant de replay
|
||
replay_id = f"replay_{uuid.uuid4().hex[:8]}"
|
||
|
||
# Résoudre le machine_id de la session cible
|
||
session_obj = processor.session_manager.get_session(session_id)
|
||
resolved_machine_id = target_machine_id or (session_obj.machine_id if session_obj else "default")
|
||
|
||
# Injecter les actions dans la queue de la session
|
||
with _replay_lock:
|
||
_replay_queues[session_id] = list(actions) # Remplacer la queue existante
|
||
_replay_states[replay_id] = _create_replay_state(
|
||
replay_id=replay_id,
|
||
workflow_id=workflow_id,
|
||
session_id=session_id,
|
||
total_actions=len(actions),
|
||
params=params,
|
||
machine_id=resolved_machine_id,
|
||
)
|
||
# Enregistrer le mapping machine -> session pour le replay ciblé
|
||
if resolved_machine_id and resolved_machine_id != "default":
|
||
_machine_replay_target[resolved_machine_id] = session_id
|
||
|
||
# Signaler au worker VLM (process séparé) qu'un replay est actif → se suspendre
|
||
_set_replay_lock(replay_id)
|
||
|
||
logger.info(
|
||
f"Replay démarré : {replay_id} | workflow={workflow_id} | "
|
||
f"session={session_id} | machine={resolved_machine_id} | "
|
||
f"{len(actions)} actions ({len(setup_actions)} setup + "
|
||
f"{len(actions) - len(setup_actions)} replay) (worker suspendu)"
|
||
)
|
||
|
||
return {
|
||
"replay_id": replay_id,
|
||
"status": "running",
|
||
"workflow_id": workflow_id,
|
||
"session_id": session_id,
|
||
"machine_id": resolved_machine_id,
|
||
"total_actions": len(actions),
|
||
"setup_actions": len(setup_actions),
|
||
"setup_app": app_info.get("primary_app", "") if app_info else "",
|
||
}
|
||
|
||
|
||
@app.post("/api/v1/traces/stream/replay/raw")
|
||
async def start_raw_replay(request: RawReplayRequest):
|
||
"""
|
||
Lancer un replay avec des actions brutes (mode Agent Libre).
|
||
|
||
Au lieu de charger un workflow, accepte directement une liste d'actions
|
||
normalisées générées par le LLM planner. Les actions sont injectées
|
||
dans la queue de replay de l'Agent V1.
|
||
"""
|
||
session_id = request.session_id
|
||
actions = request.actions
|
||
target_machine_id = request.machine_id
|
||
task = request.task_description or "Tâche libre"
|
||
|
||
if not actions:
|
||
raise HTTPException(status_code=400, detail="Aucune action fournie.")
|
||
|
||
# Limite de sécurité sur le nombre d'actions
|
||
if len(actions) > MAX_ACTIONS_PER_REPLAY:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"Trop d'actions ({len(actions)} > {MAX_ACTIONS_PER_REPLAY}). "
|
||
"Réduisez le plan d'exécution."
|
||
)
|
||
|
||
# Validation de chaque action (sécurité HIGH)
|
||
for i, action in enumerate(actions):
|
||
error = _validate_replay_action(action)
|
||
if error:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"Action #{i} invalide : {error}"
|
||
)
|
||
|
||
# Auto-détection de la session Agent V1 (avec filtre machine optionnel)
|
||
if not session_id or session_id.startswith("chat_"):
|
||
active_session = _find_active_agent_session(machine_id=target_machine_id)
|
||
if active_session:
|
||
session_id = active_session
|
||
else:
|
||
machine_hint = f" sur la machine '{target_machine_id}'" if target_machine_id else ""
|
||
raise HTTPException(
|
||
status_code=404,
|
||
detail=f"Aucune session Agent V1 active{machine_hint}. "
|
||
"Lancez l'Agent V1 sur le PC cible."
|
||
)
|
||
|
||
# Assigner des action_id si manquants
|
||
for i, action in enumerate(actions):
|
||
if "action_id" not in action:
|
||
action["action_id"] = f"act_free_{uuid.uuid4().hex[:6]}"
|
||
|
||
replay_id = f"replay_free_{uuid.uuid4().hex[:8]}"
|
||
|
||
# Résoudre le machine_id de la session cible
|
||
session_obj = processor.session_manager.get_session(session_id)
|
||
resolved_machine_id = target_machine_id or (session_obj.machine_id if session_obj else "default")
|
||
|
||
with _replay_lock:
|
||
_replay_queues[session_id] = list(actions)
|
||
_replay_states[replay_id] = _create_replay_state(
|
||
replay_id=replay_id,
|
||
workflow_id=f"free_task:{task[:50]}",
|
||
session_id=session_id,
|
||
total_actions=len(actions),
|
||
params={},
|
||
machine_id=resolved_machine_id,
|
||
)
|
||
# Enregistrer le mapping machine -> session pour le replay ciblé
|
||
if resolved_machine_id and resolved_machine_id != "default":
|
||
_machine_replay_target[resolved_machine_id] = session_id
|
||
|
||
# Signaler au worker VLM (process séparé) qu'un replay est actif → se suspendre
|
||
_set_replay_lock(replay_id)
|
||
|
||
logger.info(
|
||
f"Replay libre démarré : {replay_id} | task='{task}' | "
|
||
f"session={session_id} | machine={resolved_machine_id} | {len(actions)} actions (worker suspendu)"
|
||
)
|
||
|
||
return {
|
||
"replay_id": replay_id,
|
||
"status": "running",
|
||
"task": task,
|
||
"session_id": session_id,
|
||
"machine_id": resolved_machine_id,
|
||
"total_actions": len(actions),
|
||
}
|
||
|
||
|
||
@app.post("/api/v1/traces/stream/replay-session")
|
||
async def replay_from_session(
|
||
session_id: str,
|
||
machine_id: str = "default",
|
||
):
|
||
"""Rejouer une session directement depuis ses événements bruts.
|
||
|
||
Pas besoin d'attendre le traitement VLM/GraphBuilder.
|
||
Construit le replay propre automatiquement depuis live_events.jsonl.
|
||
|
||
Pipeline :
|
||
1. Charge les events bruts de la session
|
||
2. Filtre les parasites (heartbeat, focus_change, action_result)
|
||
3. Fusionne les text_input consécutifs
|
||
4. Normalise les coordonnées en pourcentage
|
||
5. Ajoute des waits contextuels (après Win+R, Ctrl+S, Alt+F4, Enter)
|
||
6. Coupe après Alt+F4
|
||
7. Injecte dans la queue de replay
|
||
|
||
Résultat typique : ~15-20 actions propres, prêtes à exécuter immédiatement.
|
||
"""
|
||
if not session_id:
|
||
raise HTTPException(status_code=400, detail="session_id requis")
|
||
|
||
# ── 1. Trouver le fichier live_events.jsonl de la session ──
|
||
events_file = None
|
||
|
||
# Chercher dans le sous-dossier machine_id (format standard)
|
||
if machine_id and machine_id != "default":
|
||
candidate = LIVE_SESSIONS_DIR / machine_id / session_id / "live_events.jsonl"
|
||
if candidate.exists():
|
||
events_file = candidate
|
||
|
||
# Fallback : chercher dans tous les sous-dossiers machine
|
||
if not events_file:
|
||
for machine_dir in LIVE_SESSIONS_DIR.iterdir():
|
||
if not machine_dir.is_dir():
|
||
continue
|
||
candidate = machine_dir / session_id / "live_events.jsonl"
|
||
if candidate.exists():
|
||
events_file = candidate
|
||
# Résoudre le machine_id depuis le dossier
|
||
if machine_id == "default":
|
||
machine_id = machine_dir.name
|
||
break
|
||
|
||
# Dernier fallback : dossier session directement sous LIVE_SESSIONS_DIR
|
||
if not events_file:
|
||
candidate = LIVE_SESSIONS_DIR / session_id / "live_events.jsonl"
|
||
if candidate.exists():
|
||
events_file = candidate
|
||
|
||
if not events_file:
|
||
raise HTTPException(
|
||
status_code=404,
|
||
detail=f"Session '{session_id}' introuvable. "
|
||
f"Fichier live_events.jsonl non trouvé dans "
|
||
f"{LIVE_SESSIONS_DIR}/{machine_id}/{session_id}/"
|
||
)
|
||
|
||
# ── 2. Charger les événements bruts ──
|
||
raw_events = []
|
||
try:
|
||
for line in events_file.read_text(encoding="utf-8").splitlines():
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
try:
|
||
raw_events.append(json.loads(line))
|
||
except json.JSONDecodeError:
|
||
continue
|
||
except Exception as e:
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"Erreur lecture events de la session : {e}"
|
||
)
|
||
|
||
if not raw_events:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"Session '{session_id}' : aucun événement trouvé dans live_events.jsonl"
|
||
)
|
||
|
||
# ── 2b. Fusionner les enrichissements temps réel depuis la session en mémoire ──
|
||
# Le JSONL ne contient pas les enrichissements SomEngine calculés pendant
|
||
# l'enregistrement (ils sont ajoutés en mémoire après écriture JSONL).
|
||
# On les injecte ici pour que build_replay_from_raw_events puisse les réutiliser.
|
||
session_mem = processor.session_manager.get_session(session_id)
|
||
if session_mem and session_mem.events:
|
||
_merge_enrichments_into_raw_events(raw_events, session_mem.events)
|
||
|
||
# ── 3. Construire le replay propre depuis les events bruts ──
|
||
# Passer le répertoire de session pour activer le visual replay (crops de référence)
|
||
session_dir = str(events_file.parent)
|
||
actions = build_replay_from_raw_events(
|
||
raw_events, session_id=session_id, session_dir=session_dir,
|
||
)
|
||
|
||
if not actions:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"Session '{session_id}' : aucune action exploitable après nettoyage "
|
||
f"({len(raw_events)} événements bruts)"
|
||
)
|
||
|
||
# Limite de sécurité
|
||
if len(actions) > MAX_ACTIONS_PER_REPLAY:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"Trop d'actions ({len(actions)} > {MAX_ACTIONS_PER_REPLAY}). "
|
||
"La session est trop longue pour un replay direct."
|
||
)
|
||
|
||
# Validation de chaque action (sécurité HIGH)
|
||
for i, action in enumerate(actions):
|
||
error = _validate_replay_action(action)
|
||
if error:
|
||
logger.warning(
|
||
"replay-session : action #%d invalide (%s), suppression", i, error
|
||
)
|
||
# Supprimer les actions invalides plutôt que rejeter tout le replay
|
||
actions[i] = None
|
||
actions = [a for a in actions if a is not None]
|
||
|
||
if not actions:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"Session '{session_id}' : toutes les actions ont été rejetées par la validation"
|
||
)
|
||
|
||
# Optimisation par gestes clavier si disponible
|
||
if _gesture_catalog and actions:
|
||
actions = _gesture_catalog.optimize_replay_actions(actions)
|
||
|
||
# ── 3b. Setup environnement — ouvrir les applications nécessaires ──
|
||
# Analyser les événements bruts pour détecter quelles applications sont requises
|
||
# et injecter des actions de setup en tête de la queue de replay.
|
||
setup_actions = []
|
||
app_info = _extract_required_apps_from_events(raw_events)
|
||
if app_info:
|
||
setup_actions = _generate_setup_actions(app_info, setup_id_prefix="setup_sess")
|
||
if setup_actions:
|
||
actions = setup_actions + actions
|
||
logger.info(
|
||
"replay-session %s : %d actions de setup injectées avant le replay "
|
||
"(app=%s, cmd=%s)",
|
||
session_id, len(setup_actions),
|
||
app_info.get("primary_app"), app_info.get("primary_launch_cmd"),
|
||
)
|
||
|
||
# ── 4. Trouver la session de replay cible (Agent V1 actif) ──
|
||
# L'agent actif peut avoir une session différente de la session source
|
||
target_session_id = _find_active_agent_session(machine_id=machine_id)
|
||
if not target_session_id:
|
||
# Fallback : utiliser la session source si c'est une session Agent V1
|
||
if session_id.startswith("sess_"):
|
||
target_session_id = session_id
|
||
else:
|
||
raise HTTPException(
|
||
status_code=404,
|
||
detail=f"Aucune session Agent V1 active sur la machine '{machine_id}'. "
|
||
"Lancez l'Agent V1 sur le PC cible."
|
||
)
|
||
|
||
# ── 5. Injecter dans la queue de replay ──
|
||
replay_id = f"replay_sess_{uuid.uuid4().hex[:8]}"
|
||
|
||
with _replay_lock:
|
||
_replay_queues[target_session_id] = list(actions)
|
||
_replay_states[replay_id] = _create_replay_state(
|
||
replay_id=replay_id,
|
||
workflow_id=f"session_replay:{session_id}",
|
||
session_id=target_session_id,
|
||
total_actions=len(actions),
|
||
params={},
|
||
machine_id=machine_id,
|
||
)
|
||
# Enregistrer le mapping machine -> session pour le replay ciblé
|
||
if machine_id and machine_id != "default":
|
||
_machine_replay_target[machine_id] = target_session_id
|
||
|
||
# Signaler au worker VLM (process séparé) qu'un replay est actif → se suspendre
|
||
_set_replay_lock(replay_id)
|
||
|
||
logger.info(
|
||
"Replay session démarré : %s | source=%s | target=%s | machine=%s | "
|
||
"%d actions (%d setup + %d replay) (worker suspendu)",
|
||
replay_id, session_id, target_session_id, machine_id,
|
||
len(actions), len(setup_actions), len(actions) - len(setup_actions),
|
||
)
|
||
|
||
return {
|
||
"replay_id": replay_id,
|
||
"status": "running",
|
||
"source_session_id": session_id,
|
||
"target_session_id": target_session_id,
|
||
"machine_id": machine_id,
|
||
"total_actions": len(actions),
|
||
"setup_actions": len(setup_actions),
|
||
"replay_actions": len(actions) - len(setup_actions),
|
||
"total_raw_events": len(raw_events),
|
||
"setup_app": app_info.get("primary_app", "") if app_info else "",
|
||
"actions_preview": [
|
||
{
|
||
k: (
|
||
# Ne pas sérialiser l'image base64 dans le preview
|
||
{kk: ("..." if kk == "anchor_image_base64" else vv) for kk, vv in v.items()}
|
||
if k == "target_spec" and isinstance(v, dict)
|
||
else v
|
||
)
|
||
for k, v in a.items()
|
||
if k != "action_id"
|
||
}
|
||
for a in actions[:8] # Montrer plus d'actions pour inclure le setup
|
||
],
|
||
}
|
||
|
||
|
||
@app.post("/api/v1/traces/stream/replay/single")
|
||
async def enqueue_single_action(request: SingleActionRequest):
|
||
"""
|
||
Enqueue une seule action pour exécution (mode Copilot).
|
||
|
||
Contrairement à /replay et /replay/raw qui injectent toute une liste,
|
||
cet endpoint n'enqueue qu'UNE action à la fois. L'agent chat Copilot
|
||
appelle cet endpoint étape par étape après validation utilisateur.
|
||
|
||
Retourne un action_id pour le tracking du résultat via /replay/result.
|
||
"""
|
||
session_id = request.session_id
|
||
action = dict(request.action)
|
||
target_machine_id = request.machine_id
|
||
|
||
# Validation de l'action (sécurité HIGH)
|
||
error = _validate_replay_action(action)
|
||
if error:
|
||
raise HTTPException(status_code=400, detail=f"Action invalide : {error}")
|
||
|
||
# Auto-détection de la session Agent V1 (avec filtre machine optionnel)
|
||
if not session_id or session_id.startswith("chat_"):
|
||
active_session = _find_active_agent_session(machine_id=target_machine_id)
|
||
if active_session:
|
||
session_id = active_session
|
||
else:
|
||
machine_hint = f" sur la machine '{target_machine_id}'" if target_machine_id else ""
|
||
raise HTTPException(
|
||
status_code=404,
|
||
detail=f"Aucune session Agent V1 active{machine_hint}. "
|
||
"Lancez l'Agent V1 sur le PC cible."
|
||
)
|
||
|
||
# Assigner un action_id si manquant
|
||
if "action_id" not in action:
|
||
action["action_id"] = f"act_copilot_{uuid.uuid4().hex[:8]}"
|
||
|
||
action_id = action["action_id"]
|
||
|
||
with _replay_lock:
|
||
_replay_queues[session_id].append(action)
|
||
|
||
logger.info(
|
||
f"Action Copilot enqueued: {action_id} | type={action.get('type')} | "
|
||
f"session={session_id} | machine={target_machine_id}"
|
||
)
|
||
|
||
return {
|
||
"action_id": action_id,
|
||
"session_id": session_id,
|
||
"machine_id": target_machine_id,
|
||
"status": "enqueued",
|
||
}
|
||
|
||
|
||
# =========================================================================
|
||
# Pre-check écran — Vérification pré-action par embedding CLIP
|
||
# =========================================================================
|
||
|
||
|
||
def _pre_check_screen_state(
|
||
session_id: str,
|
||
expected_node_id: str,
|
||
current_screenshot_path: str,
|
||
active_processor: StreamProcessor,
|
||
) -> Dict[str, Any]:
|
||
"""Vérifier que l'écran actuel correspond à l'état attendu du node.
|
||
|
||
Compare le screenshot actuel avec le prototype du node attendu
|
||
via similarité d'embedding CLIP (rapide, ~200ms).
|
||
|
||
Args:
|
||
session_id: ID de la session de replay
|
||
expected_node_id: ID du node source de l'action (from_node)
|
||
current_screenshot_path: Chemin du screenshot heartbeat récent
|
||
active_processor: Instance StreamProcessor avec le CLIPEmbedder chargé
|
||
|
||
Returns:
|
||
{"match": True/False, "similarity": float, "expected_node": str,
|
||
"reason": str (si mismatch), "popup_detected": bool}
|
||
"""
|
||
result: Dict[str, Any] = {
|
||
"match": True,
|
||
"similarity": 1.0,
|
||
"expected_node": expected_node_id,
|
||
"popup_detected": False,
|
||
}
|
||
|
||
try:
|
||
# 1. Trouver le workflow actif pour cette session
|
||
replay_state = None
|
||
workflow = None
|
||
with _replay_lock:
|
||
for state in _replay_states.values():
|
||
if state["session_id"] == session_id and state["status"] == "running":
|
||
replay_state = state
|
||
break
|
||
|
||
if not replay_state:
|
||
result["reason"] = "no_active_replay"
|
||
return result
|
||
|
||
workflow_id = replay_state.get("workflow_id", "")
|
||
with active_processor._data_lock:
|
||
workflow = active_processor._workflows.get(workflow_id)
|
||
|
||
if workflow is None:
|
||
result["reason"] = "workflow_not_found"
|
||
return result
|
||
|
||
# 2. Récupérer le prototype du node attendu
|
||
# Supporter à la fois les objets Workflow et les dicts bruts
|
||
node = None
|
||
if hasattr(workflow, "get_node"):
|
||
node = workflow.get_node(expected_node_id)
|
||
elif isinstance(workflow, dict):
|
||
# Format dict brut (workflows VWB/manuels)
|
||
for n in workflow.get("nodes", []):
|
||
if n.get("node_id") == expected_node_id:
|
||
node = n
|
||
break
|
||
|
||
if node is None:
|
||
result["reason"] = "node_not_found"
|
||
return result
|
||
|
||
# Extraire le prototype vector
|
||
metadata = node.metadata if hasattr(node, "metadata") else node.get("metadata", {})
|
||
proto_list = metadata.get("_prototype_vector")
|
||
if not proto_list or not isinstance(proto_list, (list, tuple)):
|
||
result["reason"] = "no_prototype_vector"
|
||
return result
|
||
|
||
import numpy as np
|
||
prototype_vector = np.array(proto_list, dtype=np.float32)
|
||
|
||
# 3. Calculer l'embedding CLIP du screenshot actuel
|
||
active_processor._ensure_initialized()
|
||
if active_processor._clip_embedder is None:
|
||
result["reason"] = "clip_embedder_unavailable"
|
||
return result
|
||
|
||
from PIL import Image
|
||
pil_image = Image.open(current_screenshot_path)
|
||
current_vector = active_processor._clip_embedder.embed_image(pil_image)
|
||
|
||
if current_vector is None or len(current_vector) == 0:
|
||
result["reason"] = "embedding_failed"
|
||
return result
|
||
|
||
# 4. Similarité cosine
|
||
current_vector = current_vector.flatten().astype(np.float32)
|
||
prototype_vector = prototype_vector.flatten().astype(np.float32)
|
||
|
||
norm_current = np.linalg.norm(current_vector)
|
||
norm_proto = np.linalg.norm(prototype_vector)
|
||
if norm_current < 1e-8 or norm_proto < 1e-8:
|
||
result["reason"] = "zero_norm_vector"
|
||
result["match"] = False
|
||
result["similarity"] = 0.0
|
||
return result
|
||
|
||
similarity = float(
|
||
np.dot(current_vector, prototype_vector) / (norm_current * norm_proto)
|
||
)
|
||
result["similarity"] = round(similarity, 4)
|
||
result["match"] = similarity >= _PRECHECK_SIMILARITY_THRESHOLD
|
||
|
||
if not result["match"]:
|
||
result["reason"] = "screen_mismatch"
|
||
logger.warning(
|
||
f"Pre-check MISMATCH pour session={session_id} "
|
||
f"node={expected_node_id}: similarity={similarity:.4f} "
|
||
f"< seuil={_PRECHECK_SIMILARITY_THRESHOLD}"
|
||
)
|
||
|
||
# 5. Détection de popup par changement de titre de fenêtre
|
||
result["popup_detected"] = _detect_popup_hint(
|
||
session_id, workflow, expected_node_id
|
||
)
|
||
|
||
except Exception as e:
|
||
# Ne jamais bloquer le replay en cas d'erreur du pre-check
|
||
logger.error(f"Pre-check échoué (non bloquant): {e}")
|
||
result["match"] = True # Fallback permissif
|
||
result["reason"] = f"precheck_error: {e}"
|
||
|
||
return result
|
||
|
||
|
||
def _detect_popup_hint(
|
||
session_id: str,
|
||
workflow: Any,
|
||
expected_node_id: str,
|
||
) -> bool:
|
||
"""Détecter si une popup ou un dialogue modal est probable.
|
||
|
||
Compare le titre de fenêtre actuel (via last_window_info de la session)
|
||
avec le titre attendu du node dans le workflow. Un changement de titre
|
||
suggère une popup/dialogue inattendu.
|
||
|
||
Args:
|
||
session_id: ID de la session
|
||
workflow: Workflow object ou dict
|
||
expected_node_id: ID du node attendu
|
||
|
||
Returns:
|
||
True si un changement de titre suggère une popup
|
||
"""
|
||
try:
|
||
# Titre actuel depuis la session
|
||
session = processor.session_manager.get_session(session_id)
|
||
if not session:
|
||
return False
|
||
current_title = session.last_window_info.get("title", "").strip().lower()
|
||
if not current_title or current_title == "unknown":
|
||
return False
|
||
|
||
# Titre attendu depuis le node du workflow
|
||
expected_title = ""
|
||
if hasattr(workflow, "get_node"):
|
||
node = workflow.get_node(expected_node_id)
|
||
if node and hasattr(node, "template") and hasattr(node.template, "window"):
|
||
window_spec = node.template.window
|
||
if hasattr(window_spec, "title_contains") and window_spec.title_contains:
|
||
expected_title = window_spec.title_contains.strip().lower()
|
||
elif isinstance(workflow, dict):
|
||
for n in workflow.get("nodes", []):
|
||
if n.get("node_id") == expected_node_id:
|
||
template = n.get("template", {})
|
||
window = template.get("window", {})
|
||
expected_title = (window.get("title_contains") or "").strip().lower()
|
||
break
|
||
|
||
if not expected_title:
|
||
return False
|
||
|
||
# Si le titre actuel ne contient plus le titre attendu, popup probable
|
||
if expected_title not in current_title:
|
||
logger.info(
|
||
f"Popup détectée: titre actuel='{current_title}' "
|
||
f"ne contient pas '{expected_title}'"
|
||
)
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.debug(f"Détection popup échouée: {e}")
|
||
|
||
return False
|
||
|
||
|
||
@app.get("/api/v1/traces/stream/replay/next")
|
||
async def get_next_action(session_id: str, machine_id: str = "default"):
|
||
"""
|
||
L'Agent V1 poll cet endpoint pour récupérer la prochaine action à exécuter.
|
||
|
||
Retourne la prochaine action de la queue ou {"action": null} si rien.
|
||
Modèle pull : l'agent demande, pas de WebSocket nécessaire.
|
||
|
||
Inclut un pre-check optionnel : si un heartbeat récent est disponible,
|
||
compare l'écran actuel avec le node attendu via similarité CLIP.
|
||
En cas de mismatch, retourne une action "wait" au lieu de l'action réelle,
|
||
laissant le client le temps de retrouver le bon état.
|
||
|
||
Multi-machine : si machine_id est fourni, ne retourne que les actions
|
||
destinées à cette machine (évite les fuites cross-machine).
|
||
|
||
Si la session de l'agent n'a pas d'actions en attente, cherche dans les
|
||
autres queues de la MÊME machine (pas cross-machine).
|
||
"""
|
||
with _replay_lock:
|
||
# Verifier si le replay est en pause supervisee (target_not_found).
|
||
# Dans ce cas, NE PAS envoyer d'action — attendre l'intervention utilisateur.
|
||
for state in _replay_states.values():
|
||
if (state["session_id"] == session_id
|
||
and state["status"] == "paused_need_help"):
|
||
logger.debug(
|
||
f"Replay {state['replay_id']} en pause supervisee "
|
||
f"pour session {session_id} — pas d'action envoyee"
|
||
)
|
||
return {
|
||
"action": None,
|
||
"session_id": session_id,
|
||
"machine_id": machine_id,
|
||
"replay_paused": True,
|
||
"pause_message": state.get("pause_message", "Replay en pause"),
|
||
"replay_id": state["replay_id"],
|
||
}
|
||
|
||
queue = _replay_queues.get(session_id, [])
|
||
# Log seulement quand il y a des actions à distribuer
|
||
if queue:
|
||
logger.info(f"[REPLAY-QUEUE] session={session_id}, actions_en_attente={len(queue)}")
|
||
|
||
if not queue and machine_id != "default":
|
||
# Lookup 1 : machine_replay_target (mapping explicite POST /replay)
|
||
target_sid = _machine_replay_target.get(machine_id)
|
||
if target_sid and target_sid != session_id:
|
||
target_queue = _replay_queues.get(target_sid, [])
|
||
if target_queue:
|
||
queue = target_queue
|
||
_replay_queues[session_id] = target_queue
|
||
del _replay_queues[target_sid]
|
||
for state in _replay_states.values():
|
||
if state["session_id"] == target_sid and state["status"] == "running":
|
||
state["session_id"] = session_id
|
||
_machine_replay_target[machine_id] = session_id
|
||
logger.info(f"Replay machine-target: {machine_id} -> {target_sid} -> {session_id}")
|
||
|
||
# Lookup 2 : chercher dans les replay_states actifs pour cette machine
|
||
if not queue:
|
||
for state in _replay_states.values():
|
||
if (state.get("machine_id") == machine_id
|
||
and state["status"] == "running"
|
||
and state["session_id"] != session_id):
|
||
other_sid = state["session_id"]
|
||
other_queue = _replay_queues.get(other_sid, [])
|
||
if other_queue:
|
||
queue = other_queue
|
||
_replay_queues[session_id] = other_queue
|
||
del _replay_queues[other_sid]
|
||
state["session_id"] = session_id
|
||
_machine_replay_target[machine_id] = session_id
|
||
logger.info(f"Replay machine-state: {machine_id} -> {other_sid} -> {session_id}")
|
||
break
|
||
|
||
if not queue:
|
||
return {"action": None, "session_id": session_id, "machine_id": machine_id}
|
||
|
||
# Peek à la prochaine action SANS la retirer (pour le pre-check)
|
||
action = queue[0]
|
||
|
||
# ---- Pre-check écran (optionnel, non bloquant) ----
|
||
# Ne s'applique qu'aux actions qui ont un from_node (actions de workflow,
|
||
# pas les wait/retry auto-injectés ni les actions Copilot/Agent Libre)
|
||
from_node = action.get("from_node")
|
||
precheck_result = None
|
||
if from_node and action.get("type") not in ("wait",):
|
||
heartbeat = _last_heartbeat.get(session_id)
|
||
if heartbeat:
|
||
age = time.time() - heartbeat["timestamp"]
|
||
if age <= _HEARTBEAT_MAX_AGE_SECONDS:
|
||
try:
|
||
import asyncio
|
||
loop = asyncio.get_event_loop()
|
||
# Exécuter le pre-check dans un thread séparé pour ne pas
|
||
# bloquer l'event loop async (CLIP embed ~200ms)
|
||
precheck_result = await asyncio.wait_for(
|
||
loop.run_in_executor(
|
||
None, # ThreadPool par défaut
|
||
_pre_check_screen_state,
|
||
session_id,
|
||
from_node,
|
||
heartbeat["path"],
|
||
processor,
|
||
),
|
||
timeout=0.5, # Max 500ms pour le pre-check
|
||
)
|
||
except asyncio.TimeoutError:
|
||
logger.warning(
|
||
f"Pre-check timeout (>500ms) pour session={session_id} "
|
||
f"node={from_node}, skip"
|
||
)
|
||
precheck_result = None
|
||
except Exception as e:
|
||
logger.error(f"Pre-check exception (non bloquant): {e}")
|
||
precheck_result = None
|
||
else:
|
||
logger.debug(
|
||
f"Pre-check skip: heartbeat trop ancien ({age:.1f}s "
|
||
f"> {_HEARTBEAT_MAX_AGE_SECONDS}s)"
|
||
)
|
||
|
||
# Si le pre-check détecte un mismatch, ne pas retirer l'action de la queue
|
||
# et retourner une action "wait" pour que le client attende et ré-essaie
|
||
if precheck_result and not precheck_result["match"]:
|
||
# ---- Auth auto : détecter un écran d'authentification (optionnel) ----
|
||
# Si le mismatch est dû à un écran d'auth, injecter les actions d'auth
|
||
# en tête de queue pour que l'agent s'authentifie automatiquement.
|
||
if _auth_handler and not precheck_result.get("popup_detected"):
|
||
try:
|
||
# Construire un ScreenState minimal depuis le heartbeat
|
||
heartbeat = _last_heartbeat.get(session_id, {})
|
||
_auth_screen_state = {
|
||
"perception": {"detected_text": heartbeat.get("detected_text", [])},
|
||
"ui_elements": heartbeat.get("ui_elements", []),
|
||
"window": heartbeat.get("window_info", {}),
|
||
"ocr_text": heartbeat.get("ocr_text", ""),
|
||
}
|
||
auth_request = _auth_handler.detect_auth_screen(_auth_screen_state)
|
||
if auth_request and auth_request.confidence >= 0.5:
|
||
auth_actions = _auth_handler.get_auth_actions(auth_request)
|
||
if auth_actions:
|
||
# Injecter les actions d'auth en tête de queue (avant l'action bloquée)
|
||
with _replay_lock:
|
||
current_q = _replay_queues.get(session_id, [])
|
||
_replay_queues[session_id] = auth_actions + current_q
|
||
logger.info(
|
||
f"Auth auto : {len(auth_actions)} actions injectées pour "
|
||
f"session={session_id} app={auth_request.app_name} "
|
||
f"type={auth_request.auth_type} (confiance={auth_request.confidence:.2f})"
|
||
)
|
||
# Retourner la première action d'auth immédiatement
|
||
with _replay_lock:
|
||
first_auth = _replay_queues[session_id].pop(0)
|
||
return {
|
||
"action": first_auth,
|
||
"session_id": session_id,
|
||
"machine_id": machine_id,
|
||
"precheck": precheck_result,
|
||
"auth_detected": True,
|
||
}
|
||
except Exception as e:
|
||
logger.warning(f"Auth auto : détection échouée (non bloquant) : {e}")
|
||
|
||
if precheck_result.get("popup_detected"):
|
||
wait_action = {
|
||
"action_id": f"precheck_wait_{uuid.uuid4().hex[:6]}",
|
||
"type": "wait",
|
||
"reason": "popup_detected",
|
||
"suggestion": "press_escape_or_click_close",
|
||
"expected_node": from_node,
|
||
"similarity": precheck_result["similarity"],
|
||
"duration_ms": 2000,
|
||
}
|
||
logger.warning(
|
||
f"Pre-check: popup détectée pour session={session_id} "
|
||
f"node={from_node}, envoi wait+suggestion"
|
||
)
|
||
else:
|
||
wait_action = {
|
||
"action_id": f"precheck_wait_{uuid.uuid4().hex[:6]}",
|
||
"type": "wait",
|
||
"reason": "screen_mismatch",
|
||
"expected_node": from_node,
|
||
"similarity": precheck_result["similarity"],
|
||
"threshold": _PRECHECK_SIMILARITY_THRESHOLD,
|
||
"duration_ms": 1500,
|
||
}
|
||
logger.warning(
|
||
f"Pre-check: mismatch écran pour session={session_id} "
|
||
f"node={from_node} (sim={precheck_result['similarity']:.4f}), envoi wait"
|
||
)
|
||
return {
|
||
"action": wait_action,
|
||
"session_id": session_id,
|
||
"machine_id": machine_id,
|
||
"precheck": precheck_result,
|
||
}
|
||
|
||
# Pre-check OK (ou skip) : retirer l'action de la queue et l'envoyer
|
||
with _replay_lock:
|
||
current_queue = _replay_queues.get(session_id, [])
|
||
if current_queue and current_queue[0].get("action_id") == action.get("action_id"):
|
||
current_queue.pop(0)
|
||
# Else: queue a changé entre temps (race condition bénigne), on envoie quand même
|
||
|
||
# Sauvegarder l'action envoyée pour le retry (si la vérification échoue)
|
||
# NE PAS écraser si _schedule_retry a déjà mis le bon retry_count
|
||
action_id_sent = action.get("action_id", "")
|
||
if action_id_sent and action_id_sent not in _retry_pending:
|
||
_retry_pending[action_id_sent] = {
|
||
"action": dict(action),
|
||
"retry_count": 0,
|
||
"replay_id": "",
|
||
}
|
||
|
||
logger.info(
|
||
f"Action envoyée à {session_id} (machine={machine_id}) : "
|
||
f"{action.get('type')} (id={action.get('action_id')})"
|
||
f"{' [precheck OK sim=' + str(precheck_result['similarity']) + ']' if precheck_result else ''}"
|
||
)
|
||
|
||
response: Dict[str, Any] = {
|
||
"action": action,
|
||
"session_id": session_id,
|
||
"machine_id": machine_id,
|
||
}
|
||
if precheck_result:
|
||
response["precheck"] = precheck_result
|
||
return response
|
||
|
||
|
||
@app.post("/api/v1/traces/stream/replay/result")
|
||
async def report_action_result(report: ReplayResultReport):
|
||
"""
|
||
L'Agent V1 renvoie le résultat d'exécution d'une action.
|
||
|
||
Permet au serveur de suivre la progression et de détecter les échecs.
|
||
Intègre la vérification post-action (comparaison screenshots) et le retry
|
||
automatique (max 3 tentatives) avant de déclarer un échec.
|
||
|
||
Stratégie de retry :
|
||
- Retry 1 : re-résoudre la cible visuellement et réinjecter l'action
|
||
- Retry 2 : attendre 2s (wait) puis réinjecter l'action (possible loading)
|
||
- Retry 3 : dernier essai identique, si échec → erreur non-récupérable
|
||
"""
|
||
session_id = report.session_id
|
||
action_id = report.action_id
|
||
|
||
# Trouver le replay correspondant à cette session
|
||
with _replay_lock:
|
||
replay_state = None
|
||
for state in _replay_states.values():
|
||
if state["session_id"] == session_id and state["status"] == "running":
|
||
replay_state = state
|
||
break
|
||
|
||
if not replay_state:
|
||
logger.warning(
|
||
f"Résultat reçu pour session {session_id} mais aucun replay actif"
|
||
)
|
||
return {"status": "no_active_replay", "session_id": session_id}
|
||
|
||
# Récupérer l'info de retry pour cette action (si c'est un retry)
|
||
retry_info = _retry_pending.pop(action_id, None)
|
||
retry_count = retry_info["retry_count"] if retry_info else 0
|
||
original_action = retry_info["action"] if retry_info else None
|
||
|
||
# Guard de sécurité : détecter le retry_count depuis l'action_id si non trouvé
|
||
# Évite la boucle infinie si _retry_pending est désynchronisé
|
||
if retry_count == 0 and "_retry" in action_id:
|
||
import re
|
||
retry_suffixes = re.findall(r"_retry\d+", action_id)
|
||
retry_count = max(retry_count, len(retry_suffixes))
|
||
if retry_count > 0:
|
||
logger.warning(
|
||
f"retry_count corrigé par action_id : {retry_count} "
|
||
f"(action_id contient {len(retry_suffixes)} suffixes _retry)"
|
||
)
|
||
|
||
# Mettre à jour le dernier screenshot reçu
|
||
screenshot_after = report.screenshot_after or report.screenshot
|
||
if screenshot_after:
|
||
with _replay_lock:
|
||
replay_state["last_screenshot"] = screenshot_after
|
||
|
||
# === Vérification post-action ===
|
||
# Ne vérifier que les actions "click" — les "type" et "key_combo" sont
|
||
# toujours considérées réussies si l'agent dit success (pas de position à vérifier,
|
||
# et le screenshot change peu pour une frappe clavier)
|
||
#
|
||
# Si l'agent a envoyé un warning "no_screen_change" ou "popup_handled",
|
||
# il a déjà tenté de gérer la situation (popup handler). Ne PAS relancer
|
||
# de retry côté serveur — continuer vers l'action suivante.
|
||
agent_warning = report.warning or ""
|
||
agent_handled_popup = agent_warning in ("no_screen_change", "popup_handled")
|
||
if agent_handled_popup:
|
||
logger.info(
|
||
f"Action {action_id} : agent warning='{agent_warning}' — "
|
||
f"popup déjà gérée côté agent, pas de retry serveur"
|
||
)
|
||
|
||
action_type_for_verify = (original_action or {}).get("type", "unknown")
|
||
skip_verify = action_type_for_verify in ("type", "key_combo", "wait")
|
||
# Skip aussi la vérification serveur si l'agent a déjà géré la popup
|
||
skip_verify = skip_verify or agent_handled_popup
|
||
verification = None
|
||
if report.success and screenshot_after and not skip_verify:
|
||
# Utiliser le screenshot_before envoyé par l'agent (Critic fiable)
|
||
# Fallback sur le dernier screenshot stocké côté serveur
|
||
screenshot_before = report.screenshot_before or replay_state.get("_last_screenshot_before")
|
||
if screenshot_before:
|
||
try:
|
||
action_dict = original_action or {"type": "unknown", "action_id": action_id}
|
||
result_dict = {
|
||
"success": report.success,
|
||
"error": report.error,
|
||
}
|
||
# Utiliser le Critic sémantique si l'action a un expected_result
|
||
expected_result = (original_action or {}).get("expected_result", "")
|
||
action_intention = (original_action or {}).get("intention", "")
|
||
if expected_result:
|
||
# Critic complet : pixel + VLM sémantique
|
||
workflow_ctx = (
|
||
f"Action {replay_state.get('completed_actions', 0)+1}"
|
||
f"/{len(replay_state.get('actions', []))}"
|
||
)
|
||
verification = _replay_verifier.verify_with_critic(
|
||
action=action_dict,
|
||
result=result_dict,
|
||
screenshot_before=screenshot_before,
|
||
screenshot_after=screenshot_after,
|
||
expected_result=expected_result,
|
||
action_intention=action_intention,
|
||
workflow_context=workflow_ctx,
|
||
)
|
||
if verification.semantic_verified is not None:
|
||
logger.info(
|
||
f"Critic sémantique : {'OK' if verification.semantic_verified else 'ÉCHEC'} "
|
||
f"en {verification.semantic_elapsed_ms:.0f}ms — {verification.semantic_detail[:80]}"
|
||
)
|
||
else:
|
||
# Vérification pixel seule (pas d'expected_result)
|
||
verification = _replay_verifier.verify_action(
|
||
action=action_dict,
|
||
result=result_dict,
|
||
screenshot_before=screenshot_before,
|
||
screenshot_after=screenshot_after,
|
||
)
|
||
except Exception as e:
|
||
logger.warning(f"Vérification post-action échouée: {e}")
|
||
|
||
# Stocker le screenshot actuel comme "before" pour la prochaine action
|
||
if screenshot_after:
|
||
with _replay_lock:
|
||
replay_state["_last_screenshot_before"] = screenshot_after
|
||
|
||
# === Enregistrer le résultat ===
|
||
with _replay_lock:
|
||
result_entry = {
|
||
"action_id": action_id,
|
||
"success": report.success,
|
||
"error": report.error,
|
||
"warning": report.warning,
|
||
"has_screenshot": bool(screenshot_after),
|
||
"actual_position": report.actual_position,
|
||
"retry_count": retry_count,
|
||
"verification": verification.to_dict() if verification else None,
|
||
"resolution_method": report.resolution_method,
|
||
"resolution_score": report.resolution_score,
|
||
"resolution_elapsed_ms": report.resolution_elapsed_ms,
|
||
}
|
||
replay_state["results"].append(result_entry)
|
||
|
||
# === Apprentissage : enregistrer le résultat pour amélioration continue ===
|
||
try:
|
||
_replay_learner.record_from_replay_result(
|
||
session_id=session_id,
|
||
action=original_action or {"action_id": action_id, "type": "unknown"},
|
||
result=result_entry,
|
||
verification=verification.to_dict() if verification else None,
|
||
)
|
||
except Exception as e:
|
||
logger.debug(f"Learning: échec enregistrement: {e}")
|
||
|
||
# === Audit Trail : traçabilité complète pour conformité hospitalière ===
|
||
try:
|
||
_action = original_action or {"action_id": action_id, "type": "unknown"}
|
||
_target_spec = _action.get("target_spec", {})
|
||
_verification = verification.to_dict() if verification else {}
|
||
|
||
# Déterminer le résultat pour l'audit
|
||
if report.success and (verification is None or verification.verified):
|
||
_audit_result = "success"
|
||
elif report.success and verification and not verification.verified:
|
||
_audit_result = "recovered" if retry_count > 0 else "failed"
|
||
elif not report.success:
|
||
_audit_result = "failed"
|
||
else:
|
||
_audit_result = "success"
|
||
|
||
# Déterminer le résultat du Critic
|
||
_critic = ""
|
||
if verification:
|
||
if verification.semantic_verified is True:
|
||
_critic = "semantic_ok"
|
||
elif verification.semantic_verified is False:
|
||
_critic = f"semantic_fail: {verification.semantic_detail[:100]}"
|
||
elif verification.verified:
|
||
_critic = "pixel_ok"
|
||
else:
|
||
_critic = f"pixel_fail: {verification.detail[:100]}"
|
||
|
||
_audit_trail.record(AuditEntry(
|
||
session_id=session_id,
|
||
action_id=action_id,
|
||
user_id=replay_state.get("params", {}).get("user_id", ""),
|
||
user_name=replay_state.get("params", {}).get("user_name", ""),
|
||
machine_id=replay_state.get("machine_id", ""),
|
||
action_type=_action.get("type", ""),
|
||
action_detail=_target_spec.get("by_text", "") or _action.get("intention", ""),
|
||
target_app=_target_spec.get("window_title", ""),
|
||
execution_mode=replay_state.get("params", {}).get("execution_mode", "autonomous"),
|
||
result=_audit_result,
|
||
resolution_method=result_entry.get("resolution_method", ""),
|
||
critic_result=_critic,
|
||
recovery_action=report.warning or "",
|
||
domain=replay_state.get("params", {}).get("domain", ""),
|
||
workflow_id=replay_state.get("workflow_id", ""),
|
||
workflow_name=replay_state.get("params", {}).get("workflow_name", ""),
|
||
duration_ms=result_entry.get("resolution_elapsed_ms", 0.0) or 0.0,
|
||
))
|
||
except Exception as e:
|
||
logger.debug(f"Audit Trail: échec enregistrement: {e}")
|
||
|
||
with _replay_lock:
|
||
# === Logique de retry / success / failure ===
|
||
if report.success and (verification is None or verification.verified):
|
||
# Action réussie (vérification OK ou pas de vérification)
|
||
replay_state["completed_actions"] += 1
|
||
replay_state["current_action_index"] += 1
|
||
|
||
elif report.success and verification and not verification.verified:
|
||
# Agent dit "success" mais la vérification échoue (rien n'a changé)
|
||
replay_state["unverified_actions"] += 1
|
||
logger.warning(
|
||
f"Action {action_id} marquée success mais non vérifiée: "
|
||
f"{verification.detail}"
|
||
)
|
||
if verification.suggestion == "retry" and retry_count < MAX_RETRIES_PER_ACTION:
|
||
# Réinjecter pour retry
|
||
_schedule_retry(
|
||
session_id, replay_state, original_action or {"action_id": action_id},
|
||
retry_count, "verification_failed"
|
||
)
|
||
else:
|
||
# Continuer malgré tout (action non vérifiée)
|
||
replay_state["completed_actions"] += 1
|
||
replay_state["current_action_index"] += 1
|
||
|
||
elif not report.success and agent_warning == "no_screen_change":
|
||
# L'action a été exécutée mais l'écran n'a pas changé.
|
||
# PAS de retry — loguer l'échec et continuer vers l'action suivante.
|
||
# C'est plus honnête que "success" et évite les retries en boucle.
|
||
replay_state["unverified_actions"] += 1
|
||
replay_state["completed_actions"] += 1
|
||
replay_state["current_action_index"] += 1
|
||
logger.warning(
|
||
f"Action {action_id} : écran inchangé (no_screen_change) — "
|
||
f"action sans effet visible, on continue"
|
||
)
|
||
|
||
elif not report.success and (report.error or "") == "target_not_found":
|
||
# Cible non trouvée visuellement — PAUSE supervisée, PAS d'erreur fatale.
|
||
# L'utilisateur doit intervenir (naviguer vers le bon ecran, fermer une popup, etc.)
|
||
# On NE vide PAS la queue : les actions restantes seront reprises apres intervention.
|
||
target_desc = report.target_description or "élément inconnu"
|
||
replay_state["status"] = "paused_need_help"
|
||
replay_state["failed_action"] = {
|
||
"action_id": action_id,
|
||
"type": (original_action or {}).get("type", "unknown"),
|
||
"target_description": target_desc,
|
||
"screenshot_b64": screenshot_after or report.screenshot,
|
||
"target_spec": report.target_spec,
|
||
}
|
||
replay_state["pause_message"] = f"Je ne vois pas '{target_desc}' à l'écran"
|
||
error_entry = {
|
||
"action_id": action_id,
|
||
"error": f"target_not_found: {target_desc}",
|
||
"retry_count": 0,
|
||
"timestamp": time.time(),
|
||
}
|
||
replay_state["error_log"].append(error_entry)
|
||
logger.warning(
|
||
f"Replay PAUSE supervisée : cible '{target_desc}' non trouvée "
|
||
f"pour {action_id} — en attente d'intervention utilisateur"
|
||
)
|
||
# Logger l'echec pour l'apprentissage futur
|
||
log_replay_failure(
|
||
replay_id=replay_state["replay_id"],
|
||
action_id=action_id,
|
||
target_spec=report.target_spec,
|
||
screenshot_b64=screenshot_after or report.screenshot,
|
||
resolution_attempts=[
|
||
r for r in replay_state["results"]
|
||
if r.get("action_id") == action_id and r.get("resolution_method")
|
||
],
|
||
error="target_not_found",
|
||
extra={
|
||
"target_description": target_desc,
|
||
"actions_completed": replay_state["completed_actions"],
|
||
"actions_remaining": len(_replay_queues.get(session_id, [])),
|
||
},
|
||
)
|
||
|
||
elif not report.success and "visual resolve" in (report.error or "").lower():
|
||
# Visual resolve échoué (ancien format d'erreur) — PAUSE supervisée aussi.
|
||
# Compatibilité avec les agents qui n'envoient pas encore "target_not_found".
|
||
target_desc = report.target_description or (report.error or "Visual resolve échoué")
|
||
replay_state["status"] = "paused_need_help"
|
||
replay_state["failed_action"] = {
|
||
"action_id": action_id,
|
||
"type": (original_action or {}).get("type", "unknown"),
|
||
"target_description": target_desc,
|
||
"screenshot_b64": screenshot_after or report.screenshot,
|
||
"target_spec": report.target_spec,
|
||
}
|
||
replay_state["pause_message"] = f"Je ne vois pas '{target_desc}' à l'écran"
|
||
error_entry = {
|
||
"action_id": action_id,
|
||
"error": report.error or "Visual resolve échoué",
|
||
"retry_count": 0,
|
||
"timestamp": time.time(),
|
||
}
|
||
replay_state["error_log"].append(error_entry)
|
||
logger.warning(
|
||
f"Replay PAUSE supervisée (compat) : visual resolve échoué pour {action_id} — "
|
||
f"{report.error}"
|
||
)
|
||
# Logger l'echec pour l'apprentissage futur
|
||
log_replay_failure(
|
||
replay_id=replay_state["replay_id"],
|
||
action_id=action_id,
|
||
target_spec=report.target_spec,
|
||
screenshot_b64=screenshot_after or report.screenshot,
|
||
error="visual_resolve_failed",
|
||
)
|
||
|
||
elif not report.success and retry_count < MAX_RETRIES_PER_ACTION:
|
||
# Échec réel (pas juste screen inchangé ou visual) — retry
|
||
action_to_retry = original_action or {"action_id": action_id, "type": "unknown"}
|
||
_schedule_retry(
|
||
session_id, replay_state, action_to_retry,
|
||
retry_count, report.error or "unknown_error"
|
||
)
|
||
|
||
else:
|
||
# Échec définitif (retries épuisés)
|
||
replay_state["failed_actions"] += 1
|
||
error_entry = {
|
||
"action_id": action_id,
|
||
"error": report.error or "Retries épuisés",
|
||
"retry_count": retry_count,
|
||
"timestamp": time.time(),
|
||
}
|
||
replay_state["error_log"].append(error_entry)
|
||
|
||
# Marquer le replay en erreur et vider la queue
|
||
replay_state["status"] = "error"
|
||
_replay_queues[session_id] = []
|
||
logger.error(
|
||
f"Replay {replay_state['replay_id']} échoué à l'action {action_id} "
|
||
f"après {retry_count} retries: {report.error}"
|
||
)
|
||
|
||
# Notifier via callback si configuré
|
||
_notify_error_callback(replay_state, action_id, report.error)
|
||
|
||
# Vérifier si le replay est terminé (queue vide + dernière action réussie)
|
||
remaining = len(_replay_queues.get(session_id, []))
|
||
if remaining == 0 and replay_state["status"] == "running":
|
||
replay_state["status"] = "completed"
|
||
logger.info(
|
||
f"Replay {replay_state['replay_id']} terminé avec succès : "
|
||
f"{replay_state['completed_actions']}/{replay_state['total_actions']} actions"
|
||
f" ({replay_state['retried_actions']} retries, "
|
||
f"{replay_state['unverified_actions']} non vérifiées)"
|
||
)
|
||
# Résumé des métriques de résolution visuelle
|
||
results_with_method = [
|
||
r for r in replay_state["results"]
|
||
if r.get("resolution_method")
|
||
]
|
||
if results_with_method:
|
||
methods_count = {}
|
||
total_elapsed = 0.0
|
||
total_score = 0.0
|
||
for r in results_with_method:
|
||
m = r["resolution_method"]
|
||
methods_count[m] = methods_count.get(m, 0) + 1
|
||
total_elapsed += r.get("resolution_elapsed_ms") or 0
|
||
total_score += r.get("resolution_score") or 0
|
||
avg_elapsed = total_elapsed / len(results_with_method)
|
||
avg_score = total_score / len(results_with_method)
|
||
methods_str = ", ".join(
|
||
f"{m}={c}" for m, c in sorted(methods_count.items())
|
||
)
|
||
logger.info(
|
||
f"Replay {replay_state['replay_id']} métriques résolution : "
|
||
f"{len(results_with_method)} resolves [{methods_str}] "
|
||
f"score_moy={avg_score:.2f} temps_moy={avg_elapsed:.0f}ms"
|
||
)
|
||
|
||
# Libérer le GPU pour le worker VLM si le replay est terminé ou en erreur
|
||
if replay_state["status"] in ("completed", "error"):
|
||
_clear_replay_lock()
|
||
logger.info(
|
||
f"Replay {replay_state['replay_id']} terminé (status={replay_state['status']}) "
|
||
f"— worker VLM autorisé à reprendre"
|
||
)
|
||
|
||
return {
|
||
"status": "recorded",
|
||
"action_id": action_id,
|
||
"success": report.success,
|
||
"replay_status": replay_state["status"],
|
||
"remaining_actions": remaining,
|
||
"retry_count": retry_count,
|
||
"verification": verification.to_dict() if verification else None,
|
||
}
|
||
|
||
|
||
def _create_replay_state(
|
||
replay_id: str,
|
||
workflow_id: str,
|
||
session_id: str,
|
||
total_actions: int,
|
||
params: Optional[Dict[str, Any]] = None,
|
||
machine_id: Optional[str] = None,
|
||
) -> Dict[str, Any]:
|
||
"""Créer un état de replay enrichi avec les champs de suivi d'erreur."""
|
||
return {
|
||
"replay_id": replay_id,
|
||
"workflow_id": workflow_id,
|
||
"session_id": session_id,
|
||
"machine_id": machine_id or "default", # Machine cible du replay
|
||
"status": "running",
|
||
"total_actions": total_actions,
|
||
"completed_actions": 0,
|
||
"failed_actions": 0,
|
||
"current_action_index": 0,
|
||
"params": params or {},
|
||
"results": [], # Historique des résultats action par action
|
||
# Champs enrichis pour le suivi d'erreur (#7)
|
||
"retried_actions": 0,
|
||
"unverified_actions": 0,
|
||
"error_log": [], # Liste des erreurs rencontrées
|
||
"last_screenshot": None, # Path du dernier screenshot reçu
|
||
"_last_screenshot_before": None, # Interne: screenshot avant la dernière action
|
||
# Champs pour pause supervisée (target_not_found)
|
||
"failed_action": None, # Contexte de l'action en echec (quand paused_need_help)
|
||
"pause_message": None, # Message a afficher a l'utilisateur
|
||
}
|
||
|
||
|
||
def _schedule_retry(
|
||
session_id: str,
|
||
replay_state: Dict[str, Any],
|
||
action: Dict[str, Any],
|
||
current_retry: int,
|
||
reason: str,
|
||
):
|
||
"""
|
||
Programmer un retry pour une action échouée.
|
||
|
||
Stratégie :
|
||
- Retry 1 : réinjecter l'action directement (re-résolution visuelle par l'agent)
|
||
- Retry 2 : injecter un wait de 2s avant l'action (possible loading en cours)
|
||
- Retry 3 : dernier essai direct
|
||
|
||
L'action est réinsérée en tête de la queue pour être la prochaine exécutée.
|
||
_replay_lock doit être acquis par l'appelant.
|
||
"""
|
||
next_retry = current_retry + 1
|
||
replay_state["retried_actions"] += 1
|
||
|
||
# Créer une copie de l'action avec un nouveau action_id pour le tracking
|
||
retry_action = dict(action)
|
||
retry_action_id = f"{action.get('action_id', 'unknown')}_retry{next_retry}"
|
||
retry_action["action_id"] = retry_action_id
|
||
|
||
# Stocker l'info de retry pour le prochain report_action_result
|
||
_retry_pending[retry_action_id] = {
|
||
"action": action,
|
||
"retry_count": next_retry,
|
||
"replay_id": replay_state["replay_id"],
|
||
"reason": reason,
|
||
}
|
||
|
||
# Stratégie de retry selon le numéro
|
||
actions_to_insert = []
|
||
|
||
if next_retry == 2:
|
||
# Retry 2 : injecter un wait de 2s avant l'action
|
||
wait_action = {
|
||
"action_id": f"wait_retry_{uuid.uuid4().hex[:6]}",
|
||
"type": "wait",
|
||
"duration_ms": 2000,
|
||
}
|
||
actions_to_insert.append(wait_action)
|
||
|
||
actions_to_insert.append(retry_action)
|
||
|
||
# Insérer en tête de la queue (prochaine action à exécuter)
|
||
queue = _replay_queues.get(session_id, [])
|
||
_replay_queues[session_id] = actions_to_insert + queue
|
||
|
||
logger.info(
|
||
f"Retry {next_retry}/{MAX_RETRIES_PER_ACTION} programmé pour {action.get('action_id')} "
|
||
f"(raison: {reason}) | nouveau id: {retry_action_id}"
|
||
)
|
||
|
||
|
||
def _notify_error_callback(
|
||
replay_state: Dict[str, Any],
|
||
action_id: str,
|
||
error: Optional[str],
|
||
):
|
||
"""
|
||
Notifier le callback d'erreur si configuré pour ce replay.
|
||
|
||
Appel HTTP POST non-bloquant vers l'URL de callback.
|
||
En cas d'échec de notification, on log mais on ne bloque pas.
|
||
"""
|
||
replay_id = replay_state["replay_id"]
|
||
callback_url = _error_callbacks.get(replay_id)
|
||
if not callback_url:
|
||
return
|
||
|
||
def _send_callback():
|
||
try:
|
||
import urllib.request
|
||
payload = json.dumps({
|
||
"replay_id": replay_id,
|
||
"workflow_id": replay_state.get("workflow_id"),
|
||
"session_id": replay_state.get("session_id"),
|
||
"action_id": action_id,
|
||
"error": error or "Erreur inconnue",
|
||
"retried_actions": replay_state.get("retried_actions", 0),
|
||
"error_log": replay_state.get("error_log", []),
|
||
"status": replay_state.get("status"),
|
||
}).encode("utf-8")
|
||
|
||
req = urllib.request.Request(
|
||
callback_url,
|
||
data=payload,
|
||
headers={"Content-Type": "application/json"},
|
||
method="POST",
|
||
)
|
||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||
logger.info(
|
||
f"Error callback envoyé à {callback_url}: {resp.status}"
|
||
)
|
||
except Exception as e:
|
||
logger.warning(
|
||
f"Échec envoi error callback à {callback_url}: {e}"
|
||
)
|
||
|
||
# Envoyer en arrière-plan pour ne pas bloquer
|
||
threading.Thread(target=_send_callback, daemon=True).start()
|
||
|
||
|
||
@app.post("/api/v1/traces/stream/replay/error_callback")
|
||
async def register_error_callback(config: ErrorCallbackConfig):
|
||
"""
|
||
Enregistrer une URL de callback pour les erreurs non-récupérables d'un replay.
|
||
|
||
Le chat server configure cette URL lors du lancement du replay.
|
||
Quand une erreur non-récupérable se produit (retries épuisés),
|
||
le serveur POST vers cette URL avec les détails de l'erreur.
|
||
"""
|
||
replay_id = config.replay_id
|
||
callback_url = config.callback_url
|
||
|
||
with _replay_lock:
|
||
if replay_id not in _replay_states:
|
||
raise HTTPException(
|
||
status_code=404,
|
||
detail=f"Replay '{replay_id}' non trouvé"
|
||
)
|
||
|
||
_error_callbacks[replay_id] = callback_url
|
||
logger.info(f"Error callback enregistré pour {replay_id}: {callback_url}")
|
||
|
||
return {
|
||
"status": "callback_registered",
|
||
"replay_id": replay_id,
|
||
"callback_url": callback_url,
|
||
}
|
||
|
||
|
||
@app.get("/api/v1/traces/stream/replay/{replay_id}")
|
||
async def get_replay_status(replay_id: str):
|
||
"""Consulter l'etat d'un replay en cours ou termine.
|
||
|
||
Quand le replay est en pause supervisee (paused_need_help), la reponse
|
||
inclut le contexte complet de l'echec : action echouee, screenshot,
|
||
target_spec, et message utilisateur.
|
||
"""
|
||
with _replay_lock:
|
||
state = _replay_states.get(replay_id)
|
||
|
||
if not state:
|
||
raise HTTPException(
|
||
status_code=404, detail=f"Replay '{replay_id}' non trouvé"
|
||
)
|
||
|
||
# Filtrer les champs internes (prefixes par _)
|
||
result = {k: v for k, v in state.items() if not k.startswith("_")}
|
||
|
||
# Enrichir avec le contexte de pause si applicable
|
||
if state["status"] == "paused_need_help":
|
||
session_id = state["session_id"]
|
||
remaining = len(_replay_queues.get(session_id, []))
|
||
result["actions_completed"] = state["completed_actions"]
|
||
result["actions_remaining"] = remaining
|
||
result["message"] = state.get("pause_message", "Replay en pause")
|
||
# Le failed_action contient deja screenshot_b64 et target_spec
|
||
|
||
return result
|
||
|
||
|
||
@app.get("/api/v1/traces/stream/replays")
|
||
async def list_replays():
|
||
"""Lister tous les replays (actifs, terminés, en erreur)."""
|
||
with _replay_lock:
|
||
# Filtrer les champs internes (préfixés par _)
|
||
return {
|
||
"replays": [
|
||
{k: v for k, v in state.items() if not k.startswith("_")}
|
||
for state in _replay_states.values()
|
||
]
|
||
}
|
||
|
||
|
||
@app.post("/api/v1/traces/stream/replay/{replay_id}/resume")
|
||
async def resume_replay(replay_id: str):
|
||
"""Reprendre un replay en pause supervisee (paused_need_help).
|
||
|
||
L'utilisateur a intervenu manuellement (naviguer vers le bon ecran,
|
||
fermer une popup, etc.) et veut relancer le replay. L'action echouee
|
||
est reinjectee en tete de queue pour etre re-tentee.
|
||
|
||
Si le replay n'est pas en pause, retourne une erreur 409 (conflit).
|
||
"""
|
||
with _replay_lock:
|
||
state = _replay_states.get(replay_id)
|
||
|
||
if not state:
|
||
raise HTTPException(
|
||
status_code=404, detail=f"Replay '{replay_id}' non trouvé"
|
||
)
|
||
|
||
if state["status"] != "paused_need_help":
|
||
raise HTTPException(
|
||
status_code=409,
|
||
detail=(
|
||
f"Replay '{replay_id}' n'est pas en pause "
|
||
f"(status actuel: {state['status']})"
|
||
),
|
||
)
|
||
|
||
# Recuperer l'action echouee pour la reinjecter
|
||
failed_action = state.get("failed_action")
|
||
session_id = state["session_id"]
|
||
|
||
# Remettre le replay en mode running
|
||
state["status"] = "running"
|
||
state["failed_action"] = None
|
||
state["pause_message"] = None
|
||
|
||
# Reinjecter l'action echouee en tete de queue (sera re-tentee)
|
||
if failed_action and failed_action.get("action_id"):
|
||
# Reconstruire l'action a partir du retry_pending ou de l'original
|
||
original_action_id = failed_action["action_id"]
|
||
# Chercher l'action originale dans les retry_pending
|
||
original = _retry_pending.pop(original_action_id, {}).get("action")
|
||
if not original:
|
||
# Reconstruire un minimum depuis le failed_action context
|
||
original = {
|
||
"action_id": original_action_id,
|
||
"type": failed_action.get("type", "click"),
|
||
"target_spec": failed_action.get("target_spec"),
|
||
"visual_mode": True,
|
||
}
|
||
# Creer un nouvel action_id pour le tracking
|
||
resume_id = f"{original_action_id}_resume"
|
||
resume_action = dict(original)
|
||
resume_action["action_id"] = resume_id
|
||
# Stocker dans retry_pending pour le suivi
|
||
_retry_pending[resume_id] = {
|
||
"action": original,
|
||
"retry_count": 0,
|
||
"replay_id": replay_id,
|
||
"reason": "resume_after_pause",
|
||
}
|
||
queue = _replay_queues.get(session_id, [])
|
||
_replay_queues[session_id] = [resume_action] + queue
|
||
|
||
remaining = len(_replay_queues.get(session_id, []))
|
||
logger.info(
|
||
f"Replay {replay_id} repris apres pause supervisee — "
|
||
f"{remaining} actions en attente"
|
||
)
|
||
|
||
return {
|
||
"status": "resumed",
|
||
"replay_id": replay_id,
|
||
"session_id": session_id,
|
||
"remaining_actions": remaining,
|
||
}
|
||
|
||
|
||
# =========================================================================
|
||
# Visual Replay — Résolution visuelle des cibles
|
||
# =========================================================================
|
||
|
||
|
||
class ResolveTargetRequest(BaseModel):
|
||
"""Requête de résolution visuelle d'une cible."""
|
||
session_id: str
|
||
screenshot_b64: str # Screenshot JPEG en base64
|
||
target_spec: Dict[str, Any] # {by_role, by_text, by_position, ...}
|
||
fallback_x_pct: float = 0.0 # Coordonnées de fallback
|
||
fallback_y_pct: float = 0.0
|
||
screen_width: int = 1920
|
||
screen_height: int = 1080
|
||
strict_mode: bool = False # True pour replay sessions (seuil template 0.90 + YOLO)
|
||
|
||
|
||
@app.post("/api/v1/traces/stream/replay/resolve_target")
|
||
async def resolve_target(request: ResolveTargetRequest):
|
||
"""
|
||
Résoudre visuellement une cible UI à partir d'un screenshot.
|
||
|
||
L'Agent V1 envoie un screenshot + target_spec AVANT d'exécuter l'action.
|
||
Le serveur analyse l'image avec UIDetector/OCR et retourne les coordonnées
|
||
de l'élément trouvé.
|
||
|
||
Stratégie de matching (par priorité) :
|
||
1. Template matching OpenCV (~100ms) — si anchor_image_base64 fourni
|
||
2. VLM Quick Find (~5-10s) — 1 appel VLM pour localiser l'élément
|
||
3. Matching sémantique complet (~15-20s) — ScreenAnalyzer + OCR + UI detection
|
||
4. Fallback — coordonnées statiques
|
||
"""
|
||
import base64
|
||
import io
|
||
import tempfile
|
||
|
||
from PIL import Image
|
||
|
||
# Décoder le screenshot
|
||
try:
|
||
img_bytes = base64.b64decode(request.screenshot_b64)
|
||
img = Image.open(io.BytesIO(img_bytes))
|
||
except Exception as e:
|
||
logger.error(f"Décodage screenshot échoué: {e}")
|
||
return _fallback_response(request, "decode_error", str(e))
|
||
|
||
# Sauver temporairement pour les analyseurs (ils attendent un chemin fichier)
|
||
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
|
||
img.save(tmp, format="JPEG", quality=90)
|
||
tmp_path = tmp.name
|
||
|
||
try:
|
||
# Lancer la résolution visuelle dans un thread SÉPARÉ (pas le GPU executor).
|
||
# Le template matching est CPU-only.
|
||
import asyncio
|
||
loop = asyncio.get_event_loop()
|
||
result = await loop.run_in_executor(
|
||
None, # ThreadPool par défaut (pas _gpu_executor)
|
||
_resolve_target_sync,
|
||
tmp_path,
|
||
request.target_spec,
|
||
request.screen_width,
|
||
request.screen_height,
|
||
request.fallback_x_pct,
|
||
request.fallback_y_pct,
|
||
request.strict_mode,
|
||
)
|
||
return result
|
||
except Exception as e:
|
||
logger.error(f"Résolution visuelle échouée: {e}")
|
||
return _fallback_response(request, "analysis_error", str(e))
|
||
finally:
|
||
import os
|
||
try:
|
||
os.unlink(tmp_path)
|
||
except OSError:
|
||
pass
|
||
|
||
|
||
# =========================================================================
|
||
# Observer — Pré-analyse écran avant résolution
|
||
# =========================================================================
|
||
|
||
|
||
class PreAnalyzeRequest(BaseModel):
|
||
"""Requête de pré-analyse écran (Observer)."""
|
||
screenshot_b64: str
|
||
expected_state: str = "" # Description attendue de l'état écran
|
||
window_title: str = "" # Titre fenêtre attendu
|
||
screen_width: int = 1920
|
||
screen_height: int = 1080
|
||
|
||
|
||
@app.post("/api/v1/traces/stream/replay/pre_analyze")
|
||
async def pre_analyze_screen(request: PreAnalyzeRequest):
|
||
"""Observer : analyser l'écran AVANT la résolution de cible.
|
||
|
||
Détecte les popups, dialogues modaux, et états inattendus
|
||
qui empêcheraient la résolution visuelle de fonctionner.
|
||
|
||
Retourne :
|
||
- screen_state: "ok" | "popup" | "unexpected"
|
||
- popup_label: texte du bouton popup à cliquer (si popup)
|
||
- popup_coords: {x_pct, y_pct} du bouton (si popup)
|
||
- detail: description du problème
|
||
"""
|
||
import asyncio
|
||
import base64
|
||
import io
|
||
|
||
from PIL import Image
|
||
|
||
try:
|
||
img_bytes = base64.b64decode(request.screenshot_b64)
|
||
img = Image.open(io.BytesIO(img_bytes))
|
||
except Exception as e:
|
||
return {"screen_state": "ok", "detail": f"decode error: {e}"}
|
||
|
||
loop = asyncio.get_event_loop()
|
||
result = await loop.run_in_executor(
|
||
None,
|
||
_pre_analyze_screen_sync,
|
||
request.screenshot_b64,
|
||
request.expected_state,
|
||
request.window_title,
|
||
request.screen_width,
|
||
request.screen_height,
|
||
)
|
||
return result
|
||
|
||
|
||
def _pre_analyze_screen_sync(
|
||
screenshot_b64: str,
|
||
expected_state: str,
|
||
window_title: str,
|
||
screen_width: int,
|
||
screen_height: int,
|
||
) -> Dict[str, Any]:
|
||
"""Pré-analyse synchrone de l'écran via VLM.
|
||
|
||
Utilise gemma4 (Docker port 11435) pour détecter :
|
||
1. Popups/dialogues modaux (avec coordonnées du bouton à cliquer)
|
||
2. États incohérents avec l'attendu
|
||
|
||
Rapide (~2-5s) car gemma4 est léger et en mode texte+image.
|
||
"""
|
||
import os
|
||
import time
|
||
import requests as _requests
|
||
|
||
gemma4_port = os.environ.get("GEMMA4_PORT", "11435")
|
||
gemma4_url = f"http://localhost:{gemma4_port}/api/chat"
|
||
|
||
# Charger le contexte métier pour l'Observer
|
||
from .domain_context import get_domain_context
|
||
domain = get_domain_context(os.environ.get("RPA_DOMAIN", "generic"))
|
||
|
||
# Prompt concis pour détection popup
|
||
prompt = (
|
||
"Regarde cette capture d'écran.\n"
|
||
"Y a-t-il une popup, boîte de dialogue, message d'erreur, ou fenêtre modale visible ?\n\n"
|
||
"Réponds EXACTEMENT dans ce format :\n"
|
||
"ÉTAT: OK ou POPUP ou INATTENDU\n"
|
||
"BOUTON: texte du bouton à cliquer (si POPUP, sinon 'aucun')\n"
|
||
"DÉTAIL: description courte (1 ligne)"
|
||
)
|
||
|
||
# Messages avec contexte métier
|
||
messages = []
|
||
if domain.system_prompt:
|
||
messages.append({"role": "system", "content": domain.system_prompt})
|
||
messages.append({"role": "user", "content": prompt, "images": [screenshot_b64]})
|
||
|
||
try:
|
||
t_start = time.time()
|
||
resp = _requests.post(
|
||
gemma4_url,
|
||
json={
|
||
"model": "gemma4:e4b",
|
||
"messages": messages,
|
||
"stream": False,
|
||
"think": True,
|
||
"options": {"temperature": 0.1, "num_predict": 800},
|
||
},
|
||
timeout=30,
|
||
)
|
||
elapsed_ms = (time.time() - t_start) * 1000
|
||
|
||
if not resp.ok:
|
||
logger.warning(f"Observer VLM HTTP {resp.status_code}")
|
||
return {"screen_state": "ok", "detail": f"VLM HTTP {resp.status_code}"}
|
||
|
||
content = resp.json().get("message", {}).get("content", "").strip()
|
||
logger.info(f"Observer VLM ({elapsed_ms:.0f}ms) : {content[:100]}")
|
||
|
||
# Parser la réponse
|
||
state = "ok"
|
||
button = ""
|
||
detail = content
|
||
|
||
for line in content.split("\n"):
|
||
line_clean = line.strip()
|
||
upper = line_clean.upper()
|
||
if upper.startswith("ÉTAT:") or upper.startswith("ETAT:"):
|
||
val = upper.split(":", 1)[1].strip()
|
||
if "POPUP" in val:
|
||
state = "popup"
|
||
elif "INATTENDU" in val or "UNEXPECTED" in val:
|
||
state = "unexpected"
|
||
else:
|
||
state = "ok"
|
||
elif upper.startswith("BOUTON:"):
|
||
button = line_clean.split(":", 1)[1].strip().strip("'\"")
|
||
if button.lower() in ("aucun", "none", "n/a", ""):
|
||
button = ""
|
||
elif upper.startswith("DÉTAIL:") or upper.startswith("DETAIL:"):
|
||
detail = line_clean.split(":", 1)[1].strip()
|
||
|
||
if state == "ok":
|
||
return {"screen_state": "ok"}
|
||
|
||
result = {
|
||
"screen_state": state,
|
||
"detail": detail,
|
||
"elapsed_ms": round(elapsed_ms, 1),
|
||
}
|
||
|
||
# Si popup détectée avec un texte de bouton, essayer de le localiser
|
||
if state == "popup" and button:
|
||
result["popup_label"] = button
|
||
# Localiser le bouton par grounding VLM (qwen2.5vl)
|
||
coords = _locate_popup_button(screenshot_b64, button, screen_width, screen_height)
|
||
if coords:
|
||
result["popup_coords"] = coords
|
||
|
||
return result
|
||
|
||
except _requests.Timeout:
|
||
logger.debug("Observer VLM timeout (15s)")
|
||
return {"screen_state": "ok", "detail": "VLM timeout"}
|
||
except Exception as e:
|
||
logger.debug(f"Observer VLM erreur : {e}")
|
||
return {"screen_state": "ok", "detail": str(e)}
|
||
|
||
|
||
def _locate_popup_button(
|
||
screenshot_b64: str, button_text: str,
|
||
screen_width: int, screen_height: int,
|
||
) -> Optional[Dict[str, float]]:
|
||
"""Localiser un bouton de popup par grounding VLM (qwen2.5vl).
|
||
|
||
Utilise le format bbox_2d natif de qwen2.5vl pour trouver
|
||
la position exacte du bouton sur le screenshot.
|
||
"""
|
||
import requests as _requests
|
||
import re
|
||
|
||
ollama_url = "http://localhost:11434/api/chat"
|
||
prompt = f"Detect the button with text '{button_text}' with a bounding box."
|
||
|
||
try:
|
||
resp = _requests.post(
|
||
ollama_url,
|
||
json={
|
||
"model": "qwen2.5vl:7b",
|
||
"messages": [{"role": "user", "content": prompt, "images": [screenshot_b64]}],
|
||
"stream": False,
|
||
"options": {"temperature": 0.1, "num_predict": 50},
|
||
},
|
||
timeout=15,
|
||
)
|
||
if not resp.ok:
|
||
return None
|
||
|
||
content = resp.json().get("message", {}).get("content", "")
|
||
|
||
# Parser bbox_2d — qwen2.5vl retourne des coordonnées en pixels
|
||
# relatifs à l'image envoyée, PAS sur une grille 1000x1000.
|
||
# Format JSON : [{"bbox_2d": [x1, y1, x2, y2], "label": "..."}]
|
||
bbox_match = re.search(
|
||
r'"bbox_2d"\s*:\s*\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]',
|
||
content,
|
||
)
|
||
if bbox_match:
|
||
x1, y1, x2, y2 = [int(bbox_match.group(i)) for i in range(1, 5)]
|
||
# Normaliser par les dimensions de l'écran (pixels → 0-1)
|
||
cx = (x1 + x2) / 2 / screen_width
|
||
cy = (y1 + y2) / 2 / screen_height
|
||
if 0.0 <= cx <= 1.0 and 0.0 <= cy <= 1.0:
|
||
logger.info(f"Observer : bouton '{button_text}' localisé à ({cx:.3f}, {cy:.3f})")
|
||
return {"x_pct": cx, "y_pct": cy}
|
||
|
||
except Exception as e:
|
||
logger.debug(f"Observer grounding bouton erreur : {e}")
|
||
|
||
return None
|
||
|
||
|
||
def _resolve_by_template_matching(
|
||
screenshot_path: str,
|
||
anchor_image_b64: str,
|
||
screen_width: int,
|
||
screen_height: int,
|
||
confidence_threshold: float = 0.7,
|
||
) -> Optional[Dict[str, Any]]:
|
||
"""Résoudre la position d'une ancre par template matching OpenCV.
|
||
|
||
Compare l'image de l'ancre (crop) avec le screenshot actuel pour trouver
|
||
la meilleure correspondance. Utilise cv2.matchTemplate avec TM_CCOEFF_NORMED.
|
||
|
||
Args:
|
||
screenshot_path: Chemin du screenshot de l'écran actuel
|
||
anchor_image_b64: Image de l'ancre encodée en base64 (PNG)
|
||
screen_width: Largeur de l'écran en pixels
|
||
screen_height: Hauteur de l'écran en pixels
|
||
confidence_threshold: Seuil minimum de confiance (0.0 à 1.0)
|
||
|
||
Returns:
|
||
Dict avec resolved=True et coordonnées, ou None si pas de match
|
||
"""
|
||
import base64
|
||
import io
|
||
|
||
try:
|
||
import cv2
|
||
import numpy as np
|
||
except ImportError:
|
||
logger.warning("OpenCV non disponible pour template matching")
|
||
return None
|
||
|
||
try:
|
||
# Charger le screenshot
|
||
screenshot = cv2.imread(screenshot_path)
|
||
if screenshot is None:
|
||
logger.warning("Impossible de lire le screenshot : %s", screenshot_path)
|
||
return None
|
||
|
||
# Décoder l'image de l'ancre depuis base64
|
||
anchor_bytes = base64.b64decode(anchor_image_b64)
|
||
anchor_array = np.frombuffer(anchor_bytes, dtype=np.uint8)
|
||
anchor_img = cv2.imdecode(anchor_array, cv2.IMREAD_COLOR)
|
||
if anchor_img is None:
|
||
logger.warning("Impossible de décoder l'image de l'ancre")
|
||
return None
|
||
|
||
# Convertir en niveaux de gris pour le matching
|
||
screenshot_gray = cv2.cvtColor(screenshot, cv2.COLOR_BGR2GRAY)
|
||
anchor_gray = cv2.cvtColor(anchor_img, cv2.COLOR_BGR2GRAY)
|
||
|
||
# Vérifier que l'ancre n'est pas plus grande que le screenshot
|
||
sh, sw = screenshot_gray.shape[:2]
|
||
ah, aw = anchor_gray.shape[:2]
|
||
if ah > sh or aw > sw:
|
||
logger.warning(
|
||
"Ancre (%dx%d) plus grande que le screenshot (%dx%d)",
|
||
aw, ah, sw, sh,
|
||
)
|
||
return None
|
||
|
||
# Template matching multi-échelle : essayer l'échelle 1.0 d'abord,
|
||
# puis quelques variations si la résolution a changé.
|
||
# Plage étendue 0.5x-2.0x pour couvrir les écarts importants
|
||
# (ex: apprentissage 2560x1600 → replay 1280x720 = ratio ~0.5x)
|
||
best_val = -1.0
|
||
best_loc = None
|
||
best_scale = 1.0
|
||
best_anchor_size = (aw, ah)
|
||
|
||
for scale in [1.0, 0.9, 1.1, 0.8, 1.2, 0.75, 1.25, 0.6, 1.5, 0.5, 1.75, 2.0]:
|
||
if scale != 1.0:
|
||
new_w = int(aw * scale)
|
||
new_h = int(ah * scale)
|
||
if new_w < 10 or new_h < 10 or new_w > sw or new_h > sh:
|
||
continue
|
||
scaled_anchor = cv2.resize(anchor_gray, (new_w, new_h))
|
||
else:
|
||
scaled_anchor = anchor_gray
|
||
new_w, new_h = aw, ah
|
||
|
||
result = cv2.matchTemplate(screenshot_gray, scaled_anchor, cv2.TM_CCOEFF_NORMED)
|
||
_, max_val, _, max_loc = cv2.minMaxLoc(result)
|
||
|
||
if max_val > best_val:
|
||
best_val = max_val
|
||
best_loc = max_loc
|
||
best_scale = scale
|
||
best_anchor_size = (new_w, new_h)
|
||
|
||
# Si on a un très bon match, pas besoin de continuer
|
||
if best_val >= 0.95:
|
||
break
|
||
|
||
if best_val < confidence_threshold:
|
||
logger.info(
|
||
"Template matching : meilleur score=%.3f < seuil=%.3f (ancre %dx%d, écran %dx%d)",
|
||
best_val, confidence_threshold, aw, ah, sw, sh,
|
||
)
|
||
return None
|
||
|
||
# Calculer le centre du match
|
||
match_w, match_h = best_anchor_size
|
||
cx = best_loc[0] + match_w / 2.0
|
||
cy = best_loc[1] + match_h / 2.0
|
||
|
||
# Convertir en proportions normalisées
|
||
x_pct = round(cx / sw, 6) if sw > 0 else 0.0
|
||
y_pct = round(cy / sh, 6) if sh > 0 else 0.0
|
||
|
||
logger.info(
|
||
"Template matching OK : score=%.3f, échelle=%.2f, "
|
||
"centre=(%d, %d) → (%.4f, %.4f) sur %dx%d",
|
||
best_val, best_scale, int(cx), int(cy), x_pct, y_pct, sw, sh,
|
||
)
|
||
|
||
return {
|
||
"resolved": True,
|
||
"method": "template_matching",
|
||
"x_pct": x_pct,
|
||
"y_pct": y_pct,
|
||
"matched_element": {
|
||
"label": f"anchor_template",
|
||
"type": "visual_anchor",
|
||
"role": "anchor",
|
||
"center": [int(cx), int(cy)],
|
||
"confidence": best_val,
|
||
},
|
||
"score": best_val,
|
||
"scale": best_scale,
|
||
"match_box": {
|
||
"x": best_loc[0],
|
||
"y": best_loc[1],
|
||
"width": match_w,
|
||
"height": match_h,
|
||
},
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.error("Erreur template matching : %s", e)
|
||
return None
|
||
|
||
|
||
def _validate_match_context(
|
||
result: Dict[str, Any],
|
||
original_x_pct: float,
|
||
original_y_pct: float,
|
||
target_spec: Dict[str, Any],
|
||
max_distance: float = 0.35,
|
||
) -> bool:
|
||
"""Vérifier que la position trouvée est dans la même zone que l'originale.
|
||
|
||
Évite les faux positifs du template matching : un bouton similaire visuellement
|
||
mais situé dans une zone très différente de l'écran.
|
||
|
||
Args:
|
||
result: Résultat du template matching (contient x_pct, y_pct).
|
||
original_x_pct: Position X originale (pourcentage, 0.0-1.0).
|
||
original_y_pct: Position Y originale (pourcentage, 0.0-1.0).
|
||
target_spec: Spécification de la cible (non utilisé pour l'instant,
|
||
mais disponible pour des règles contextuelles futures).
|
||
max_distance: Distance euclidienne maximum acceptée (en pourcentage de l'écran).
|
||
Défaut 0.35 = ~35% de la diagonale, assez permissif pour les UI dynamiques.
|
||
|
||
Returns:
|
||
True si la position est valide (même zone), False sinon.
|
||
"""
|
||
found_x = result.get("x_pct", 0.0)
|
||
found_y = result.get("y_pct", 0.0)
|
||
|
||
# Distance euclidienne en pourcentage de l'écran
|
||
dx = found_x - original_x_pct
|
||
dy = found_y - original_y_pct
|
||
distance = (dx ** 2 + dy ** 2) ** 0.5
|
||
|
||
if distance > max_distance:
|
||
logger.debug(
|
||
"Context validation : distance=%.3f > max=%.3f "
|
||
"(found=(%.3f, %.3f), original=(%.3f, %.3f))",
|
||
distance, max_distance, found_x, found_y, original_x_pct, original_y_pct,
|
||
)
|
||
return False
|
||
|
||
return True
|
||
|
||
|
||
# =========================================================================
|
||
# YOLO/OmniParser — Résolution par détection d'éléments UI
|
||
# =========================================================================
|
||
|
||
# Chargement paresseux d'OmniParser (singleton, GPU)
|
||
_omniparser_available: Optional[bool] = None # None = pas encore vérifié
|
||
_omniparser_instance = None
|
||
_omniparser_lock = threading.Lock()
|
||
|
||
|
||
def _get_omniparser():
|
||
"""Obtenir l'instance OmniParser (lazy loading, thread-safe).
|
||
|
||
Returns:
|
||
OmniParserAdapter ou None si non disponible.
|
||
"""
|
||
global _omniparser_available, _omniparser_instance
|
||
if _omniparser_available is False:
|
||
return None
|
||
if _omniparser_instance is not None:
|
||
return _omniparser_instance
|
||
|
||
with _omniparser_lock:
|
||
if _omniparser_available is False:
|
||
return None
|
||
if _omniparser_instance is not None:
|
||
return _omniparser_instance
|
||
try:
|
||
from core.detection.omniparser_adapter import OmniParserAdapter
|
||
adapter = OmniParserAdapter()
|
||
if adapter.available:
|
||
_omniparser_instance = adapter
|
||
_omniparser_available = True
|
||
logger.info("OmniParser disponible pour la résolution YOLO")
|
||
return adapter
|
||
else:
|
||
_omniparser_available = False
|
||
logger.info("OmniParser : modèles non trouvés, YOLO désactivé")
|
||
return None
|
||
except ImportError:
|
||
_omniparser_available = False
|
||
logger.info("OmniParser non installé, YOLO désactivé")
|
||
return None
|
||
except Exception as e:
|
||
_omniparser_available = False
|
||
logger.warning("OmniParser init échouée : %s", e)
|
||
return None
|
||
|
||
|
||
def _resolve_by_yolo(
|
||
screenshot_path: str,
|
||
anchor_image_b64: str,
|
||
screen_width: int,
|
||
screen_height: int,
|
||
target_spec: Dict[str, Any],
|
||
) -> Optional[Dict[str, Any]]:
|
||
"""Résolution via YOLO/OmniParser : détecte tous les éléments UI
|
||
puis matche le crop de référence contre les éléments détectés.
|
||
|
||
Stratégie :
|
||
1. OmniParser détecte tous les éléments UI du screenshot (~0.6-0.8s)
|
||
2. Pour chaque élément détecté, template matching local contre l'anchor
|
||
3. Si 1 seul bon match (score >= 0.50) → accepter
|
||
4. Si 2+ matchs ambigus → retourner None (le VLM tranchera)
|
||
|
||
Args:
|
||
screenshot_path: Chemin vers le screenshot JPEG
|
||
anchor_image_b64: Image de l'anchor encodée en base64
|
||
screen_width: Largeur de l'écran
|
||
screen_height: Hauteur de l'écran
|
||
target_spec: Spécification de la cible
|
||
|
||
Returns:
|
||
Dict avec resolved=True/False, x_pct, y_pct, score
|
||
ou None si OmniParser pas disponible ou aucun match
|
||
"""
|
||
import base64
|
||
|
||
try:
|
||
import cv2
|
||
import numpy as np
|
||
except ImportError:
|
||
return None
|
||
|
||
omniparser = _get_omniparser()
|
||
if omniparser is None:
|
||
return None
|
||
|
||
t0 = time.time()
|
||
|
||
try:
|
||
from PIL import Image as PILImage
|
||
|
||
# Charger le screenshot en PIL
|
||
screenshot_pil = PILImage.open(screenshot_path)
|
||
sw, sh = screenshot_pil.size
|
||
|
||
# Charger le screenshot en numpy/OpenCV pour le template matching
|
||
screenshot_np = np.array(screenshot_pil)
|
||
if len(screenshot_np.shape) == 3 and screenshot_np.shape[2] == 3:
|
||
# PIL est RGB, convertir en BGR pour OpenCV
|
||
screenshot_bgr = cv2.cvtColor(screenshot_np, cv2.COLOR_RGB2BGR)
|
||
else:
|
||
screenshot_bgr = screenshot_np
|
||
screenshot_gray = cv2.cvtColor(screenshot_bgr, cv2.COLOR_BGR2GRAY)
|
||
|
||
# Décoder l'anchor depuis base64
|
||
anchor_bytes = base64.b64decode(anchor_image_b64)
|
||
anchor_array = np.frombuffer(anchor_bytes, dtype=np.uint8)
|
||
anchor_img = cv2.imdecode(anchor_array, cv2.IMREAD_COLOR)
|
||
if anchor_img is None:
|
||
logger.warning("YOLO resolve : impossible de décoder l'anchor")
|
||
return None
|
||
anchor_gray = cv2.cvtColor(anchor_img, cv2.COLOR_BGR2GRAY)
|
||
anchor_h, anchor_w = anchor_gray.shape[:2]
|
||
|
||
# Détecter tous les éléments UI avec OmniParser
|
||
elements = omniparser.detect(screenshot_pil)
|
||
if not elements:
|
||
elapsed = time.time() - t0
|
||
logger.info("YOLO resolve : 0 éléments détectés (%.1fs)", elapsed)
|
||
return None
|
||
|
||
logger.info(
|
||
"YOLO resolve : %d éléments détectés, matching anchor %dx%d...",
|
||
len(elements), anchor_w, anchor_h,
|
||
)
|
||
|
||
# Matcher l'anchor contre chaque élément détecté
|
||
YOLO_MATCH_THRESHOLD = 0.50
|
||
matches = []
|
||
|
||
for elem in elements:
|
||
x1, y1, x2, y2 = elem.bbox
|
||
elem_w = x2 - x1
|
||
elem_h = y2 - y1
|
||
|
||
# Ignorer les éléments trop petits
|
||
if elem_w < 5 or elem_h < 5:
|
||
continue
|
||
|
||
# Extraire le crop de l'élément depuis le screenshot
|
||
elem_crop = screenshot_gray[y1:y2, x1:x2]
|
||
if elem_crop.size == 0:
|
||
continue
|
||
|
||
# Template matching local : resize anchor pour matcher la taille de l'élément
|
||
# ou inversement, selon les dimensions relatives
|
||
try:
|
||
# Approche : resize l'anchor à la taille du crop et comparer
|
||
if elem_w > 0 and elem_h > 0:
|
||
anchor_resized = cv2.resize(anchor_gray, (elem_w, elem_h))
|
||
result = cv2.matchTemplate(
|
||
elem_crop, anchor_resized, cv2.TM_CCOEFF_NORMED
|
||
)
|
||
_, max_val, _, _ = cv2.minMaxLoc(result)
|
||
else:
|
||
continue
|
||
|
||
# Aussi essayer le crop à la taille de l'anchor si c'est plus grand
|
||
if elem_w >= anchor_w and elem_h >= anchor_h:
|
||
result2 = cv2.matchTemplate(
|
||
elem_crop, anchor_gray, cv2.TM_CCOEFF_NORMED
|
||
)
|
||
_, max_val2, _, _ = cv2.minMaxLoc(result2)
|
||
max_val = max(max_val, max_val2)
|
||
|
||
if max_val >= YOLO_MATCH_THRESHOLD:
|
||
matches.append((elem, max_val))
|
||
|
||
except cv2.error:
|
||
continue
|
||
|
||
elapsed = time.time() - t0
|
||
|
||
if not matches:
|
||
logger.info(
|
||
"YOLO resolve : aucun match >= %.2f parmi %d éléments (%.1fs)",
|
||
YOLO_MATCH_THRESHOLD, len(elements), elapsed,
|
||
)
|
||
return None
|
||
|
||
# Trier par score décroissant
|
||
matches.sort(key=lambda m: m[1], reverse=True)
|
||
best_elem, best_score = matches[0]
|
||
|
||
# Si 2+ matchs avec des scores proches (< 0.10 d'écart), c'est ambigu
|
||
# → laisser le VLM trancher
|
||
if len(matches) >= 2:
|
||
second_score = matches[1][1]
|
||
if best_score - second_score < 0.10:
|
||
logger.info(
|
||
"YOLO resolve : %d matchs ambigus (best=%.3f, second=%.3f, "
|
||
"écart=%.3f < 0.10), VLM requis (%.1fs)",
|
||
len(matches), best_score, second_score,
|
||
best_score - second_score, elapsed,
|
||
)
|
||
return None
|
||
|
||
# 1 seul match clair → accepter
|
||
cx, cy = best_elem.center
|
||
x_pct = round(cx / sw, 6) if sw > 0 else 0.0
|
||
y_pct = round(cy / sh, 6) if sh > 0 else 0.0
|
||
|
||
logger.info(
|
||
"YOLO resolve OK : '%s' (%s) score=%.3f → (%.4f, %.4f) "
|
||
"parmi %d éléments, %d matchs (%.1fs)",
|
||
best_elem.label, best_elem.element_type, best_score,
|
||
x_pct, y_pct, len(elements), len(matches), elapsed,
|
||
)
|
||
|
||
return {
|
||
"resolved": True,
|
||
"method": "yolo_omniparser",
|
||
"x_pct": x_pct,
|
||
"y_pct": y_pct,
|
||
"matched_element": {
|
||
"label": best_elem.label,
|
||
"type": best_elem.element_type,
|
||
"role": "yolo_detected",
|
||
"center": [cx, cy],
|
||
"confidence": best_score,
|
||
},
|
||
"score": best_score,
|
||
"yolo_elements_count": len(elements),
|
||
"yolo_matches_count": len(matches),
|
||
}
|
||
|
||
except Exception as e:
|
||
elapsed = time.time() - t0
|
||
logger.warning("YOLO resolve : exception (%.1fs) — %s", elapsed, e)
|
||
return None
|
||
|
||
|
||
# =========================================================================
|
||
# VLM Quick Find — Fallback léger quand le template matching échoue
|
||
# =========================================================================
|
||
|
||
# Client Ollama singleton (initialisé au premier appel, pas au démarrage)
|
||
_vlm_client = None
|
||
_vlm_client_lock = threading.Lock()
|
||
|
||
# Timeout dédié pour le VLM Quick Find (plus court que le timeout par défaut)
|
||
_VLM_QUICK_FIND_TIMEOUT = 30 # secondes
|
||
|
||
|
||
def _get_vlm_client():
|
||
"""Obtenir ou créer le client Ollama singleton pour le VLM Quick Find.
|
||
|
||
Initialisation paresseuse : le client n'est créé qu'au premier appel,
|
||
pas au démarrage du serveur (évite de bloquer si Ollama est down).
|
||
Le modèle est résolu automatiquement via vlm_config (RPA_VLM_MODEL).
|
||
"""
|
||
global _vlm_client
|
||
if _vlm_client is not None:
|
||
return _vlm_client
|
||
with _vlm_client_lock:
|
||
if _vlm_client is not None:
|
||
return _vlm_client
|
||
try:
|
||
from core.detection.ollama_client import OllamaClient
|
||
from core.detection.vlm_config import get_vlm_model
|
||
_model = get_vlm_model()
|
||
_vlm_client = OllamaClient(
|
||
endpoint="http://localhost:11434",
|
||
model=_model,
|
||
timeout=_VLM_QUICK_FIND_TIMEOUT,
|
||
)
|
||
logger.info("VLM Quick Find : client Ollama initialisé (%s)", _model)
|
||
except Exception as e:
|
||
logger.warning(f"VLM Quick Find : impossible d'initialiser le client Ollama : {e}")
|
||
return None
|
||
return _vlm_client
|
||
|
||
|
||
def _build_target_description(target_spec: Dict[str, Any]) -> str:
|
||
"""Construire une description textuelle de l'élément à trouver.
|
||
|
||
Utilisé par le VLM Quick Find pour savoir quoi chercher sur le screenshot.
|
||
|
||
Args:
|
||
target_spec: Spécification de la cible (by_text, by_role, etc.)
|
||
|
||
Returns:
|
||
Description en langage naturel, ex: "un bouton contenant 'Valider'"
|
||
"""
|
||
by_text = target_spec.get("by_text", "").strip()
|
||
by_role = target_spec.get("by_role", "").strip()
|
||
|
||
if by_text and by_role:
|
||
return f"un {by_role} contenant '{by_text}'"
|
||
elif by_text:
|
||
return f"élément contenant le texte '{by_text}'"
|
||
elif by_role:
|
||
return f"un {by_role}"
|
||
else:
|
||
return "l'élément interactif principal"
|
||
|
||
|
||
def _vlm_quick_find(
|
||
screenshot_path: str,
|
||
target_description: str,
|
||
anchor_image_b64: Optional[str] = None,
|
||
) -> Optional[Dict[str, Any]]:
|
||
"""Demander au VLM de localiser un élément sur le screenshot.
|
||
|
||
Stratégie VLM-first pour le replay : le VLM comprend le contexte
|
||
de l'écran et peut trouver un élément même si l'apparence a changé.
|
||
|
||
Modes de fonctionnement :
|
||
- Avec anchor_image_b64 + description : multi-image (screenshot + crop de référence).
|
||
Le VLM voit le screenshot ET le crop, ce qui est beaucoup plus précis.
|
||
- Avec description seule : single-image, le VLM cherche par la description textuelle.
|
||
- Avec anchor_image_b64 seule (pas de description) : multi-image avec prompt visuel pur.
|
||
|
||
Args:
|
||
screenshot_path: Chemin du screenshot actuel
|
||
target_description: Description riche de l'élément à trouver.
|
||
Ex: "Dans la fenêtre 'Exécuter', l'élément cliqué en bas au centre"
|
||
anchor_image_b64: Image de référence (crop) en base64 (optionnel).
|
||
Si fourni, envoyé comme seconde image au VLM pour comparaison visuelle.
|
||
|
||
Returns:
|
||
{"x_pct": float, "y_pct": float, "confidence": float, "method": "vlm_quick_find"}
|
||
ou None si l'élément n'est pas trouvé ou en cas d'erreur
|
||
"""
|
||
client = _get_vlm_client()
|
||
if client is None:
|
||
logger.debug("VLM Quick Find : client Ollama non disponible, skip")
|
||
return None
|
||
|
||
t0 = time.time()
|
||
|
||
# Construire le prompt adapté selon les informations disponibles
|
||
has_anchor = bool(anchor_image_b64)
|
||
has_description = bool(target_description and target_description.strip())
|
||
|
||
if has_anchor and has_description:
|
||
# Mode optimal : screenshot + crop de référence + description textuelle
|
||
prompt = (
|
||
"The first image is the current screen. "
|
||
"The second image shows the element I want to click.\n\n"
|
||
f"Context: {target_description}\n\n"
|
||
"Find this exact element on the screen and return its CENTER coordinates "
|
||
"as percentage of the screen dimensions.\n"
|
||
'Return ONLY a JSON object: {"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}\n'
|
||
'If the element is not visible, return: {"x_pct": null, "y_pct": null, "confidence": 0.0}'
|
||
)
|
||
elif has_anchor:
|
||
# Mode visuel pur : screenshot + crop, pas de description
|
||
prompt = (
|
||
"The first image is the current screen. "
|
||
"The second image shows the element I want to click.\n\n"
|
||
"Find this exact element on the screen and return its CENTER coordinates "
|
||
"as percentage of the screen dimensions.\n"
|
||
'Return ONLY a JSON object: {"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}\n'
|
||
'If the element is not visible, return: {"x_pct": null, "y_pct": null, "confidence": 0.0}'
|
||
)
|
||
else:
|
||
# Mode description seule
|
||
prompt = (
|
||
"Look at this screenshot carefully.\n\n"
|
||
f"{target_description}\n\n"
|
||
"Find this element and return its CENTER coordinates "
|
||
"as percentage of the image dimensions.\n"
|
||
'Return ONLY a JSON object: {"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}\n'
|
||
'If the element is not visible, return: {"x_pct": null, "y_pct": null, "confidence": 0.0}'
|
||
)
|
||
|
||
system_prompt = "You are a UI element locator. Output raw JSON only. No explanation."
|
||
|
||
try:
|
||
# Préparer les images supplémentaires (anchor crop)
|
||
extra_images = [anchor_image_b64] if has_anchor else None
|
||
|
||
result = client.generate(
|
||
prompt=prompt,
|
||
image_path=screenshot_path,
|
||
system_prompt=system_prompt,
|
||
temperature=0.1,
|
||
max_tokens=200,
|
||
force_json=False,
|
||
extra_images_b64=extra_images,
|
||
)
|
||
|
||
elapsed = time.time() - t0
|
||
|
||
if not result.get("success"):
|
||
logger.info(
|
||
"VLM Quick Find : échec appel VLM (%.1fs) — %s",
|
||
elapsed, result.get("error", "?"),
|
||
)
|
||
return None
|
||
|
||
response_text = result.get("response", "").strip()
|
||
if not response_text:
|
||
logger.info("VLM Quick Find : réponse vide du VLM (%.1fs)", elapsed)
|
||
return None
|
||
|
||
# Parser la réponse JSON (réutiliser le parser robuste d'OllamaClient)
|
||
parsed = client._extract_json_from_response(response_text)
|
||
if parsed is None:
|
||
logger.info(
|
||
"VLM Quick Find : réponse non-JSON (%.1fs) — %.80s",
|
||
elapsed, response_text,
|
||
)
|
||
return None
|
||
|
||
# Valider les coordonnées
|
||
x_pct = parsed.get("x_pct")
|
||
y_pct = parsed.get("y_pct")
|
||
confidence = float(parsed.get("confidence", 0.0))
|
||
|
||
if x_pct is None or y_pct is None or confidence < 0.3:
|
||
logger.info(
|
||
"VLM Quick Find : élément non trouvé ou confiance trop basse "
|
||
"(%.1fs, confidence=%.2f) pour '%s'",
|
||
elapsed, confidence,
|
||
target_description[:80] if target_description else "(anchor only)",
|
||
)
|
||
return None
|
||
|
||
x_pct = float(x_pct)
|
||
y_pct = float(y_pct)
|
||
|
||
# Vérifier que les coordonnées sont dans les bornes [0, 1]
|
||
if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0):
|
||
logger.info(
|
||
"VLM Quick Find : coordonnées hors bornes (%.4f, %.4f), ignoré",
|
||
x_pct, y_pct,
|
||
)
|
||
return None
|
||
|
||
mode_str = "multi-image" if has_anchor else "description"
|
||
desc_short = (target_description[:60] + "...") if target_description and len(target_description) > 60 else (target_description or "(anchor)")
|
||
logger.info(
|
||
"VLM Quick Find OK [%s] : '%s' → (%.4f, %.4f) confidence=%.2f en %.1fs",
|
||
mode_str, desc_short, x_pct, y_pct, confidence, elapsed,
|
||
)
|
||
|
||
return {
|
||
"resolved": True,
|
||
"method": "vlm_quick_find",
|
||
"x_pct": round(x_pct, 6),
|
||
"y_pct": round(y_pct, 6),
|
||
"matched_element": {
|
||
"label": target_description or "anchor_visual",
|
||
"type": "vlm_located",
|
||
"role": "vlm_quick_find",
|
||
"confidence": confidence,
|
||
},
|
||
"score": confidence,
|
||
}
|
||
|
||
except Exception as e:
|
||
elapsed = time.time() - t0
|
||
logger.warning(
|
||
"VLM Quick Find : exception (%.1fs) — %s", elapsed, e,
|
||
)
|
||
return None
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Résolution par VLM Grounding Direct (configurable via RPA_VLM_MODEL)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _resolve_by_grounding(
|
||
screenshot_path: str,
|
||
target_spec: Dict[str, Any],
|
||
screen_width: int,
|
||
screen_height: int,
|
||
) -> Optional[Dict[str, Any]]:
|
||
"""Résoudre une cible via grounding VLM direct.
|
||
|
||
Le modèle VLM (gemma4:e4b par défaut, configurable via RPA_VLM_MODEL)
|
||
reçoit le screenshot + une description textuelle et retourne
|
||
directement les coordonnées de l'élément. Pas de SomEngine,
|
||
pas de numérotation — le VLM fait du grounding UI natif.
|
||
|
||
Approche plus fiable que SomEngine+VLM pour les icônes et éléments
|
||
visuels sans texte (logo Windows, disquette, bouton fermer).
|
||
"""
|
||
import base64
|
||
import io
|
||
import re
|
||
|
||
t0 = time.time()
|
||
|
||
# Construire la description de la cible
|
||
by_text = target_spec.get("by_text", "").strip()
|
||
vlm_desc = target_spec.get("vlm_description", "").strip()
|
||
window_title = target_spec.get("window_title", "").strip()
|
||
|
||
if by_text:
|
||
description = by_text
|
||
elif vlm_desc:
|
||
description = vlm_desc
|
||
else:
|
||
return None
|
||
|
||
# Utiliser la capture fenêtre si disponible (plus ciblée, moins de bruit)
|
||
# Sinon fallback sur le full screen
|
||
window_capture = target_spec.get("window_capture", {})
|
||
window_rect = window_capture.get("rect") # [x1, y1, x2, y2] écran
|
||
|
||
try:
|
||
from PIL import Image as PILImage
|
||
from pathlib import Path
|
||
|
||
# Utiliser la fenêtre active : cropper depuis le screenshot full
|
||
# via window_rect (fonctionne au replay comme à l'enregistrement)
|
||
img = PILImage.open(screenshot_path)
|
||
|
||
if window_rect:
|
||
x1, y1, x2, y2 = window_rect
|
||
img = img.crop((x1, y1, x2, y2))
|
||
using_window = True
|
||
logger.debug("Grounding : crop fenêtre (%d,%d,%d,%d) → %dx%d", x1, y1, x2, y2, *img.size)
|
||
else:
|
||
using_window = False
|
||
|
||
orig_w, orig_h = img.size
|
||
small_w, small_h = orig_w, orig_h # pas de redimensionnement
|
||
|
||
buf = io.BytesIO()
|
||
img.save(buf, format="JPEG", quality=80)
|
||
shot_b64 = base64.b64encode(buf.getvalue()).decode()
|
||
except Exception as e:
|
||
logger.warning("Grounding : erreur chargement image — %s", e)
|
||
return None
|
||
|
||
# Prompt natif Qwen2.5-VL — format bbox_2d (le seul fiable)
|
||
# Ajouter la position relative pour désambiguïser (ex: deux "Rechercher" à l'écran)
|
||
original_pos = target_spec.get("original_position", {})
|
||
pos_hint = ""
|
||
y_rel = original_pos.get("y_relative", "")
|
||
x_rel = original_pos.get("x_relative", "")
|
||
if y_rel or x_rel:
|
||
pos_hint = f" located {y_rel} {x_rel} of the screen".strip()
|
||
prompt = f"Detect '{description}'{pos_hint} in this image with a bounding box."
|
||
|
||
# Le grounding nécessite un modèle entraîné pour les coordonnées (bbox_2d).
|
||
# Qwen2.5-VL est le seul qui retourne des positions précises.
|
||
# gemma4 comprend les images mais ne sait pas localiser en coordonnées.
|
||
_grounding_model = os.environ.get("RPA_GROUNDING_MODEL", "qwen2.5vl:7b")
|
||
|
||
# Appel VLM — vLLM (GPU, rapide) en priorité, Ollama en fallback
|
||
import requests as _requests
|
||
content = ""
|
||
|
||
# Port vLLM configurable via env
|
||
_vllm_port = os.environ.get("VLLM_PORT", "8100")
|
||
_vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ")
|
||
|
||
# Essai 1 : vLLM (API OpenAI-compatible, GPU)
|
||
try:
|
||
vllm_resp = _requests.post(
|
||
f"http://localhost:{_vllm_port}/v1/chat/completions",
|
||
json={
|
||
"model": _vllm_model,
|
||
"messages": [
|
||
{"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."},
|
||
{"role": "user", "content": [
|
||
{"type": "text", "text": prompt},
|
||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}},
|
||
]},
|
||
],
|
||
"temperature": 0.1,
|
||
"max_tokens": 80,
|
||
},
|
||
timeout=30,
|
||
)
|
||
if vllm_resp.ok:
|
||
content = vllm_resp.json().get("choices", [{}])[0].get("message", {}).get("content", "")
|
||
if content:
|
||
logger.debug("Grounding via vLLM OK")
|
||
except Exception as e:
|
||
logger.debug("vLLM non disponible (%s), fallback Ollama", e)
|
||
|
||
# Essai 2 : Ollama (qwen2.5vl:7b pour le grounding — format bbox_2d natif)
|
||
if not content:
|
||
try:
|
||
resp = _requests.post("http://localhost:11434/api/chat", json={
|
||
"model": _grounding_model,
|
||
"messages": [
|
||
{"role": "user", "content": prompt, "images": [shot_b64]},
|
||
],
|
||
"stream": False,
|
||
"options": {"temperature": 0.1, "num_predict": 100},
|
||
}, timeout=60)
|
||
content = resp.json().get("message", {}).get("content", "")
|
||
except Exception as e:
|
||
logger.info("Grounding VLM timeout/erreur : %s", e)
|
||
return None
|
||
|
||
elapsed = time.time() - t0
|
||
|
||
# Parser la réponse — supporte bbox_2d en pixels, JSON %, arrays bruts
|
||
x_pct, y_pct = None, None
|
||
|
||
# Format 1 : bbox_2d en pixels [x, y] ou [x1, y1, x2, y2]
|
||
bbox_match = re.search(r'"bbox_2d"\s*:\s*\[([^\]]+)\]', content)
|
||
if bbox_match:
|
||
coords = [float(v.strip()) for v in bbox_match.group(1).split(",")]
|
||
if len(coords) == 2:
|
||
x_pct = coords[0] / small_w
|
||
y_pct = coords[1] / small_h
|
||
elif len(coords) >= 4:
|
||
x_pct = (coords[0] + coords[2]) / 2 / small_w
|
||
y_pct = (coords[1] + coords[3]) / 2 / small_h
|
||
|
||
# Format 2 : JSON {"x": 0.XX, "y": 0.YY}
|
||
if x_pct is None:
|
||
json_match = re.search(r'"x"\s*:\s*([\d.]+).*?"y"\s*:\s*([\d.]+)', content)
|
||
if json_match:
|
||
x_val, y_val = float(json_match.group(1)), float(json_match.group(2))
|
||
# Si > 1, c'est en pixels
|
||
if x_val > 1:
|
||
x_pct = x_val / small_w
|
||
y_pct = y_val / small_h
|
||
else:
|
||
x_pct = x_val
|
||
y_pct = y_val
|
||
|
||
# Format 3 : {"x_pct": 0.XX, "y_pct": 0.YY}
|
||
if x_pct is None:
|
||
pct_match = re.search(r'"x_pct"\s*:\s*([\d.]+).*?"y_pct"\s*:\s*([\d.]+)', content)
|
||
if pct_match:
|
||
x_pct = float(pct_match.group(1))
|
||
y_pct = float(pct_match.group(2))
|
||
|
||
# Format 4 : array brut [x1, y1, x2, y2] ou [x, y]
|
||
if x_pct is None:
|
||
arr_match = re.search(r'\[[\s]*([\d.]+)\s*,\s*([\d.]+)(?:\s*,\s*([\d.]+)\s*,\s*([\d.]+))?\s*\]', content)
|
||
if arr_match:
|
||
vals = [float(v) for v in arr_match.groups() if v is not None]
|
||
if len(vals) >= 4:
|
||
x_pct = (vals[0] + vals[2]) / 2 / small_w
|
||
y_pct = (vals[1] + vals[3]) / 2 / small_h
|
||
elif len(vals) == 2:
|
||
x_pct = vals[0] / small_w
|
||
y_pct = vals[1] / small_h
|
||
|
||
if x_pct is None or y_pct is None:
|
||
# Fallback multi-image : screenshot + crop → grounding sans description
|
||
anchor_b64 = target_spec.get("anchor_image_base64", "")
|
||
if anchor_b64:
|
||
try:
|
||
prompt_mi = (
|
||
"Image 1 is a screenshot. Image 2 shows a UI element.\n"
|
||
"Find where Image 2 appears on Image 1.\n"
|
||
'Return position: {"x": NNN, "y": NNN} in pixels of Image 1.'
|
||
)
|
||
resp2 = _requests.post("http://localhost:11434/api/chat", json={
|
||
"model": _grounding_model,
|
||
"messages": [
|
||
{"role": "user", "content": prompt_mi, "images": [shot_b64, anchor_b64]},
|
||
],
|
||
"stream": False,
|
||
"options": {"temperature": 0.1, "num_predict": 50},
|
||
}, timeout=60)
|
||
content2 = resp2.json().get("message", {}).get("content", "")
|
||
elapsed = time.time() - t0
|
||
|
||
# Parser tous les formats
|
||
arr2 = re.search(r'\[[\s]*([\d.]+)\s*,\s*([\d.]+)(?:\s*,\s*([\d.]+)\s*,\s*([\d.]+))?\s*\]', content2)
|
||
if arr2:
|
||
vals = [float(v) for v in arr2.groups() if v is not None]
|
||
if len(vals) >= 4:
|
||
x_pct = (vals[0] + vals[2]) / 2 / small_w
|
||
y_pct = (vals[1] + vals[3]) / 2 / small_h
|
||
elif len(vals) == 2:
|
||
x_pct = vals[0] / small_w
|
||
y_pct = vals[1] / small_h
|
||
if x_pct is None:
|
||
json2 = re.search(r'"x"\s*:\s*([\d.]+).*?"y"\s*:\s*([\d.]+)', content2)
|
||
if json2:
|
||
x_pct = float(json2.group(1)) / small_w
|
||
y_pct = float(json2.group(2)) / small_h
|
||
if x_pct is not None:
|
||
logger.info("Grounding multi-image OK (%.1fs)", elapsed)
|
||
except Exception as e:
|
||
logger.debug("Grounding multi-image erreur: %s", e)
|
||
|
||
if x_pct is None or y_pct is None:
|
||
logger.info(
|
||
"Grounding : réponse non parsable (%.1fs) — %s",
|
||
elapsed, content[:120],
|
||
)
|
||
return None
|
||
|
||
# Valider les bornes
|
||
if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0):
|
||
logger.info("Grounding : coordonnées hors bornes (%.3f, %.3f)", x_pct, y_pct)
|
||
return None
|
||
|
||
# Convertir coordonnées fenêtre → coordonnées écran
|
||
if using_window and window_rect:
|
||
win_x1, win_y1, win_x2, win_y2 = window_rect
|
||
win_w = win_x2 - win_x1
|
||
win_h = win_y2 - win_y1
|
||
# x_pct/y_pct sont relatifs à la fenêtre, convertir en relatif à l'écran
|
||
abs_x = win_x1 + x_pct * win_w
|
||
abs_y = win_y1 + y_pct * win_h
|
||
x_pct = abs_x / screen_width
|
||
y_pct = abs_y / screen_height
|
||
logger.info(
|
||
"Grounding OK [%s/window] : '%s' → (%.4f, %.4f) en %.1fs",
|
||
_grounding_model, description[:50], x_pct, y_pct, elapsed,
|
||
)
|
||
else:
|
||
logger.info(
|
||
"Grounding OK [%s/full] : '%s' → (%.4f, %.4f) en %.1fs",
|
||
_grounding_model, description[:50], x_pct, y_pct, elapsed,
|
||
)
|
||
|
||
return {
|
||
"resolved": True,
|
||
"method": "grounding_vlm",
|
||
"x_pct": round(x_pct, 6),
|
||
"y_pct": round(y_pct, 6),
|
||
"matched_element": {
|
||
"label": description[:60],
|
||
"type": "grounding",
|
||
"role": "grounding_vlm",
|
||
"confidence": 0.85,
|
||
},
|
||
"score": 0.85,
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Résolution Set-of-Mark : SomEngine (détection) + VLM (identification)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _get_som_engine_api():
|
||
"""Singleton SomEngine partagé."""
|
||
try:
|
||
from core.detection.som_engine import get_shared_engine
|
||
return get_shared_engine()
|
||
except ImportError:
|
||
return None
|
||
|
||
|
||
def _resolve_by_som(
|
||
screenshot_path: str,
|
||
target_spec: Dict[str, Any],
|
||
screen_width: int,
|
||
screen_height: int,
|
||
) -> Optional[Dict[str, Any]]:
|
||
"""Résoudre une cible UI via Set-of-Mark + VLM.
|
||
|
||
Pipeline :
|
||
1. SomEngine détecte tous les éléments et les numérote sur le screenshot
|
||
2. VLM reçoit l'image annotée + description de la cible
|
||
3. VLM identifie le numéro du mark → coordonnées précises
|
||
|
||
Avantages vs VLM direct :
|
||
- Le VLM n'a qu'à identifier (son point fort), pas localiser
|
||
- Les coordonnées viennent de SomEngine (pixel-perfect)
|
||
- Question simple "quel numéro ?" → réponse simple
|
||
|
||
Args:
|
||
screenshot_path: Chemin du screenshot actuel
|
||
target_spec: Spécification de la cible (vlm_description, som_element, etc.)
|
||
screen_width: Largeur écran en pixels
|
||
screen_height: Hauteur écran en pixels
|
||
|
||
Returns:
|
||
Dict avec resolved=True et coordonnées, ou None si indisponible.
|
||
"""
|
||
engine = _get_som_engine_api()
|
||
if engine is None:
|
||
return None
|
||
|
||
client = _get_vlm_client()
|
||
if client is None:
|
||
return None
|
||
|
||
t0 = time.time()
|
||
|
||
# ── 1. Lancer SomEngine sur le screenshot actuel ──
|
||
try:
|
||
from PIL import Image as PILImage
|
||
img = PILImage.open(screenshot_path).convert("RGB")
|
||
som_result = engine.analyze(img)
|
||
except Exception as e:
|
||
logger.warning("SoM resolve : erreur analyse — %s", e)
|
||
return None
|
||
|
||
if not som_result.elements:
|
||
logger.info("SoM resolve : 0 éléments détectés")
|
||
return None
|
||
|
||
# ── 2. Construire la description de la cible ──
|
||
som_element = target_spec.get("som_element", {})
|
||
vlm_description = target_spec.get("vlm_description", "")
|
||
anchor_label = som_element.get("label", "")
|
||
|
||
# Construire un prompt riche
|
||
target_parts = []
|
||
if anchor_label:
|
||
target_parts.append(f"texte '{anchor_label}'")
|
||
if vlm_description:
|
||
target_parts.append(vlm_description)
|
||
if not target_parts:
|
||
# Sans description, SoM resolve ne peut pas fonctionner
|
||
logger.debug("SoM resolve : pas de description pour identifier l'élément")
|
||
return None
|
||
|
||
target_desc = ", ".join(target_parts)
|
||
|
||
# ── 2.5. Raccourci : si le label est connu, chercher par texte directement ──
|
||
# Pas besoin du VLM si on connaît le texte exact de l'élément !
|
||
if anchor_label and len(anchor_label) >= 2:
|
||
label_lower = anchor_label.lower()
|
||
# Match exact d'abord, puis partiel
|
||
exact_matches = [
|
||
e for e in som_result.elements
|
||
if e.label and e.label.lower() == label_lower
|
||
]
|
||
if not exact_matches:
|
||
exact_matches = [
|
||
e for e in som_result.elements
|
||
if e.label and len(e.label) >= 3 and (
|
||
label_lower in e.label.lower()
|
||
or e.label.lower() in label_lower
|
||
)
|
||
]
|
||
|
||
if len(exact_matches) == 1:
|
||
# Match unique par texte → pas besoin du VLM
|
||
elem = exact_matches[0]
|
||
elapsed = time.time() - t0
|
||
cx_norm, cy_norm = elem.center_norm
|
||
logger.info(
|
||
"SoM resolve FAST : match texte unique '#%d %s' → (%.4f, %.4f) en %.1fs",
|
||
elem.id, elem.label, cx_norm, cy_norm, elapsed,
|
||
)
|
||
return {
|
||
"resolved": True,
|
||
"method": "som_text_match",
|
||
"x_pct": round(cx_norm, 6),
|
||
"y_pct": round(cy_norm, 6),
|
||
"matched_element": {
|
||
"label": elem.label,
|
||
"type": elem.source,
|
||
"role": "som_text_match",
|
||
"confidence": max(elem.confidence, 0.85),
|
||
"som_id": elem.id,
|
||
},
|
||
"score": max(elem.confidence, 0.85),
|
||
}
|
||
elif len(exact_matches) > 1:
|
||
# Plusieurs matchs texte → disambiguïser par proximité à la position originale
|
||
ref_center = som_element.get("center_norm", [])
|
||
if ref_center and len(ref_center) == 2:
|
||
ref_x, ref_y = ref_center
|
||
best = min(
|
||
exact_matches,
|
||
key=lambda e: (
|
||
(e.center_norm[0] - ref_x) ** 2
|
||
+ (e.center_norm[1] - ref_y) ** 2
|
||
),
|
||
)
|
||
elapsed = time.time() - t0
|
||
cx_norm, cy_norm = best.center_norm
|
||
dist = ((cx_norm - ref_x) ** 2 + (cy_norm - ref_y) ** 2) ** 0.5
|
||
if dist < 0.15: # Tolérance 15% de l'écran
|
||
logger.info(
|
||
"SoM resolve FAST : match texte proximité '#%d %s' (dist=%.3f) "
|
||
"→ (%.4f, %.4f) en %.1fs",
|
||
best.id, best.label, dist, cx_norm, cy_norm, elapsed,
|
||
)
|
||
return {
|
||
"resolved": True,
|
||
"method": "som_text_match",
|
||
"x_pct": round(cx_norm, 6),
|
||
"y_pct": round(cy_norm, 6),
|
||
"matched_element": {
|
||
"label": best.label,
|
||
"type": best.source,
|
||
"role": "som_text_match_proximity",
|
||
"confidence": max(best.confidence, 0.80),
|
||
"som_id": best.id,
|
||
},
|
||
"score": max(best.confidence, 0.80),
|
||
}
|
||
logger.info(
|
||
"SoM resolve : %d matchs texte pour '%s', VLM nécessaire",
|
||
len(exact_matches), anchor_label,
|
||
)
|
||
|
||
# ── 2.7. Fallback : template matching anchor vs éléments SomEngine ──
|
||
# Pour les icônes sans texte : comparer le crop de référence contre
|
||
# chaque région YOLO détectée par SomEngine.
|
||
anchor_b64 = target_spec.get("anchor_image_base64", "")
|
||
by_text = target_spec.get("by_text", "").strip()
|
||
if anchor_b64 and (not anchor_label or not by_text):
|
||
try:
|
||
import cv2
|
||
import numpy as np
|
||
|
||
# Décoder l'anchor
|
||
anc_bytes = base64.b64decode(anchor_b64)
|
||
anc_array = np.frombuffer(anc_bytes, dtype=np.uint8)
|
||
anc_img = cv2.imdecode(anc_array, cv2.IMREAD_GRAYSCALE)
|
||
|
||
# Charger le screenshot en OpenCV
|
||
screenshot_cv = cv2.imread(screenshot_path, cv2.IMREAD_GRAYSCALE)
|
||
|
||
if anc_img is not None and screenshot_cv is not None:
|
||
# Template matching de l'anchor sur le SCREENSHOT ENTIER
|
||
# (pas sur les régions individuelles — l'anchor est souvent plus grand)
|
||
anc_h, anc_w = anc_img.shape[:2]
|
||
if screenshot_cv.shape[0] >= anc_h and screenshot_cv.shape[1] >= anc_w:
|
||
res = cv2.matchTemplate(screenshot_cv, anc_img, cv2.TM_CCOEFF_NORMED)
|
||
_, max_score, _, max_loc = cv2.minMaxLoc(res)
|
||
|
||
if max_score >= 0.5:
|
||
# Centre du match
|
||
match_cx = max_loc[0] + anc_w // 2
|
||
match_cy = max_loc[1] + anc_h // 2
|
||
|
||
# Trouver l'élément SomEngine le plus proche du centre du match
|
||
best_elem = None
|
||
best_dist = float("inf")
|
||
for elem in som_result.elements:
|
||
cx, cy = elem.center
|
||
dist = ((match_cx - cx) ** 2 + (match_cy - cy) ** 2) ** 0.5
|
||
if dist < best_dist:
|
||
best_dist = dist
|
||
best_elem = elem
|
||
|
||
if best_elem and best_dist < 100: # Max 100px de distance
|
||
elapsed = time.time() - t0
|
||
cx_norm, cy_norm = best_elem.center_norm
|
||
logger.info(
|
||
"SoM resolve ANCHOR : match crop score=%.3f → "
|
||
"elem '#%d %s' (dist=%.0fpx) → (%.4f, %.4f) en %.1fs",
|
||
max_score, best_elem.id, best_elem.label,
|
||
best_dist, cx_norm, cy_norm, elapsed,
|
||
)
|
||
return {
|
||
"resolved": True,
|
||
"method": "som_anchor_match",
|
||
"x_pct": round(cx_norm, 6),
|
||
"y_pct": round(cy_norm, 6),
|
||
"matched_element": {
|
||
"label": best_elem.label or f"icon #{best_elem.id}",
|
||
"type": best_elem.source,
|
||
"role": "som_anchor_match",
|
||
"confidence": max_score,
|
||
"som_id": best_elem.id,
|
||
},
|
||
"score": max_score,
|
||
}
|
||
except ImportError:
|
||
pass
|
||
except Exception as e:
|
||
logger.debug("SoM anchor match erreur : %s", e)
|
||
|
||
# ── 3. Sauvegarder l'image annotée SoM temporairement ──
|
||
if som_result.som_image is None:
|
||
logger.debug("SoM resolve : pas d'image annotée, skip VLM")
|
||
return None
|
||
|
||
import tempfile
|
||
try:
|
||
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
|
||
som_result.som_image.save(tmp, format="JPEG", quality=85)
|
||
som_img_path = tmp.name
|
||
except Exception as e:
|
||
logger.warning("SoM resolve : erreur sauvegarde image annotée — %s", e)
|
||
return None
|
||
|
||
# ── 4. VLM : identifier le numéro du mark ──
|
||
# Lister uniquement les éléments avec un label (plus concis pour le VLM)
|
||
labeled_elements = [e for e in som_result.elements if e.label][:30]
|
||
elements_list = "\n".join(
|
||
f" #{e.id}: '{e.label}'"
|
||
for e in labeled_elements
|
||
)
|
||
|
||
# Multi-image : SoM annotée + anchor crop (si disponible)
|
||
anchor_b64 = target_spec.get("anchor_image_base64", "")
|
||
extra_images = [anchor_b64] if anchor_b64 else None
|
||
|
||
if extra_images:
|
||
prompt = (
|
||
"Image 1 shows the screen with numbered marks on each UI element.\n"
|
||
"Image 2 shows the element I'm looking for.\n\n"
|
||
f"Target: {target_desc}\n\n"
|
||
f"Detected elements:\n{elements_list}\n\n"
|
||
"Which mark number matches the target element in Image 2?\n"
|
||
'Answer with JSON only: {"mark_id": N, "confidence": 0.9}'
|
||
)
|
||
else:
|
||
prompt = (
|
||
f"I'm looking for: {target_desc}\n\n"
|
||
f"Detected elements:\n{elements_list}\n\n"
|
||
"Which number is the correct element?\n"
|
||
'Answer with JSON only: {"mark_id": N, "confidence": 0.9}'
|
||
)
|
||
|
||
system_prompt = "You identify UI elements by number. Output JSON only, no explanation."
|
||
|
||
try:
|
||
result = client.generate(
|
||
prompt=prompt,
|
||
image_path=som_img_path,
|
||
system_prompt=system_prompt,
|
||
temperature=0.1,
|
||
max_tokens=50,
|
||
force_json=False,
|
||
extra_images_b64=extra_images,
|
||
)
|
||
except Exception as e:
|
||
logger.warning("SoM resolve : erreur VLM — %s", e)
|
||
return None
|
||
finally:
|
||
import os
|
||
try:
|
||
os.unlink(som_img_path)
|
||
except OSError:
|
||
pass
|
||
|
||
elapsed = time.time() - t0
|
||
|
||
if not result.get("success"):
|
||
logger.info("SoM resolve : VLM échoué (%.1fs)", elapsed)
|
||
return None
|
||
|
||
# ── 5. Parser la réponse et retourner les coordonnées ──
|
||
response_text = result.get("response", "").strip()
|
||
|
||
# Tenter d'abord l'extraction JSON standard
|
||
parsed = client._extract_json_from_response(response_text)
|
||
|
||
# Fallback : extraire un nombre simple de la réponse
|
||
if parsed is None:
|
||
import re
|
||
numbers = re.findall(r'\b(\d+)\b', response_text)
|
||
if numbers:
|
||
candidate = int(numbers[0])
|
||
if som_result.get_element_by_id(candidate) is not None:
|
||
parsed = {"mark_id": candidate, "confidence": 0.7}
|
||
logger.debug("SoM resolve : extraction numéro fallback → #%d", candidate)
|
||
|
||
if parsed is None:
|
||
logger.info("SoM resolve : réponse non-JSON (%.1fs) — %.80s", elapsed, response_text)
|
||
return None
|
||
|
||
mark_id = parsed.get("mark_id")
|
||
confidence = float(parsed.get("confidence", 0.0))
|
||
|
||
if mark_id is None or confidence < 0.3:
|
||
logger.info(
|
||
"SoM resolve : mark non trouvé ou confiance trop basse (mark=%s, conf=%.2f, %.1fs)",
|
||
mark_id, confidence, elapsed,
|
||
)
|
||
return None
|
||
|
||
mark_id = int(mark_id)
|
||
elem = som_result.get_element_by_id(mark_id)
|
||
if elem is None:
|
||
logger.warning("SoM resolve : mark #%d inexistant (%.1fs)", mark_id, elapsed)
|
||
return None
|
||
|
||
cx_norm, cy_norm = elem.center_norm
|
||
logger.info(
|
||
"SoM resolve OK : mark #%d '%s' → (%.4f, %.4f) conf=%.2f en %.1fs (%d éléments)",
|
||
mark_id, elem.label, cx_norm, cy_norm, confidence, elapsed, len(som_result.elements),
|
||
)
|
||
|
||
return {
|
||
"resolved": True,
|
||
"method": "som_vlm",
|
||
"x_pct": round(cx_norm, 6),
|
||
"y_pct": round(cy_norm, 6),
|
||
"matched_element": {
|
||
"label": elem.label or f"mark #{mark_id}",
|
||
"type": elem.source,
|
||
"role": "som_identified",
|
||
"confidence": confidence,
|
||
"som_id": mark_id,
|
||
},
|
||
"score": confidence,
|
||
}
|
||
|
||
|
||
def _resolve_target_sync(
|
||
screenshot_path: str,
|
||
target_spec: Dict[str, Any],
|
||
screen_width: int,
|
||
screen_height: int,
|
||
fallback_x_pct: float,
|
||
fallback_y_pct: float,
|
||
strict_mode: bool = False,
|
||
) -> Dict[str, Any]:
|
||
"""Résoudre la cible visuellement (exécuté dans un thread séparé).
|
||
|
||
Hiérarchie de résolution (strict_mode=True, replay sessions) — VLM-FIRST :
|
||
1. VLM Quick Find (~3-8s) — compréhension sémantique de l'écran, multi-image
|
||
(screenshot + crop de référence + description riche)
|
||
1.5. SoM + VLM (~5-15s) — SomEngine numérote les éléments, VLM identifie le bon
|
||
2. Template matching OpenCV (~100ms) — fallback pixel, seuil STRICT 0.90
|
||
3. resolved=False → STOP le replay
|
||
|
||
Le VLM comprend le contexte (titre de fenêtre, type d'élément, position)
|
||
et peut trouver un élément même si l'écran est différent de l'enregistrement.
|
||
Le template matching ne compare que des pixels et produit des faux positifs.
|
||
|
||
Hiérarchie classique (strict_mode=False, VWB et autres) — INCHANGÉE :
|
||
1. Template matching OpenCV (~100ms) — seuil 0.70
|
||
1.5. VLM Quick Find si template échoue et by_text/by_role dispo
|
||
2. by_text/by_role → VLM Quick Find puis ScreenAnalyzer
|
||
3. fallback coordonnées statiques
|
||
"""
|
||
anchor_image_b64 = target_spec.get("anchor_image_base64", "")
|
||
|
||
# ===================================================================
|
||
# MODE STRICT (replay sessions) — Stratégie VLM-FIRST
|
||
# ===================================================================
|
||
if strict_mode and anchor_image_b64:
|
||
vlm_description = target_spec.get("vlm_description", "")
|
||
by_text_strict = target_spec.get("by_text", "").strip()
|
||
|
||
# Fallback : construire la description depuis by_text/by_role
|
||
if not vlm_description:
|
||
by_role = target_spec.get("by_role", "").strip()
|
||
if by_text_strict or by_role:
|
||
vlm_description = _build_target_description(target_spec)
|
||
|
||
# ---------------------------------------------------------------
|
||
# Étape -1 : Vérification CLIP (si embedding de référence fourni)
|
||
# Vérifie qu'on est dans la bonne application avant de chercher
|
||
# l'élément. Filet de sécurité contre les clics au mauvais endroit.
|
||
# ---------------------------------------------------------------
|
||
clip_embedding = target_spec.get("clip_embedding")
|
||
if clip_embedding:
|
||
try:
|
||
from core.embedding.clip_embedder import CLIPEmbedder
|
||
from PIL import Image as _PILImage
|
||
import numpy as _np
|
||
|
||
_clip = CLIPEmbedder()
|
||
# Embedding de l'écran actuel (fenêtre si possible)
|
||
window_capture = target_spec.get("window_capture", {})
|
||
window_rect = window_capture.get("rect")
|
||
current_img = _PILImage.open(screenshot_path)
|
||
if window_rect:
|
||
current_img = current_img.crop(tuple(window_rect))
|
||
|
||
current_emb = _np.array(_clip.embed_image(current_img), dtype=_np.float32).flatten()
|
||
ref_emb = _np.array(clip_embedding, dtype=_np.float32).flatten()
|
||
|
||
clip_sim = float(_np.dot(current_emb, ref_emb) / (
|
||
_np.linalg.norm(current_emb) * _np.linalg.norm(ref_emb)
|
||
))
|
||
logger.info(f"CLIP vérification : similarité={clip_sim:.3f}")
|
||
|
||
if clip_sim < 0.75:
|
||
logger.warning(
|
||
f"CLIP MISMATCH : sim={clip_sim:.3f} < 0.75 — "
|
||
f"écran actuel trop différent de l'enregistrement"
|
||
)
|
||
return {
|
||
"resolved": False,
|
||
"method": "clip_mismatch",
|
||
"reason": f"clip_similarity_{clip_sim:.3f}",
|
||
"x_pct": fallback_x_pct,
|
||
"y_pct": fallback_y_pct,
|
||
}
|
||
except Exception as e:
|
||
logger.debug(f"CLIP vérification erreur (non-bloquant) : {e}")
|
||
|
||
# ---------------------------------------------------------------
|
||
# Étape 0 : Choisir la stratégie selon le type d'élément
|
||
# ---------------------------------------------------------------
|
||
by_text_source = target_spec.get("by_text_source", "")
|
||
|
||
has_window = bool(target_spec.get("window_capture", {}).get("rect"))
|
||
|
||
if by_text_strict and by_text_source in ("ocr", "vlm") and has_window:
|
||
# Texte visible DANS une fenêtre → grounding VLM sur fenêtre croppée
|
||
grounding_result = _resolve_by_grounding(
|
||
screenshot_path=screenshot_path,
|
||
target_spec=target_spec,
|
||
screen_width=screen_width,
|
||
screen_height=screen_height,
|
||
)
|
||
if grounding_result and grounding_result.get("resolved"):
|
||
logger.info(
|
||
"Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'",
|
||
grounding_result.get("x_pct", 0),
|
||
grounding_result.get("y_pct", 0),
|
||
by_text_strict[:50],
|
||
)
|
||
return grounding_result
|
||
|
||
if not by_text_strict or by_text_source not in ("ocr", "vlm"):
|
||
# Template matching pour les éléments sans texte (icônes pures)
|
||
window_capture = target_spec.get("window_capture", {})
|
||
window_rect = window_capture.get("rect")
|
||
from pathlib import Path as _Path
|
||
_full = _Path(screenshot_path)
|
||
_win = _full.parent / _full.name.replace("_full.png", "_window.png")
|
||
tm_path = str(_win) if _win.is_file() and window_rect else screenshot_path
|
||
tm_screen_w = (window_rect[2] - window_rect[0]) if window_rect and _win.is_file() else screen_width
|
||
tm_screen_h = (window_rect[3] - window_rect[1]) if window_rect and _win.is_file() else screen_height
|
||
|
||
result = _resolve_by_template_matching(
|
||
screenshot_path=tm_path,
|
||
anchor_image_b64=anchor_image_b64,
|
||
screen_width=tm_screen_w,
|
||
screen_height=tm_screen_h,
|
||
confidence_threshold=0.90,
|
||
)
|
||
if result and result.get("score", 0) >= 0.90:
|
||
x_tm, y_tm = result["x_pct"], result["y_pct"]
|
||
# Convertir coordonnées fenêtre → écran si nécessaire
|
||
if window_rect and _win.is_file():
|
||
abs_x = window_rect[0] + x_tm * tm_screen_w
|
||
abs_y = window_rect[1] + y_tm * tm_screen_h
|
||
result["x_pct"] = round(abs_x / screen_width, 6)
|
||
result["y_pct"] = round(abs_y / screen_height, 6)
|
||
logger.info(
|
||
"Strict resolve TEMPLATE : icon match (score=%.3f)",
|
||
result.get("score", 0),
|
||
)
|
||
return result
|
||
|
||
# ---------------------------------------------------------------
|
||
# Étape 1 : VLM Quick Find (fallback, multi-image)
|
||
# ---------------------------------------------------------------
|
||
if vlm_description or anchor_image_b64:
|
||
vlm_result = _vlm_quick_find(
|
||
screenshot_path=screenshot_path,
|
||
target_description=vlm_description,
|
||
anchor_image_b64=anchor_image_b64,
|
||
)
|
||
if vlm_result and vlm_result.get("resolved"):
|
||
if vlm_result.get("score", 0) >= 0.3:
|
||
logger.info(
|
||
"Strict resolve VLM-first : VLM OK (score=%.2f) pour '%s'",
|
||
vlm_result.get("score", 0),
|
||
vlm_description[:60] if vlm_description else "(anchor)",
|
||
)
|
||
return vlm_result
|
||
else:
|
||
logger.info(
|
||
"Strict resolve VLM-first : VLM score=%.2f trop bas, passage template",
|
||
vlm_result.get("score", 0),
|
||
)
|
||
else:
|
||
logger.info(
|
||
"Strict resolve VLM-first : VLM échoué pour '%s', passage template matching",
|
||
vlm_description[:60] if vlm_description else "(anchor)",
|
||
)
|
||
|
||
# ---------------------------------------------------------------
|
||
# Étape 1.5 : SoM + VLM (Set-of-Mark + identification)
|
||
# SomEngine numérote les éléments, VLM identifie le bon numéro.
|
||
# Plus fiable que le VLM direct car le VLM n'a qu'à identifier,
|
||
# pas localiser — et les coordonnées sont pixel-perfect.
|
||
# ---------------------------------------------------------------
|
||
som_element = target_spec.get("som_element", {})
|
||
if som_element or vlm_description:
|
||
som_result = _resolve_by_som(
|
||
screenshot_path=screenshot_path,
|
||
target_spec=target_spec,
|
||
screen_width=screen_width,
|
||
screen_height=screen_height,
|
||
)
|
||
if som_result and som_result.get("resolved"):
|
||
logger.info(
|
||
"Strict resolve SoM+VLM : OK (score=%.2f, mark=#%s)",
|
||
som_result.get("score", 0),
|
||
som_result.get("matched_element", {}).get("som_id", "?"),
|
||
)
|
||
return som_result
|
||
else:
|
||
logger.info("Strict resolve SoM+VLM : échoué, passage template matching")
|
||
|
||
# ---------------------------------------------------------------
|
||
# Étape 2 : Template matching (fallback pixel) — seuil STRICT 0.90
|
||
# ---------------------------------------------------------------
|
||
result = _resolve_by_template_matching(
|
||
screenshot_path=screenshot_path,
|
||
anchor_image_b64=anchor_image_b64,
|
||
screen_width=screen_width,
|
||
screen_height=screen_height,
|
||
confidence_threshold=0.90,
|
||
)
|
||
if result:
|
||
score = result.get("score", 0)
|
||
# Score >= 0.95 : match quasi-parfait, pas besoin de valider le contexte
|
||
if score >= 0.95:
|
||
logger.info(
|
||
"Strict resolve VLM-first : template matching fallback OK "
|
||
"(score=%.3f >= 0.95, contexte skip — match quasi-parfait)",
|
||
score,
|
||
)
|
||
return result
|
||
elif _validate_match_context(result, fallback_x_pct, fallback_y_pct, target_spec):
|
||
logger.info(
|
||
"Strict resolve VLM-first : template matching fallback OK "
|
||
"(score=%.3f >= 0.90, context OK)",
|
||
score,
|
||
)
|
||
return result
|
||
else:
|
||
logger.warning(
|
||
"Strict resolve VLM-first : template score=%.3f MAIS contexte invalide, rejeté",
|
||
score,
|
||
)
|
||
|
||
# ---------------------------------------------------------------
|
||
# Étape 3 : RIEN ne fonctionne → resolved=False → STOP replay
|
||
# ---------------------------------------------------------------
|
||
return {
|
||
"resolved": False,
|
||
"method": "strict_vlm_template_failed",
|
||
"reason": "vlm_and_template_all_failed",
|
||
"x_pct": fallback_x_pct,
|
||
"y_pct": fallback_y_pct,
|
||
}
|
||
|
||
# ===================================================================
|
||
# MODE CLASSIQUE (VWB et autres) — Comportement existant
|
||
# ===================================================================
|
||
|
||
# ---------------------------------------------------------------
|
||
# Stratégie 1 : Template matching par image d'ancre (seuil 0.70)
|
||
# ---------------------------------------------------------------
|
||
if anchor_image_b64:
|
||
result = _resolve_by_template_matching(
|
||
screenshot_path=screenshot_path,
|
||
anchor_image_b64=anchor_image_b64,
|
||
screen_width=screen_width,
|
||
screen_height=screen_height,
|
||
confidence_threshold=0.7,
|
||
)
|
||
if result:
|
||
return result
|
||
logger.info(
|
||
"Template matching échoué pour ancre '%s', tentative VLM Quick Find",
|
||
target_spec.get("anchor_id", "?"),
|
||
)
|
||
|
||
# ---------------------------------------------------------------
|
||
# Stratégie 1.5 : VLM Quick Find (fallback léger après template matching)
|
||
# ---------------------------------------------------------------
|
||
by_text = target_spec.get("by_text", "").strip()
|
||
by_role = target_spec.get("by_role", "").strip()
|
||
if by_text or by_role:
|
||
vlm_desc = _build_target_description(target_spec)
|
||
vlm_result = _vlm_quick_find(
|
||
screenshot_path=screenshot_path,
|
||
target_description=vlm_desc,
|
||
anchor_image_b64=anchor_image_b64,
|
||
)
|
||
if vlm_result:
|
||
return vlm_result
|
||
logger.info(
|
||
"VLM Quick Find échoué pour ancre '%s', fallback coordonnées",
|
||
target_spec.get("anchor_id", "?"),
|
||
)
|
||
|
||
return {
|
||
"resolved": False,
|
||
"method": "fallback",
|
||
"reason": "template_matching_failed",
|
||
"x_pct": fallback_x_pct,
|
||
"y_pct": fallback_y_pct,
|
||
}
|
||
|
||
# ---------------------------------------------------------------
|
||
# Stratégie 2 : VLM Quick Find (léger, ~5-10s)
|
||
# ---------------------------------------------------------------
|
||
by_text = target_spec.get("by_text", "")
|
||
by_role = target_spec.get("by_role", "")
|
||
|
||
# Si aucun critère sémantique et pas d'ancre, fallback direct
|
||
if not by_text and not by_role and not anchor_image_b64:
|
||
return {
|
||
"resolved": False,
|
||
"method": "fallback",
|
||
"reason": "no_target_criteria",
|
||
"x_pct": fallback_x_pct,
|
||
"y_pct": fallback_y_pct,
|
||
}
|
||
|
||
# Tenter le VLM Quick Find AVANT ScreenAnalyzer (beaucoup plus rapide)
|
||
if by_text or by_role:
|
||
vlm_desc = _build_target_description(target_spec)
|
||
vlm_result = _vlm_quick_find(
|
||
screenshot_path=screenshot_path,
|
||
target_description=vlm_desc,
|
||
)
|
||
if vlm_result:
|
||
return vlm_result
|
||
logger.info(
|
||
"VLM Quick Find échoué pour '%s', fallback ScreenAnalyzer",
|
||
vlm_desc,
|
||
)
|
||
|
||
# ---------------------------------------------------------------
|
||
# Stratégie 3 : Matching sémantique via ScreenAnalyzer (~15-20s)
|
||
# ---------------------------------------------------------------
|
||
processor._ensure_initialized()
|
||
|
||
if processor._screen_analyzer is None:
|
||
return {
|
||
"resolved": False,
|
||
"method": "fallback",
|
||
"reason": "screen_analyzer_unavailable",
|
||
"x_pct": fallback_x_pct,
|
||
"y_pct": fallback_y_pct,
|
||
}
|
||
|
||
# Analyser le screenshot (Niveaux 1-3 : raw, OCR, UI elements)
|
||
try:
|
||
screen_state = processor._screen_analyzer.analyze(screenshot_path)
|
||
except Exception as e:
|
||
logger.warning(f"Analyse screenshot échouée: {e}")
|
||
return {
|
||
"resolved": False,
|
||
"method": "fallback",
|
||
"reason": f"analysis_failed: {e}",
|
||
"x_pct": fallback_x_pct,
|
||
"y_pct": fallback_y_pct,
|
||
}
|
||
|
||
ui_elements = screen_state.ui_elements or []
|
||
if not ui_elements:
|
||
logger.info("Aucun élément UI détecté, fallback coordonnées")
|
||
return {
|
||
"resolved": False,
|
||
"method": "fallback",
|
||
"reason": "no_ui_elements",
|
||
"x_pct": fallback_x_pct,
|
||
"y_pct": fallback_y_pct,
|
||
}
|
||
|
||
# Matching de la cible parmi les éléments détectés
|
||
candidates = []
|
||
|
||
for elem in ui_elements:
|
||
score = 0.0
|
||
|
||
# Score par texte (label)
|
||
if by_text and elem.label:
|
||
text_lower = by_text.lower()
|
||
label_lower = elem.label.lower()
|
||
if text_lower in label_lower or label_lower in text_lower:
|
||
score += 0.6
|
||
elif _fuzzy_match(text_lower, label_lower):
|
||
score += 0.3
|
||
|
||
# Score par rôle
|
||
if by_role:
|
||
role_lower = by_role.lower()
|
||
if elem.role and role_lower in elem.role.lower():
|
||
score += 0.3
|
||
if elem.type and role_lower in elem.type.lower():
|
||
score += 0.2
|
||
|
||
if score > 0:
|
||
candidates.append((elem, score))
|
||
|
||
if not candidates:
|
||
logger.info(
|
||
f"Aucun match visuel pour target(text='{by_text}', role='{by_role}') "
|
||
f"parmi {len(ui_elements)} éléments"
|
||
)
|
||
return {
|
||
"resolved": False,
|
||
"method": "fallback",
|
||
"reason": "no_match",
|
||
"x_pct": fallback_x_pct,
|
||
"y_pct": fallback_y_pct,
|
||
"ui_elements_count": len(ui_elements),
|
||
}
|
||
|
||
# Trier par score décroissant et prendre le meilleur
|
||
candidates.sort(key=lambda c: c[1], reverse=True)
|
||
best_elem, best_score = candidates[0]
|
||
|
||
# Convertir les coordonnées pixel en proportions
|
||
cx, cy = best_elem.center
|
||
x_pct = round(cx / screen_width, 6) if screen_width > 0 else 0.0
|
||
y_pct = round(cy / screen_height, 6) if screen_height > 0 else 0.0
|
||
|
||
logger.info(
|
||
f"Cible résolue visuellement: '{best_elem.label}' ({best_elem.type}/{best_elem.role}) "
|
||
f"score={best_score:.2f} → ({x_pct:.4f}, {y_pct:.4f})"
|
||
)
|
||
|
||
return {
|
||
"resolved": True,
|
||
"method": "visual",
|
||
"x_pct": x_pct,
|
||
"y_pct": y_pct,
|
||
"matched_element": {
|
||
"label": best_elem.label,
|
||
"type": best_elem.type,
|
||
"role": best_elem.role,
|
||
"center": list(best_elem.center),
|
||
"confidence": best_elem.label_confidence,
|
||
},
|
||
"score": best_score,
|
||
"candidates_count": len(candidates),
|
||
"ui_elements_count": len(ui_elements),
|
||
}
|
||
|
||
|
||
def _fuzzy_match(a: str, b: str, threshold: float = 0.6) -> bool:
|
||
"""Match approximatif par ratio de caractères communs."""
|
||
if not a or not b:
|
||
return False
|
||
common = sum(1 for c in a if c in b)
|
||
return (common / max(len(a), len(b))) >= threshold
|
||
|
||
|
||
def _fallback_response(request: ResolveTargetRequest, reason: str, detail: str) -> Dict:
|
||
"""Réponse de fallback quand la résolution visuelle échoue."""
|
||
return {
|
||
"resolved": False,
|
||
"method": "fallback",
|
||
"reason": reason,
|
||
"detail": detail,
|
||
"x_pct": request.fallback_x_pct,
|
||
"y_pct": request.fallback_y_pct,
|
||
}
|
||
|
||
|
||
# =========================================================================
|
||
# Learning Pack — Export / Import pour la fédération des apprentissages
|
||
# =========================================================================
|
||
|
||
class LearningPackImportRequest(BaseModel):
|
||
"""Corps de la requête d'import d'un Learning Pack."""
|
||
# Le pack complet au format JSON (structure LearningPack.to_dict())
|
||
pack: Dict[str, Any]
|
||
|
||
|
||
@app.get("/api/v1/traces/stream/learning-pack/export")
|
||
async def export_learning_pack(client_id: str, request: Request):
|
||
"""
|
||
Exporter les apprentissages d'un client en Learning Pack anonymisé.
|
||
|
||
Le client_id est haché (SHA-256) dans le pack exporté —
|
||
aucune donnée d'identification ne sort du serveur.
|
||
|
||
Query params:
|
||
client_id: identifiant du client (obligatoire).
|
||
|
||
Returns:
|
||
JSON du LearningPack anonymisé.
|
||
"""
|
||
try:
|
||
from core.federation.learning_pack import LearningPackExporter
|
||
from core.models.workflow_graph import Workflow
|
||
except ImportError as exc:
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"Module federation non disponible : {exc}",
|
||
)
|
||
|
||
if not client_id or not client_id.strip():
|
||
raise HTTPException(status_code=400, detail="client_id requis")
|
||
|
||
# Récupérer tous les workflows chargés par le StreamProcessor
|
||
workflows = list(processor._workflows.values())
|
||
if not workflows:
|
||
raise HTTPException(
|
||
status_code=404,
|
||
detail="Aucun workflow trouvé pour l'export",
|
||
)
|
||
|
||
exporter = LearningPackExporter()
|
||
pack = exporter.export(workflows, client_id=client_id.strip())
|
||
|
||
logger.info(
|
||
"Learning pack exporté pour client_id=%s (hash=%s) : %d workflows, %d prototypes",
|
||
client_id[:8] + "...", pack.source_hash[:16] + "...",
|
||
len(workflows), len(pack.screen_prototypes),
|
||
)
|
||
return pack.to_dict()
|
||
|
||
|
||
@app.post("/api/v1/traces/stream/learning-pack/import")
|
||
async def import_learning_pack(body: LearningPackImportRequest, request: Request):
|
||
"""
|
||
Importer un Learning Pack dans l'index FAISS global.
|
||
|
||
Body JSON:
|
||
{ "pack": { ... } } — structure LearningPack complète
|
||
|
||
Returns:
|
||
Statistiques de l'import (vecteurs ajoutés, total index, etc.).
|
||
"""
|
||
try:
|
||
from core.federation.learning_pack import LearningPack
|
||
from core.federation.faiss_global import GlobalFAISSIndex
|
||
except ImportError as exc:
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"Module federation non disponible : {exc}",
|
||
)
|
||
|
||
try:
|
||
pack = LearningPack.from_dict(body.pack)
|
||
except Exception as exc:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"Format de Learning Pack invalide : {exc}",
|
||
)
|
||
|
||
# Utiliser ou créer l'index global (singleton au niveau du module)
|
||
global _global_faiss_index
|
||
if _global_faiss_index is None:
|
||
_global_faiss_index = GlobalFAISSIndex()
|
||
|
||
added = _global_faiss_index.add_pack(pack)
|
||
stats = _global_faiss_index.get_stats()
|
||
|
||
logger.info(
|
||
"Learning pack importé : pack_id=%s, +%d vecteurs (total=%d)",
|
||
pack.pack_id, added, stats["total_vectors"],
|
||
)
|
||
return {
|
||
"status": "ok",
|
||
"pack_id": pack.pack_id,
|
||
"source_hash": pack.source_hash,
|
||
"vectors_added": added,
|
||
"index_stats": stats,
|
||
}
|
||
|
||
|
||
# Index FAISS global (singleton, initialisé au premier import)
|
||
_global_faiss_index = None
|
||
|
||
|
||
# =========================================================================
|
||
# Endpoints Audit Trail — traçabilité complète des actions RPA
|
||
# =========================================================================
|
||
|
||
@app.get("/api/v1/audit/history")
|
||
async def audit_history(
|
||
date_from: str = "",
|
||
date_to: str = "",
|
||
user_id: str = "",
|
||
session_id: str = "",
|
||
result: str = "",
|
||
action_type: str = "",
|
||
workflow_id: str = "",
|
||
domain: str = "",
|
||
limit: int = 100,
|
||
offset: int = 0,
|
||
):
|
||
"""
|
||
Historique d'audit paginé avec filtres.
|
||
|
||
Paramètres query :
|
||
date_from : date début (YYYY-MM-DD), défaut = aujourd'hui
|
||
date_to : date fin (YYYY-MM-DD), défaut = date_from
|
||
user_id : filtrer par identifiant TIM
|
||
session_id: filtrer par session
|
||
result : filtrer par résultat (success, failed, recovered, skipped)
|
||
action_type: filtrer par type d'action (click, type, key_combo, etc.)
|
||
workflow_id: filtrer par workflow
|
||
domain : filtrer par domaine métier
|
||
limit : nombre max de résultats (défaut 100, max 1000)
|
||
offset : décalage pour la pagination
|
||
|
||
Retourne la liste des entrées triées par timestamp décroissant.
|
||
"""
|
||
# Borner le limit pour éviter les abus
|
||
limit = min(max(1, limit), 1000)
|
||
offset = max(0, offset)
|
||
|
||
entries = _audit_trail.query(
|
||
date_from=date_from,
|
||
date_to=date_to,
|
||
user_id=user_id,
|
||
session_id=session_id,
|
||
result=result,
|
||
action_type=action_type,
|
||
workflow_id=workflow_id,
|
||
domain=domain,
|
||
limit=limit,
|
||
offset=offset,
|
||
)
|
||
|
||
return {
|
||
"status": "ok",
|
||
"count": len(entries),
|
||
"offset": offset,
|
||
"limit": limit,
|
||
"entries": entries,
|
||
}
|
||
|
||
|
||
@app.get("/api/v1/audit/summary")
|
||
async def audit_summary(
|
||
date: str = "",
|
||
):
|
||
"""
|
||
Résumé journalier de l'audit.
|
||
|
||
Paramètre query :
|
||
date : date cible (YYYY-MM-DD), défaut = aujourd'hui
|
||
|
||
Retourne les statistiques agrégées : nombre d'actions, taux de succès,
|
||
répartition par utilisateur, par résultat, par type, par workflow, par mode.
|
||
"""
|
||
summary = _audit_trail.get_summary(target_date=date)
|
||
return {
|
||
"status": "ok",
|
||
**summary,
|
||
}
|
||
|
||
|
||
@app.get("/api/v1/audit/export")
|
||
async def audit_export(
|
||
date_from: str = "",
|
||
date_to: str = "",
|
||
user_id: str = "",
|
||
session_id: str = "",
|
||
):
|
||
"""
|
||
Export CSV de l'historique d'audit.
|
||
|
||
Paramètres query :
|
||
date_from : date début (YYYY-MM-DD), défaut = aujourd'hui
|
||
date_to : date fin (YYYY-MM-DD), défaut = date_from
|
||
user_id : filtrer par identifiant TIM
|
||
session_id : filtrer par session
|
||
|
||
Retourne le fichier CSV en texte brut (Content-Type: text/csv).
|
||
"""
|
||
from fastapi.responses import Response
|
||
|
||
csv_data = _audit_trail.export_csv(
|
||
date_from=date_from,
|
||
date_to=date_to,
|
||
user_id=user_id,
|
||
session_id=session_id,
|
||
)
|
||
|
||
if not csv_data:
|
||
raise HTTPException(
|
||
status_code=404,
|
||
detail="Aucune entrée d'audit trouvée pour les filtres spécifiés.",
|
||
)
|
||
|
||
# Nom du fichier pour le téléchargement
|
||
filename = f"audit_{date_from or 'today'}"
|
||
if date_to and date_to != date_from:
|
||
filename += f"_to_{date_to}"
|
||
filename += ".csv"
|
||
|
||
return Response(
|
||
content=csv_data,
|
||
media_type="text/csv; charset=utf-8",
|
||
headers={
|
||
"Content-Disposition": f'attachment; filename="{filename}"',
|
||
},
|
||
)
|
||
|
||
|
||
# =========================================================================
|
||
# Task Planner — Comprendre et exécuter des ordres en langage naturel
|
||
# =========================================================================
|
||
|
||
from .task_planner import TaskPlanner
|
||
|
||
_task_planner = TaskPlanner()
|
||
|
||
|
||
class TaskRequest(BaseModel):
|
||
"""Requête de tâche en langage naturel."""
|
||
instruction: str # "Traite les dossiers de janvier"
|
||
machine_id: str = "default" # Machine cible
|
||
dry_run: bool = False # True = planifier sans exécuter
|
||
|
||
|
||
@app.post("/api/v1/task")
|
||
async def execute_task(request: TaskRequest):
|
||
"""Exécuter une tâche décrite en langage naturel.
|
||
|
||
Léa comprend l'instruction, trouve le workflow correspondant,
|
||
et l'exécute. C'est le point d'entrée principal pour l'utilisateur.
|
||
|
||
Exemples :
|
||
- "Ouvre le bloc-notes et écris bonjour"
|
||
- "Traite les dossiers de janvier"
|
||
- "Recherche voiture électrique sur Google"
|
||
"""
|
||
import asyncio
|
||
|
||
# 1. Lister les workflows disponibles
|
||
workflows = _list_available_workflows()
|
||
|
||
# 2. Comprendre l'instruction
|
||
loop = asyncio.get_event_loop()
|
||
plan = await loop.run_in_executor(
|
||
None,
|
||
lambda: _task_planner.understand(
|
||
instruction=request.instruction,
|
||
available_workflows=workflows,
|
||
),
|
||
)
|
||
|
||
if not plan.understood:
|
||
return {
|
||
"status": "not_understood",
|
||
"instruction": request.instruction,
|
||
"error": plan.error or "Instruction non comprise",
|
||
"plan": plan.to_dict(),
|
||
}
|
||
|
||
# 3. Dry run = retourner le plan sans exécuter
|
||
if request.dry_run:
|
||
return {
|
||
"status": "planned",
|
||
"instruction": request.instruction,
|
||
"plan": plan.to_dict(),
|
||
}
|
||
|
||
# 4. Exécuter
|
||
def replay_callback(session_id="", machine_id="", params=None, actions=None, task_description=""):
|
||
"""Callback pour lancer un replay depuis le planner."""
|
||
if session_id:
|
||
# Mode replay : relancer un workflow connu
|
||
import requests as _req
|
||
resp = _req.post(
|
||
f"http://localhost:5005/api/v1/traces/stream/replay-session"
|
||
f"?session_id={session_id}&machine_id={machine_id}",
|
||
headers={"Authorization": f"Bearer {API_TOKEN}"},
|
||
timeout=600,
|
||
)
|
||
if resp.ok:
|
||
return resp.json().get("replay_id", "")
|
||
raise Exception(f"Replay échoué: {resp.text[:200]}")
|
||
elif actions:
|
||
# Mode libre : actions planifiées
|
||
import requests as _req
|
||
resp = _req.post(
|
||
f"http://localhost:5005/api/v1/traces/stream/replay/raw",
|
||
json={
|
||
"session_id": "",
|
||
"actions": actions,
|
||
"machine_id": machine_id,
|
||
"task_description": task_description,
|
||
},
|
||
headers={"Authorization": f"Bearer {API_TOKEN}"},
|
||
timeout=30,
|
||
)
|
||
if resp.ok:
|
||
return resp.json().get("replay_id", "")
|
||
raise Exception(f"Replay raw échoué: {resp.text[:200]}")
|
||
|
||
result = await loop.run_in_executor(
|
||
None,
|
||
lambda: _task_planner.execute(
|
||
plan=plan,
|
||
replay_callback=replay_callback,
|
||
machine_id=request.machine_id,
|
||
),
|
||
)
|
||
|
||
return {
|
||
"status": "executed" if result.success else "failed",
|
||
"instruction": request.instruction,
|
||
"plan": plan.to_dict(),
|
||
"result": result.to_dict(),
|
||
}
|
||
|
||
|
||
@app.get("/api/v1/task/capabilities")
|
||
async def list_capabilities():
|
||
"""Lister ce que Léa sait faire (workflows appris)."""
|
||
workflows = _list_available_workflows()
|
||
return {
|
||
"capabilities": _task_planner.list_capabilities(workflows),
|
||
"workflows": workflows,
|
||
"total": len(workflows),
|
||
}
|
||
|
||
|
||
def _list_available_workflows() -> List[Dict[str, Any]]:
|
||
"""Lister les workflows/sessions disponibles pour le planner."""
|
||
workflows = []
|
||
|
||
# Sessions enregistrées avec des événements
|
||
try:
|
||
sessions_dir = LIVE_SESSIONS_DIR
|
||
for machine_dir in sessions_dir.iterdir():
|
||
if not machine_dir.is_dir() or machine_dir.name.startswith((".", "embeddings", "streaming")):
|
||
continue
|
||
for session_dir in machine_dir.iterdir():
|
||
if not session_dir.is_dir() or not session_dir.name.startswith("sess_"):
|
||
continue
|
||
events_file = session_dir / "live_events.jsonl"
|
||
if events_file.is_file():
|
||
# Extraire une description depuis les événements
|
||
desc = _extract_session_description(events_file)
|
||
workflows.append({
|
||
"session_id": session_dir.name,
|
||
"name": desc.get("name", session_dir.name),
|
||
"description": desc.get("description", ""),
|
||
"machine": machine_dir.name,
|
||
"event_count": desc.get("event_count", 0),
|
||
})
|
||
except Exception as e:
|
||
logger.debug(f"Erreur listage workflows: {e}")
|
||
|
||
return workflows
|
||
|
||
|
||
def _extract_session_description(events_file) -> Dict[str, Any]:
|
||
"""Extraire une description métier d'une session depuis ses événements.
|
||
|
||
Analyse les événements pour produire une description sémantique
|
||
(pas juste une liste d'apps) qui aide au matching par le TaskPlanner.
|
||
|
||
Exemples de descriptions produites :
|
||
- "Ouvrir Bloc-notes via Exécuter (Win+R) et écrire du texte"
|
||
- "Naviguer dans l'Explorateur de fichiers et ouvrir des images"
|
||
- "Utiliser cmd.exe pour exécuter des commandes"
|
||
"""
|
||
try:
|
||
apps = set()
|
||
app_names = set() # Noms d'applications (partie droite du titre)
|
||
typed_texts = [] # Texte saisi par l'utilisateur
|
||
key_combos = [] # Raccourcis clavier utilisés
|
||
event_types = {} # Compteur par type d'événement
|
||
window_sequence = [] # Séquence des fenêtres visitées (pour le flux)
|
||
event_count = 0
|
||
|
||
with open(events_file) as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
event_count += 1
|
||
if event_count > 100: # Lire plus pour mieux comprendre
|
||
break
|
||
try:
|
||
obj = json.loads(line)
|
||
evt = obj.get("event", obj)
|
||
evt_type = evt.get("type", "")
|
||
|
||
# Compter les types d'événements
|
||
event_types[evt_type] = event_types.get(evt_type, 0) + 1
|
||
|
||
# Collecter les fenêtres
|
||
title = evt.get("window", {}).get("title", "")
|
||
if title and title not in ("unknown_window", "Program Manager"):
|
||
if title not in window_sequence[-1:]:
|
||
window_sequence.append(title)
|
||
# Extraire le nom de l'app (partie droite du titre)
|
||
for sep in [" – ", " - ", " — "]:
|
||
if sep in title:
|
||
app_name = title.split(sep)[-1].strip()
|
||
app_names.add(app_name)
|
||
apps.add(title)
|
||
break
|
||
else:
|
||
app_names.add(title[:30])
|
||
apps.add(title[:30])
|
||
|
||
# Collecter le texte saisi
|
||
if evt_type == "text_input":
|
||
text = evt.get("text", "")
|
||
if text and len(text) > 1:
|
||
typed_texts.append(text)
|
||
|
||
# Collecter les raccourcis clavier
|
||
if evt_type == "key_combo":
|
||
keys = evt.get("keys", [])
|
||
if keys:
|
||
key_combos.append("+".join(keys))
|
||
|
||
# Changement de fenêtre → flux
|
||
if evt_type == "window_focus_change":
|
||
to_title = evt.get("to", {}).get("title", "")
|
||
if to_title and to_title not in ("unknown_window", "Program Manager"):
|
||
if to_title not in window_sequence[-1:]:
|
||
window_sequence.append(to_title)
|
||
|
||
except json.JSONDecodeError:
|
||
continue
|
||
|
||
# --- Construire la description sémantique ---
|
||
apps_list = sorted(app_names)[:5]
|
||
apps_str = ", ".join(apps_list)
|
||
|
||
# Construire une description orientée action
|
||
desc_parts = []
|
||
|
||
# Détecter les patterns courants
|
||
has_run_dialog = any("Exécuter" in w for w in window_sequence)
|
||
has_search = any("Rechercher" in w or "Recherche" in w for w in window_sequence)
|
||
has_win_r = "win+r" in [k.lower() for k in key_combos]
|
||
has_win_s = "win+s" in [k.lower() for k in key_combos]
|
||
|
||
# Applications principales utilisées (en dehors des launchers)
|
||
main_apps = [a for a in apps_list if a not in ("Exécuter", "Rechercher")]
|
||
launcher = ""
|
||
if has_run_dialog or has_win_r:
|
||
launcher = "via Exécuter (Win+R)"
|
||
elif has_search or has_win_s:
|
||
launcher = "via la recherche Windows"
|
||
|
||
if main_apps:
|
||
verb = "Ouvrir" if launcher else "Utiliser"
|
||
desc_parts.append(f"{verb} {', '.join(main_apps)} {launcher}".strip())
|
||
elif launcher:
|
||
desc_parts.append(f"Lancer une application {launcher}")
|
||
|
||
# Texte saisi
|
||
total_typed = "".join(typed_texts)
|
||
if len(total_typed) > 5:
|
||
desc_parts.append("écrire du texte")
|
||
elif typed_texts:
|
||
desc_parts.append(f"saisir '{total_typed[:30]}'")
|
||
|
||
# Raccourcis clavier notables
|
||
notable_combos = [k for k in key_combos if k.lower() not in ("win+r", "win+s")]
|
||
if notable_combos:
|
||
combos_str = ", ".join(sorted(set(notable_combos))[:3])
|
||
desc_parts.append(f"raccourcis : {combos_str}")
|
||
|
||
# Nombre de clics
|
||
click_count = event_types.get("mouse_click", 0)
|
||
if click_count > 5:
|
||
desc_parts.append(f"{click_count} clics")
|
||
|
||
description = " et ".join(desc_parts) if desc_parts else f"Workflow avec {apps_str}"
|
||
name = apps_str or "Session sans nom"
|
||
|
||
return {
|
||
"name": name,
|
||
"description": description,
|
||
"event_count": event_count,
|
||
"apps": apps_list,
|
||
"typed_text_preview": total_typed[:50] if typed_texts else "",
|
||
}
|
||
except Exception:
|
||
return {"name": "?", "description": "", "event_count": 0}
|
||
|
||
|
||
if __name__ == "__main__":
|
||
import uvicorn
|
||
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s [API-STREAM] %(message)s",
|
||
)
|
||
uvicorn.run(app, host="0.0.0.0", port=5005)
|