From 4509038bf0c363b6f9cddf6fbf1167e51210939c Mon Sep 17 00:00:00 2001
From: Dom <dom@rpa-vision-v3.local>
Date: Thu, 9 Apr 2026 21:37:44 +0200
Subject: [PATCH] =?UTF-8?q?refactor:=20=C3=A9clater=20api=5Fstream.py=20(6?=
 =?UTF-8?q?400=E2=86=923350=20lignes)=20en=20modules?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- resolve_engine.py (1953 lignes) — résolution visuelle (template, VLM, SoM, YOLO)
- replay_engine.py (1284 lignes) — gestion des replays (queue, setup, retry, validation)
- api_stream.py (3352 lignes) — routeur principal (endpoints HTTP thin layer)

Préparation V4 : base propre pour le WorkflowIR et l'ExecutionCompiler.
137 tests passent, 0 régression, aucun endpoint modifié.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 agent_v0/server_v1/api_stream.py     | 3209 +-------------------------
 agent_v0/server_v1/replay_engine.py  | 1284 +++++++++++
 agent_v0/server_v1/resolve_engine.py | 1953 ++++++++++++++++
 3 files changed, 3304 insertions(+), 3142 deletions(-)
 create mode 100644 agent_v0/server_v1/replay_engine.py
 create mode 100644 agent_v0/server_v1/resolve_engine.py

diff --git a/agent_v0/server_v1/api_stream.py b/agent_v0/server_v1/api_stream.py
index e3e2aa33d..853b25260 100644
--- a/agent_v0/server_v1/api_stream.py
+++ b/agent_v0/server_v1/api_stream.py
@@ -133,615 +133,63 @@ def _check_rate_limit(endpoint: str, client_ip: str) -> bool:
 
 
 # =========================================================================
-# Validation des actions de replay (sécurité HIGH)
+# Replay Engine — fonctions de replay extraites dans replay_engine.py
 # =========================================================================
-_ALLOWED_ACTION_TYPES = {
-    "click", "type", "key_combo", "scroll", "wait",
-    "file_open", "file_save", "file_close", "file_new", "file_dialog",
-    "double_click", "right_click", "drag",
-    "verify_screen",  # Replay hybride : vérification visuelle entre groupes
-}
-_MAX_ACTION_TEXT_LENGTH = 10000
-_MAX_KEYS_PER_COMBO = 10
-# Touches autorisées dans les key_combo (modificateurs + touches spéciales + caractères simples)
-_KNOWN_KEY_NAMES = {
-    "enter", "return", "tab", "escape", "esc", "backspace", "delete", "space",
-    "up", "down", "left", "right", "home", "end", "page_up", "page_down",
-    "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11", "f12",
-    "ctrl", "ctrl_l", "ctrl_r", "alt", "alt_l", "alt_r",
-    "shift", "shift_l", "shift_r",
-    "cmd", "win", "super", "super_l", "super_r", "windows", "meta",
-    "insert", "print_screen", "caps_lock", "num_lock",
-}
+from .replay_engine import (
+    _ALLOWED_ACTION_TYPES,
+    _MAX_ACTION_TEXT_LENGTH,
+    _MAX_KEYS_PER_COMBO,
+    _KNOWN_KEY_NAMES,
+    _validate_replay_action,
+    _APP_LAUNCH_COMMANDS,
+    _APP_VISUAL_SEARCH,
+    _SETUP_IGNORE_APPS,
+    _extract_required_apps_from_events,
+    _extract_required_apps_from_workflow,
+    _resolve_launch_command,
+    _infer_app_from_window_titles,
+    _get_visual_search_info,
+    _generate_setup_actions,
+    _find_active_agent_session as _find_active_agent_session_impl,
+    _workflow_to_actions as _workflow_to_actions_impl,
+    _is_learned_workflow,
+    _edge_to_normalized_actions,
+    _substitute_variables,
+    _expand_compound_steps,
+    _pre_check_screen_state as _pre_check_screen_state_impl,
+    _detect_popup_hint as _detect_popup_hint_impl,
+    _create_replay_state,
+    _schedule_retry as _schedule_retry_impl,
+    _notify_error_callback as _notify_error_callback_impl,
+)
 
 
-# =========================================================================
-# Setup environnement — Préparation automatique avant le replay
-# =========================================================================
-# Mapping des noms d'exécutables Windows courants vers la commande de lancement.
-# Utilisé comme fallback pour le texte de recherche dans le menu Démarrer.
-# Le format est : "processname.exe" (minuscule) -> commande shell
-_APP_LAUNCH_COMMANDS: Dict[str, str] = {
-    "notepad.exe": "notepad",
-    "explorer.exe": "explorer",
-    "calc.exe": "calc",
-    "mspaint.exe": "mspaint",
-    "cmd.exe": "cmd",
-    "powershell.exe": "powershell",
-    "wordpad.exe": "wordpad",
-    "charmap.exe": "charmap",
-    "snippingtool.exe": "snippingtool",
-    "taskmgr.exe": "taskmgr",
-    "regedit.exe": "regedit",
-    "mstsc.exe": "mstsc",
-    "winword.exe": "winword",
-    "excel.exe": "excel",
-    "powerpnt.exe": "powerpnt",
-    "outlook.exe": "outlook",
-    "msedge.exe": "msedge",
-    "chrome.exe": "chrome",
-    "firefox.exe": "firefox",
-    "code.exe": "code",
-}
 
-# Mapping des exécutables vers le nom visuel à chercher dans le menu Démarrer.
-# Contient le texte de recherche (souvent le nom français) et une description
-# pour le VLM afin d'identifier l'icône dans les résultats de recherche.
-# Format : "processname.exe" -> {"search_text": ..., "display_name": ..., "vlm_description": ...}
-_APP_VISUAL_SEARCH: Dict[str, Dict[str, str]] = {
-    "notepad.exe": {
-        "search_text": "Bloc-notes",
-        "display_name": "Bloc-notes",
-        "vlm_description": "L'application Bloc-notes (Notepad) dans les résultats de recherche",
-    },
-    "calc.exe": {
-        "search_text": "Calculatrice",
-        "display_name": "Calculatrice",
-        "vlm_description": "L'application Calculatrice dans les résultats de recherche",
-    },
-    "mspaint.exe": {
-        "search_text": "Paint",
-        "display_name": "Paint",
-        "vlm_description": "L'application Paint dans les résultats de recherche",
-    },
-    "cmd.exe": {
-        "search_text": "Invite de commandes",
-        "display_name": "Invite de commandes",
-        "vlm_description": "L'Invite de commandes (Command Prompt) dans les résultats",
-    },
-    "powershell.exe": {
-        "search_text": "PowerShell",
-        "display_name": "PowerShell",
-        "vlm_description": "Windows PowerShell dans les résultats de recherche",
-    },
-    "wordpad.exe": {
-        "search_text": "WordPad",
-        "display_name": "WordPad",
-        "vlm_description": "L'application WordPad dans les résultats de recherche",
-    },
-    "winword.exe": {
-        "search_text": "Word",
-        "display_name": "Microsoft Word",
-        "vlm_description": "Microsoft Word dans les résultats de recherche",
-    },
-    "excel.exe": {
-        "search_text": "Excel",
-        "display_name": "Microsoft Excel",
-        "vlm_description": "Microsoft Excel dans les résultats de recherche",
-    },
-    "powerpnt.exe": {
-        "search_text": "PowerPoint",
-        "display_name": "Microsoft PowerPoint",
-        "vlm_description": "Microsoft PowerPoint dans les résultats de recherche",
-    },
-    "outlook.exe": {
-        "search_text": "Outlook",
-        "display_name": "Microsoft Outlook",
-        "vlm_description": "Microsoft Outlook dans les résultats de recherche",
-    },
-    "msedge.exe": {
-        "search_text": "Edge",
-        "display_name": "Microsoft Edge",
-        "vlm_description": "Microsoft Edge dans les résultats de recherche",
-    },
-    "chrome.exe": {
-        "search_text": "Chrome",
-        "display_name": "Google Chrome",
-        "vlm_description": "Google Chrome dans les résultats de recherche",
-    },
-    "firefox.exe": {
-        "search_text": "Firefox",
-        "display_name": "Mozilla Firefox",
-        "vlm_description": "Mozilla Firefox dans les résultats de recherche",
-    },
-    "code.exe": {
-        "search_text": "Visual Studio Code",
-        "display_name": "Visual Studio Code",
-        "vlm_description": "Visual Studio Code dans les résultats de recherche",
-    },
-    "taskmgr.exe": {
-        "search_text": "Gestionnaire des tâches",
-        "display_name": "Gestionnaire des tâches",
-        "vlm_description": "Le Gestionnaire des tâches dans les résultats de recherche",
-    },
-    "snippingtool.exe": {
-        "search_text": "Outil Capture",
-        "display_name": "Outil Capture d'écran",
-        "vlm_description": "L'Outil Capture d'écran dans les résultats de recherche",
-    },
-    "mstsc.exe": {
-        "search_text": "Connexion Bureau à distance",
-        "display_name": "Bureau à distance",
-        "vlm_description": "La Connexion Bureau à distance dans les résultats",
-    },
-}
+# Wrappers pour les fonctions replay_engine qui accèdent aux variables globales du module.
+# Ces wrappers passent processor, _replay_lock, _replay_states, etc.
+def _find_active_agent_session(machine_id=None):
+    return _find_active_agent_session_impl(processor.session_manager, machine_id)
 
-# Applications Windows à ignorer pour le setup (processus système, agents, etc.)
-_SETUP_IGNORE_APPS = {
-    "searchhost.exe",     # Barre de recherche Windows
-    "explorer.exe",       # Explorer est toujours lancé (shell Windows)
-    "pythonw.exe",        # Agent Python (notre propre agent)
-    "python.exe",         # Idem
-    "shellexperiencehost.exe",
-    "startmenuexperiencehost.exe",
-    "applicationframehost.exe",
-    "systemsettings.exe",
-    "textinputhost.exe",
-    "runtimebroker.exe",
-}
+def _workflow_to_actions(workflow, params=None):
+    return _workflow_to_actions_impl(workflow, params, processor, _gesture_catalog)
 
-
-def _extract_required_apps_from_events(raw_events: list) -> Dict[str, Any]:
-    """Extraire les applications requises depuis les événements bruts d'une session.
-
-    Analyse les window_focus_change pour identifier :
-    - L'application principale (la plus utilisée hors apps système)
-    - La première fenêtre ciblée (pour le setup initial)
-
-    Args:
-        raw_events: Événements bruts depuis live_events.jsonl.
-
-    Returns:
-        Dict avec les clés :
-        - primary_app: str (nom de l'exécutable principal, ex: "Notepad.exe")
-        - primary_launch_cmd: str (commande Win+R, ex: "notepad")
-        - first_window_title: str (titre de la première fenêtre applicative)
-        - apps: dict[str, int] (app_name -> nombre d'occurrences)
-    """
-    app_counts: Dict[str, int] = defaultdict(int)
-    first_app = None
-    first_window_title = None
-
-    for raw_evt in raw_events:
-        event_data = raw_evt.get("event", raw_evt)
-        evt_type = event_data.get("type", "")
-
-        if evt_type == "window_focus_change":
-            to_info = event_data.get("to", {})
-            if not to_info:
-                continue
-            app_name = to_info.get("app_name", "")
-            title = to_info.get("title", "")
-            if app_name:
-                app_counts[app_name] += 1
-                if first_app is None and app_name.lower() not in _SETUP_IGNORE_APPS:
-                    first_app = app_name
-                    first_window_title = title
-
-        # Aussi extraire depuis les mouse_click qui ont un champ window
-        elif evt_type == "mouse_click":
-            window = event_data.get("window", {})
-            if isinstance(window, dict):
-                app_name = window.get("app_name", "")
-                if app_name:
-                    app_counts[app_name] += 1
-
-    if not app_counts:
-        return {}
-
-    # Déterminer l'application principale (la plus fréquente hors apps ignorées)
-    filtered_apps = {
-        k: v for k, v in app_counts.items()
-        if k.lower() not in _SETUP_IGNORE_APPS
-    }
-    if not filtered_apps:
-        return {}
-
-    primary_app = max(filtered_apps, key=filtered_apps.get)
-
-    # Résoudre la commande de lancement
-    primary_launch_cmd = _resolve_launch_command(primary_app)
-
-    return {
-        "primary_app": primary_app,
-        "primary_launch_cmd": primary_launch_cmd,
-        "first_window_title": first_window_title or "",
-        "apps": dict(app_counts),
-    }
-
-
-def _extract_required_apps_from_workflow(workflow) -> Dict[str, Any]:
-    """Extraire les applications requises depuis un workflow structuré.
-
-    Analyse les nodes du workflow pour identifier les titres de fenêtres
-    requis, puis infère l'application principale.
-
-    Args:
-        workflow: Objet Workflow ou dict brut.
-
-    Returns:
-        Même format que _extract_required_apps_from_events.
-    """
-    # Accéder aux données (objet ou dict)
-    if hasattr(workflow, 'nodes'):
-        nodes = workflow.nodes
-        metadata = workflow.metadata if hasattr(workflow, 'metadata') else {}
-    elif isinstance(workflow, dict):
-        nodes = workflow.get('nodes', [])
-        metadata = workflow.get('metadata', {})
-    else:
-        return {}
-
-    if not nodes:
-        return {}
-
-    # Collecter les titres de fenêtres depuis les nodes
-    window_titles = []
-    for node in nodes:
-        template = node.template if hasattr(node, 'template') else node.get('template', {})
-        if isinstance(template, dict):
-            window = template.get('window', {})
-        elif hasattr(template, 'window'):
-            window = template.window if hasattr(template.window, '__dict__') else {}
-        else:
-            window = {}
-
-        if isinstance(window, dict):
-            title = window.get('title_pattern', '') or window.get('title_contains', '')
-        elif hasattr(window, 'title_pattern'):
-            title = getattr(window, 'title_pattern', '') or ''
-        else:
-            title = ''
-
-        if title:
-            window_titles.append(title)
-
-    # Inférer l'app principale depuis les titres de fenêtres
-    primary_app, primary_launch_cmd, matched_title = _infer_app_from_window_titles(window_titles)
-    # Utiliser le titre qui a matché l'app (pas le premier node qui peut être "Rechercher")
-    first_title = matched_title or (window_titles[0] if window_titles else "")
-
-    if not primary_app:
-        return {}
-
-    source_session_id = metadata.get("source_session_id", "") if isinstance(metadata, dict) else ""
-    machine_id = metadata.get("machine_id", "") if isinstance(metadata, dict) else ""
-
-    return {
-        "primary_app": primary_app,
-        "primary_launch_cmd": primary_launch_cmd,
-        "first_window_title": first_title,
-        "apps": {},
-        "source_session_id": source_session_id,
-        "machine_id": machine_id,
-    }
-
-
-def _resolve_launch_command(app_name: str) -> str:
-    """Résoudre la commande Win+R pour lancer une application.
-
-    Si l'app n'est pas dans le mapping, utilise le nom de l'exécutable
-    directement sans l'extension .exe (fonctionne pour la plupart des apps).
-    """
-    app_lower = app_name.lower()
-    if app_lower in _APP_LAUNCH_COMMANDS:
-        return _APP_LAUNCH_COMMANDS[app_lower]
-    # Fallback : utiliser le nom sans l'extension .exe
-    if app_lower.endswith(".exe"):
-        return app_name[:-4]
-    return app_name
-
-
-def _infer_app_from_window_titles(titles: list) -> tuple:
-    """Inférer le nom de l'application et la commande de lancement depuis des titres de fenêtres.
-
-    Utilise des heuristiques basées sur les patterns de titres Windows courants.
-
-    Returns:
-        Tuple (app_name, launch_command, matched_title).
-        ("", "", "") si non identifié.
-    """
-    _TITLE_APP_PATTERNS = [
-        ("bloc-notes", "Notepad.exe", "notepad"),
-        ("notepad", "Notepad.exe", "notepad"),
-        ("word", "winword.exe", "winword"),
-        ("excel", "excel.exe", "excel"),
-        ("powerpoint", "powerpnt.exe", "powerpnt"),
-        ("outlook", "outlook.exe", "outlook"),
-        ("paint", "mspaint.exe", "mspaint"),
-        ("calculatrice", "calc.exe", "calc"),
-        ("calculator", "calc.exe", "calc"),
-        ("explorateur de fichiers", "explorer.exe", "explorer"),
-        ("file explorer", "explorer.exe", "explorer"),
-        ("invite de commandes", "cmd.exe", "cmd"),
-        ("command prompt", "cmd.exe", "cmd"),
-        ("powershell", "powershell.exe", "powershell"),
-        ("visual studio code", "code.exe", "code"),
-        ("edge", "msedge.exe", "msedge"),
-        ("chrome", "chrome.exe", "chrome"),
-        ("firefox", "firefox.exe", "firefox"),
-    ]
-
-    for title in titles:
-        title_lower = title.lower()
-        for pattern, app_name, launch_cmd in _TITLE_APP_PATTERNS:
-            if pattern in title_lower:
-                # Ignorer les apps système (explorer, etc.)
-                if app_name.lower() in _SETUP_IGNORE_APPS:
-                    continue
-                return (app_name, launch_cmd, title)
-
-    return ("", "", "")
-
-
-def _get_visual_search_info(app_name: str) -> Dict[str, str]:
-    """Obtenir les informations de recherche visuelle pour une application.
-
-    Consulte _APP_VISUAL_SEARCH, sinon construit un fallback à partir du nom
-    de l'exécutable (ex: "MonApp.exe" → search_text="MonApp").
-
-    Args:
-        app_name: Nom de l'exécutable (ex: "Notepad.exe").
-
-    Returns:
-        Dict avec search_text, display_name, vlm_description.
-    """
-    app_lower = app_name.lower()
-    if app_lower in _APP_VISUAL_SEARCH:
-        return dict(_APP_VISUAL_SEARCH[app_lower])
-
-    # Fallback : utiliser le nom sans .exe
-    base_name = app_name[:-4] if app_lower.endswith(".exe") else app_name
-    return {
-        "search_text": base_name,
-        "display_name": base_name,
-        "vlm_description": f"L'application {base_name} dans les résultats de recherche",
-    }
-
-
-def _generate_setup_actions(
-    app_info: Dict[str, Any],
-    setup_id_prefix: str = "setup",
-) -> List[Dict[str, Any]]:
-    """Générer les actions 100% visuelles pour ouvrir l'application avant le replay.
-
-    Approche entièrement visuelle — JAMAIS de raccourcis clavier (Win, Win+R,
-    Ctrl+X, etc.) qui n'ont pas été enregistrés par l'utilisateur. Tout passe
-    par des clics visuels résolus par le VLM (Qwen2.5-VL).
-
-    La séquence est :
-    1. Clic visuel sur le bouton Démarrer (coin bas-gauche de l'écran)
-    2. Attendre que le menu Démarrer s'ouvre (1s)
-    3. Clic visuel sur la barre de recherche du menu Démarrer
-    4. Attendre que la barre de recherche soit active (500ms)
-    5. Taper le nom de l'application (texte français, ex: "Bloc-notes")
-    6. Attendre les résultats de recherche (1.2s)
-    7. Clic visuel sur le résultat de l'application trouvée
-    8. Attendre que l'application s'ouvre (2-3s selon le poids)
-    9. verify_screen : vérifier que la fenêtre attendue est apparue
-
-    Args:
-        app_info: Dict retourné par _extract_required_apps_from_events ou
-            _extract_required_apps_from_workflow.
-        setup_id_prefix: Préfixe pour les action_id générés.
-
-    Returns:
-        Liste d'actions normalisées, prêtes à injecter dans la queue.
-        Liste vide si aucune préparation n'est nécessaire.
-    """
-    if not app_info:
-        return []
-
-    launch_cmd = app_info.get("primary_launch_cmd", "")
-    primary_app = app_info.get("primary_app", "")
-    first_title = app_info.get("first_window_title", "")
-
-    if not launch_cmd:
-        logger.debug(
-            "setup_actions : pas de commande de lancement pour '%s', skip",
-            primary_app,
-        )
-        return []
-
-    # Ne pas lancer les apps système (toujours présentes)
-    if primary_app.lower() in _SETUP_IGNORE_APPS:
-        logger.debug("setup_actions : app '%s' ignorée (système)", primary_app)
-        return []
-
-    # Obtenir les informations de recherche visuelle pour cette app
-    visual_info = _get_visual_search_info(primary_app)
-    search_text = visual_info["search_text"]
-    display_name = visual_info["display_name"]
-    vlm_description = visual_info["vlm_description"]
-
-    actions = []
-
-    logger.info(
-        "Génération setup env 100%% visuel : lancement de '%s' via clic "
-        "Démarrer → recherche visuelle '%s' (fenêtre attendue : '%s')",
-        primary_app, search_text, first_title,
+def _pre_check_screen_state(session_id, expected_node_id, current_screenshot_path, active_processor):
+    return _pre_check_screen_state_impl(
+        session_id, expected_node_id, current_screenshot_path, active_processor,
+        _replay_states, _replay_lock, _PRECHECK_SIMILARITY_THRESHOLD,
     )
 
-    # 1. Clic visuel sur le bouton Démarrer (toujours visible, bas-gauche)
-    #    Le VLM résout la position exacte ; x_pct/y_pct sont des fallbacks.
-    actions.append({
-        "action_id": f"act_{setup_id_prefix}_click_start",
-        "type": "click",
-        "x_pct": 0.02,
-        "y_pct": 0.98,
-        "button": "left",
-        "visual_mode": True,
-        "target_spec": {
-            "by_text": "Démarrer",
-            "by_role": "start_button",
-            "vlm_description": (
-                "Le bouton Démarrer de Windows (icône Windows), "
-                "en bas à gauche de la barre des tâches"
-            ),
-        },
-        "_setup_phase": True,
-        "_setup_step": "click_start_menu",
-    })
+def _detect_popup_hint(session_id, workflow, expected_node_id):
+    return _detect_popup_hint_impl(session_id, workflow, expected_node_id, processor)
 
-    # 2. Attendre que le menu Démarrer s'ouvre
-    actions.append({
-        "action_id": f"act_{setup_id_prefix}_wait_start",
-        "type": "wait",
-        "duration_ms": 1000,
-        "_setup_phase": True,
-        "_setup_step": "wait_start_menu",
-    })
-
-    # 3. Clic visuel sur la barre de recherche du menu Démarrer
-    #    Sur Windows 10/11, la barre de recherche est intégrée au menu Démarrer
-    #    ou visible dans la barre des tâches. Le VLM la trouve visuellement.
-    actions.append({
-        "action_id": f"act_{setup_id_prefix}_click_search",
-        "type": "click",
-        "x_pct": 0.20,
-        "y_pct": 0.92,
-        "button": "left",
-        "visual_mode": True,
-        "target_spec": {
-            "by_text": "Rechercher",
-            "by_role": "search_box",
-            "vlm_description": (
-                "La barre ou le champ de recherche dans le menu Démarrer "
-                "de Windows, souvent intitulé 'Tapez ici pour rechercher' "
-                "ou 'Rechercher'"
-            ),
-        },
-        "_setup_phase": True,
-        "_setup_step": "click_search_box",
-    })
-
-    # 4. Attendre que la barre de recherche soit active et prête
-    actions.append({
-        "action_id": f"act_{setup_id_prefix}_wait_search_ready",
-        "type": "wait",
-        "duration_ms": 500,
-        "_setup_phase": True,
-        "_setup_step": "wait_search_ready",
-    })
-
-    # 5. Taper le nom visuel de l'application (texte français)
-    #    Le champ de recherche a été cliqué visuellement à l'étape 3,
-    #    donc le type s'exécute dans le champ actif.
-    actions.append({
-        "action_id": f"act_{setup_id_prefix}_type_search",
-        "type": "type",
-        "text": search_text,
-        "_setup_phase": True,
-        "_setup_step": "type_app_name",
-    })
-
-    # 6. Attendre que la recherche Windows trouve l'application
-    actions.append({
-        "action_id": f"act_{setup_id_prefix}_wait_results",
-        "type": "wait",
-        "duration_ms": 1200,
-        "_setup_phase": True,
-        "_setup_step": "wait_search_results",
-    })
-
-    # 7. Clic visuel sur le résultat de l'application dans la liste
-    #    Le VLM identifie l'icône/texte de l'app dans les résultats.
-    actions.append({
-        "action_id": f"act_{setup_id_prefix}_click_result",
-        "type": "click",
-        "x_pct": 0.20,
-        "y_pct": 0.50,
-        "button": "left",
-        "visual_mode": True,
-        "target_spec": {
-            "by_text": display_name,
-            "by_role": "app_icon",
-            "vlm_description": vlm_description,
-        },
-        "_setup_phase": True,
-        "_setup_step": "click_app_result",
-    })
-
-    # 8. Attendre que l'application s'ouvre
-    #    Durée variable : 3s pour les apps lourdes (Office, VS Code), 2s sinon
-    heavy_apps = {"winword.exe", "excel.exe", "powerpnt.exe", "outlook.exe", "code.exe"}
-    wait_ms = 3000 if primary_app.lower() in heavy_apps else 2000
-    actions.append({
-        "action_id": f"act_{setup_id_prefix}_wait_launch",
-        "type": "wait",
-        "duration_ms": wait_ms,
-        "_setup_phase": True,
-        "_setup_step": "wait_app_launch",
-    })
-
-    # 9. Vérification visuelle que la fenêtre attendue est apparue
-    if first_title:
-        actions.append({
-            "action_id": f"act_{setup_id_prefix}_verify",
-            "type": "verify_screen",
-            "expected_node": "setup_initial",
-            "timeout_ms": 5000,
-            "_setup_phase": True,
-            "_setup_step": "verify_app_ready",
-            "_expected_title": first_title,
-        })
-
-    logger.info(
-        "Setup env visuel généré : %d actions pour lancer '%s' "
-        "(recherche visuelle : '%s')",
-        len(actions), primary_app, search_text,
+def _schedule_retry(session_id, replay_state, action, current_retry, reason):
+    _schedule_retry_impl(
+        session_id, replay_state, action, current_retry, reason,
+        _replay_queues, _retry_pending, MAX_RETRIES_PER_ACTION,
     )
 
-    return actions
-
-
-def _validate_replay_action(action: dict) -> Optional[str]:
-    """Valide une action de replay. Retourne un message d'erreur ou None si valide."""
-    action_type = action.get("type", "")
-
-    # Vérifier le type d'action
-    if action_type not in _ALLOWED_ACTION_TYPES:
-        return f"Type d'action non autorisé : '{action_type}'. Autorisés : {sorted(_ALLOWED_ACTION_TYPES)}"
-
-    # Vérifier la longueur du texte
-    text = action.get("text", "")
-    if isinstance(text, str) and len(text) > _MAX_ACTION_TEXT_LENGTH:
-        return f"Texte trop long ({len(text)} > {_MAX_ACTION_TEXT_LENGTH} caractères)"
-
-    # Vérifier les touches
-    keys = action.get("keys", [])
-    if isinstance(keys, list):
-        if len(keys) > _MAX_KEYS_PER_COMBO:
-            return f"Trop de touches ({len(keys)} > {_MAX_KEYS_PER_COMBO})"
-        for key in keys:
-            key_lower = str(key).lower()
-            # Accepter les caractères simples (a-z, 0-9, ponctuation) et les noms connus
-            if len(str(key)) == 1 or key_lower in _KNOWN_KEY_NAMES:
-                continue
-            return f"Touche inconnue : '{key}'"
-
-    # Vérifier les coordonnées normalisées
-    for coord_name in ("x_pct", "y_pct"):
-        val = action.get(coord_name)
-        if val is not None:
-            try:
-                val_f = float(val)
-                if not (0.0 <= val_f <= 1.0):
-                    return f"Coordonnée {coord_name}={val_f} hors limites [0.0, 1.0]"
-            except (TypeError, ValueError):
-                return f"Coordonnée {coord_name} invalide : {val}"
-
-    return None  # Valide
+def _notify_error_callback(replay_state, action_id, error):
+    _notify_error_callback_impl(replay_state, action_id, error, _error_callbacks)
 
 
 # En production (ENVIRONMENT != development), désactiver la doc Swagger
@@ -1973,304 +1421,6 @@ async def get_session(session_id: str):
 # =========================================================================
 
 
-def _find_active_agent_session(machine_id: Optional[str] = None) -> Optional[str]:
-    """Trouver la dernière session Agent V1 pour le replay.
-
-    Stratégie en 2 passes :
-    1. D'abord chercher une session non-finalisée (Agent V1 actif)
-    2. Sinon, prendre la plus récente même finalisée (Agent V1 peut avoir
-       redémarré et créé une nouvelle session, ou la session a été finalisée
-       par timeout mais l'agent est toujours là)
-
-    Dans les deux cas, on ne considère que les sessions 'sess_*' (Agent V1).
-
-    Args:
-        machine_id: Si fourni, ne chercher que les sessions de cette machine.
-                    Si None, chercher toutes les sessions (rétrocompatible).
-    """
-    with processor.session_manager._lock:
-        all_agent_sessions = [
-            s for s in processor.session_manager._sessions.values()
-            if s.session_id.startswith("sess_")
-            and (machine_id is None or s.machine_id == machine_id)
-        ]
-
-    if not all_agent_sessions:
-        return None
-
-    # Trier par session_id (contient un timestamp) — plus récent d'abord
-    all_agent_sessions.sort(key=lambda s: s.session_id, reverse=True)
-
-    # Passe 1 : préférer une session non-finalisée
-    for s in all_agent_sessions:
-        if not s.finalized:
-            return s.session_id
-
-    # Passe 2 : fallback sur la plus récente (même finalisée)
-    # L'Agent V1 poll /replay/next indépendamment de l'état finalized
-    return all_agent_sessions[0].session_id
-
-
-def _workflow_to_actions(workflow, params: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
-    """
-    Convertir un workflow (nodes + edges ordonnés) en liste d'actions normalisées.
-
-    Parcourt le graphe depuis les entry_nodes en suivant les edges.
-    Chaque edge produit une action normalisée avec coordonnées en pourcentage.
-
-    Mode intelligent (workflows appris par Léa) :
-        Si le workflow a des nodes avec des prototype_vectors, utilise le
-        StreamProcessor.extract_enriched_actions() qui enrichit les actions
-        avec les données de la session originale, le ciblage visuel et le
-        pre-check/post-check par embedding CLIP.
-
-    Mode classique (workflows VWB/manuels) :
-        Parcours BFS classique avec _edge_to_normalized_actions().
-    """
-    params = params or {}
-
-    # Détection d'un workflow appris (a des nodes avec prototype_vectors)
-    # et qui a des edges structurés
-    if _is_learned_workflow(workflow):
-        # Priorité 1 : replay hybride (événements bruts + structure workflow)
-        # Beaucoup plus fiable car utilise les actions utilisateur réelles
-        # au lieu des compound actions du GraphBuilder qui perdent les détails
-        hybrid = processor.build_hybrid_replay(workflow)
-        if hybrid:
-            logger.info(
-                "Replay hybride : %d actions depuis events bruts + structure workflow",
-                len(hybrid),
-            )
-            # Optimisation par gestes clavier si disponible
-            if _gesture_catalog and hybrid:
-                hybrid = _gesture_catalog.optimize_replay_actions(hybrid)
-            return hybrid
-
-        # Priorité 2 : enrichissement classique (fallback si hybride échoue)
-        enriched = processor.extract_enriched_actions(workflow, params)
-        if enriched:
-            logger.info(
-                "Replay intelligent : %d actions enrichies depuis le workflow appris",
-                len(enriched),
-            )
-            # Optimisation par gestes clavier si disponible
-            if _gesture_catalog and enriched:
-                enriched = _gesture_catalog.optimize_replay_actions(enriched)
-            return enriched
-        # Si l'enrichissement échoue aussi, fallback sur le mode classique
-        logger.warning(
-            "Enrichissement échoué pour le workflow appris, fallback mode classique"
-        )
-
-    # Mode classique (VWB/manuels ou fallback)
-    actions = []
-
-    # Construire un index des edges sortants par node
-    outgoing: Dict[str, list] = defaultdict(list)
-    for edge in workflow.edges:
-        outgoing[edge.from_node].append(edge)
-
-    # Parcours linéaire depuis le premier entry_node
-    visited = set()
-    current_nodes = list(workflow.entry_nodes) if workflow.entry_nodes else []
-
-    # Fallback : si pas d'entry_nodes, prendre le premier node
-    if not current_nodes and workflow.nodes:
-        current_nodes = [workflow.nodes[0].node_id]
-
-    while current_nodes:
-        node_id = current_nodes.pop(0)
-        if node_id in visited:
-            continue
-        visited.add(node_id)
-
-        edges = outgoing.get(node_id, [])
-        for edge in edges:
-            edge_actions = _edge_to_normalized_actions(edge, params)
-            actions.extend(edge_actions)
-            # Suivre le graphe vers le prochain node
-            if edge.to_node not in visited:
-                current_nodes.append(edge.to_node)
-
-    # Optimisation : substituer les actions visuelles par des gestes clavier si possible
-    if _gesture_catalog and actions:
-        actions = _gesture_catalog.optimize_replay_actions(actions)
-
-    return actions
-
-
-def _is_learned_workflow(workflow) -> bool:
-    """Détecter si un workflow est un workflow appris (vs VWB/manuel).
-
-    Un workflow appris a :
-    - Des nodes avec _prototype_vector dans metadata
-    - Des edges avec from_node/to_node
-    - Un learning_state indicatif (OBSERVATION, COACHING, AUTO_CANDIDATE, etc.)
-
-    Un workflow VWB/manuel a généralement :
-    - Des edges avec des target_spec complets (by_text, by_role remplis)
-    - Pas de prototype_vectors
-    """
-    # Accéder aux données (objet ou dict)
-    if hasattr(workflow, 'nodes'):
-        nodes = workflow.nodes
-        edges = workflow.edges
-    elif isinstance(workflow, dict):
-        nodes = workflow.get('nodes', [])
-        edges = workflow.get('edges', [])
-    else:
-        return False
-
-    if not nodes or not edges:
-        return False
-
-    # Vérifier si au moins un node a un prototype_vector
-    has_prototype = False
-    for node in nodes:
-        metadata = node.metadata if hasattr(node, 'metadata') else node.get('metadata', {})
-        if isinstance(metadata, dict) and '_prototype_vector' in metadata:
-            has_prototype = True
-            break
-
-    return has_prototype
-
-
-def _edge_to_normalized_actions(edge, params: Dict[str, Any]) -> List[Dict[str, Any]]:
-    """
-    Convertir un WorkflowEdge en liste d'actions normalisées pour l'Agent V1.
-
-    Un edge simple produit 1 action, un edge compound produit N actions (une par step).
-    """
-    action = edge.action
-    if action is None:
-        logger.warning(f"Edge {edge.edge_id} sans action, skip")
-        return []
-    action_type = action.type
-    target = action.target
-    action_params = action.parameters or {}
-
-    # Extraire les coordonnées normalisées depuis TargetSpec.by_position
-    x_pct = 0.0
-    y_pct = 0.0
-    if target and target.by_position:
-        px, py = target.by_position
-        if px <= 1.0 and py <= 1.0:
-            x_pct = px
-            y_pct = py
-        else:
-            ref_w = action_params.get("ref_width", 1920) or 1920
-            ref_h = action_params.get("ref_height", 1080) or 1080
-            x_pct = round(px / ref_w, 6)
-            y_pct = round(py / ref_h, 6)
-
-    base = {"edge_id": edge.edge_id, "from_node": edge.from_node, "to_node": edge.to_node}
-
-    # Compound : décomposer en actions individuelles
-    if action_type == "compound":
-        return _expand_compound_steps(action_params.get("steps", []), base, params)
-
-    # Actions simples
-    normalized = {**base, "action_id": f"act_{uuid.uuid4().hex[:8]}"}
-
-    if action_type == "mouse_click":
-        normalized["type"] = "click"
-        normalized["x_pct"] = x_pct
-        normalized["y_pct"] = y_pct
-        normalized["button"] = action_params.get("button", "left")
-
-    elif action_type == "text_input":
-        normalized["type"] = "type"
-        text = action_params.get("text", "")
-        text = _substitute_variables(text, params, action_params.get("defaults", {}))
-        normalized["text"] = text
-        normalized["x_pct"] = x_pct
-        normalized["y_pct"] = y_pct
-
-    elif action_type == "key_press":
-        normalized["type"] = "key_combo"
-        keys = action_params.get("keys", [])
-        if not keys and action_params.get("key"):
-            keys = [action_params["key"]]
-        normalized["keys"] = keys
-
-    else:
-        logger.warning(f"Type d'action inconnu : {action_type}")
-        return []
-
-    # Ajouter le target_spec complet pour la résolution visuelle
-    target_spec = {}
-    if target and target.by_role:
-        target_spec["by_role"] = target.by_role
-        normalized["target_role"] = target.by_role  # Compat debug
-    if target and target.by_text:
-        target_spec["by_text"] = target.by_text
-        normalized["target_text"] = target.by_text  # Compat debug
-    if target and hasattr(target, 'context_hints') and target.context_hints:
-        target_spec["context_hints"] = target.context_hints
-    if target_spec:
-        normalized["target_spec"] = target_spec
-        normalized["visual_mode"] = True  # Signal à l'agent d'utiliser la résolution visuelle
-
-    return [normalized]
-
-
-def _substitute_variables(text: str, params: Dict[str, Any], defaults: Dict[str, Any]) -> str:
-    """Substituer les variables ${var} dans un texte.
-
-    Priorité : params utilisateur > defaults du workflow > texte brut inchangé.
-    Supporte ${var} dans un texte plus long (ex: "${expression}=").
-    """
-    import re
-
-    def replacer(match):
-        var_name = match.group(1)
-        return str(params.get(var_name, defaults.get(var_name, match.group(0))))
-
-    return re.sub(r'\$\{(\w+)\}', replacer, text)
-
-
-def _expand_compound_steps(
-    steps: List[Dict[str, Any]], base: Dict[str, Any], params: Dict[str, Any]
-) -> List[Dict[str, Any]]:
-    """Décomposer les steps d'un compound en actions individuelles."""
-    actions = []
-    for step in steps:
-        step_type = step.get("type", "unknown")
-        action = {
-            **base,
-            "action_id": f"act_{uuid.uuid4().hex[:8]}",
-        }
-
-        if step_type == "key_press":
-            action["type"] = "key_combo"
-            keys = step.get("keys", [])
-            if not keys and step.get("key"):
-                keys = [step["key"]]
-            action["keys"] = keys
-
-        elif step_type == "text_input":
-            action["type"] = "type"
-            text = step.get("text", "")
-            text = _substitute_variables(text, params, {})
-            action["text"] = text
-
-        elif step_type == "wait":
-            action["type"] = "wait"
-            action["duration_ms"] = step.get("duration_ms", 500)
-
-        elif step_type == "mouse_click":
-            action["type"] = "click"
-            action["x_pct"] = step.get("x_pct", 0.0)
-            action["y_pct"] = step.get("y_pct", 0.0)
-            action["button"] = step.get("button", "left")
-
-        else:
-            logger.debug(f"Step compound inconnu : {step_type}")
-            continue
-
-        actions.append(action)
-
-    return actions
 
 
 @app.post("/api/v1/traces/stream/replay")
@@ -2761,195 +1911,6 @@ async def enqueue_single_action(request: SingleActionRequest):
 # =========================================================================
 
 
-def _pre_check_screen_state(
-    session_id: str,
-    expected_node_id: str,
-    current_screenshot_path: str,
-    active_processor: StreamProcessor,
-) -> Dict[str, Any]:
-    """Vérifier que l'écran actuel correspond à l'état attendu du node.
-
-    Compare le screenshot actuel avec le prototype du node attendu
-    via similarité d'embedding CLIP (rapide, ~200ms).
-
-    Args:
-        session_id: ID de la session de replay
-        expected_node_id: ID du node source de l'action (from_node)
-        current_screenshot_path: Chemin du screenshot heartbeat récent
-        active_processor: Instance StreamProcessor avec le CLIPEmbedder chargé
-
-    Returns:
-        {"match": True/False, "similarity": float, "expected_node": str,
-         "reason": str (si mismatch), "popup_detected": bool}
-    """
-    result: Dict[str, Any] = {
-        "match": True,
-        "similarity": 1.0,
-        "expected_node": expected_node_id,
-        "popup_detected": False,
-    }
-
-    try:
-        # 1. Trouver le workflow actif pour cette session
-        replay_state = None
-        workflow = None
-        with _replay_lock:
-            for state in _replay_states.values():
-                if state["session_id"] == session_id and state["status"] == "running":
-                    replay_state = state
-                    break
-
-        if not replay_state:
-            result["reason"] = "no_active_replay"
-            return result
-
-        workflow_id = replay_state.get("workflow_id", "")
-        with active_processor._data_lock:
-            workflow = active_processor._workflows.get(workflow_id)
-
-        if workflow is None:
-            result["reason"] = "workflow_not_found"
-            return result
-
-        # 2. Récupérer le prototype du node attendu
-        # Supporter à la fois les objets Workflow et les dicts bruts
-        node = None
-        if hasattr(workflow, "get_node"):
-            node = workflow.get_node(expected_node_id)
-        elif isinstance(workflow, dict):
-            # Format dict brut (workflows VWB/manuels)
-            for n in workflow.get("nodes", []):
-                if n.get("node_id") == expected_node_id:
-                    node = n
-                    break
-
-        if node is None:
-            result["reason"] = "node_not_found"
-            return result
-
-        # Extraire le prototype vector
-        metadata = node.metadata if hasattr(node, "metadata") else node.get("metadata", {})
-        proto_list = metadata.get("_prototype_vector")
-        if not proto_list or not isinstance(proto_list, (list, tuple)):
-            result["reason"] = "no_prototype_vector"
-            return result
-
-        import numpy as np
-        prototype_vector = np.array(proto_list, dtype=np.float32)
-
-        # 3. Calculer l'embedding CLIP du screenshot actuel
-        active_processor._ensure_initialized()
-        if active_processor._clip_embedder is None:
-            result["reason"] = "clip_embedder_unavailable"
-            return result
-
-        from PIL import Image
-        pil_image = Image.open(current_screenshot_path)
-        current_vector = active_processor._clip_embedder.embed_image(pil_image)
-
-        if current_vector is None or len(current_vector) == 0:
-            result["reason"] = "embedding_failed"
-            return result
-
-        # 4. Similarité cosine
-        current_vector = current_vector.flatten().astype(np.float32)
-        prototype_vector = prototype_vector.flatten().astype(np.float32)
-
-        norm_current = np.linalg.norm(current_vector)
-        norm_proto = np.linalg.norm(prototype_vector)
-        if norm_current < 1e-8 or norm_proto < 1e-8:
-            result["reason"] = "zero_norm_vector"
-            result["match"] = False
-            result["similarity"] = 0.0
-            return result
-
-        similarity = float(
-            np.dot(current_vector, prototype_vector) / (norm_current * norm_proto)
-        )
-        result["similarity"] = round(similarity, 4)
-        result["match"] = similarity >= _PRECHECK_SIMILARITY_THRESHOLD
-
-        if not result["match"]:
-            result["reason"] = "screen_mismatch"
-            logger.warning(
-                f"Pre-check MISMATCH pour session={session_id} "
-                f"node={expected_node_id}: similarity={similarity:.4f} "
-                f"< seuil={_PRECHECK_SIMILARITY_THRESHOLD}"
-            )
-
-            # 5. Détection de popup par changement de titre de fenêtre
-            result["popup_detected"] = _detect_popup_hint(
-                session_id, workflow, expected_node_id
-            )
-
-    except Exception as e:
-        # Ne jamais bloquer le replay en cas d'erreur du pre-check
-        logger.error(f"Pre-check échoué (non bloquant): {e}")
-        result["match"] = True  # Fallback permissif
-        result["reason"] = f"precheck_error: {e}"
-
-    return result
-
-
-def _detect_popup_hint(
-    session_id: str,
-    workflow: Any,
-    expected_node_id: str,
-) -> bool:
-    """Détecter si une popup ou un dialogue modal est probable.
-
-    Compare le titre de fenêtre actuel (via last_window_info de la session)
-    avec le titre attendu du node dans le workflow. Un changement de titre
-    suggère une popup/dialogue inattendu.
-
-    Args:
-        session_id: ID de la session
-        workflow: Workflow object ou dict
-        expected_node_id: ID du node attendu
-
-    Returns:
-        True si un changement de titre suggère une popup
-    """
-    try:
-        # Titre actuel depuis la session
-        session = processor.session_manager.get_session(session_id)
-        if not session:
-            return False
-        current_title = session.last_window_info.get("title", "").strip().lower()
-        if not current_title or current_title == "unknown":
-            return False
-
-        # Titre attendu depuis le node du workflow
-        expected_title = ""
-        if hasattr(workflow, "get_node"):
-            node = workflow.get_node(expected_node_id)
-            if node and hasattr(node, "template") and hasattr(node.template, "window"):
-                window_spec = node.template.window
-                if hasattr(window_spec, "title_contains") and window_spec.title_contains:
-                    expected_title = window_spec.title_contains.strip().lower()
-        elif isinstance(workflow, dict):
-            for n in workflow.get("nodes", []):
-                if n.get("node_id") == expected_node_id:
-                    template = n.get("template", {})
-                    window = template.get("window", {})
-                    expected_title = (window.get("title_contains") or "").strip().lower()
-                    break
-
-        if not expected_title:
-            return False
-
-        # Si le titre actuel ne contient plus le titre attendu, popup probable
-        if expected_title not in current_title:
-            logger.info(
-                f"Popup détectée: titre actuel='{current_title}' "
-                f"ne contient pas '{expected_title}'"
-            )
-            return True
-
-    except Exception as e:
-        logger.debug(f"Détection popup échouée: {e}")
-
-    return False
 
 
 @app.get("/api/v1/traces/stream/replay/next")
@@ -3584,144 +2545,6 @@ async def report_action_result(report: ReplayResultReport):
     }
 
 
-def _create_replay_state(
-    replay_id: str,
-    workflow_id: str,
-    session_id: str,
-    total_actions: int,
-    params: Optional[Dict[str, Any]] = None,
-    machine_id: Optional[str] = None,
-) -> Dict[str, Any]:
-    """Créer un état de replay enrichi avec les champs de suivi d'erreur."""
-    return {
-        "replay_id": replay_id,
-        "workflow_id": workflow_id,
-        "session_id": session_id,
-        "machine_id": machine_id or "default",  # Machine cible du replay
-        "status": "running",
-        "total_actions": total_actions,
-        "completed_actions": 0,
-        "failed_actions": 0,
-        "current_action_index": 0,
-        "params": params or {},
-        "results": [],  # Historique des résultats action par action
-        # Champs enrichis pour le suivi d'erreur (#7)
-        "retried_actions": 0,
-        "unverified_actions": 0,
-        "error_log": [],         # Liste des erreurs rencontrées
-        "last_screenshot": None, # Path du dernier screenshot reçu
-        "_last_screenshot_before": None,  # Interne: screenshot avant la dernière action
-        # Champs pour pause supervisée (target_not_found)
-        "failed_action": None,   # Contexte de l'action en echec (quand paused_need_help)
-        "pause_message": None,   # Message a afficher a l'utilisateur
-    }
-
-
-def _schedule_retry(
-    session_id: str,
-    replay_state: Dict[str, Any],
-    action: Dict[str, Any],
-    current_retry: int,
-    reason: str,
-):
-    """
-    Programmer un retry pour une action échouée.
-
-    Stratégie :
-    - Retry 1 : réinjecter l'action directement (re-résolution visuelle par l'agent)
-    - Retry 2 : injecter un wait de 2s avant l'action (possible loading en cours)
-    - Retry 3 : dernier essai direct
-
-    L'action est réinsérée en tête de la queue pour être la prochaine exécutée.
-    _replay_lock doit être acquis par l'appelant.
-    """
-    next_retry = current_retry + 1
-    replay_state["retried_actions"] += 1
-
-    # Créer une copie de l'action avec un nouveau action_id pour le tracking
-    retry_action = dict(action)
-    retry_action_id = f"{action.get('action_id', 'unknown')}_retry{next_retry}"
-    retry_action["action_id"] = retry_action_id
-
-    # Stocker l'info de retry pour le prochain report_action_result
-    _retry_pending[retry_action_id] = {
-        "action": action,
-        "retry_count": next_retry,
-        "replay_id": replay_state["replay_id"],
-        "reason": reason,
-    }
-
-    # Stratégie de retry selon le numéro
-    actions_to_insert = []
-
-    if next_retry == 2:
-        # Retry 2 : injecter un wait de 2s avant l'action
-        wait_action = {
-            "action_id": f"wait_retry_{uuid.uuid4().hex[:6]}",
-            "type": "wait",
-            "duration_ms": 2000,
-        }
-        actions_to_insert.append(wait_action)
-
-    actions_to_insert.append(retry_action)
-
-    # Insérer en tête de la queue (prochaine action à exécuter)
-    queue = _replay_queues.get(session_id, [])
-    _replay_queues[session_id] = actions_to_insert + queue
-
-    logger.info(
-        f"Retry {next_retry}/{MAX_RETRIES_PER_ACTION} programmé pour {action.get('action_id')} "
-        f"(raison: {reason}) | nouveau id: {retry_action_id}"
-    )
-
-
-def _notify_error_callback(
-    replay_state: Dict[str, Any],
-    action_id: str,
-    error: Optional[str],
-):
-    """
-    Notifier le callback d'erreur si configuré pour ce replay.
-
-    Appel HTTP POST non-bloquant vers l'URL de callback.
-    En cas d'échec de notification, on log mais on ne bloque pas.
-    """
-    replay_id = replay_state["replay_id"]
-    callback_url = _error_callbacks.get(replay_id)
-    if not callback_url:
-        return
-
-    def _send_callback():
-        try:
-            import urllib.request
-            payload = json.dumps({
-                "replay_id": replay_id,
-                "workflow_id": replay_state.get("workflow_id"),
-                "session_id": replay_state.get("session_id"),
-                "action_id": action_id,
-                "error": error or "Erreur inconnue",
-                "retried_actions": replay_state.get("retried_actions", 0),
-                "error_log": replay_state.get("error_log", []),
-                "status": replay_state.get("status"),
-            }).encode("utf-8")
-
-            req = urllib.request.Request(
-                callback_url,
-                data=payload,
-                headers={"Content-Type": "application/json"},
-                method="POST",
-            )
-            with urllib.request.urlopen(req, timeout=5) as resp:
-                logger.info(
-                    f"Error callback envoyé à {callback_url}: {resp.status}"
-                )
-        except Exception as e:
-            logger.warning(
-                f"Échec envoi error callback à {callback_url}: {e}"
-            )
-
-    # Envoyer en arrière-plan pour ne pas bloquer
-    threading.Thread(target=_send_callback, daemon=True).start()
 
 
 @app.post("/api/v1/traces/stream/replay/error_callback")
@@ -3876,20 +2699,27 @@ async def resume_replay(replay_id: str):
 
 
 # =========================================================================
-# Visual Replay — Résolution visuelle des cibles
+# Visual Replay — Résolution visuelle des cibles (module resolve_engine)
 # =========================================================================
-
-
-class ResolveTargetRequest(BaseModel):
-    """Requête de résolution visuelle d'une cible."""
-    session_id: str
-    screenshot_b64: str  # Screenshot JPEG en base64
-    target_spec: Dict[str, Any]  # {by_role, by_text, by_position, ...}
-    fallback_x_pct: float = 0.0  # Coordonnées de fallback
-    fallback_y_pct: float = 0.0
-    screen_width: int = 1920
-    screen_height: int = 1080
-    strict_mode: bool = False  # True pour replay sessions (seuil template 0.90 + YOLO)
+from .resolve_engine import (
+    ResolveTargetRequest,
+    PreAnalyzeRequest,
+    _resolve_by_template_matching,
+    _validate_match_context,
+    _get_omniparser,
+    _resolve_by_yolo,
+    _get_vlm_client,
+    _build_target_description,
+    _vlm_quick_find,
+    _resolve_by_grounding,
+    _get_som_engine_api,
+    _resolve_by_som,
+    _resolve_target_sync,
+    _fuzzy_match,
+    _fallback_response,
+    _pre_analyze_screen_sync,
+    _locate_popup_button,
+)
 
 
 @app.post("/api/v1/traces/stream/replay/resolve_target")
@@ -3941,6 +2771,7 @@ async def resolve_target(request: ResolveTargetRequest):
             request.fallback_x_pct,
             request.fallback_y_pct,
             request.strict_mode,
+            processor,
         )
         return result
     except Exception as e:
@@ -3954,20 +2785,6 @@ async def resolve_target(request: ResolveTargetRequest):
             pass
 
 
-# =========================================================================
-# Observer — Pré-analyse écran avant résolution
-# =========================================================================
-
-
-class PreAnalyzeRequest(BaseModel):
-    """Requête de pré-analyse écran (Observer)."""
-    screenshot_b64: str
-    expected_state: str = ""       # Description attendue de l'état écran
-    window_title: str = ""         # Titre fenêtre attendu
-    screen_width: int = 1920
-    screen_height: int = 1080
-
-
 @app.post("/api/v1/traces/stream/replay/pre_analyze")
 async def pre_analyze_screen(request: PreAnalyzeRequest):
     """Observer : analyser l'écran AVANT la résolution de cible.
@@ -4006,1898 +2823,6 @@ async def pre_analyze_screen(request: PreAnalyzeRequest):
     return result
 
 
-def _pre_analyze_screen_sync(
-    screenshot_b64: str,
-    expected_state: str,
-    window_title: str,
-    screen_width: int,
-    screen_height: int,
-) -> Dict[str, Any]:
-    """Pré-analyse synchrone de l'écran via VLM.
-
-    Utilise gemma4 (Docker port 11435) pour détecter :
-    1. Popups/dialogues modaux (avec coordonnées du bouton à cliquer)
-    2. États incohérents avec l'attendu
-
-    Rapide (~2-5s) car gemma4 est léger et en mode texte+image.
-    """
-    import os
-    import time
-    import requests as _requests
-
-    gemma4_port = os.environ.get("GEMMA4_PORT", "11435")
-    gemma4_url = f"http://localhost:{gemma4_port}/api/chat"
-
-    # Charger le contexte métier pour l'Observer
-    from .domain_context import get_domain_context
-    domain = get_domain_context(os.environ.get("RPA_DOMAIN", "generic"))
-
-    # Prompt concis pour détection popup
-    prompt = (
-        "Regarde cette capture d'écran.\n"
-        "Y a-t-il une popup, boîte de dialogue, message d'erreur, ou fenêtre modale visible ?\n\n"
-        "Réponds EXACTEMENT dans ce format :\n"
-        "ÉTAT: OK ou POPUP ou INATTENDU\n"
-        "BOUTON: texte du bouton à cliquer (si POPUP, sinon 'aucun')\n"
-        "DÉTAIL: description courte (1 ligne)"
-    )
-
-    # Messages avec contexte métier
-    messages = []
-    if domain.system_prompt:
-        messages.append({"role": "system", "content": domain.system_prompt})
-    messages.append({"role": "user", "content": prompt, "images": [screenshot_b64]})
-
-    try:
-        t_start = time.time()
-        resp = _requests.post(
-            gemma4_url,
-            json={
-                "model": "gemma4:e4b",
-                "messages": messages,
-                "stream": False,
-                "think": True,
-                "options": {"temperature": 0.1, "num_predict": 800},
-            },
-            timeout=30,
-        )
-        elapsed_ms = (time.time() - t_start) * 1000
-
-        if not resp.ok:
-            logger.warning(f"Observer VLM HTTP {resp.status_code}")
-            return {"screen_state": "ok", "detail": f"VLM HTTP {resp.status_code}"}
-
-        content = resp.json().get("message", {}).get("content", "").strip()
-        logger.info(f"Observer VLM ({elapsed_ms:.0f}ms) : {content[:100]}")
-
-        # Parser la réponse
-        state = "ok"
-        button = ""
-        detail = content
-
-        for line in content.split("\n"):
-            line_clean = line.strip()
-            upper = line_clean.upper()
-            if upper.startswith("ÉTAT:") or upper.startswith("ETAT:"):
-                val = upper.split(":", 1)[1].strip()
-                if "POPUP" in val:
-                    state = "popup"
-                elif "INATTENDU" in val or "UNEXPECTED" in val:
-                    state = "unexpected"
-                else:
-                    state = "ok"
-            elif upper.startswith("BOUTON:"):
-                button = line_clean.split(":", 1)[1].strip().strip("'\"")
-                if button.lower() in ("aucun", "none", "n/a", ""):
-                    button = ""
-            elif upper.startswith("DÉTAIL:") or upper.startswith("DETAIL:"):
-                detail = line_clean.split(":", 1)[1].strip()
-
-        if state == "ok":
-            return {"screen_state": "ok"}
-
-        result = {
-            "screen_state": state,
-            "detail": detail,
-            "elapsed_ms": round(elapsed_ms, 1),
-        }
-
-        # Si popup détectée avec un texte de bouton, essayer de le localiser
-        if state == "popup" and button:
-            result["popup_label"] = button
-            # Localiser le bouton par grounding VLM (qwen2.5vl)
-            coords = _locate_popup_button(screenshot_b64, button, screen_width, screen_height)
-            if coords:
-                result["popup_coords"] = coords
-
-        return result
-
-    except _requests.Timeout:
-        logger.debug("Observer VLM timeout (15s)")
-        return {"screen_state": "ok", "detail": "VLM timeout"}
-    except Exception as e:
-        logger.debug(f"Observer VLM erreur : {e}")
-        return {"screen_state": "ok", "detail": str(e)}
-
-
-def _locate_popup_button(
-    screenshot_b64: str, button_text: str,
-    screen_width: int, screen_height: int,
-) -> Optional[Dict[str, float]]:
-    """Localiser un bouton de popup par grounding VLM (qwen2.5vl).
-
-    Utilise le format bbox_2d natif de qwen2.5vl pour trouver
-    la position exacte du bouton sur le screenshot.
-    """
-    import requests as _requests
-    import re
-
-    ollama_url = "http://localhost:11434/api/chat"
-    prompt = f"Detect the button with text '{button_text}' with a bounding box."
-
-    try:
-        resp = _requests.post(
-            ollama_url,
-            json={
-                "model": "qwen2.5vl:7b",
-                "messages": [{"role": "user", "content": prompt, "images": [screenshot_b64]}],
-                "stream": False,
-                "options": {"temperature": 0.1, "num_predict": 50},
-            },
-            timeout=15,
-        )
-        if not resp.ok:
-            return None
-
-        content = resp.json().get("message", {}).get("content", "")
-
-        # Parser bbox_2d — qwen2.5vl retourne des coordonnées en pixels
-        # relatifs à l'image envoyée, PAS sur une grille 1000x1000.
-        # Format JSON : [{"bbox_2d": [x1, y1, x2, y2], "label": "..."}]
-        bbox_match = re.search(
-            r'"bbox_2d"\s*:\s*\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]',
-            content,
-        )
-        if bbox_match:
-            x1, y1, x2, y2 = [int(bbox_match.group(i)) for i in range(1, 5)]
-            # Normaliser par les dimensions de l'écran (pixels → 0-1)
-            cx = (x1 + x2) / 2 / screen_width
-            cy = (y1 + y2) / 2 / screen_height
-            if 0.0 <= cx <= 1.0 and 0.0 <= cy <= 1.0:
-                logger.info(f"Observer : bouton '{button_text}' localisé à ({cx:.3f}, {cy:.3f})")
-                return {"x_pct": cx, "y_pct": cy}
-
-    except Exception as e:
-        logger.debug(f"Observer grounding bouton erreur : {e}")
-
-    return None
-
-
-def _resolve_by_template_matching(
-    screenshot_path: str,
-    anchor_image_b64: str,
-    screen_width: int,
-    screen_height: int,
-    confidence_threshold: float = 0.7,
-) -> Optional[Dict[str, Any]]:
-    """Résoudre la position d'une ancre par template matching OpenCV.
-
-    Compare l'image de l'ancre (crop) avec le screenshot actuel pour trouver
-    la meilleure correspondance. Utilise cv2.matchTemplate avec TM_CCOEFF_NORMED.
-
-    Args:
-        screenshot_path: Chemin du screenshot de l'écran actuel
-        anchor_image_b64: Image de l'ancre encodée en base64 (PNG)
-        screen_width: Largeur de l'écran en pixels
-        screen_height: Hauteur de l'écran en pixels
-        confidence_threshold: Seuil minimum de confiance (0.0 à 1.0)
-
-    Returns:
-        Dict avec resolved=True et coordonnées, ou None si pas de match
-    """
-    import base64
-    import io
-
-    try:
-        import cv2
-        import numpy as np
-    except ImportError:
-        logger.warning("OpenCV non disponible pour template matching")
-        return None
-
-    try:
-        # Charger le screenshot
-        screenshot = cv2.imread(screenshot_path)
-        if screenshot is None:
-            logger.warning("Impossible de lire le screenshot : %s", screenshot_path)
-            return None
-
-        # Décoder l'image de l'ancre depuis base64
-        anchor_bytes = base64.b64decode(anchor_image_b64)
-        anchor_array = np.frombuffer(anchor_bytes, dtype=np.uint8)
-        anchor_img = cv2.imdecode(anchor_array, cv2.IMREAD_COLOR)
-        if anchor_img is None:
-            logger.warning("Impossible de décoder l'image de l'ancre")
-            return None
-
-        # Convertir en niveaux de gris pour le matching
-        screenshot_gray = cv2.cvtColor(screenshot, cv2.COLOR_BGR2GRAY)
-        anchor_gray = cv2.cvtColor(anchor_img, cv2.COLOR_BGR2GRAY)
-
-        # Vérifier que l'ancre n'est pas plus grande que le screenshot
-        sh, sw = screenshot_gray.shape[:2]
-        ah, aw = anchor_gray.shape[:2]
-        if ah > sh or aw > sw:
-            logger.warning(
-                "Ancre (%dx%d) plus grande que le screenshot (%dx%d)",
-                aw, ah, sw, sh,
-            )
-            return None
-
-        # Template matching multi-échelle : essayer l'échelle 1.0 d'abord,
-        # puis quelques variations si la résolution a changé.
-        # Plage étendue 0.5x-2.0x pour couvrir les écarts importants
-        # (ex: apprentissage 2560x1600 → replay 1280x720 = ratio ~0.5x)
-        best_val = -1.0
-        best_loc = None
-        best_scale = 1.0
-        best_anchor_size = (aw, ah)
-
-        for scale in [1.0, 0.9, 1.1, 0.8, 1.2, 0.75, 1.25, 0.6, 1.5, 0.5, 1.75, 2.0]:
-            if scale != 1.0:
-                new_w = int(aw * scale)
-                new_h = int(ah * scale)
-                if new_w < 10 or new_h < 10 or new_w > sw or new_h > sh:
-                    continue
-                scaled_anchor = cv2.resize(anchor_gray, (new_w, new_h))
-            else:
-                scaled_anchor = anchor_gray
-                new_w, new_h = aw, ah
-
-            result = cv2.matchTemplate(screenshot_gray, scaled_anchor, cv2.TM_CCOEFF_NORMED)
-            _, max_val, _, max_loc = cv2.minMaxLoc(result)
-
-            if max_val > best_val:
-                best_val = max_val
-                best_loc = max_loc
-                best_scale = scale
-                best_anchor_size = (new_w, new_h)
-
-            # Si on a un très bon match, pas besoin de continuer
-            if best_val >= 0.95:
-                break
-
-        if best_val < confidence_threshold:
-            logger.info(
-                "Template matching : meilleur score=%.3f < seuil=%.3f (ancre %dx%d, écran %dx%d)",
-                best_val, confidence_threshold, aw, ah, sw, sh,
-            )
-            return None
-
-        # Calculer le centre du match
-        match_w, match_h = best_anchor_size
-        cx = best_loc[0] + match_w / 2.0
-        cy = best_loc[1] + match_h / 2.0
-
-        # Convertir en proportions normalisées
-        x_pct = round(cx / sw, 6) if sw > 0 else 0.0
-        y_pct = round(cy / sh, 6) if sh > 0 else 0.0
-
-        logger.info(
-            "Template matching OK : score=%.3f, échelle=%.2f, "
-            "centre=(%d, %d) → (%.4f, %.4f) sur %dx%d",
-            best_val, best_scale, int(cx), int(cy), x_pct, y_pct, sw, sh,
-        )
-
-        return {
-            "resolved": True,
-            "method": "template_matching",
-            "x_pct": x_pct,
-            "y_pct": y_pct,
-            "matched_element": {
-                "label": f"anchor_template",
-                "type": "visual_anchor",
-                "role": "anchor",
-                "center": [int(cx), int(cy)],
-                "confidence": best_val,
-            },
-            "score": best_val,
-            "scale": best_scale,
-            "match_box": {
-                "x": best_loc[0],
-                "y": best_loc[1],
-                "width": match_w,
-                "height": match_h,
-            },
-        }
-
-    except Exception as e:
-        logger.error("Erreur template matching : %s", e)
-        return None
-
-
-def _validate_match_context(
-    result: Dict[str, Any],
-    original_x_pct: float,
-    original_y_pct: float,
-    target_spec: Dict[str, Any],
-    max_distance: float = 0.35,
-) -> bool:
-    """Vérifier que la position trouvée est dans la même zone que l'originale.
-
-    Évite les faux positifs du template matching : un bouton similaire visuellement
-    mais situé dans une zone très différente de l'écran.
-
-    Args:
-        result: Résultat du template matching (contient x_pct, y_pct).
-        original_x_pct: Position X originale (pourcentage, 0.0-1.0).
-        original_y_pct: Position Y originale (pourcentage, 0.0-1.0).
-        target_spec: Spécification de la cible (non utilisé pour l'instant,
-            mais disponible pour des règles contextuelles futures).
-        max_distance: Distance euclidienne maximum acceptée (en pourcentage de l'écran).
-            Défaut 0.35 = ~35% de la diagonale, assez permissif pour les UI dynamiques.
-
-    Returns:
-        True si la position est valide (même zone), False sinon.
-    """
-    found_x = result.get("x_pct", 0.0)
-    found_y = result.get("y_pct", 0.0)
-
-    # Distance euclidienne en pourcentage de l'écran
-    dx = found_x - original_x_pct
-    dy = found_y - original_y_pct
-    distance = (dx ** 2 + dy ** 2) ** 0.5
-
-    if distance > max_distance:
-        logger.debug(
-            "Context validation : distance=%.3f > max=%.3f "
-            "(found=(%.3f, %.3f), original=(%.3f, %.3f))",
-            distance, max_distance, found_x, found_y, original_x_pct, original_y_pct,
-        )
-        return False
-
-    return True
-
-
-# =========================================================================
-# YOLO/OmniParser — Résolution par détection d'éléments UI
-# =========================================================================
-
-# Chargement paresseux d'OmniParser (singleton, GPU)
-_omniparser_available: Optional[bool] = None  # None = pas encore vérifié
-_omniparser_instance = None
-_omniparser_lock = threading.Lock()
-
-
-def _get_omniparser():
-    """Obtenir l'instance OmniParser (lazy loading, thread-safe).
-
-    Returns:
-        OmniParserAdapter ou None si non disponible.
-    """
-    global _omniparser_available, _omniparser_instance
-    if _omniparser_available is False:
-        return None
-    if _omniparser_instance is not None:
-        return _omniparser_instance
-
-    with _omniparser_lock:
-        if _omniparser_available is False:
-            return None
-        if _omniparser_instance is not None:
-            return _omniparser_instance
-        try:
-            from core.detection.omniparser_adapter import OmniParserAdapter
-            adapter = OmniParserAdapter()
-            if adapter.available:
-                _omniparser_instance = adapter
-                _omniparser_available = True
-                logger.info("OmniParser disponible pour la résolution YOLO")
-                return adapter
-            else:
-                _omniparser_available = False
-                logger.info("OmniParser : modèles non trouvés, YOLO désactivé")
-                return None
-        except ImportError:
-            _omniparser_available = False
-            logger.info("OmniParser non installé, YOLO désactivé")
-            return None
-        except Exception as e:
-            _omniparser_available = False
-            logger.warning("OmniParser init échouée : %s", e)
-            return None
-
-
-def _resolve_by_yolo(
-    screenshot_path: str,
-    anchor_image_b64: str,
-    screen_width: int,
-    screen_height: int,
-    target_spec: Dict[str, Any],
-) -> Optional[Dict[str, Any]]:
-    """Résolution via YOLO/OmniParser : détecte tous les éléments UI
-    puis matche le crop de référence contre les éléments détectés.
-
-    Stratégie :
-    1. OmniParser détecte tous les éléments UI du screenshot (~0.6-0.8s)
-    2. Pour chaque élément détecté, template matching local contre l'anchor
-    3. Si 1 seul bon match (score >= 0.50) → accepter
-    4. Si 2+ matchs ambigus → retourner None (le VLM tranchera)
-
-    Args:
-        screenshot_path: Chemin vers le screenshot JPEG
-        anchor_image_b64: Image de l'anchor encodée en base64
-        screen_width: Largeur de l'écran
-        screen_height: Hauteur de l'écran
-        target_spec: Spécification de la cible
-
-    Returns:
-        Dict avec resolved=True/False, x_pct, y_pct, score
-        ou None si OmniParser pas disponible ou aucun match
-    """
-    import base64
-
-    try:
-        import cv2
-        import numpy as np
-    except ImportError:
-        return None
-
-    omniparser = _get_omniparser()
-    if omniparser is None:
-        return None
-
-    t0 = time.time()
-
-    try:
-        from PIL import Image as PILImage
-
-        # Charger le screenshot en PIL
-        screenshot_pil = PILImage.open(screenshot_path)
-        sw, sh = screenshot_pil.size
-
-        # Charger le screenshot en numpy/OpenCV pour le template matching
-        screenshot_np = np.array(screenshot_pil)
-        if len(screenshot_np.shape) == 3 and screenshot_np.shape[2] == 3:
-            # PIL est RGB, convertir en BGR pour OpenCV
-            screenshot_bgr = cv2.cvtColor(screenshot_np, cv2.COLOR_RGB2BGR)
-        else:
-            screenshot_bgr = screenshot_np
-        screenshot_gray = cv2.cvtColor(screenshot_bgr, cv2.COLOR_BGR2GRAY)
-
-        # Décoder l'anchor depuis base64
-        anchor_bytes = base64.b64decode(anchor_image_b64)
-        anchor_array = np.frombuffer(anchor_bytes, dtype=np.uint8)
-        anchor_img = cv2.imdecode(anchor_array, cv2.IMREAD_COLOR)
-        if anchor_img is None:
-            logger.warning("YOLO resolve : impossible de décoder l'anchor")
-            return None
-        anchor_gray = cv2.cvtColor(anchor_img, cv2.COLOR_BGR2GRAY)
-        anchor_h, anchor_w = anchor_gray.shape[:2]
-
-        # Détecter tous les éléments UI avec OmniParser
-        elements = omniparser.detect(screenshot_pil)
-        if not elements:
-            elapsed = time.time() - t0
-            logger.info("YOLO resolve : 0 éléments détectés (%.1fs)", elapsed)
-            return None
-
-        logger.info(
-            "YOLO resolve : %d éléments détectés, matching anchor %dx%d...",
-            len(elements), anchor_w, anchor_h,
-        )
-
-        # Matcher l'anchor contre chaque élément détecté
-        YOLO_MATCH_THRESHOLD = 0.50
-        matches = []
-
-        for elem in elements:
-            x1, y1, x2, y2 = elem.bbox
-            elem_w = x2 - x1
-            elem_h = y2 - y1
-
-            # Ignorer les éléments trop petits
-            if elem_w < 5 or elem_h < 5:
-                continue
-
-            # Extraire le crop de l'élément depuis le screenshot
-            elem_crop = screenshot_gray[y1:y2, x1:x2]
-            if elem_crop.size == 0:
-                continue
-
-            # Template matching local : resize anchor pour matcher la taille de l'élément
-            # ou inversement, selon les dimensions relatives
-            try:
-                # Approche : resize l'anchor à la taille du crop et comparer
-                if elem_w > 0 and elem_h > 0:
-                    anchor_resized = cv2.resize(anchor_gray, (elem_w, elem_h))
-                    result = cv2.matchTemplate(
-                        elem_crop, anchor_resized, cv2.TM_CCOEFF_NORMED
-                    )
-                    _, max_val, _, _ = cv2.minMaxLoc(result)
-                else:
-                    continue
-
-                # Aussi essayer le crop à la taille de l'anchor si c'est plus grand
-                if elem_w >= anchor_w and elem_h >= anchor_h:
-                    result2 = cv2.matchTemplate(
-                        elem_crop, anchor_gray, cv2.TM_CCOEFF_NORMED
-                    )
-                    _, max_val2, _, _ = cv2.minMaxLoc(result2)
-                    max_val = max(max_val, max_val2)
-
-                if max_val >= YOLO_MATCH_THRESHOLD:
-                    matches.append((elem, max_val))
-
-            except cv2.error:
-                continue
-
-        elapsed = time.time() - t0
-
-        if not matches:
-            logger.info(
-                "YOLO resolve : aucun match >= %.2f parmi %d éléments (%.1fs)",
-                YOLO_MATCH_THRESHOLD, len(elements), elapsed,
-            )
-            return None
-
-        # Trier par score décroissant
-        matches.sort(key=lambda m: m[1], reverse=True)
-        best_elem, best_score = matches[0]
-
-        # Si 2+ matchs avec des scores proches (< 0.10 d'écart), c'est ambigu
-        # → laisser le VLM trancher
-        if len(matches) >= 2:
-            second_score = matches[1][1]
-            if best_score - second_score < 0.10:
-                logger.info(
-                    "YOLO resolve : %d matchs ambigus (best=%.3f, second=%.3f, "
-                    "écart=%.3f < 0.10), VLM requis (%.1fs)",
-                    len(matches), best_score, second_score,
-                    best_score - second_score, elapsed,
-                )
-                return None
-
-        # 1 seul match clair → accepter
-        cx, cy = best_elem.center
-        x_pct = round(cx / sw, 6) if sw > 0 else 0.0
-        y_pct = round(cy / sh, 6) if sh > 0 else 0.0
-
-        logger.info(
-            "YOLO resolve OK : '%s' (%s) score=%.3f → (%.4f, %.4f) "
-            "parmi %d éléments, %d matchs (%.1fs)",
-            best_elem.label, best_elem.element_type, best_score,
-            x_pct, y_pct, len(elements), len(matches), elapsed,
-        )
-
-        return {
-            "resolved": True,
-            "method": "yolo_omniparser",
-            "x_pct": x_pct,
-            "y_pct": y_pct,
-            "matched_element": {
-                "label": best_elem.label,
-                "type": best_elem.element_type,
-                "role": "yolo_detected",
-                "center": [cx, cy],
-                "confidence": best_score,
-            },
-            "score": best_score,
-            "yolo_elements_count": len(elements),
-            "yolo_matches_count": len(matches),
-        }
-
-    except Exception as e:
-        elapsed = time.time() - t0
-        logger.warning("YOLO resolve : exception (%.1fs) — %s", elapsed, e)
-        return None
-
-
-# =========================================================================
-# VLM Quick Find — Fallback léger quand le template matching échoue
-# =========================================================================
-
-# Client Ollama singleton (initialisé au premier appel, pas au démarrage)
-_vlm_client = None
-_vlm_client_lock = threading.Lock()
-
-# Timeout dédié pour le VLM Quick Find (plus court que le timeout par défaut)
-_VLM_QUICK_FIND_TIMEOUT = 30  # secondes
-
-
-def _get_vlm_client():
-    """Obtenir ou créer le client Ollama singleton pour le VLM Quick Find.
-
-    Initialisation paresseuse : le client n'est créé qu'au premier appel,
-    pas au démarrage du serveur (évite de bloquer si Ollama est down).
-    Le modèle est résolu automatiquement via vlm_config (RPA_VLM_MODEL).
-    """
-    global _vlm_client
-    if _vlm_client is not None:
-        return _vlm_client
-    with _vlm_client_lock:
-        if _vlm_client is not None:
-            return _vlm_client
-        try:
-            from core.detection.ollama_client import OllamaClient
-            from core.detection.vlm_config import get_vlm_model
-            _model = get_vlm_model()
-            _vlm_client = OllamaClient(
-                endpoint="http://localhost:11434",
-                model=_model,
-                timeout=_VLM_QUICK_FIND_TIMEOUT,
-            )
-            logger.info("VLM Quick Find : client Ollama initialisé (%s)", _model)
-        except Exception as e:
-            logger.warning(f"VLM Quick Find : impossible d'initialiser le client Ollama : {e}")
-            return None
-    return _vlm_client
-
-
-def _build_target_description(target_spec: Dict[str, Any]) -> str:
-    """Construire une description textuelle de l'élément à trouver.
-
-    Utilisé par le VLM Quick Find pour savoir quoi chercher sur le screenshot.
-
-    Args:
-        target_spec: Spécification de la cible (by_text, by_role, etc.)
-
-    Returns:
-        Description en langage naturel, ex: "un bouton contenant 'Valider'"
-    """
-    by_text = target_spec.get("by_text", "").strip()
-    by_role = target_spec.get("by_role", "").strip()
-
-    if by_text and by_role:
-        return f"un {by_role} contenant '{by_text}'"
-    elif by_text:
-        return f"élément contenant le texte '{by_text}'"
-    elif by_role:
-        return f"un {by_role}"
-    else:
-        return "l'élément interactif principal"
-
-
-def _vlm_quick_find(
-    screenshot_path: str,
-    target_description: str,
-    anchor_image_b64: Optional[str] = None,
-) -> Optional[Dict[str, Any]]:
-    """Demander au VLM de localiser un élément sur le screenshot.
-
-    Stratégie VLM-first pour le replay : le VLM comprend le contexte
-    de l'écran et peut trouver un élément même si l'apparence a changé.
-
-    Modes de fonctionnement :
-    - Avec anchor_image_b64 + description : multi-image (screenshot + crop de référence).
-      Le VLM voit le screenshot ET le crop, ce qui est beaucoup plus précis.
-    - Avec description seule : single-image, le VLM cherche par la description textuelle.
-    - Avec anchor_image_b64 seule (pas de description) : multi-image avec prompt visuel pur.
-
-    Args:
-        screenshot_path: Chemin du screenshot actuel
-        target_description: Description riche de l'élément à trouver.
-            Ex: "Dans la fenêtre 'Exécuter', l'élément cliqué en bas au centre"
-        anchor_image_b64: Image de référence (crop) en base64 (optionnel).
-            Si fourni, envoyé comme seconde image au VLM pour comparaison visuelle.
-
-    Returns:
-        {"x_pct": float, "y_pct": float, "confidence": float, "method": "vlm_quick_find"}
-        ou None si l'élément n'est pas trouvé ou en cas d'erreur
-    """
-    client = _get_vlm_client()
-    if client is None:
-        logger.debug("VLM Quick Find : client Ollama non disponible, skip")
-        return None
-
-    t0 = time.time()
-
-    # Construire le prompt adapté selon les informations disponibles
-    has_anchor = bool(anchor_image_b64)
-    has_description = bool(target_description and target_description.strip())
-
-    if has_anchor and has_description:
-        # Mode optimal : screenshot + crop de référence + description textuelle
-        prompt = (
-            "The first image is the current screen. "
-            "The second image shows the element I want to click.\n\n"
-            f"Context: {target_description}\n\n"
-            "Find this exact element on the screen and return its CENTER coordinates "
-            "as percentage of the screen dimensions.\n"
-            'Return ONLY a JSON object: {"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}\n'
-            'If the element is not visible, return: {"x_pct": null, "y_pct": null, "confidence": 0.0}'
-        )
-    elif has_anchor:
-        # Mode visuel pur : screenshot + crop, pas de description
-        prompt = (
-            "The first image is the current screen. "
-            "The second image shows the element I want to click.\n\n"
-            "Find this exact element on the screen and return its CENTER coordinates "
-            "as percentage of the screen dimensions.\n"
-            'Return ONLY a JSON object: {"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}\n'
-            'If the element is not visible, return: {"x_pct": null, "y_pct": null, "confidence": 0.0}'
-        )
-    else:
-        # Mode description seule
-        prompt = (
-            "Look at this screenshot carefully.\n\n"
-            f"{target_description}\n\n"
-            "Find this element and return its CENTER coordinates "
-            "as percentage of the image dimensions.\n"
-            'Return ONLY a JSON object: {"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}\n'
-            'If the element is not visible, return: {"x_pct": null, "y_pct": null, "confidence": 0.0}'
-        )
-
-    system_prompt = "You are a UI element locator. Output raw JSON only. No explanation."
-
-    try:
-        # Préparer les images supplémentaires (anchor crop)
-        extra_images = [anchor_image_b64] if has_anchor else None
-
-        result = client.generate(
-            prompt=prompt,
-            image_path=screenshot_path,
-            system_prompt=system_prompt,
-            temperature=0.1,
-            max_tokens=200,
-            force_json=False,
-            extra_images_b64=extra_images,
-        )
-
-        elapsed = time.time() - t0
-
-        if not result.get("success"):
-            logger.info(
-                "VLM Quick Find : échec appel VLM (%.1fs) — %s",
-                elapsed, result.get("error", "?"),
-            )
-            return None
-
-        response_text = result.get("response", "").strip()
-        if not response_text:
-            logger.info("VLM Quick Find : réponse vide du VLM (%.1fs)", elapsed)
-            return None
-
-        # Parser la réponse JSON (réutiliser le parser robuste d'OllamaClient)
-        parsed = client._extract_json_from_response(response_text)
-        if parsed is None:
-            logger.info(
-                "VLM Quick Find : réponse non-JSON (%.1fs) — %.80s",
-                elapsed, response_text,
-            )
-            return None
-
-        # Valider les coordonnées
-        x_pct = parsed.get("x_pct")
-        y_pct = parsed.get("y_pct")
-        confidence = float(parsed.get("confidence", 0.0))
-
-        if x_pct is None or y_pct is None or confidence < 0.3:
-            logger.info(
-                "VLM Quick Find : élément non trouvé ou confiance trop basse "
-                "(%.1fs, confidence=%.2f) pour '%s'",
-                elapsed, confidence,
-                target_description[:80] if target_description else "(anchor only)",
-            )
-            return None
-
-        x_pct = float(x_pct)
-        y_pct = float(y_pct)
-
-        # Vérifier que les coordonnées sont dans les bornes [0, 1]
-        if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0):
-            logger.info(
-                "VLM Quick Find : coordonnées hors bornes (%.4f, %.4f), ignoré",
-                x_pct, y_pct,
-            )
-            return None
-
-        mode_str = "multi-image" if has_anchor else "description"
-        desc_short = (target_description[:60] + "...") if target_description and len(target_description) > 60 else (target_description or "(anchor)")
-        logger.info(
-            "VLM Quick Find OK [%s] : '%s' → (%.4f, %.4f) confidence=%.2f en %.1fs",
-            mode_str, desc_short, x_pct, y_pct, confidence, elapsed,
-        )
-
-        return {
-            "resolved": True,
-            "method": "vlm_quick_find",
-            "x_pct": round(x_pct, 6),
-            "y_pct": round(y_pct, 6),
-            "matched_element": {
-                "label": target_description or "anchor_visual",
-                "type": "vlm_located",
-                "role": "vlm_quick_find",
-                "confidence": confidence,
-            },
-            "score": confidence,
-        }
-
-    except Exception as e:
-        elapsed = time.time() - t0
-        logger.warning(
-            "VLM Quick Find : exception (%.1fs) — %s", elapsed, e,
-        )
-        return None
-
-
-# ---------------------------------------------------------------------------
-# Résolution par VLM Grounding Direct (configurable via RPA_VLM_MODEL)
-# ---------------------------------------------------------------------------
-
-
-def _resolve_by_grounding(
-    screenshot_path: str,
-    target_spec: Dict[str, Any],
-    screen_width: int,
-    screen_height: int,
-) -> Optional[Dict[str, Any]]:
-    """Résoudre une cible via grounding VLM direct.
-
-    Le modèle VLM (gemma4:e4b par défaut, configurable via RPA_VLM_MODEL)
-    reçoit le screenshot + une description textuelle et retourne
-    directement les coordonnées de l'élément. Pas de SomEngine,
-    pas de numérotation — le VLM fait du grounding UI natif.
-
-    Approche plus fiable que SomEngine+VLM pour les icônes et éléments
-    visuels sans texte (logo Windows, disquette, bouton fermer).
-    """
-    import base64
-    import io
-    import re
-
-    t0 = time.time()
-
-    # Construire la description de la cible
-    by_text = target_spec.get("by_text", "").strip()
-    vlm_desc = target_spec.get("vlm_description", "").strip()
-    window_title = target_spec.get("window_title", "").strip()
-
-    if by_text:
-        description = by_text
-    elif vlm_desc:
-        description = vlm_desc
-    else:
-        return None
-
-    # Utiliser la capture fenêtre si disponible (plus ciblée, moins de bruit)
-    # Sinon fallback sur le full screen
-    window_capture = target_spec.get("window_capture", {})
-    window_rect = window_capture.get("rect")  # [x1, y1, x2, y2] écran
-
-    try:
-        from PIL import Image as PILImage
-        from pathlib import Path
-
-        # Utiliser la fenêtre active : cropper depuis le screenshot full
-        # via window_rect (fonctionne au replay comme à l'enregistrement)
-        img = PILImage.open(screenshot_path)
-
-        if window_rect:
-            x1, y1, x2, y2 = window_rect
-            img = img.crop((x1, y1, x2, y2))
-            using_window = True
-            logger.debug("Grounding : crop fenêtre (%d,%d,%d,%d) → %dx%d", x1, y1, x2, y2, *img.size)
-        else:
-            using_window = False
-
-        orig_w, orig_h = img.size
-        small_w, small_h = orig_w, orig_h  # pas de redimensionnement
-
-        buf = io.BytesIO()
-        img.save(buf, format="JPEG", quality=80)
-        shot_b64 = base64.b64encode(buf.getvalue()).decode()
-    except Exception as e:
-        logger.warning("Grounding : erreur chargement image — %s", e)
-        return None
-
-    # Prompt natif Qwen2.5-VL — format bbox_2d (le seul fiable)
-    # Ajouter la position relative pour désambiguïser (ex: deux "Rechercher" à l'écran)
-    original_pos = target_spec.get("original_position", {})
-    pos_hint = ""
-    y_rel = original_pos.get("y_relative", "")
-    x_rel = original_pos.get("x_relative", "")
-    if y_rel or x_rel:
-        pos_hint = f" located {y_rel} {x_rel} of the screen".strip()
-    prompt = f"Detect '{description}'{pos_hint} in this image with a bounding box."
-
-    # Le grounding nécessite un modèle entraîné pour les coordonnées (bbox_2d).
-    # Qwen2.5-VL est le seul qui retourne des positions précises.
-    # gemma4 comprend les images mais ne sait pas localiser en coordonnées.
-    _grounding_model = os.environ.get("RPA_GROUNDING_MODEL", "qwen2.5vl:7b")
-
-    # Appel VLM — vLLM (GPU, rapide) en priorité, Ollama en fallback
-    import requests as _requests
-    content = ""
-
-    # Port vLLM configurable via env
-    _vllm_port = os.environ.get("VLLM_PORT", "8100")
-    _vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ")
-
-    # Essai 1 : vLLM (API OpenAI-compatible, GPU)
-    try:
-        vllm_resp = _requests.post(
-            f"http://localhost:{_vllm_port}/v1/chat/completions",
-            json={
-                "model": _vllm_model,
-                "messages": [
-                    {"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."},
-                    {"role": "user", "content": [
-                        {"type": "text", "text": prompt},
-                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}},
-                    ]},
-                ],
-                "temperature": 0.1,
-                "max_tokens": 80,
-            },
-            timeout=30,
-        )
-        if vllm_resp.ok:
-            content = vllm_resp.json().get("choices", [{}])[0].get("message", {}).get("content", "")
-            if content:
-                logger.debug("Grounding via vLLM OK")
-    except Exception as e:
-        logger.debug("vLLM non disponible (%s), fallback Ollama", e)
-
-    # Essai 2 : Ollama (qwen2.5vl:7b pour le grounding — format bbox_2d natif)
-    if not content:
-        try:
-            resp = _requests.post("http://localhost:11434/api/chat", json={
-                "model": _grounding_model,
-                "messages": [
-                    {"role": "user", "content": prompt, "images": [shot_b64]},
-                ],
-                "stream": False,
-                "options": {"temperature": 0.1, "num_predict": 100},
-            }, timeout=60)
-            content = resp.json().get("message", {}).get("content", "")
-        except Exception as e:
-            logger.info("Grounding VLM timeout/erreur : %s", e)
-            return None
-
-    elapsed = time.time() - t0
-
-    # Parser la réponse — supporte bbox_2d en pixels, JSON %, arrays bruts
-    x_pct, y_pct = None, None
-
-    # Format 1 : bbox_2d en pixels [x, y] ou [x1, y1, x2, y2]
-    bbox_match = re.search(r'"bbox_2d"\s*:\s*\[([^\]]+)\]', content)
-    if bbox_match:
-        coords = [float(v.strip()) for v in bbox_match.group(1).split(",")]
-        if len(coords) == 2:
-            x_pct = coords[0] / small_w
-            y_pct = coords[1] / small_h
-        elif len(coords) >= 4:
-            x_pct = (coords[0] + coords[2]) / 2 / small_w
-            y_pct = (coords[1] + coords[3]) / 2 / small_h
-
-    # Format 2 : JSON {"x": 0.XX, "y": 0.YY}
-    if x_pct is None:
-        json_match = re.search(r'"x"\s*:\s*([\d.]+).*?"y"\s*:\s*([\d.]+)', content)
-        if json_match:
-            x_val, y_val = float(json_match.group(1)), float(json_match.group(2))
-            # Si > 1, c'est en pixels
-            if x_val > 1:
-                x_pct = x_val / small_w
-                y_pct = y_val / small_h
-            else:
-                x_pct = x_val
-                y_pct = y_val
-
-    # Format 3 : {"x_pct": 0.XX, "y_pct": 0.YY}
-    if x_pct is None:
-        pct_match = re.search(r'"x_pct"\s*:\s*([\d.]+).*?"y_pct"\s*:\s*([\d.]+)', content)
-        if pct_match:
-            x_pct = float(pct_match.group(1))
-            y_pct = float(pct_match.group(2))
-
-    # Format 4 : array brut [x1, y1, x2, y2] ou [x, y]
-    if x_pct is None:
-        arr_match = re.search(r'\[[\s]*([\d.]+)\s*,\s*([\d.]+)(?:\s*,\s*([\d.]+)\s*,\s*([\d.]+))?\s*\]', content)
-        if arr_match:
-            vals = [float(v) for v in arr_match.groups() if v is not None]
-            if len(vals) >= 4:
-                x_pct = (vals[0] + vals[2]) / 2 / small_w
-                y_pct = (vals[1] + vals[3]) / 2 / small_h
-            elif len(vals) == 2:
-                x_pct = vals[0] / small_w
-                y_pct = vals[1] / small_h
-
-    if x_pct is None or y_pct is None:
-        # Fallback multi-image : screenshot + crop → grounding sans description
-        anchor_b64 = target_spec.get("anchor_image_base64", "")
-        if anchor_b64:
-            try:
-                prompt_mi = (
-                    "Image 1 is a screenshot. Image 2 shows a UI element.\n"
-                    "Find where Image 2 appears on Image 1.\n"
-                    'Return position: {"x": NNN, "y": NNN} in pixels of Image 1.'
-                )
-                resp2 = _requests.post("http://localhost:11434/api/chat", json={
-                    "model": _grounding_model,
-                    "messages": [
-                        {"role": "user", "content": prompt_mi, "images": [shot_b64, anchor_b64]},
-                    ],
-                    "stream": False,
-                    "options": {"temperature": 0.1, "num_predict": 50},
-                }, timeout=60)
-                content2 = resp2.json().get("message", {}).get("content", "")
-                elapsed = time.time() - t0
-
-                # Parser tous les formats
-                arr2 = re.search(r'\[[\s]*([\d.]+)\s*,\s*([\d.]+)(?:\s*,\s*([\d.]+)\s*,\s*([\d.]+))?\s*\]', content2)
-                if arr2:
-                    vals = [float(v) for v in arr2.groups() if v is not None]
-                    if len(vals) >= 4:
-                        x_pct = (vals[0] + vals[2]) / 2 / small_w
-                        y_pct = (vals[1] + vals[3]) / 2 / small_h
-                    elif len(vals) == 2:
-                        x_pct = vals[0] / small_w
-                        y_pct = vals[1] / small_h
-                if x_pct is None:
-                    json2 = re.search(r'"x"\s*:\s*([\d.]+).*?"y"\s*:\s*([\d.]+)', content2)
-                    if json2:
-                        x_pct = float(json2.group(1)) / small_w
-                        y_pct = float(json2.group(2)) / small_h
-                if x_pct is not None:
-                    logger.info("Grounding multi-image OK (%.1fs)", elapsed)
-            except Exception as e:
-                logger.debug("Grounding multi-image erreur: %s", e)
-
-    if x_pct is None or y_pct is None:
-        logger.info(
-            "Grounding : réponse non parsable (%.1fs) — %s",
-            elapsed, content[:120],
-        )
-        return None
-
-    # Valider les bornes
-    if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0):
-        logger.info("Grounding : coordonnées hors bornes (%.3f, %.3f)", x_pct, y_pct)
-        return None
-
-    # Convertir coordonnées fenêtre → coordonnées écran
-    if using_window and window_rect:
-        win_x1, win_y1, win_x2, win_y2 = window_rect
-        win_w = win_x2 - win_x1
-        win_h = win_y2 - win_y1
-        # x_pct/y_pct sont relatifs à la fenêtre, convertir en relatif à l'écran
-        abs_x = win_x1 + x_pct * win_w
-        abs_y = win_y1 + y_pct * win_h
-        x_pct = abs_x / screen_width
-        y_pct = abs_y / screen_height
-        logger.info(
-            "Grounding OK [%s/window] : '%s' → (%.4f, %.4f) en %.1fs",
-            _grounding_model, description[:50], x_pct, y_pct, elapsed,
-        )
-    else:
-        logger.info(
-            "Grounding OK [%s/full] : '%s' → (%.4f, %.4f) en %.1fs",
-            _grounding_model, description[:50], x_pct, y_pct, elapsed,
-        )
-
-    return {
-        "resolved": True,
-        "method": "grounding_vlm",
-        "x_pct": round(x_pct, 6),
-        "y_pct": round(y_pct, 6),
-        "matched_element": {
-            "label": description[:60],
-            "type": "grounding",
-            "role": "grounding_vlm",
-            "confidence": 0.85,
-        },
-        "score": 0.85,
-    }
-
-
-# ---------------------------------------------------------------------------
-# Résolution Set-of-Mark : SomEngine (détection) + VLM (identification)
-# ---------------------------------------------------------------------------
-
-def _get_som_engine_api():
-    """Singleton SomEngine partagé."""
-    try:
-        from core.detection.som_engine import get_shared_engine
-        return get_shared_engine()
-    except ImportError:
-        return None
-
-
-def _resolve_by_som(
-    screenshot_path: str,
-    target_spec: Dict[str, Any],
-    screen_width: int,
-    screen_height: int,
-) -> Optional[Dict[str, Any]]:
-    """Résoudre une cible UI via Set-of-Mark + VLM.
-
-    Pipeline :
-    1. SomEngine détecte tous les éléments et les numérote sur le screenshot
-    2. VLM reçoit l'image annotée + description de la cible
-    3. VLM identifie le numéro du mark → coordonnées précises
-
-    Avantages vs VLM direct :
-    - Le VLM n'a qu'à identifier (son point fort), pas localiser
-    - Les coordonnées viennent de SomEngine (pixel-perfect)
-    - Question simple "quel numéro ?" → réponse simple
-
-    Args:
-        screenshot_path: Chemin du screenshot actuel
-        target_spec: Spécification de la cible (vlm_description, som_element, etc.)
-        screen_width: Largeur écran en pixels
-        screen_height: Hauteur écran en pixels
-
-    Returns:
-        Dict avec resolved=True et coordonnées, ou None si indisponible.
-    """
-    engine = _get_som_engine_api()
-    if engine is None:
-        return None
-
-    client = _get_vlm_client()
-    if client is None:
-        return None
-
-    t0 = time.time()
-
-    # ── 1. Lancer SomEngine sur le screenshot actuel ──
-    try:
-        from PIL import Image as PILImage
-        img = PILImage.open(screenshot_path).convert("RGB")
-        som_result = engine.analyze(img)
-    except Exception as e:
-        logger.warning("SoM resolve : erreur analyse — %s", e)
-        return None
-
-    if not som_result.elements:
-        logger.info("SoM resolve : 0 éléments détectés")
-        return None
-
-    # ── 2. Construire la description de la cible ──
-    som_element = target_spec.get("som_element", {})
-    vlm_description = target_spec.get("vlm_description", "")
-    anchor_label = som_element.get("label", "")
-
-    # Construire un prompt riche
-    target_parts = []
-    if anchor_label:
-        target_parts.append(f"texte '{anchor_label}'")
-    if vlm_description:
-        target_parts.append(vlm_description)
-    if not target_parts:
-        # Sans description, SoM resolve ne peut pas fonctionner
-        logger.debug("SoM resolve : pas de description pour identifier l'élément")
-        return None
-
-    target_desc = ", ".join(target_parts)
-
-    # ── 2.5. Raccourci : si le label est connu, chercher par texte directement ──
-    # Pas besoin du VLM si on connaît le texte exact de l'élément !
-    if anchor_label and len(anchor_label) >= 2:
-        label_lower = anchor_label.lower()
-        # Match exact d'abord, puis partiel
-        exact_matches = [
-            e for e in som_result.elements
-            if e.label and e.label.lower() == label_lower
-        ]
-        if not exact_matches:
-            exact_matches = [
-                e for e in som_result.elements
-                if e.label and len(e.label) >= 3 and (
-                    label_lower in e.label.lower()
-                    or e.label.lower() in label_lower
-                )
-            ]
-
-        if len(exact_matches) == 1:
-            # Match unique par texte → pas besoin du VLM
-            elem = exact_matches[0]
-            elapsed = time.time() - t0
-            cx_norm, cy_norm = elem.center_norm
-            logger.info(
-                "SoM resolve FAST : match texte unique '#%d %s' → (%.4f, %.4f) en %.1fs",
-                elem.id, elem.label, cx_norm, cy_norm, elapsed,
-            )
-            return {
-                "resolved": True,
-                "method": "som_text_match",
-                "x_pct": round(cx_norm, 6),
-                "y_pct": round(cy_norm, 6),
-                "matched_element": {
-                    "label": elem.label,
-                    "type": elem.source,
-                    "role": "som_text_match",
-                    "confidence": max(elem.confidence, 0.85),
-                    "som_id": elem.id,
-                },
-                "score": max(elem.confidence, 0.85),
-            }
-        elif len(exact_matches) > 1:
-            # Plusieurs matchs texte → disambiguïser par proximité à la position originale
-            ref_center = som_element.get("center_norm", [])
-            if ref_center and len(ref_center) == 2:
-                ref_x, ref_y = ref_center
-                best = min(
-                    exact_matches,
-                    key=lambda e: (
-                        (e.center_norm[0] - ref_x) ** 2
-                        + (e.center_norm[1] - ref_y) ** 2
-                    ),
-                )
-                elapsed = time.time() - t0
-                cx_norm, cy_norm = best.center_norm
-                dist = ((cx_norm - ref_x) ** 2 + (cy_norm - ref_y) ** 2) ** 0.5
-                if dist < 0.15:  # Tolérance 15% de l'écran
-                    logger.info(
-                        "SoM resolve FAST : match texte proximité '#%d %s' (dist=%.3f) "
-                        "→ (%.4f, %.4f) en %.1fs",
-                        best.id, best.label, dist, cx_norm, cy_norm, elapsed,
-                    )
-                    return {
-                        "resolved": True,
-                        "method": "som_text_match",
-                        "x_pct": round(cx_norm, 6),
-                        "y_pct": round(cy_norm, 6),
-                        "matched_element": {
-                            "label": best.label,
-                            "type": best.source,
-                            "role": "som_text_match_proximity",
-                            "confidence": max(best.confidence, 0.80),
-                            "som_id": best.id,
-                        },
-                        "score": max(best.confidence, 0.80),
-                    }
-            logger.info(
-                "SoM resolve : %d matchs texte pour '%s', VLM nécessaire",
-                len(exact_matches), anchor_label,
-            )
-
-    # ── 2.7. Fallback : template matching anchor vs éléments SomEngine ──
-    # Pour les icônes sans texte : comparer le crop de référence contre
-    # chaque région YOLO détectée par SomEngine.
-    anchor_b64 = target_spec.get("anchor_image_base64", "")
-    by_text = target_spec.get("by_text", "").strip()
-    if anchor_b64 and (not anchor_label or not by_text):
-        try:
-            import cv2
-            import numpy as np
-
-            # Décoder l'anchor
-            anc_bytes = base64.b64decode(anchor_b64)
-            anc_array = np.frombuffer(anc_bytes, dtype=np.uint8)
-            anc_img = cv2.imdecode(anc_array, cv2.IMREAD_GRAYSCALE)
-
-            # Charger le screenshot en OpenCV
-            screenshot_cv = cv2.imread(screenshot_path, cv2.IMREAD_GRAYSCALE)
-
-            if anc_img is not None and screenshot_cv is not None:
-                # Template matching de l'anchor sur le SCREENSHOT ENTIER
-                # (pas sur les régions individuelles — l'anchor est souvent plus grand)
-                anc_h, anc_w = anc_img.shape[:2]
-                if screenshot_cv.shape[0] >= anc_h and screenshot_cv.shape[1] >= anc_w:
-                    res = cv2.matchTemplate(screenshot_cv, anc_img, cv2.TM_CCOEFF_NORMED)
-                    _, max_score, _, max_loc = cv2.minMaxLoc(res)
-
-                    if max_score >= 0.5:
-                        # Centre du match
-                        match_cx = max_loc[0] + anc_w // 2
-                        match_cy = max_loc[1] + anc_h // 2
-
-                        # Trouver l'élément SomEngine le plus proche du centre du match
-                        best_elem = None
-                        best_dist = float("inf")
-                        for elem in som_result.elements:
-                            cx, cy = elem.center
-                            dist = ((match_cx - cx) ** 2 + (match_cy - cy) ** 2) ** 0.5
-                            if dist < best_dist:
-                                best_dist = dist
-                                best_elem = elem
-
-                        if best_elem and best_dist < 100:  # Max 100px de distance
-                            elapsed = time.time() - t0
-                            cx_norm, cy_norm = best_elem.center_norm
-                            logger.info(
-                                "SoM resolve ANCHOR : match crop score=%.3f → "
-                                "elem '#%d %s' (dist=%.0fpx) → (%.4f, %.4f) en %.1fs",
-                                max_score, best_elem.id, best_elem.label,
-                                best_dist, cx_norm, cy_norm, elapsed,
-                            )
-                            return {
-                                "resolved": True,
-                                "method": "som_anchor_match",
-                                "x_pct": round(cx_norm, 6),
-                                "y_pct": round(cy_norm, 6),
-                                "matched_element": {
-                                    "label": best_elem.label or f"icon #{best_elem.id}",
-                                    "type": best_elem.source,
-                                    "role": "som_anchor_match",
-                                    "confidence": max_score,
-                                    "som_id": best_elem.id,
-                                },
-                                "score": max_score,
-                            }
-        except ImportError:
-            pass
-        except Exception as e:
-            logger.debug("SoM anchor match erreur : %s", e)
-
-    # ── 3. Sauvegarder l'image annotée SoM temporairement ──
-    if som_result.som_image is None:
-        logger.debug("SoM resolve : pas d'image annotée, skip VLM")
-        return None
-
-    import tempfile
-    try:
-        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
-            som_result.som_image.save(tmp, format="JPEG", quality=85)
-            som_img_path = tmp.name
-    except Exception as e:
-        logger.warning("SoM resolve : erreur sauvegarde image annotée — %s", e)
-        return None
-
-    # ── 4. VLM : identifier le numéro du mark ──
-    # Lister uniquement les éléments avec un label (plus concis pour le VLM)
-    labeled_elements = [e for e in som_result.elements if e.label][:30]
-    elements_list = "\n".join(
-        f"  #{e.id}: '{e.label}'"
-        for e in labeled_elements
-    )
-
-    # Multi-image : SoM annotée + anchor crop (si disponible)
-    anchor_b64 = target_spec.get("anchor_image_base64", "")
-    extra_images = [anchor_b64] if anchor_b64 else None
-
-    if extra_images:
-        prompt = (
-            "Image 1 shows the screen with numbered marks on each UI element.\n"
-            "Image 2 shows the element I'm looking for.\n\n"
-            f"Target: {target_desc}\n\n"
-            f"Detected elements:\n{elements_list}\n\n"
-            "Which mark number matches the target element in Image 2?\n"
-            'Answer with JSON only: {"mark_id": N, "confidence": 0.9}'
-        )
-    else:
-        prompt = (
-            f"I'm looking for: {target_desc}\n\n"
-            f"Detected elements:\n{elements_list}\n\n"
-            "Which number is the correct element?\n"
-            'Answer with JSON only: {"mark_id": N, "confidence": 0.9}'
-        )
-
-    system_prompt = "You identify UI elements by number. Output JSON only, no explanation."
-
-    try:
-        result = client.generate(
-            prompt=prompt,
-            image_path=som_img_path,
-            system_prompt=system_prompt,
-            temperature=0.1,
-            max_tokens=50,
-            force_json=False,
-            extra_images_b64=extra_images,
-        )
-    except Exception as e:
-        logger.warning("SoM resolve : erreur VLM — %s", e)
-        return None
-    finally:
-        import os
-        try:
-            os.unlink(som_img_path)
-        except OSError:
-            pass
-
-    elapsed = time.time() - t0
-
-    if not result.get("success"):
-        logger.info("SoM resolve : VLM échoué (%.1fs)", elapsed)
-        return None
-
-    # ── 5. Parser la réponse et retourner les coordonnées ──
-    response_text = result.get("response", "").strip()
-
-    # Tenter d'abord l'extraction JSON standard
-    parsed = client._extract_json_from_response(response_text)
-
-    # Fallback : extraire un nombre simple de la réponse
-    if parsed is None:
-        import re
-        numbers = re.findall(r'\b(\d+)\b', response_text)
-        if numbers:
-            candidate = int(numbers[0])
-            if som_result.get_element_by_id(candidate) is not None:
-                parsed = {"mark_id": candidate, "confidence": 0.7}
-                logger.debug("SoM resolve : extraction numéro fallback → #%d", candidate)
-
-    if parsed is None:
-        logger.info("SoM resolve : réponse non-JSON (%.1fs) — %.80s", elapsed, response_text)
-        return None
-
-    mark_id = parsed.get("mark_id")
-    confidence = float(parsed.get("confidence", 0.0))
-
-    if mark_id is None or confidence < 0.3:
-        logger.info(
-            "SoM resolve : mark non trouvé ou confiance trop basse (mark=%s, conf=%.2f, %.1fs)",
-            mark_id, confidence, elapsed,
-        )
-        return None
-
-    mark_id = int(mark_id)
-    elem = som_result.get_element_by_id(mark_id)
-    if elem is None:
-        logger.warning("SoM resolve : mark #%d inexistant (%.1fs)", mark_id, elapsed)
-        return None
-
-    cx_norm, cy_norm = elem.center_norm
-    logger.info(
-        "SoM resolve OK : mark #%d '%s' → (%.4f, %.4f) conf=%.2f en %.1fs (%d éléments)",
-        mark_id, elem.label, cx_norm, cy_norm, confidence, elapsed, len(som_result.elements),
-    )
-
-    return {
-        "resolved": True,
-        "method": "som_vlm",
-        "x_pct": round(cx_norm, 6),
-        "y_pct": round(cy_norm, 6),
-        "matched_element": {
-            "label": elem.label or f"mark #{mark_id}",
-            "type": elem.source,
-            "role": "som_identified",
-            "confidence": confidence,
-            "som_id": mark_id,
-        },
-        "score": confidence,
-    }
-
-
-def _resolve_target_sync(
-    screenshot_path: str,
-    target_spec: Dict[str, Any],
-    screen_width: int,
-    screen_height: int,
-    fallback_x_pct: float,
-    fallback_y_pct: float,
-    strict_mode: bool = False,
-) -> Dict[str, Any]:
-    """Résoudre la cible visuellement (exécuté dans un thread séparé).
-
-    Hiérarchie de résolution (strict_mode=True, replay sessions) — VLM-FIRST :
-    1. VLM Quick Find (~3-8s) — compréhension sémantique de l'écran, multi-image
-       (screenshot + crop de référence + description riche)
-    1.5. SoM + VLM (~5-15s) — SomEngine numérote les éléments, VLM identifie le bon
-    2. Template matching OpenCV (~100ms) — fallback pixel, seuil STRICT 0.90
-    3. resolved=False → STOP le replay
-
-    Le VLM comprend le contexte (titre de fenêtre, type d'élément, position)
-    et peut trouver un élément même si l'écran est différent de l'enregistrement.
-    Le template matching ne compare que des pixels et produit des faux positifs.
-
-    Hiérarchie classique (strict_mode=False, VWB et autres) — INCHANGÉE :
-    1. Template matching OpenCV (~100ms) — seuil 0.70
-    1.5. VLM Quick Find si template échoue et by_text/by_role dispo
-    2. by_text/by_role → VLM Quick Find puis ScreenAnalyzer
-    3. fallback coordonnées statiques
-    """
-    anchor_image_b64 = target_spec.get("anchor_image_base64", "")
-
-    # ===================================================================
-    # MODE STRICT (replay sessions) — Stratégie VLM-FIRST
-    # ===================================================================
-    if strict_mode and anchor_image_b64:
-        vlm_description = target_spec.get("vlm_description", "")
-        by_text_strict = target_spec.get("by_text", "").strip()
-
-        # Fallback : construire la description depuis by_text/by_role
-        if not vlm_description:
-            by_role = target_spec.get("by_role", "").strip()
-            if by_text_strict or by_role:
-                vlm_description = _build_target_description(target_spec)
-
-        # ---------------------------------------------------------------
-        # Étape -1 : Vérification CLIP (si embedding de référence fourni)
-        # Vérifie qu'on est dans la bonne application avant de chercher
-        # l'élément. Filet de sécurité contre les clics au mauvais endroit.
-        # ---------------------------------------------------------------
-        clip_embedding = target_spec.get("clip_embedding")
-        if clip_embedding:
-            try:
-                from core.embedding.clip_embedder import CLIPEmbedder
-                from PIL import Image as _PILImage
-                import numpy as _np
-
-                _clip = CLIPEmbedder()
-                # Embedding de l'écran actuel (fenêtre si possible)
-                window_capture = target_spec.get("window_capture", {})
-                window_rect = window_capture.get("rect")
-                current_img = _PILImage.open(screenshot_path)
-                if window_rect:
-                    current_img = current_img.crop(tuple(window_rect))
-
-                current_emb = _np.array(_clip.embed_image(current_img), dtype=_np.float32).flatten()
-                ref_emb = _np.array(clip_embedding, dtype=_np.float32).flatten()
-
-                clip_sim = float(_np.dot(current_emb, ref_emb) / (
-                    _np.linalg.norm(current_emb) * _np.linalg.norm(ref_emb)
-                ))
-                logger.info(f"CLIP vérification : similarité={clip_sim:.3f}")
-
-                if clip_sim < 0.75:
-                    logger.warning(
-                        f"CLIP MISMATCH : sim={clip_sim:.3f} < 0.75 — "
-                        f"écran actuel trop différent de l'enregistrement"
-                    )
-                    return {
-                        "resolved": False,
-                        "method": "clip_mismatch",
-                        "reason": f"clip_similarity_{clip_sim:.3f}",
-                        "x_pct": fallback_x_pct,
-                        "y_pct": fallback_y_pct,
-                    }
-            except Exception as e:
-                logger.debug(f"CLIP vérification erreur (non-bloquant) : {e}")
-
-        # ---------------------------------------------------------------
-        # Étape 0 : Choisir la stratégie selon le type d'élément
-        # ---------------------------------------------------------------
-        by_text_source = target_spec.get("by_text_source", "")
-
-        has_window = bool(target_spec.get("window_capture", {}).get("rect"))
-
-        if by_text_strict and by_text_source in ("ocr", "vlm") and has_window:
-            # Texte visible DANS une fenêtre → grounding VLM sur fenêtre croppée
-            grounding_result = _resolve_by_grounding(
-                screenshot_path=screenshot_path,
-                target_spec=target_spec,
-                screen_width=screen_width,
-                screen_height=screen_height,
-            )
-            if grounding_result and grounding_result.get("resolved"):
-                logger.info(
-                    "Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'",
-                    grounding_result.get("x_pct", 0),
-                    grounding_result.get("y_pct", 0),
-                    by_text_strict[:50],
-                )
-                return grounding_result
-
-        if not by_text_strict or by_text_source not in ("ocr", "vlm"):
-            # Template matching pour les éléments sans texte (icônes pures)
-            window_capture = target_spec.get("window_capture", {})
-            window_rect = window_capture.get("rect")
-            from pathlib import Path as _Path
-            _full = _Path(screenshot_path)
-            _win = _full.parent / _full.name.replace("_full.png", "_window.png")
-            tm_path = str(_win) if _win.is_file() and window_rect else screenshot_path
-            tm_screen_w = (window_rect[2] - window_rect[0]) if window_rect and _win.is_file() else screen_width
-            tm_screen_h = (window_rect[3] - window_rect[1]) if window_rect and _win.is_file() else screen_height
-
-            result = _resolve_by_template_matching(
-                screenshot_path=tm_path,
-                anchor_image_b64=anchor_image_b64,
-                screen_width=tm_screen_w,
-                screen_height=tm_screen_h,
-                confidence_threshold=0.90,
-            )
-            if result and result.get("score", 0) >= 0.90:
-                x_tm, y_tm = result["x_pct"], result["y_pct"]
-                # Convertir coordonnées fenêtre → écran si nécessaire
-                if window_rect and _win.is_file():
-                    abs_x = window_rect[0] + x_tm * tm_screen_w
-                    abs_y = window_rect[1] + y_tm * tm_screen_h
-                    result["x_pct"] = round(abs_x / screen_width, 6)
-                    result["y_pct"] = round(abs_y / screen_height, 6)
-                logger.info(
-                    "Strict resolve TEMPLATE : icon match (score=%.3f)",
-                    result.get("score", 0),
-                )
-                return result
-
-        # ---------------------------------------------------------------
-        # Étape 1 : VLM Quick Find (fallback, multi-image)
-        # ---------------------------------------------------------------
-        if vlm_description or anchor_image_b64:
-            vlm_result = _vlm_quick_find(
-                screenshot_path=screenshot_path,
-                target_description=vlm_description,
-                anchor_image_b64=anchor_image_b64,
-            )
-            if vlm_result and vlm_result.get("resolved"):
-                if vlm_result.get("score", 0) >= 0.3:
-                    logger.info(
-                        "Strict resolve VLM-first : VLM OK (score=%.2f) pour '%s'",
-                        vlm_result.get("score", 0),
-                        vlm_description[:60] if vlm_description else "(anchor)",
-                    )
-                    return vlm_result
-                else:
-                    logger.info(
-                        "Strict resolve VLM-first : VLM score=%.2f trop bas, passage template",
-                        vlm_result.get("score", 0),
-                    )
-            else:
-                logger.info(
-                    "Strict resolve VLM-first : VLM échoué pour '%s', passage template matching",
-                    vlm_description[:60] if vlm_description else "(anchor)",
-                )
-
-        # ---------------------------------------------------------------
-        # Étape 1.5 : SoM + VLM (Set-of-Mark + identification)
-        # SomEngine numérote les éléments, VLM identifie le bon numéro.
-        # Plus fiable que le VLM direct car le VLM n'a qu'à identifier,
-        # pas localiser — et les coordonnées sont pixel-perfect.
-        # ---------------------------------------------------------------
-        som_element = target_spec.get("som_element", {})
-        if som_element or vlm_description:
-            som_result = _resolve_by_som(
-                screenshot_path=screenshot_path,
-                target_spec=target_spec,
-                screen_width=screen_width,
-                screen_height=screen_height,
-            )
-            if som_result and som_result.get("resolved"):
-                logger.info(
-                    "Strict resolve SoM+VLM : OK (score=%.2f, mark=#%s)",
-                    som_result.get("score", 0),
-                    som_result.get("matched_element", {}).get("som_id", "?"),
-                )
-                return som_result
-            else:
-                logger.info("Strict resolve SoM+VLM : échoué, passage template matching")
-
-        # ---------------------------------------------------------------
-        # Étape 2 : Template matching (fallback pixel) — seuil STRICT 0.90
-        # ---------------------------------------------------------------
-        result = _resolve_by_template_matching(
-            screenshot_path=screenshot_path,
-            anchor_image_b64=anchor_image_b64,
-            screen_width=screen_width,
-            screen_height=screen_height,
-            confidence_threshold=0.90,
-        )
-        if result:
-            score = result.get("score", 0)
-            # Score >= 0.95 : match quasi-parfait, pas besoin de valider le contexte
-            if score >= 0.95:
-                logger.info(
-                    "Strict resolve VLM-first : template matching fallback OK "
-                    "(score=%.3f >= 0.95, contexte skip — match quasi-parfait)",
-                    score,
-                )
-                return result
-            elif _validate_match_context(result, fallback_x_pct, fallback_y_pct, target_spec):
-                logger.info(
-                    "Strict resolve VLM-first : template matching fallback OK "
-                    "(score=%.3f >= 0.90, context OK)",
-                    score,
-                )
-                return result
-            else:
-                logger.warning(
-                    "Strict resolve VLM-first : template score=%.3f MAIS contexte invalide, rejeté",
-                    score,
-                )
-
-        # ---------------------------------------------------------------
-        # Étape 3 : RIEN ne fonctionne → resolved=False → STOP replay
-        # ---------------------------------------------------------------
-        return {
-            "resolved": False,
-            "method": "strict_vlm_template_failed",
-            "reason": "vlm_and_template_all_failed",
-            "x_pct": fallback_x_pct,
-            "y_pct": fallback_y_pct,
-        }
-
-    # ===================================================================
-    # MODE CLASSIQUE (VWB et autres) — Comportement existant
-    # ===================================================================
-
-    # ---------------------------------------------------------------
-    # Stratégie 1 : Template matching par image d'ancre (seuil 0.70)
-    # ---------------------------------------------------------------
-    if anchor_image_b64:
-        result = _resolve_by_template_matching(
-            screenshot_path=screenshot_path,
-            anchor_image_b64=anchor_image_b64,
-            screen_width=screen_width,
-            screen_height=screen_height,
-            confidence_threshold=0.7,
-        )
-        if result:
-            return result
-        logger.info(
-            "Template matching échoué pour ancre '%s', tentative VLM Quick Find",
-            target_spec.get("anchor_id", "?"),
-        )
-
-        # ---------------------------------------------------------------
-        # Stratégie 1.5 : VLM Quick Find (fallback léger après template matching)
-        # ---------------------------------------------------------------
-        by_text = target_spec.get("by_text", "").strip()
-        by_role = target_spec.get("by_role", "").strip()
-        if by_text or by_role:
-            vlm_desc = _build_target_description(target_spec)
-            vlm_result = _vlm_quick_find(
-                screenshot_path=screenshot_path,
-                target_description=vlm_desc,
-                anchor_image_b64=anchor_image_b64,
-            )
-            if vlm_result:
-                return vlm_result
-            logger.info(
-                "VLM Quick Find échoué pour ancre '%s', fallback coordonnées",
-                target_spec.get("anchor_id", "?"),
-            )
-
-        return {
-            "resolved": False,
-            "method": "fallback",
-            "reason": "template_matching_failed",
-            "x_pct": fallback_x_pct,
-            "y_pct": fallback_y_pct,
-        }
-
-    # ---------------------------------------------------------------
-    # Stratégie 2 : VLM Quick Find (léger, ~5-10s)
-    # ---------------------------------------------------------------
-    by_text = target_spec.get("by_text", "")
-    by_role = target_spec.get("by_role", "")
-
-    # Si aucun critère sémantique et pas d'ancre, fallback direct
-    if not by_text and not by_role and not anchor_image_b64:
-        return {
-            "resolved": False,
-            "method": "fallback",
-            "reason": "no_target_criteria",
-            "x_pct": fallback_x_pct,
-            "y_pct": fallback_y_pct,
-        }
-
-    # Tenter le VLM Quick Find AVANT ScreenAnalyzer (beaucoup plus rapide)
-    if by_text or by_role:
-        vlm_desc = _build_target_description(target_spec)
-        vlm_result = _vlm_quick_find(
-            screenshot_path=screenshot_path,
-            target_description=vlm_desc,
-        )
-        if vlm_result:
-            return vlm_result
-        logger.info(
-            "VLM Quick Find échoué pour '%s', fallback ScreenAnalyzer",
-            vlm_desc,
-        )
-
-    # ---------------------------------------------------------------
-    # Stratégie 3 : Matching sémantique via ScreenAnalyzer (~15-20s)
-    # ---------------------------------------------------------------
-    processor._ensure_initialized()
-
-    if processor._screen_analyzer is None:
-        return {
-            "resolved": False,
-            "method": "fallback",
-            "reason": "screen_analyzer_unavailable",
-            "x_pct": fallback_x_pct,
-            "y_pct": fallback_y_pct,
-        }
-
-    # Analyser le screenshot (Niveaux 1-3 : raw, OCR, UI elements)
-    try:
-        screen_state = processor._screen_analyzer.analyze(screenshot_path)
-    except Exception as e:
-        logger.warning(f"Analyse screenshot échouée: {e}")
-        return {
-            "resolved": False,
-            "method": "fallback",
-            "reason": f"analysis_failed: {e}",
-            "x_pct": fallback_x_pct,
-            "y_pct": fallback_y_pct,
-        }
-
-    ui_elements = screen_state.ui_elements or []
-    if not ui_elements:
-        logger.info("Aucun élément UI détecté, fallback coordonnées")
-        return {
-            "resolved": False,
-            "method": "fallback",
-            "reason": "no_ui_elements",
-            "x_pct": fallback_x_pct,
-            "y_pct": fallback_y_pct,
-        }
-
-    # Matching de la cible parmi les éléments détectés
-    candidates = []
-
-    for elem in ui_elements:
-        score = 0.0
-
-        # Score par texte (label)
-        if by_text and elem.label:
-            text_lower = by_text.lower()
-            label_lower = elem.label.lower()
-            if text_lower in label_lower or label_lower in text_lower:
-                score += 0.6
-            elif _fuzzy_match(text_lower, label_lower):
-                score += 0.3
-
-        # Score par rôle
-        if by_role:
-            role_lower = by_role.lower()
-            if elem.role and role_lower in elem.role.lower():
-                score += 0.3
-            if elem.type and role_lower in elem.type.lower():
-                score += 0.2
-
-        if score > 0:
-            candidates.append((elem, score))
-
-    if not candidates:
-        logger.info(
-            f"Aucun match visuel pour target(text='{by_text}', role='{by_role}') "
-            f"parmi {len(ui_elements)} éléments"
-        )
-        return {
-            "resolved": False,
-            "method": "fallback",
-            "reason": "no_match",
-            "x_pct": fallback_x_pct,
-            "y_pct": fallback_y_pct,
-            "ui_elements_count": len(ui_elements),
-        }
-
-    # Trier par score décroissant et prendre le meilleur
-    candidates.sort(key=lambda c: c[1], reverse=True)
-    best_elem, best_score = candidates[0]
-
-    # Convertir les coordonnées pixel en proportions
-    cx, cy = best_elem.center
-    x_pct = round(cx / screen_width, 6) if screen_width > 0 else 0.0
-    y_pct = round(cy / screen_height, 6) if screen_height > 0 else 0.0
-
-    logger.info(
-        f"Cible résolue visuellement: '{best_elem.label}' ({best_elem.type}/{best_elem.role}) "
-        f"score={best_score:.2f} → ({x_pct:.4f}, {y_pct:.4f})"
-    )
-
-    return {
-        "resolved": True,
-        "method": "visual",
-        "x_pct": x_pct,
-        "y_pct": y_pct,
-        "matched_element": {
-            "label": best_elem.label,
-            "type": best_elem.type,
-            "role": best_elem.role,
-            "center": list(best_elem.center),
-            "confidence": best_elem.label_confidence,
-        },
-        "score": best_score,
-        "candidates_count": len(candidates),
-        "ui_elements_count": len(ui_elements),
-    }
-
-
-def _fuzzy_match(a: str, b: str, threshold: float = 0.6) -> bool:
-    """Match approximatif par ratio de caractères communs."""
-    if not a or not b:
-        return False
-    common = sum(1 for c in a if c in b)
-    return (common / max(len(a), len(b))) >= threshold
-
-
-def _fallback_response(request: ResolveTargetRequest, reason: str, detail: str) -> Dict:
-    """Réponse de fallback quand la résolution visuelle échoue."""
-    return {
-        "resolved": False,
-        "method": "fallback",
-        "reason": reason,
-        "detail": detail,
-        "x_pct": request.fallback_x_pct,
-        "y_pct": request.fallback_y_pct,
-    }
-
-
 # =========================================================================
 # Learning Pack — Export / Import pour la fédération des apprentissages
 # =========================================================================
diff --git a/agent_v0/server_v1/replay_engine.py b/agent_v0/server_v1/replay_engine.py
new file mode 100644
index 000000000..b5c8bac41
--- /dev/null
+++ b/agent_v0/server_v1/replay_engine.py
@@ -0,0 +1,1284 @@
+# agent_v0/server_v1/replay_engine.py
+"""
+Replay Engine — Gestion des replays de workflows.
+
+Contient :
+- Setup environnement (préparation apps avant replay)
+- Validation des actions de replay (sécurité)
+- Conversion workflow → actions normalisées
+- Fonctions utilitaires de replay (session detection, state management, retry)
+- Pre-check écran par embedding CLIP
+- Détection popup
+
+Extrait de api_stream.py pour clarifier l'architecture.
+"""
+
+import json
+import logging
+import re
+import threading
+import time
+import uuid
+from collections import defaultdict
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger("api_stream")
+
+
+# =========================================================================
+# Validation des actions de replay (sécurité HIGH)
+# =========================================================================
+_ALLOWED_ACTION_TYPES = {
+    "click", "type", "key_combo", "scroll", "wait",
+    "file_open", "file_save", "file_close", "file_new", "file_dialog",
+    "double_click", "right_click", "drag",
+    "verify_screen",  # Replay hybride : vérification visuelle entre groupes
+}
+_MAX_ACTION_TEXT_LENGTH = 10000
+_MAX_KEYS_PER_COMBO = 10
+# Touches autorisées dans les key_combo (modificateurs + touches spéciales + caractères simples)
+_KNOWN_KEY_NAMES = {
+    "enter", "return", "tab", "escape", "esc", "backspace", "delete", "space",
+    "up", "down", "left", "right", "home", "end", "page_up", "page_down",
+    "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11", "f12",
+    "ctrl", "ctrl_l", "ctrl_r", "alt", "alt_l", "alt_r",
+    "shift", "shift_l", "shift_r",
+    "cmd", "win", "super", "super_l", "super_r", "windows", "meta",
+    "insert", "print_screen", "caps_lock", "num_lock",
+}
+
+
+def _validate_replay_action(action: dict) -> Optional[str]:
+    """Valide une action de replay. Retourne un message d'erreur ou None si valide."""
+    action_type = action.get("type", "")
+
+    # Vérifier le type d'action
+    if action_type not in _ALLOWED_ACTION_TYPES:
+        return f"Type d'action non autorisé : '{action_type}'. Autorisés : {sorted(_ALLOWED_ACTION_TYPES)}"
+
+    # Vérifier la longueur du texte
+    text = action.get("text", "")
+    if isinstance(text, str) and len(text) > _MAX_ACTION_TEXT_LENGTH:
+        return f"Texte trop long ({len(text)} > {_MAX_ACTION_TEXT_LENGTH} caractères)"
+
+    # Vérifier les touches
+    keys = action.get("keys", [])
+    if isinstance(keys, list):
+        if len(keys) > _MAX_KEYS_PER_COMBO:
+            return f"Trop de touches ({len(keys)} > {_MAX_KEYS_PER_COMBO})"
+        for key in keys:
+            key_lower = str(key).lower()
+            # Accepter les caractères simples (a-z, 0-9, ponctuation) et les noms connus
+            if len(str(key)) == 1 or key_lower in _KNOWN_KEY_NAMES:
+                continue
+            return f"Touche inconnue : '{key}'"
+
+    # Vérifier les coordonnées normalisées
+    for coord_name in ("x_pct", "y_pct"):
+        val = action.get(coord_name)
+        if val is not None:
+            try:
+                val_f = float(val)
+                if not (0.0 <= val_f <= 1.0):
+                    return f"Coordonnée {coord_name}={val_f} hors limites [0.0, 1.0]"
+            except (TypeError, ValueError):
+                return f"Coordonnée {coord_name} invalide : {val}"
+
+    return None  # Valide
+
+
+# =========================================================================
+# Setup environnement — Préparation automatique avant le replay
+# =========================================================================
+# Mapping des noms d'exécutables Windows courants vers la commande de lancement.
+# Utilisé comme fallback pour le texte de recherche dans le menu Démarrer.
+# Le format est : "processname.exe" (minuscule) -> commande shell
+_APP_LAUNCH_COMMANDS: Dict[str, str] = {
+    "notepad.exe": "notepad",
+    "explorer.exe": "explorer",
+    "calc.exe": "calc",
+    "mspaint.exe": "mspaint",
+    "cmd.exe": "cmd",
+    "powershell.exe": "powershell",
+    "wordpad.exe": "wordpad",
+    "charmap.exe": "charmap",
+    "snippingtool.exe": "snippingtool",
+    "taskmgr.exe": "taskmgr",
+    "regedit.exe": "regedit",
+    "mstsc.exe": "mstsc",
+    "winword.exe": "winword",
+    "excel.exe": "excel",
+    "powerpnt.exe": "powerpnt",
+    "outlook.exe": "outlook",
+    "msedge.exe": "msedge",
+    "chrome.exe": "chrome",
+    "firefox.exe": "firefox",
+    "code.exe": "code",
+}
+
+# Mapping des exécutables vers le nom visuel à chercher dans le menu Démarrer.
+# Contient le texte de recherche (souvent le nom français) et une description
+# pour le VLM afin d'identifier l'icône dans les résultats de recherche.
+# Format : "processname.exe" -> {"search_text": ..., "display_name": ..., "vlm_description": ...}
+_APP_VISUAL_SEARCH: Dict[str, Dict[str, str]] = {
+    "notepad.exe": {
+        "search_text": "Bloc-notes",
+        "display_name": "Bloc-notes",
+        "vlm_description": "L'application Bloc-notes (Notepad) dans les résultats de recherche",
+    },
+    "calc.exe": {
+        "search_text": "Calculatrice",
+        "display_name": "Calculatrice",
+        "vlm_description": "L'application Calculatrice dans les résultats de recherche",
+    },
+    "mspaint.exe": {
+        "search_text": "Paint",
+        "display_name": "Paint",
+        "vlm_description": "L'application Paint dans les résultats de recherche",
+    },
+    "cmd.exe": {
+        "search_text": "Invite de commandes",
+        "display_name": "Invite de commandes",
+        "vlm_description": "L'Invite de commandes (Command Prompt) dans les résultats",
+    },
+    "powershell.exe": {
+        "search_text": "PowerShell",
+        "display_name": "PowerShell",
+        "vlm_description": "Windows PowerShell dans les résultats de recherche",
+    },
+    "wordpad.exe": {
+        "search_text": "WordPad",
+        "display_name": "WordPad",
+        "vlm_description": "L'application WordPad dans les résultats de recherche",
+    },
+    "winword.exe": {
+        "search_text": "Word",
+        "display_name": "Microsoft Word",
+        "vlm_description": "Microsoft Word dans les résultats de recherche",
+    },
+    "excel.exe": {
+        "search_text": "Excel",
+        "display_name": "Microsoft Excel",
+        "vlm_description": "Microsoft Excel dans les résultats de recherche",
+    },
+    "powerpnt.exe": {
+        "search_text": "PowerPoint",
+        "display_name": "Microsoft PowerPoint",
+        "vlm_description": "Microsoft PowerPoint dans les résultats de recherche",
+    },
+    "outlook.exe": {
+        "search_text": "Outlook",
+        "display_name": "Microsoft Outlook",
+        "vlm_description": "Microsoft Outlook dans les résultats de recherche",
+    },
+    "msedge.exe": {
+        "search_text": "Edge",
+        "display_name": "Microsoft Edge",
+        "vlm_description": "Microsoft Edge dans les résultats de recherche",
+    },
+    "chrome.exe": {
+        "search_text": "Chrome",
+        "display_name": "Google Chrome",
+        "vlm_description": "Google Chrome dans les résultats de recherche",
+    },
+    "firefox.exe": {
+        "search_text": "Firefox",
+        "display_name": "Mozilla Firefox",
+        "vlm_description": "Mozilla Firefox dans les résultats de recherche",
+    },
+    "code.exe": {
+        "search_text": "Visual Studio Code",
+        "display_name": "Visual Studio Code",
+        "vlm_description": "Visual Studio Code dans les résultats de recherche",
+    },
+    "taskmgr.exe": {
+        "search_text": "Gestionnaire des tâches",
+        "display_name": "Gestionnaire des tâches",
+        "vlm_description": "Le Gestionnaire des tâches dans les résultats de recherche",
+    },
+    "snippingtool.exe": {
+        "search_text": "Outil Capture",
+        "display_name": "Outil Capture d'écran",
+        "vlm_description": "L'Outil Capture d'écran dans les résultats de recherche",
+    },
+    "mstsc.exe": {
+        "search_text": "Connexion Bureau à distance",
+        "display_name": "Bureau à distance",
+        "vlm_description": "La Connexion Bureau à distance dans les résultats",
+    },
+}
+
+# Applications Windows à ignorer pour le setup (processus système, agents, etc.)
+_SETUP_IGNORE_APPS = {
+    "searchhost.exe",     # Barre de recherche Windows
+    "explorer.exe",       # Explorer est toujours lancé (shell Windows)
+    "pythonw.exe",        # Agent Python (notre propre agent)
+    "python.exe",         # Idem
+    "shellexperiencehost.exe",
+    "startmenuexperiencehost.exe",
+    "applicationframehost.exe",
+    "systemsettings.exe",
+    "textinputhost.exe",
+    "runtimebroker.exe",
+}
+
+
+def _extract_required_apps_from_events(raw_events: list) -> Dict[str, Any]:
+    """Extraire les applications requises depuis les événements bruts d'une session.
+
+    Analyse les window_focus_change pour identifier :
+    - L'application principale (la plus utilisée hors apps système)
+    - La première fenêtre ciblée (pour le setup initial)
+
+    Args:
+        raw_events: Événements bruts depuis live_events.jsonl.
+
+    Returns:
+        Dict avec les clés :
+        - primary_app: str (nom de l'exécutable principal, ex: "Notepad.exe")
+        - primary_launch_cmd: str (commande Win+R, ex: "notepad")
+        - first_window_title: str (titre de la première fenêtre applicative)
+        - apps: dict[str, int] (app_name -> nombre d'occurrences)
+    """
+    app_counts: Dict[str, int] = defaultdict(int)
+    first_app = None
+    first_window_title = None
+
+    for raw_evt in raw_events:
+        event_data = raw_evt.get("event", raw_evt)
+        evt_type = event_data.get("type", "")
+
+        if evt_type == "window_focus_change":
+            to_info = event_data.get("to", {})
+            if not to_info:
+                continue
+            app_name = to_info.get("app_name", "")
+            title = to_info.get("title", "")
+            if app_name:
+                app_counts[app_name] += 1
+                if first_app is None and app_name.lower() not in _SETUP_IGNORE_APPS:
+                    first_app = app_name
+                    first_window_title = title
+
+        # Aussi extraire depuis les mouse_click qui ont un champ window
+        elif evt_type == "mouse_click":
+            window = event_data.get("window", {})
+            if isinstance(window, dict):
+                app_name = window.get("app_name", "")
+                if app_name:
+                    app_counts[app_name] += 1
+
+    if not app_counts:
+        return {}
+
+    # Déterminer l'application principale (la plus fréquente hors apps ignorées)
+    filtered_apps = {
+        k: v for k, v in app_counts.items()
+        if k.lower() not in _SETUP_IGNORE_APPS
+    }
+    if not filtered_apps:
+        return {}
+
+    primary_app = max(filtered_apps, key=filtered_apps.get)
+
+    # Résoudre la commande de lancement
+    primary_launch_cmd = _resolve_launch_command(primary_app)
+
+    return {
+        "primary_app": primary_app,
+        "primary_launch_cmd": primary_launch_cmd,
+        "first_window_title": first_window_title or "",
+        "apps": dict(app_counts),
+    }
+
+
+def _extract_required_apps_from_workflow(workflow) -> Dict[str, Any]:
+    """Extraire les applications requises depuis un workflow structuré.
+
+    Analyse les nodes du workflow pour identifier les titres de fenêtres
+    requis, puis infère l'application principale.
+
+    Args:
+        workflow: Objet Workflow ou dict brut.
+
+    Returns:
+        Même format que _extract_required_apps_from_events.
+    """
+    # Accéder aux données (objet ou dict)
+    if hasattr(workflow, 'nodes'):
+        nodes = workflow.nodes
+        metadata = workflow.metadata if hasattr(workflow, 'metadata') else {}
+    elif isinstance(workflow, dict):
+        nodes = workflow.get('nodes', [])
+        metadata = workflow.get('metadata', {})
+    else:
+        return {}
+
+    if not nodes:
+        return {}
+
+    # Collecter les titres de fenêtres depuis les nodes
+    window_titles = []
+    for node in nodes:
+        template = node.template if hasattr(node, 'template') else node.get('template', {})
+        if isinstance(template, dict):
+            window = template.get('window', {})
+        elif hasattr(template, 'window'):
+            window = template.window if hasattr(template.window, '__dict__') else {}
+        else:
+            window = {}
+
+        if isinstance(window, dict):
+            title = window.get('title_pattern', '') or window.get('title_contains', '')
+        elif hasattr(window, 'title_pattern'):
+            title = getattr(window, 'title_pattern', '') or ''
+        else:
+            title = ''
+
+        if title:
+            window_titles.append(title)
+
+    # Inférer l'app principale depuis les titres de fenêtres
+    primary_app, primary_launch_cmd, matched_title = _infer_app_from_window_titles(window_titles)
+    # Utiliser le titre qui a matché l'app (pas le premier node qui peut être "Rechercher")
+    first_title = matched_title or (window_titles[0] if window_titles else "")
+
+    if not primary_app:
+        return {}
+
+    source_session_id = metadata.get("source_session_id", "") if isinstance(metadata, dict) else ""
+    machine_id = metadata.get("machine_id", "") if isinstance(metadata, dict) else ""
+
+    return {
+        "primary_app": primary_app,
+        "primary_launch_cmd": primary_launch_cmd,
+        "first_window_title": first_title,
+        "apps": {},
+        "source_session_id": source_session_id,
+        "machine_id": machine_id,
+    }
+
+
+def _resolve_launch_command(app_name: str) -> str:
+    """Résoudre la commande Win+R pour lancer une application.
+
+    Si l'app n'est pas dans le mapping, utilise le nom de l'exécutable
+    directement sans l'extension .exe (fonctionne pour la plupart des apps).
+    """
+    app_lower = app_name.lower()
+    if app_lower in _APP_LAUNCH_COMMANDS:
+        return _APP_LAUNCH_COMMANDS[app_lower]
+    # Fallback : utiliser le nom sans l'extension .exe
+    if app_lower.endswith(".exe"):
+        return app_name[:-4]
+    return app_name
+
+
+def _infer_app_from_window_titles(titles: list) -> tuple:
+    """Inférer le nom de l'application et la commande de lancement depuis des titres de fenêtres.
+
+    Utilise des heuristiques basées sur les patterns de titres Windows courants.
+
+    Returns:
+        Tuple (app_name, launch_command, matched_title).
+        ("", "", "") si non identifié.
+    """
+    _TITLE_APP_PATTERNS = [
+        ("bloc-notes", "Notepad.exe", "notepad"),
+        ("notepad", "Notepad.exe", "notepad"),
+        ("word", "winword.exe", "winword"),
+        ("excel", "excel.exe", "excel"),
+        ("powerpoint", "powerpnt.exe", "powerpnt"),
+        ("outlook", "outlook.exe", "outlook"),
+        ("paint", "mspaint.exe", "mspaint"),
+        ("calculatrice", "calc.exe", "calc"),
+        ("calculator", "calc.exe", "calc"),
+        ("explorateur de fichiers", "explorer.exe", "explorer"),
+        ("file explorer", "explorer.exe", "explorer"),
+        ("invite de commandes", "cmd.exe", "cmd"),
+        ("command prompt", "cmd.exe", "cmd"),
+        ("powershell", "powershell.exe", "powershell"),
+        ("visual studio code", "code.exe", "code"),
+        ("edge", "msedge.exe", "msedge"),
+        ("chrome", "chrome.exe", "chrome"),
+        ("firefox", "firefox.exe", "firefox"),
+    ]
+
+    for title in titles:
+        title_lower = title.lower()
+        for pattern, app_name, launch_cmd in _TITLE_APP_PATTERNS:
+            if pattern in title_lower:
+                # Ignorer les apps système (explorer, etc.)
+                if app_name.lower() in _SETUP_IGNORE_APPS:
+                    continue
+                return (app_name, launch_cmd, title)
+
+    return ("", "", "")
+
+
+def _get_visual_search_info(app_name: str) -> Dict[str, str]:
+    """Obtenir les informations de recherche visuelle pour une application.
+
+    Consulte _APP_VISUAL_SEARCH, sinon construit un fallback à partir du nom
+    de l'exécutable (ex: "MonApp.exe" -> search_text="MonApp").
+
+    Args:
+        app_name: Nom de l'exécutable (ex: "Notepad.exe").
+
+    Returns:
+        Dict avec search_text, display_name, vlm_description.
+    """
+    app_lower = app_name.lower()
+    if app_lower in _APP_VISUAL_SEARCH:
+        return dict(_APP_VISUAL_SEARCH[app_lower])
+
+    # Fallback : utiliser le nom sans .exe
+    base_name = app_name[:-4] if app_lower.endswith(".exe") else app_name
+    return {
+        "search_text": base_name,
+        "display_name": base_name,
+        "vlm_description": f"L'application {base_name} dans les résultats de recherche",
+    }
+
+
+def _generate_setup_actions(
+    app_info: Dict[str, Any],
+    setup_id_prefix: str = "setup",
+) -> List[Dict[str, Any]]:
+    """Générer les actions 100% visuelles pour ouvrir l'application avant le replay.
+
+    Approche entièrement visuelle -- JAMAIS de raccourcis clavier (Win, Win+R,
+    Ctrl+X, etc.) qui n'ont pas été enregistrés par l'utilisateur. Tout passe
+    par des clics visuels résolus par le VLM (Qwen2.5-VL).
+
+    La séquence est :
+    1. Clic visuel sur le bouton Démarrer (coin bas-gauche de l'écran)
+    2. Attendre que le menu Démarrer s'ouvre (1s)
+    3. Clic visuel sur la barre de recherche du menu Démarrer
+    4. Attendre que la barre de recherche soit active (500ms)
+    5. Taper le nom de l'application (texte français, ex: "Bloc-notes")
+    6. Attendre les résultats de recherche (1.2s)
+    7. Clic visuel sur le résultat de l'application trouvée
+    8. Attendre que l'application s'ouvre (2-3s selon le poids)
+    9. verify_screen : vérifier que la fenêtre attendue est apparue
+
+    Args:
+        app_info: Dict retourné par _extract_required_apps_from_events ou
+            _extract_required_apps_from_workflow.
+        setup_id_prefix: Préfixe pour les action_id générés.
+
+    Returns:
+        Liste d'actions normalisées, prêtes à injecter dans la queue.
+        Liste vide si aucune préparation n'est nécessaire.
+    """
+    if not app_info:
+        return []
+
+    launch_cmd = app_info.get("primary_launch_cmd", "")
+    primary_app = app_info.get("primary_app", "")
+    first_title = app_info.get("first_window_title", "")
+
+    if not launch_cmd:
+        logger.debug(
+            "setup_actions : pas de commande de lancement pour '%s', skip",
+            primary_app,
+        )
+        return []
+
+    # Ne pas lancer les apps système (toujours présentes)
+    if primary_app.lower() in _SETUP_IGNORE_APPS:
+        logger.debug("setup_actions : app '%s' ignorée (système)", primary_app)
+        return []
+
+    # Obtenir les informations de recherche visuelle pour cette app
+    visual_info = _get_visual_search_info(primary_app)
+    search_text = visual_info["search_text"]
+    display_name = visual_info["display_name"]
+    vlm_description = visual_info["vlm_description"]
+
+    actions = []
+
+    logger.info(
+        "Génération setup env 100%% visuel : lancement de '%s' via clic "
+        "Démarrer → recherche visuelle '%s' (fenêtre attendue : '%s')",
+        primary_app, search_text, first_title,
+    )
+
+    # 1. Clic visuel sur le bouton Démarrer (toujours visible, bas-gauche)
+    #    Le VLM résout la position exacte ; x_pct/y_pct sont des fallbacks.
+    actions.append({
+        "action_id": f"act_{setup_id_prefix}_click_start",
+        "type": "click",
+        "x_pct": 0.02,
+        "y_pct": 0.98,
+        "button": "left",
+        "visual_mode": True,
+        "target_spec": {
+            "by_text": "Démarrer",
+            "by_role": "start_button",
+            "vlm_description": (
+                "Le bouton Démarrer de Windows (icône Windows), "
+                "en bas à gauche de la barre des tâches"
+            ),
+        },
+        "_setup_phase": True,
+        "_setup_step": "click_start_menu",
+    })
+
+    # 2. Attendre que le menu Démarrer s'ouvre
+    actions.append({
+        "action_id": f"act_{setup_id_prefix}_wait_start",
+        "type": "wait",
+        "duration_ms": 1000,
+        "_setup_phase": True,
+        "_setup_step": "wait_start_menu",
+    })
+
+    # 3. Clic visuel sur la barre de recherche du menu Démarrer
+    actions.append({
+        "action_id": f"act_{setup_id_prefix}_click_search",
+        "type": "click",
+        "x_pct": 0.20,
+        "y_pct": 0.92,
+        "button": "left",
+        "visual_mode": True,
+        "target_spec": {
+            "by_text": "Rechercher",
+            "by_role": "search_box",
+            "vlm_description": (
+                "La barre ou le champ de recherche dans le menu Démarrer "
+                "de Windows, souvent intitulé 'Tapez ici pour rechercher' "
+                "ou 'Rechercher'"
+            ),
+        },
+        "_setup_phase": True,
+        "_setup_step": "click_search_box",
+    })
+
+    # 4. Attendre que la barre de recherche soit active et prête
+    actions.append({
+        "action_id": f"act_{setup_id_prefix}_wait_search_ready",
+        "type": "wait",
+        "duration_ms": 500,
+        "_setup_phase": True,
+        "_setup_step": "wait_search_ready",
+    })
+
+    # 5. Taper le nom visuel de l'application (texte français)
+    actions.append({
+        "action_id": f"act_{setup_id_prefix}_type_search",
+        "type": "type",
+        "text": search_text,
+        "_setup_phase": True,
+        "_setup_step": "type_app_name",
+    })
+
+    # 6. Attendre que la recherche Windows trouve l'application
+    actions.append({
+        "action_id": f"act_{setup_id_prefix}_wait_results",
+        "type": "wait",
+        "duration_ms": 1200,
+        "_setup_phase": True,
+        "_setup_step": "wait_search_results",
+    })
+
+    # 7. Clic visuel sur le résultat de l'application dans la liste
+    actions.append({
+        "action_id": f"act_{setup_id_prefix}_click_result",
+        "type": "click",
+        "x_pct": 0.20,
+        "y_pct": 0.50,
+        "button": "left",
+        "visual_mode": True,
+        "target_spec": {
+            "by_text": display_name,
+            "by_role": "app_icon",
+            "vlm_description": vlm_description,
+        },
+        "_setup_phase": True,
+        "_setup_step": "click_app_result",
+    })
+
+    # 8. Attendre que l'application s'ouvre
+    heavy_apps = {"winword.exe", "excel.exe", "powerpnt.exe", "outlook.exe", "code.exe"}
+    wait_ms = 3000 if primary_app.lower() in heavy_apps else 2000
+    actions.append({
+        "action_id": f"act_{setup_id_prefix}_wait_launch",
+        "type": "wait",
+        "duration_ms": wait_ms,
+        "_setup_phase": True,
+        "_setup_step": "wait_app_launch",
+    })
+
+    # 9. Vérification visuelle que la fenêtre attendue est apparue
+    if first_title:
+        actions.append({
+            "action_id": f"act_{setup_id_prefix}_verify",
+            "type": "verify_screen",
+            "expected_node": "setup_initial",
+            "timeout_ms": 5000,
+            "_setup_phase": True,
+            "_setup_step": "verify_app_ready",
+            "_expected_title": first_title,
+        })
+
+    logger.info(
+        "Setup env visuel généré : %d actions pour lancer '%s' "
+        "(recherche visuelle : '%s')",
+        len(actions), primary_app, search_text,
+    )
+
+    return actions
+
+
+# =========================================================================
+# Replay — Fonctions de conversion workflow → actions
+# =========================================================================
+
+def _find_active_agent_session(session_manager, machine_id: Optional[str] = None) -> Optional[str]:
+    """Trouver la dernière session Agent V1 pour le replay.
+
+    Stratégie en 2 passes :
+    1. D'abord chercher une session non-finalisée (Agent V1 actif)
+    2. Sinon, prendre la plus récente même finalisée
+
+    Args:
+        session_manager: Instance LiveSessionManager.
+        machine_id: Si fourni, ne chercher que les sessions de cette machine.
+    """
+    with session_manager._lock:
+        all_agent_sessions = [
+            s for s in session_manager._sessions.values()
+            if s.session_id.startswith("sess_")
+            and (machine_id is None or s.machine_id == machine_id)
+        ]
+
+    if not all_agent_sessions:
+        return None
+
+    # Trier par session_id (contient un timestamp) — plus récent d'abord
+    all_agent_sessions.sort(key=lambda s: s.session_id, reverse=True)
+
+    # Passe 1 : préférer une session non-finalisée
+    for s in all_agent_sessions:
+        if not s.finalized:
+            return s.session_id
+
+    # Passe 2 : fallback sur la plus récente (même finalisée)
+    return all_agent_sessions[0].session_id
+
+
+def _workflow_to_actions(
+    workflow,
+    params: Optional[Dict[str, Any]] = None,
+    processor=None,
+    gesture_catalog=None,
+) -> List[Dict[str, Any]]:
+    """
+    Convertir un workflow (nodes + edges ordonnés) en liste d'actions normalisées.
+
+    Parcourt le graphe depuis les entry_nodes en suivant les edges.
+    Chaque edge produit une action normalisée avec coordonnées en pourcentage.
+
+    Mode intelligent (workflows appris par Léa) :
+        Si le workflow a des nodes avec des prototype_vectors, utilise le
+        StreamProcessor.extract_enriched_actions() qui enrichit les actions
+        avec les données de la session originale, le ciblage visuel et le
+        pre-check/post-check par embedding CLIP.
+
+    Mode classique (workflows VWB/manuels) :
+        Parcours BFS classique avec _edge_to_normalized_actions().
+    """
+    params = params or {}
+
+    # Détection d'un workflow appris (a des nodes avec prototype_vectors)
+    # et qui a des edges structurés
+    if _is_learned_workflow(workflow) and processor is not None:
+        # Priorité 1 : replay hybride (événements bruts + structure workflow)
+        hybrid = processor.build_hybrid_replay(workflow)
+        if hybrid:
+            logger.info(
+                "Replay hybride : %d actions depuis events bruts + structure workflow",
+                len(hybrid),
+            )
+            # Optimisation par gestes clavier si disponible
+            if gesture_catalog and hybrid:
+                hybrid = gesture_catalog.optimize_replay_actions(hybrid)
+            return hybrid
+
+        # Priorité 2 : enrichissement classique (fallback si hybride échoue)
+        enriched = processor.extract_enriched_actions(workflow, params)
+        if enriched:
+            logger.info(
+                "Replay intelligent : %d actions enrichies depuis le workflow appris",
+                len(enriched),
+            )
+            if gesture_catalog and enriched:
+                enriched = gesture_catalog.optimize_replay_actions(enriched)
+            return enriched
+        # Si l'enrichissement échoue aussi, fallback sur le mode classique
+        logger.warning(
+            "Enrichissement échoué pour le workflow appris, fallback mode classique"
+        )
+
+    # Mode classique (VWB/manuels ou fallback)
+    actions = []
+
+    # Construire un index des edges sortants par node
+    outgoing: Dict[str, list] = defaultdict(list)
+    for edge in workflow.edges:
+        outgoing[edge.from_node].append(edge)
+
+    # Parcours linéaire depuis le premier entry_node
+    visited = set()
+    current_nodes = list(workflow.entry_nodes) if workflow.entry_nodes else []
+
+    # Fallback : si pas d'entry_nodes, prendre le premier node
+    if not current_nodes and workflow.nodes:
+        current_nodes = [workflow.nodes[0].node_id]
+
+    while current_nodes:
+        node_id = current_nodes.pop(0)
+        if node_id in visited:
+            continue
+        visited.add(node_id)
+
+        edges = outgoing.get(node_id, [])
+        for edge in edges:
+            edge_actions = _edge_to_normalized_actions(edge, params)
+            actions.extend(edge_actions)
+            # Suivre le graphe vers le prochain node
+            if edge.to_node not in visited:
+                current_nodes.append(edge.to_node)
+
+    # Optimisation : substituer les actions visuelles par des gestes clavier si possible
+    if gesture_catalog and actions:
+        actions = gesture_catalog.optimize_replay_actions(actions)
+
+    return actions
+
+
+def _is_learned_workflow(workflow) -> bool:
+    """Détecter si un workflow est un workflow appris (vs VWB/manuel).
+
+    Un workflow appris a :
+    - Des nodes avec _prototype_vector dans metadata
+    - Des edges avec from_node/to_node
+    - Un learning_state indicatif (OBSERVATION, COACHING, AUTO_CANDIDATE, etc.)
+
+    Un workflow VWB/manuel a généralement :
+    - Des edges avec des target_spec complets (by_text, by_role remplis)
+    - Pas de prototype_vectors
+    """
+    # Accéder aux données (objet ou dict)
+    if hasattr(workflow, 'nodes'):
+        nodes = workflow.nodes
+        edges = workflow.edges
+    elif isinstance(workflow, dict):
+        nodes = workflow.get('nodes', [])
+        edges = workflow.get('edges', [])
+    else:
+        return False
+
+    if not nodes or not edges:
+        return False
+
+    # Vérifier si au moins un node a un prototype_vector
+    has_prototype = False
+    for node in nodes:
+        metadata = node.metadata if hasattr(node, 'metadata') else node.get('metadata', {})
+        if isinstance(metadata, dict) and '_prototype_vector' in metadata:
+            has_prototype = True
+            break
+
+    return has_prototype
+
+
+def _edge_to_normalized_actions(edge, params: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """
+    Convertir un WorkflowEdge en liste d'actions normalisées pour l'Agent V1.
+
+    Un edge simple produit 1 action, un edge compound produit N actions (une par step).
+    """
+    action = edge.action
+    if action is None:
+        logger.warning(f"Edge {edge.edge_id} sans action, skip")
+        return []
+    action_type = action.type
+    target = action.target
+    action_params = action.parameters or {}
+
+    # Extraire les coordonnées normalisées depuis TargetSpec.by_position
+    x_pct = 0.0
+    y_pct = 0.0
+    if target and target.by_position:
+        px, py = target.by_position
+        if px <= 1.0 and py <= 1.0:
+            x_pct = px
+            y_pct = py
+        else:
+            ref_w = action_params.get("ref_width", 1920) or 1920
+            ref_h = action_params.get("ref_height", 1080) or 1080
+            x_pct = round(px / ref_w, 6)
+            y_pct = round(py / ref_h, 6)
+
+    base = {"edge_id": edge.edge_id, "from_node": edge.from_node, "to_node": edge.to_node}
+
+    # Compound : décomposer en actions individuelles
+    if action_type == "compound":
+        return _expand_compound_steps(action_params.get("steps", []), base, params)
+
+    # Actions simples
+    normalized = {**base, "action_id": f"act_{uuid.uuid4().hex[:8]}"}
+
+    if action_type == "mouse_click":
+        normalized["type"] = "click"
+        normalized["x_pct"] = x_pct
+        normalized["y_pct"] = y_pct
+        normalized["button"] = action_params.get("button", "left")
+
+    elif action_type == "text_input":
+        normalized["type"] = "type"
+        text = action_params.get("text", "")
+        text = _substitute_variables(text, params, action_params.get("defaults", {}))
+        normalized["text"] = text
+        normalized["x_pct"] = x_pct
+        normalized["y_pct"] = y_pct
+
+    elif action_type == "key_press":
+        normalized["type"] = "key_combo"
+        keys = action_params.get("keys", [])
+        if not keys and action_params.get("key"):
+            keys = [action_params["key"]]
+        normalized["keys"] = keys
+
+    else:
+        logger.warning(f"Type d'action inconnu : {action_type}")
+        return []
+
+    # Ajouter le target_spec complet pour la résolution visuelle
+    target_spec = {}
+    if target and target.by_role:
+        target_spec["by_role"] = target.by_role
+        normalized["target_role"] = target.by_role  # Compat debug
+    if target and target.by_text:
+        target_spec["by_text"] = target.by_text
+        normalized["target_text"] = target.by_text  # Compat debug
+    if target and hasattr(target, 'context_hints') and target.context_hints:
+        target_spec["context_hints"] = target.context_hints
+    if target_spec:
+        normalized["target_spec"] = target_spec
+        normalized["visual_mode"] = True  # Signal à l'agent d'utiliser la résolution visuelle
+
+    return [normalized]
+
+
+def _substitute_variables(text: str, params: Dict[str, Any], defaults: Dict[str, Any]) -> str:
+    """Substituer les variables ${var} dans un texte.
+
+    Priorité : params utilisateur > defaults du workflow > texte brut inchangé.
+    Supporte ${var} dans un texte plus long (ex: "${expression}=").
+    """
+    def replacer(match):
+        var_name = match.group(1)
+        return str(params.get(var_name, defaults.get(var_name, match.group(0))))
+
+    return re.sub(r'\$\{(\w+)\}', replacer, text)
+
+
+def _expand_compound_steps(
+    steps: List[Dict[str, Any]], base: Dict[str, Any], params: Dict[str, Any]
+) -> List[Dict[str, Any]]:
+    """Décomposer les steps d'un compound en actions individuelles."""
+    actions = []
+    for step in steps:
+        step_type = step.get("type", "unknown")
+        action = {
+            **base,
+            "action_id": f"act_{uuid.uuid4().hex[:8]}",
+        }
+
+        if step_type == "key_press":
+            action["type"] = "key_combo"
+            keys = step.get("keys", [])
+            if not keys and step.get("key"):
+                keys = [step["key"]]
+            action["keys"] = keys
+
+        elif step_type == "text_input":
+            action["type"] = "type"
+            text = step.get("text", "")
+            text = _substitute_variables(text, params, {})
+            action["text"] = text
+
+        elif step_type == "wait":
+            action["type"] = "wait"
+            action["duration_ms"] = step.get("duration_ms", 500)
+
+        elif step_type == "mouse_click":
+            action["type"] = "click"
+            action["x_pct"] = step.get("x_pct", 0.0)
+            action["y_pct"] = step.get("y_pct", 0.0)
+            action["button"] = step.get("button", "left")
+
+        else:
+            logger.debug(f"Step compound inconnu : {step_type}")
+            continue
+
+        actions.append(action)
+
+    return actions
+
+
+# =========================================================================
+# Pre-check écran — Vérification pré-action par embedding CLIP
+# =========================================================================
+
+
+def _pre_check_screen_state(
+    session_id: str,
+    expected_node_id: str,
+    current_screenshot_path: str,
+    active_processor,
+    replay_states: Dict[str, Dict[str, Any]],
+    replay_lock: threading.Lock,
+    precheck_threshold: float = 0.85,
+) -> Dict[str, Any]:
+    """Vérifier que l'écran actuel correspond à l'état attendu du node.
+
+    Compare le screenshot actuel avec le prototype du node attendu
+    via similarité d'embedding CLIP (rapide, ~200ms).
+
+    Args:
+        session_id: ID de la session de replay
+        expected_node_id: ID du node source de l'action (from_node)
+        current_screenshot_path: Chemin du screenshot heartbeat récent
+        active_processor: Instance StreamProcessor avec le CLIPEmbedder chargé
+        replay_states: Dict partagé des états de replay
+        replay_lock: Lock pour l'accès concurrent aux replay_states
+        precheck_threshold: Seuil de similarité cosine
+
+    Returns:
+        {"match": True/False, "similarity": float, "expected_node": str,
+         "reason": str (si mismatch), "popup_detected": bool}
+    """
+    result: Dict[str, Any] = {
+        "match": True,
+        "similarity": 1.0,
+        "expected_node": expected_node_id,
+        "popup_detected": False,
+    }
+
+    try:
+        # 1. Trouver le workflow actif pour cette session
+        replay_state = None
+        workflow = None
+        with replay_lock:
+            for state in replay_states.values():
+                if state["session_id"] == session_id and state["status"] == "running":
+                    replay_state = state
+                    break
+
+        if not replay_state:
+            result["reason"] = "no_active_replay"
+            return result
+
+        workflow_id = replay_state.get("workflow_id", "")
+        with active_processor._data_lock:
+            workflow = active_processor._workflows.get(workflow_id)
+
+        if workflow is None:
+            result["reason"] = "workflow_not_found"
+            return result
+
+        # 2. Récupérer le prototype du node attendu
+        # Supporter à la fois les objets Workflow et les dicts bruts
+        node = None
+        if hasattr(workflow, "get_node"):
+            node = workflow.get_node(expected_node_id)
+        elif isinstance(workflow, dict):
+            # Format dict brut (workflows VWB/manuels)
+            for n in workflow.get("nodes", []):
+                if n.get("node_id") == expected_node_id:
+                    node = n
+                    break
+
+        if node is None:
+            result["reason"] = "node_not_found"
+            return result
+
+        # Extraire le prototype vector
+        metadata = node.metadata if hasattr(node, "metadata") else node.get("metadata", {})
+        proto_list = metadata.get("_prototype_vector")
+        if not proto_list or not isinstance(proto_list, (list, tuple)):
+            result["reason"] = "no_prototype_vector"
+            return result
+
+        import numpy as np
+        prototype_vector = np.array(proto_list, dtype=np.float32)
+
+        # 3. Calculer l'embedding CLIP du screenshot actuel
+        active_processor._ensure_initialized()
+        if active_processor._clip_embedder is None:
+            result["reason"] = "clip_embedder_unavailable"
+            return result
+
+        from PIL import Image
+        pil_image = Image.open(current_screenshot_path)
+        current_vector = active_processor._clip_embedder.embed_image(pil_image)
+
+        if current_vector is None or len(current_vector) == 0:
+            result["reason"] = "embedding_failed"
+            return result
+
+        # 4. Similarité cosine
+        current_vector = current_vector.flatten().astype(np.float32)
+        prototype_vector = prototype_vector.flatten().astype(np.float32)
+
+        norm_current = np.linalg.norm(current_vector)
+        norm_proto = np.linalg.norm(prototype_vector)
+        if norm_current < 1e-8 or norm_proto < 1e-8:
+            result["reason"] = "zero_norm_vector"
+            result["match"] = False
+            result["similarity"] = 0.0
+            return result
+
+        similarity = float(
+            np.dot(current_vector, prototype_vector) / (norm_current * norm_proto)
+        )
+        result["similarity"] = round(similarity, 4)
+        result["match"] = similarity >= precheck_threshold
+
+        if not result["match"]:
+            result["reason"] = "screen_mismatch"
+            logger.warning(
+                f"Pre-check MISMATCH pour session={session_id} "
+                f"node={expected_node_id}: similarity={similarity:.4f} "
+                f"< seuil={precheck_threshold}"
+            )
+
+            # 5. Détection de popup par changement de titre de fenêtre
+            result["popup_detected"] = _detect_popup_hint(
+                session_id, workflow, expected_node_id, active_processor,
+            )
+
+    except Exception as e:
+        # Ne jamais bloquer le replay en cas d'erreur du pre-check
+        logger.error(f"Pre-check échoué (non bloquant): {e}")
+        result["match"] = True  # Fallback permissif
+        result["reason"] = f"precheck_error: {e}"
+
+    return result
+
+
+def _detect_popup_hint(
+    session_id: str,
+    workflow: Any,
+    expected_node_id: str,
+    processor_instance=None,
+) -> bool:
+    """Détecter si une popup ou un dialogue modal est probable.
+
+    Compare le titre de fenêtre actuel (via last_window_info de la session)
+    avec le titre attendu du node dans le workflow. Un changement de titre
+    suggère une popup/dialogue inattendu.
+
+    Args:
+        session_id: ID de la session
+        workflow: Workflow object ou dict
+        expected_node_id: ID du node attendu
+        processor_instance: StreamProcessor pour accéder aux sessions
+
+    Returns:
+        True si un changement de titre suggère une popup
+    """
+    try:
+        if processor_instance is None:
+            return False
+        # Titre actuel depuis la session
+        session = processor_instance.session_manager.get_session(session_id)
+        if not session:
+            return False
+        current_title = session.last_window_info.get("title", "").strip().lower()
+        if not current_title or current_title == "unknown":
+            return False
+
+        # Titre attendu depuis le node du workflow
+        expected_title = ""
+        if hasattr(workflow, "get_node"):
+            node = workflow.get_node(expected_node_id)
+            if node and hasattr(node, "template") and hasattr(node.template, "window"):
+                window_spec = node.template.window
+                if hasattr(window_spec, "title_contains") and window_spec.title_contains:
+                    expected_title = window_spec.title_contains.strip().lower()
+        elif isinstance(workflow, dict):
+            for n in workflow.get("nodes", []):
+                if n.get("node_id") == expected_node_id:
+                    template = n.get("template", {})
+                    window = template.get("window", {})
+                    expected_title = (window.get("title_contains") or "").strip().lower()
+                    break
+
+        if not expected_title:
+            return False
+
+        # Si le titre actuel ne contient plus le titre attendu, popup probable
+        if expected_title not in current_title:
+            logger.info(
+                f"Popup détectée: titre actuel='{current_title}' "
+                f"ne contient pas '{expected_title}'"
+            )
+            return True
+
+    except Exception as e:
+        logger.debug(f"Détection popup échouée: {e}")
+
+    return False
+
+
+# =========================================================================
+# Replay — État et retry
+# =========================================================================
+
+def _create_replay_state(
+    replay_id: str,
+    workflow_id: str,
+    session_id: str,
+    total_actions: int,
+    params: Optional[Dict[str, Any]] = None,
+    machine_id: Optional[str] = None,
+) -> Dict[str, Any]:
+    """Créer un état de replay enrichi avec les champs de suivi d'erreur."""
+    return {
+        "replay_id": replay_id,
+        "workflow_id": workflow_id,
+        "session_id": session_id,
+        "machine_id": machine_id or "default",  # Machine cible du replay
+        "status": "running",
+        "total_actions": total_actions,
+        "completed_actions": 0,
+        "failed_actions": 0,
+        "current_action_index": 0,
+        "params": params or {},
+        "results": [],  # Historique des résultats action par action
+        # Champs enrichis pour le suivi d'erreur (#7)
+        "retried_actions": 0,
+        "unverified_actions": 0,
+        "error_log": [],         # Liste des erreurs rencontrées
+        "last_screenshot": None, # Path du dernier screenshot reçu
+        "_last_screenshot_before": None,  # Interne: screenshot avant la dernière action
+        # Champs pour pause supervisée (target_not_found)
+        "failed_action": None,   # Contexte de l'action en echec (quand paused_need_help)
+        "pause_message": None,   # Message a afficher a l'utilisateur
+    }
+
+
+def _schedule_retry(
+    session_id: str,
+    replay_state: Dict[str, Any],
+    action: Dict[str, Any],
+    current_retry: int,
+    reason: str,
+    replay_queues: Dict[str, List[Dict[str, Any]]],
+    retry_pending: Dict[str, Dict[str, Any]],
+    max_retries: int = 3,
+):
+    """
+    Programmer un retry pour une action échouée.
+
+    Stratégie :
+    - Retry 1 : réinjecter l'action directement (re-résolution visuelle par l'agent)
+    - Retry 2 : injecter un wait de 2s avant l'action (possible loading en cours)
+    - Retry 3 : dernier essai direct
+
+    L'action est réinsérée en tête de la queue pour être la prochaine exécutée.
+    Le lock de replay doit être acquis par l'appelant.
+    """
+    next_retry = current_retry + 1
+    replay_state["retried_actions"] += 1
+
+    # Créer une copie de l'action avec un nouveau action_id pour le tracking
+    retry_action = dict(action)
+    retry_action_id = f"{action.get('action_id', 'unknown')}_retry{next_retry}"
+    retry_action["action_id"] = retry_action_id
+
+    # Stocker l'info de retry pour le prochain report_action_result
+    retry_pending[retry_action_id] = {
+        "action": action,
+        "retry_count": next_retry,
+        "replay_id": replay_state["replay_id"],
+        "reason": reason,
+    }
+
+    # Stratégie de retry selon le numéro
+    actions_to_insert = []
+
+    if next_retry == 2:
+        # Retry 2 : injecter un wait de 2s avant l'action
+        wait_action = {
+            "action_id": f"wait_retry_{uuid.uuid4().hex[:6]}",
+            "type": "wait",
+            "duration_ms": 2000,
+        }
+        actions_to_insert.append(wait_action)
+
+    actions_to_insert.append(retry_action)
+
+    # Insérer en tête de la queue (prochaine action à exécuter)
+    queue = replay_queues.get(session_id, [])
+    replay_queues[session_id] = actions_to_insert + queue
+
+    logger.info(
+        f"Retry {next_retry}/{max_retries} programmé pour {action.get('action_id')} "
+        f"(raison: {reason}) | nouveau id: {retry_action_id}"
+    )
+
+
+def _notify_error_callback(
+    replay_state: Dict[str, Any],
+    action_id: str,
+    error: Optional[str],
+    error_callbacks: Dict[str, str],
+):
+    """
+    Notifier le callback d'erreur si configuré pour ce replay.
+
+    Appel HTTP POST non-bloquant vers l'URL de callback.
+    En cas d'échec de notification, on log mais on ne bloque pas.
+    """
+    replay_id = replay_state["replay_id"]
+    callback_url = error_callbacks.get(replay_id)
+    if not callback_url:
+        return
+
+    def _send_callback():
+        try:
+            import urllib.request
+            payload = json.dumps({
+                "replay_id": replay_id,
+                "workflow_id": replay_state.get("workflow_id"),
+                "session_id": replay_state.get("session_id"),
+                "action_id": action_id,
+                "error": error or "Erreur inconnue",
+                "retried_actions": replay_state.get("retried_actions", 0),
+                "error_log": replay_state.get("error_log", []),
+                "status": replay_state.get("status"),
+            }).encode("utf-8")
+
+            req = urllib.request.Request(
+                callback_url,
+                data=payload,
+                headers={"Content-Type": "application/json"},
+                method="POST",
+            )
+            with urllib.request.urlopen(req, timeout=5) as resp:
+                logger.info(
+                    f"Error callback envoyé à {callback_url}: {resp.status}"
+                )
+        except Exception as e:
+            logger.warning(
+                f"Échec envoi error callback à {callback_url}: {e}"
+            )
+
+    # Envoyer en arrière-plan pour ne pas bloquer
+    threading.Thread(target=_send_callback, daemon=True).start()
diff --git a/agent_v0/server_v1/resolve_engine.py b/agent_v0/server_v1/resolve_engine.py
new file mode 100644
index 000000000..ea90b57db
--- /dev/null
+++ b/agent_v0/server_v1/resolve_engine.py
@@ -0,0 +1,1953 @@
+# agent_v0/server_v1/resolve_engine.py
+"""
+Résolution visuelle des cibles UI pour le replay.
+
+Contient toutes les stratégies de résolution :
+- Template matching OpenCV (~100ms)
+- YOLO/OmniParser (~0.6-0.8s)
+- VLM Quick Find (~3-8s)
+- VLM Grounding Direct (~5-15s)
+- SomEngine + VLM (~5-15s)
+- Matching sémantique ScreenAnalyzer (~15-20s)
+- Pré-analyse écran (Observer — popup detection)
+
+Extrait de api_stream.py pour clarifier l'architecture.
+"""
+
+import base64
+import io
+import logging
+import os
+import re
+import tempfile
+import threading
+import time
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel
+
+logger = logging.getLogger("api_stream")
+
+
+# =========================================================================
+# Modèles Pydantic
+# =========================================================================
+
+class ResolveTargetRequest(BaseModel):
+    """Requête de résolution visuelle d'une cible."""
+    session_id: str
+    screenshot_b64: str  # Screenshot JPEG en base64
+    target_spec: Dict[str, Any]  # {by_role, by_text, by_position, ...}
+    fallback_x_pct: float = 0.0  # Coordonnées de fallback
+    fallback_y_pct: float = 0.0
+    screen_width: int = 1920
+    screen_height: int = 1080
+    strict_mode: bool = False  # True pour replay sessions (seuil template 0.90 + YOLO)
+
+
+class PreAnalyzeRequest(BaseModel):
+    """Requête de pré-analyse écran (Observer)."""
+    screenshot_b64: str
+    expected_state: str = ""       # Description attendue de l'état écran
+    window_title: str = ""         # Titre fenêtre attendu
+    screen_width: int = 1920
+    screen_height: int = 1080
+
+
+# =========================================================================
+# Template Matching
+# =========================================================================
+
+def _resolve_by_template_matching(
+    screenshot_path: str,
+    anchor_image_b64: str,
+    screen_width: int,
+    screen_height: int,
+    confidence_threshold: float = 0.7,
+) -> Optional[Dict[str, Any]]:
+    """Résoudre la position d'une ancre par template matching OpenCV.
+
+    Compare l'image de l'ancre (crop) avec le screenshot actuel pour trouver
+    la meilleure correspondance. Utilise cv2.matchTemplate avec TM_CCOEFF_NORMED.
+
+    Args:
+        screenshot_path: Chemin du screenshot de l'écran actuel
+        anchor_image_b64: Image de l'ancre encodée en base64 (PNG)
+        screen_width: Largeur de l'écran en pixels
+        screen_height: Hauteur de l'écran en pixels
+        confidence_threshold: Seuil minimum de confiance (0.0 à 1.0)
+
+    Returns:
+        Dict avec resolved=True et coordonnées, ou None si pas de match
+    """
+    try:
+        import cv2
+        import numpy as np
+    except ImportError:
+        logger.warning("OpenCV non disponible pour template matching")
+        return None
+
+    try:
+        # Charger le screenshot
+        screenshot = cv2.imread(screenshot_path)
+        if screenshot is None:
+            logger.warning("Impossible de lire le screenshot : %s", screenshot_path)
+            return None
+
+        # Décoder l'image de l'ancre depuis base64
+        anchor_bytes = base64.b64decode(anchor_image_b64)
+        anchor_array = np.frombuffer(anchor_bytes, dtype=np.uint8)
+        anchor_img = cv2.imdecode(anchor_array, cv2.IMREAD_COLOR)
+        if anchor_img is None:
+            logger.warning("Impossible de décoder l'image de l'ancre")
+            return None
+
+        # Convertir en niveaux de gris pour le matching
+        screenshot_gray = cv2.cvtColor(screenshot, cv2.COLOR_BGR2GRAY)
+        anchor_gray = cv2.cvtColor(anchor_img, cv2.COLOR_BGR2GRAY)
+
+        # Vérifier que l'ancre n'est pas plus grande que le screenshot
+        sh, sw = screenshot_gray.shape[:2]
+        ah, aw = anchor_gray.shape[:2]
+        if ah > sh or aw > sw:
+            logger.warning(
+                "Ancre (%dx%d) plus grande que le screenshot (%dx%d)",
+                aw, ah, sw, sh,
+            )
+            return None
+
+        # Template matching multi-échelle : essayer l'échelle 1.0 d'abord,
+        # puis quelques variations si la résolution a changé.
+        # Plage étendue 0.5x-2.0x pour couvrir les écarts importants
+        # (ex: apprentissage 2560x1600 → replay 1280x720 = ratio ~0.5x)
+        best_val = -1.0
+        best_loc = None
+        best_scale = 1.0
+        best_anchor_size = (aw, ah)
+
+        for scale in [1.0, 0.9, 1.1, 0.8, 1.2, 0.75, 1.25, 0.6, 1.5, 0.5, 1.75, 2.0]:
+            if scale != 1.0:
+                new_w = int(aw * scale)
+                new_h = int(ah * scale)
+                if new_w < 10 or new_h < 10 or new_w > sw or new_h > sh:
+                    continue
+                scaled_anchor = cv2.resize(anchor_gray, (new_w, new_h))
+            else:
+                scaled_anchor = anchor_gray
+                new_w, new_h = aw, ah
+
+            result = cv2.matchTemplate(screenshot_gray, scaled_anchor, cv2.TM_CCOEFF_NORMED)
+            _, max_val, _, max_loc = cv2.minMaxLoc(result)
+
+            if max_val > best_val:
+                best_val = max_val
+                best_loc = max_loc
+                best_scale = scale
+                best_anchor_size = (new_w, new_h)
+
+            # Si on a un très bon match, pas besoin de continuer
+            if best_val >= 0.95:
+                break
+
+        if best_val < confidence_threshold:
+            logger.info(
+                "Template matching : meilleur score=%.3f < seuil=%.3f (ancre %dx%d, écran %dx%d)",
+                best_val, confidence_threshold, aw, ah, sw, sh,
+            )
+            return None
+
+        # Calculer le centre du match
+        match_w, match_h = best_anchor_size
+        cx = best_loc[0] + match_w / 2.0
+        cy = best_loc[1] + match_h / 2.0
+
+        # Convertir en proportions normalisées
+        x_pct = round(cx / sw, 6) if sw > 0 else 0.0
+        y_pct = round(cy / sh, 6) if sh > 0 else 0.0
+
+        logger.info(
+            "Template matching OK : score=%.3f, échelle=%.2f, "
+            "centre=(%d, %d) → (%.4f, %.4f) sur %dx%d",
+            best_val, best_scale, int(cx), int(cy), x_pct, y_pct, sw, sh,
+        )
+
+        return {
+            "resolved": True,
+            "method": "template_matching",
+            "x_pct": x_pct,
+            "y_pct": y_pct,
+            "matched_element": {
+                "label": f"anchor_template",
+                "type": "visual_anchor",
+                "role": "anchor",
+                "center": [int(cx), int(cy)],
+                "confidence": best_val,
+            },
+            "score": best_val,
+            "scale": best_scale,
+            "match_box": {
+                "x": best_loc[0],
+                "y": best_loc[1],
+                "width": match_w,
+                "height": match_h,
+            },
+        }
+
+    except Exception as e:
+        logger.error("Erreur template matching : %s", e)
+        return None
+
+
+def _validate_match_context(
+    result: Dict[str, Any],
+    original_x_pct: float,
+    original_y_pct: float,
+    target_spec: Dict[str, Any],
+    max_distance: float = 0.35,
+) -> bool:
+    """Vérifier que la position trouvée est dans la même zone que l'originale.
+
+    Évite les faux positifs du template matching : un bouton similaire visuellement
+    mais situé dans une zone très différente de l'écran.
+
+    Args:
+        result: Résultat du template matching (contient x_pct, y_pct).
+        original_x_pct: Position X originale (pourcentage, 0.0-1.0).
+        original_y_pct: Position Y originale (pourcentage, 0.0-1.0).
+        target_spec: Spécification de la cible (non utilisé pour l'instant,
+            mais disponible pour des règles contextuelles futures).
+        max_distance: Distance euclidienne maximum acceptée (en pourcentage de l'écran).
+            Défaut 0.35 = ~35% de la diagonale, assez permissif pour les UI dynamiques.
+
+    Returns:
+        True si la position est valide (même zone), False sinon.
+    """
+    found_x = result.get("x_pct", 0.0)
+    found_y = result.get("y_pct", 0.0)
+
+    # Distance euclidienne en pourcentage de l'écran
+    dx = found_x - original_x_pct
+    dy = found_y - original_y_pct
+    distance = (dx ** 2 + dy ** 2) ** 0.5
+
+    if distance > max_distance:
+        logger.debug(
+            "Context validation : distance=%.3f > max=%.3f "
+            "(found=(%.3f, %.3f), original=(%.3f, %.3f))",
+            distance, max_distance, found_x, found_y, original_x_pct, original_y_pct,
+        )
+        return False
+
+    return True
+
+
+# =========================================================================
+# YOLO/OmniParser — Résolution par détection d'éléments UI
+# =========================================================================
+
+# Chargement paresseux d'OmniParser (singleton, GPU)
+_omniparser_available: Optional[bool] = None  # None = pas encore vérifié
+_omniparser_instance = None
+_omniparser_lock = threading.Lock()
+
+
+def _get_omniparser():
+    """Obtenir l'instance OmniParser (lazy loading, thread-safe).
+
+    Returns:
+        OmniParserAdapter ou None si non disponible.
+    """
+    global _omniparser_available, _omniparser_instance
+    if _omniparser_available is False:
+        return None
+    if _omniparser_instance is not None:
+        return _omniparser_instance
+
+    with _omniparser_lock:
+        if _omniparser_available is False:
+            return None
+        if _omniparser_instance is not None:
+            return _omniparser_instance
+        try:
+            from core.detection.omniparser_adapter import OmniParserAdapter
+            adapter = OmniParserAdapter()
+            if adapter.available:
+                _omniparser_instance = adapter
+                _omniparser_available = True
+                logger.info("OmniParser disponible pour la résolution YOLO")
+                return adapter
+            else:
+                _omniparser_available = False
+                logger.info("OmniParser : modèles non trouvés, YOLO désactivé")
+                return None
+        except ImportError:
+            _omniparser_available = False
+            logger.info("OmniParser non installé, YOLO désactivé")
+            return None
+        except Exception as e:
+            _omniparser_available = False
+            logger.warning("OmniParser init échouée : %s", e)
+            return None
+
+
+def _resolve_by_yolo(
+    screenshot_path: str,
+    anchor_image_b64: str,
+    screen_width: int,
+    screen_height: int,
+    target_spec: Dict[str, Any],
+) -> Optional[Dict[str, Any]]:
+    """Résolution via YOLO/OmniParser : détecte tous les éléments UI
+    puis matche le crop de référence contre les éléments détectés.
+
+    Stratégie :
+    1. OmniParser détecte tous les éléments UI du screenshot (~0.6-0.8s)
+    2. Pour chaque élément détecté, template matching local contre l'anchor
+    3. Si 1 seul bon match (score >= 0.50) → accepter
+    4. Si 2+ matchs ambigus → retourner None (le VLM tranchera)
+
+    Args:
+        screenshot_path: Chemin vers le screenshot JPEG
+        anchor_image_b64: Image de l'anchor encodée en base64
+        screen_width: Largeur de l'écran
+        screen_height: Hauteur de l'écran
+        target_spec: Spécification de la cible
+
+    Returns:
+        Dict avec resolved=True/False, x_pct, y_pct, score
+        ou None si OmniParser pas disponible ou aucun match
+    """
+    try:
+        import cv2
+        import numpy as np
+    except ImportError:
+        return None
+
+    omniparser = _get_omniparser()
+    if omniparser is None:
+        return None
+
+    t0 = time.time()
+
+    try:
+        from PIL import Image as PILImage
+
+        # Charger le screenshot en PIL
+        screenshot_pil = PILImage.open(screenshot_path)
+        sw, sh = screenshot_pil.size
+
+        # Charger le screenshot en numpy/OpenCV pour le template matching
+        screenshot_np = np.array(screenshot_pil)
+        if len(screenshot_np.shape) == 3 and screenshot_np.shape[2] == 3:
+            # PIL est RGB, convertir en BGR pour OpenCV
+            screenshot_bgr = cv2.cvtColor(screenshot_np, cv2.COLOR_RGB2BGR)
+        else:
+            screenshot_bgr = screenshot_np
+        screenshot_gray = cv2.cvtColor(screenshot_bgr, cv2.COLOR_BGR2GRAY)
+
+        # Décoder l'anchor depuis base64
+        anchor_bytes = base64.b64decode(anchor_image_b64)
+        anchor_array = np.frombuffer(anchor_bytes, dtype=np.uint8)
+        anchor_img = cv2.imdecode(anchor_array, cv2.IMREAD_COLOR)
+        if anchor_img is None:
+            logger.warning("YOLO resolve : impossible de décoder l'anchor")
+            return None
+        anchor_gray = cv2.cvtColor(anchor_img, cv2.COLOR_BGR2GRAY)
+        anchor_h, anchor_w = anchor_gray.shape[:2]
+
+        # Détecter tous les éléments UI avec OmniParser
+        elements = omniparser.detect(screenshot_pil)
+        if not elements:
+            elapsed = time.time() - t0
+            logger.info("YOLO resolve : 0 éléments détectés (%.1fs)", elapsed)
+            return None
+
+        logger.info(
+            "YOLO resolve : %d éléments détectés, matching anchor %dx%d...",
+            len(elements), anchor_w, anchor_h,
+        )
+
+        # Matcher l'anchor contre chaque élément détecté
+        YOLO_MATCH_THRESHOLD = 0.50
+        matches = []
+
+        for elem in elements:
+            x1, y1, x2, y2 = elem.bbox
+            elem_w = x2 - x1
+            elem_h = y2 - y1
+
+            # Ignorer les éléments trop petits
+            if elem_w < 5 or elem_h < 5:
+                continue
+
+            # Extraire le crop de l'élément depuis le screenshot
+            elem_crop = screenshot_gray[y1:y2, x1:x2]
+            if elem_crop.size == 0:
+                continue
+
+            # Template matching local : resize anchor pour matcher la taille de l'élément
+            # ou inversement, selon les dimensions relatives
+            try:
+                # Approche : resize l'anchor à la taille du crop et comparer
+                if elem_w > 0 and elem_h > 0:
+                    anchor_resized = cv2.resize(anchor_gray, (elem_w, elem_h))
+                    result = cv2.matchTemplate(
+                        elem_crop, anchor_resized, cv2.TM_CCOEFF_NORMED
+                    )
+                    _, max_val, _, _ = cv2.minMaxLoc(result)
+                else:
+                    continue
+
+                # Aussi essayer le crop à la taille de l'anchor si c'est plus grand
+                if elem_w >= anchor_w and elem_h >= anchor_h:
+                    result2 = cv2.matchTemplate(
+                        elem_crop, anchor_gray, cv2.TM_CCOEFF_NORMED
+                    )
+                    _, max_val2, _, _ = cv2.minMaxLoc(result2)
+                    max_val = max(max_val, max_val2)
+
+                if max_val >= YOLO_MATCH_THRESHOLD:
+                    matches.append((elem, max_val))
+
+            except cv2.error:
+                continue
+
+        elapsed = time.time() - t0
+
+        if not matches:
+            logger.info(
+                "YOLO resolve : aucun match >= %.2f parmi %d éléments (%.1fs)",
+                YOLO_MATCH_THRESHOLD, len(elements), elapsed,
+            )
+            return None
+
+        # Trier par score décroissant
+        matches.sort(key=lambda m: m[1], reverse=True)
+        best_elem, best_score = matches[0]
+
+        # Si 2+ matchs avec des scores proches (< 0.10 d'écart), c'est ambigu
+        # → laisser le VLM trancher
+        if len(matches) >= 2:
+            second_score = matches[1][1]
+            if best_score - second_score < 0.10:
+                logger.info(
+                    "YOLO resolve : %d matchs ambigus (best=%.3f, second=%.3f, "
+                    "écart=%.3f < 0.10), VLM requis (%.1fs)",
+                    len(matches), best_score, second_score,
+                    best_score - second_score, elapsed,
+                )
+                return None
+
+        # 1 seul match clair → accepter
+        cx, cy = best_elem.center
+        x_pct = round(cx / sw, 6) if sw > 0 else 0.0
+        y_pct = round(cy / sh, 6) if sh > 0 else 0.0
+
+        logger.info(
+            "YOLO resolve OK : '%s' (%s) score=%.3f → (%.4f, %.4f) "
+            "parmi %d éléments, %d matchs (%.1fs)",
+            best_elem.label, best_elem.element_type, best_score,
+            x_pct, y_pct, len(elements), len(matches), elapsed,
+        )
+
+        return {
+            "resolved": True,
+            "method": "yolo_omniparser",
+            "x_pct": x_pct,
+            "y_pct": y_pct,
+            "matched_element": {
+                "label": best_elem.label,
+                "type": best_elem.element_type,
+                "role": "yolo_detected",
+                "center": [cx, cy],
+                "confidence": best_score,
+            },
+            "score": best_score,
+            "yolo_elements_count": len(elements),
+            "yolo_matches_count": len(matches),
+        }
+
+    except Exception as e:
+        elapsed = time.time() - t0
+        logger.warning("YOLO resolve : exception (%.1fs) — %s", elapsed, e)
+        return None
+
+
+# =========================================================================
+# VLM Quick Find — Fallback léger quand le template matching échoue
+# =========================================================================
+
+# Client Ollama singleton (initialisé au premier appel, pas au démarrage)
+_vlm_client = None
+_vlm_client_lock = threading.Lock()
+
+# Timeout dédié pour le VLM Quick Find (plus court que le timeout par défaut)
+_VLM_QUICK_FIND_TIMEOUT = 30  # secondes
+
+
+def _get_vlm_client():
+    """Obtenir ou créer le client Ollama singleton pour le VLM Quick Find.
+
+    Initialisation paresseuse : le client n'est créé qu'au premier appel,
+    pas au démarrage du serveur (évite de bloquer si Ollama est down).
+    Le modèle est résolu automatiquement via vlm_config (RPA_VLM_MODEL).
+    """
+    global _vlm_client
+    if _vlm_client is not None:
+        return _vlm_client
+    with _vlm_client_lock:
+        if _vlm_client is not None:
+            return _vlm_client
+        try:
+            from core.detection.ollama_client import OllamaClient
+            from core.detection.vlm_config import get_vlm_model
+            _model = get_vlm_model()
+            _vlm_client = OllamaClient(
+                endpoint="http://localhost:11434",
+                model=_model,
+                timeout=_VLM_QUICK_FIND_TIMEOUT,
+            )
+            logger.info("VLM Quick Find : client Ollama initialisé (%s)", _model)
+        except Exception as e:
+            logger.warning(f"VLM Quick Find : impossible d'initialiser le client Ollama : {e}")
+            return None
+    return _vlm_client
+
+
+def _build_target_description(target_spec: Dict[str, Any]) -> str:
+    """Construire une description textuelle de l'élément à trouver.
+
+    Utilisé par le VLM Quick Find pour savoir quoi chercher sur le screenshot.
+
+    Args:
+        target_spec: Spécification de la cible (by_text, by_role, etc.)
+
+    Returns:
+        Description en langage naturel, ex: "un bouton contenant 'Valider'"
+    """
+    by_text = target_spec.get("by_text", "").strip()
+    by_role = target_spec.get("by_role", "").strip()
+
+    if by_text and by_role:
+        return f"un {by_role} contenant '{by_text}'"
+    elif by_text:
+        return f"élément contenant le texte '{by_text}'"
+    elif by_role:
+        return f"un {by_role}"
+    else:
+        return "l'élément interactif principal"
+
+
+def _vlm_quick_find(
+    screenshot_path: str,
+    target_description: str,
+    anchor_image_b64: Optional[str] = None,
+) -> Optional[Dict[str, Any]]:
+    """Demander au VLM de localiser un élément sur le screenshot.
+
+    Stratégie VLM-first pour le replay : le VLM comprend le contexte
+    de l'écran et peut trouver un élément même si l'apparence a changé.
+
+    Modes de fonctionnement :
+    - Avec anchor_image_b64 + description : multi-image (screenshot + crop de référence).
+      Le VLM voit le screenshot ET le crop, ce qui est beaucoup plus précis.
+    - Avec description seule : single-image, le VLM cherche par la description textuelle.
+    - Avec anchor_image_b64 seule (pas de description) : multi-image avec prompt visuel pur.
+
+    Args:
+        screenshot_path: Chemin du screenshot actuel
+        target_description: Description riche de l'élément à trouver.
+            Ex: "Dans la fenêtre 'Exécuter', l'élément cliqué en bas au centre"
+        anchor_image_b64: Image de référence (crop) en base64 (optionnel).
+            Si fourni, envoyé comme seconde image au VLM pour comparaison visuelle.
+
+    Returns:
+        {"x_pct": float, "y_pct": float, "confidence": float, "method": "vlm_quick_find"}
+        ou None si l'élément n'est pas trouvé ou en cas d'erreur
+    """
+    client = _get_vlm_client()
+    if client is None:
+        logger.debug("VLM Quick Find : client Ollama non disponible, skip")
+        return None
+
+    t0 = time.time()
+
+    # Construire le prompt adapté selon les informations disponibles
+    has_anchor = bool(anchor_image_b64)
+    has_description = bool(target_description and target_description.strip())
+
+    if has_anchor and has_description:
+        # Mode optimal : screenshot + crop de référence + description textuelle
+        prompt = (
+            "The first image is the current screen. "
+            "The second image shows the element I want to click.\n\n"
+            f"Context: {target_description}\n\n"
+            "Find this exact element on the screen and return its CENTER coordinates "
+            "as percentage of the screen dimensions.\n"
+            'Return ONLY a JSON object: {"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}\n'
+            'If the element is not visible, return: {"x_pct": null, "y_pct": null, "confidence": 0.0}'
+        )
+    elif has_anchor:
+        # Mode visuel pur : screenshot + crop, pas de description
+        prompt = (
+            "The first image is the current screen. "
+            "The second image shows the element I want to click.\n\n"
+            "Find this exact element on the screen and return its CENTER coordinates "
+            "as percentage of the screen dimensions.\n"
+            'Return ONLY a JSON object: {"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}\n'
+            'If the element is not visible, return: {"x_pct": null, "y_pct": null, "confidence": 0.0}'
+        )
+    else:
+        # Mode description seule
+        prompt = (
+            "Look at this screenshot carefully.\n\n"
+            f"{target_description}\n\n"
+            "Find this element and return its CENTER coordinates "
+            "as percentage of the image dimensions.\n"
+            'Return ONLY a JSON object: {"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}\n'
+            'If the element is not visible, return: {"x_pct": null, "y_pct": null, "confidence": 0.0}'
+        )
+
+    system_prompt = "You are a UI element locator. Output raw JSON only. No explanation."
+
+    try:
+        # Préparer les images supplémentaires (anchor crop)
+        extra_images = [anchor_image_b64] if has_anchor else None
+
+        result = client.generate(
+            prompt=prompt,
+            image_path=screenshot_path,
+            system_prompt=system_prompt,
+            temperature=0.1,
+            max_tokens=200,
+            force_json=False,
+            extra_images_b64=extra_images,
+        )
+
+        elapsed = time.time() - t0
+
+        if not result.get("success"):
+            logger.info(
+                "VLM Quick Find : échec appel VLM (%.1fs) — %s",
+                elapsed, result.get("error", "?"),
+            )
+            return None
+
+        response_text = result.get("response", "").strip()
+        if not response_text:
+            logger.info("VLM Quick Find : réponse vide du VLM (%.1fs)", elapsed)
+            return None
+
+        # Parser la réponse JSON (réutiliser le parser robuste d'OllamaClient)
+        parsed = client._extract_json_from_response(response_text)
+        if parsed is None:
+            logger.info(
+                "VLM Quick Find : réponse non-JSON (%.1fs) — %.80s",
+                elapsed, response_text,
+            )
+            return None
+
+        # Valider les coordonnées
+        x_pct = parsed.get("x_pct")
+        y_pct = parsed.get("y_pct")
+        confidence = float(parsed.get("confidence", 0.0))
+
+        if x_pct is None or y_pct is None or confidence < 0.3:
+            logger.info(
+                "VLM Quick Find : élément non trouvé ou confiance trop basse "
+                "(%.1fs, confidence=%.2f) pour '%s'",
+                elapsed, confidence,
+                target_description[:80] if target_description else "(anchor only)",
+            )
+            return None
+
+        x_pct = float(x_pct)
+        y_pct = float(y_pct)
+
+        # Vérifier que les coordonnées sont dans les bornes [0, 1]
+        if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0):
+            logger.info(
+                "VLM Quick Find : coordonnées hors bornes (%.4f, %.4f), ignoré",
+                x_pct, y_pct,
+            )
+            return None
+
+        mode_str = "multi-image" if has_anchor else "description"
+        desc_short = (target_description[:60] + "...") if target_description and len(target_description) > 60 else (target_description or "(anchor)")
+        logger.info(
+            "VLM Quick Find OK [%s] : '%s' → (%.4f, %.4f) confidence=%.2f en %.1fs",
+            mode_str, desc_short, x_pct, y_pct, confidence, elapsed,
+        )
+
+        return {
+            "resolved": True,
+            "method": "vlm_quick_find",
+            "x_pct": round(x_pct, 6),
+            "y_pct": round(y_pct, 6),
+            "matched_element": {
+                "label": target_description or "anchor_visual",
+                "type": "vlm_located",
+                "role": "vlm_quick_find",
+                "confidence": confidence,
+            },
+            "score": confidence,
+        }
+
+    except Exception as e:
+        elapsed = time.time() - t0
+        logger.warning(
+            "VLM Quick Find : exception (%.1fs) — %s", elapsed, e,
+        )
+        return None
+
+
+# ---------------------------------------------------------------------------
+# Résolution par VLM Grounding Direct (configurable via RPA_VLM_MODEL)
+# ---------------------------------------------------------------------------
+
+
+def _resolve_by_grounding(
+    screenshot_path: str,
+    target_spec: Dict[str, Any],
+    screen_width: int,
+    screen_height: int,
+) -> Optional[Dict[str, Any]]:
+    """Résoudre une cible via grounding VLM direct.
+
+    Le modèle VLM (gemma4:e4b par défaut, configurable via RPA_VLM_MODEL)
+    reçoit le screenshot + une description textuelle et retourne
+    directement les coordonnées de l'élément. Pas de SomEngine,
+    pas de numérotation — le VLM fait du grounding UI natif.
+
+    Approche plus fiable que SomEngine+VLM pour les icônes et éléments
+    visuels sans texte (logo Windows, disquette, bouton fermer).
+    """
+    t0 = time.time()
+
+    # Construire la description de la cible
+    by_text = target_spec.get("by_text", "").strip()
+    vlm_desc = target_spec.get("vlm_description", "").strip()
+    window_title = target_spec.get("window_title", "").strip()
+
+    if by_text:
+        description = by_text
+    elif vlm_desc:
+        description = vlm_desc
+    else:
+        return None
+
+    # Utiliser la capture fenêtre si disponible (plus ciblée, moins de bruit)
+    # Sinon fallback sur le full screen
+    window_capture = target_spec.get("window_capture", {})
+    window_rect = window_capture.get("rect")  # [x1, y1, x2, y2] écran
+
+    try:
+        from PIL import Image as PILImage
+        from pathlib import Path
+
+        # Utiliser la fenêtre active : cropper depuis le screenshot full
+        # via window_rect (fonctionne au replay comme à l'enregistrement)
+        img = PILImage.open(screenshot_path)
+
+        if window_rect:
+            x1, y1, x2, y2 = window_rect
+            img = img.crop((x1, y1, x2, y2))
+            using_window = True
+            logger.debug("Grounding : crop fenêtre (%d,%d,%d,%d) → %dx%d", x1, y1, x2, y2, *img.size)
+        else:
+            using_window = False
+
+        orig_w, orig_h = img.size
+        small_w, small_h = orig_w, orig_h  # pas de redimensionnement
+
+        buf = io.BytesIO()
+        img.save(buf, format="JPEG", quality=80)
+        shot_b64 = base64.b64encode(buf.getvalue()).decode()
+    except Exception as e:
+        logger.warning("Grounding : erreur chargement image — %s", e)
+        return None
+
+    # Prompt natif Qwen2.5-VL — format bbox_2d (le seul fiable)
+    # Ajouter la position relative pour désambiguïser (ex: deux "Rechercher" à l'écran)
+    original_pos = target_spec.get("original_position", {})
+    pos_hint = ""
+    y_rel = original_pos.get("y_relative", "")
+    x_rel = original_pos.get("x_relative", "")
+    if y_rel or x_rel:
+        pos_hint = f" located {y_rel} {x_rel} of the screen".strip()
+    prompt = f"Detect '{description}'{pos_hint} in this image with a bounding box."
+
+    # Le grounding nécessite un modèle entraîné pour les coordonnées (bbox_2d).
+    # Qwen2.5-VL est le seul qui retourne des positions précises.
+    # gemma4 comprend les images mais ne sait pas localiser en coordonnées.
+    _grounding_model = os.environ.get("RPA_GROUNDING_MODEL", "qwen2.5vl:7b")
+
+    # Appel VLM — vLLM (GPU, rapide) en priorité, Ollama en fallback
+    import requests as _requests
+    content = ""
+
+    # Port vLLM configurable via env
+    _vllm_port = os.environ.get("VLLM_PORT", "8100")
+    _vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ")
+
+    # Essai 1 : vLLM (API OpenAI-compatible, GPU)
+    try:
+        vllm_resp = _requests.post(
+            f"http://localhost:{_vllm_port}/v1/chat/completions",
+            json={
+                "model": _vllm_model,
+                "messages": [
+                    {"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."},
+                    {"role": "user", "content": [
+                        {"type": "text", "text": prompt},
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}},
+                    ]},
+                ],
+                "temperature": 0.1,
+                "max_tokens": 80,
+            },
+            timeout=30,
+        )
+        if vllm_resp.ok:
+            content = vllm_resp.json().get("choices", [{}])[0].get("message", {}).get("content", "")
+            if content:
+                logger.debug("Grounding via vLLM OK")
+    except Exception as e:
+        logger.debug("vLLM non disponible (%s), fallback Ollama", e)
+
+    # Essai 2 : Ollama (qwen2.5vl:7b pour le grounding — format bbox_2d natif)
+    if not content:
+        try:
+            resp = _requests.post("http://localhost:11434/api/chat", json={
+                "model": _grounding_model,
+                "messages": [
+                    {"role": "user", "content": prompt, "images": [shot_b64]},
+                ],
+                "stream": False,
+                "options": {"temperature": 0.1, "num_predict": 100},
+            }, timeout=60)
+            content = resp.json().get("message", {}).get("content", "")
+        except Exception as e:
+            logger.info("Grounding VLM timeout/erreur : %s", e)
+            return None
+
+    elapsed = time.time() - t0
+
+    # Parser la réponse — supporte bbox_2d en pixels, JSON %, arrays bruts
+    x_pct, y_pct = None, None
+
+    # Format 1 : bbox_2d en pixels [x, y] ou [x1, y1, x2, y2]
+    bbox_match = re.search(r'"bbox_2d"\s*:\s*\[([^\]]+)\]', content)
+    if bbox_match:
+        coords = [float(v.strip()) for v in bbox_match.group(1).split(",")]
+        if len(coords) == 2:
+            x_pct = coords[0] / small_w
+            y_pct = coords[1] / small_h
+        elif len(coords) >= 4:
+            x_pct = (coords[0] + coords[2]) / 2 / small_w
+            y_pct = (coords[1] + coords[3]) / 2 / small_h
+
+    # Format 2 : JSON {"x": 0.XX, "y": 0.YY}
+    if x_pct is None:
+        json_match = re.search(r'"x"\s*:\s*([\d.]+).*?"y"\s*:\s*([\d.]+)', content)
+        if json_match:
+            x_val, y_val = float(json_match.group(1)), float(json_match.group(2))
+            # Si > 1, c'est en pixels
+            if x_val > 1:
+                x_pct = x_val / small_w
+                y_pct = y_val / small_h
+            else:
+                x_pct = x_val
+                y_pct = y_val
+
+    # Format 3 : {"x_pct": 0.XX, "y_pct": 0.YY}
+    if x_pct is None:
+        pct_match = re.search(r'"x_pct"\s*:\s*([\d.]+).*?"y_pct"\s*:\s*([\d.]+)', content)
+        if pct_match:
+            x_pct = float(pct_match.group(1))
+            y_pct = float(pct_match.group(2))
+
+    # Format 4 : array brut [x1, y1, x2, y2] ou [x, y]
+    if x_pct is None:
+        arr_match = re.search(r'\[[\s]*([\d.]+)\s*,\s*([\d.]+)(?:\s*,\s*([\d.]+)\s*,\s*([\d.]+))?\s*\]', content)
+        if arr_match:
+            vals = [float(v) for v in arr_match.groups() if v is not None]
+            if len(vals) >= 4:
+                x_pct = (vals[0] + vals[2]) / 2 / small_w
+                y_pct = (vals[1] + vals[3]) / 2 / small_h
+            elif len(vals) == 2:
+                x_pct = vals[0] / small_w
+                y_pct = vals[1] / small_h
+
+    if x_pct is None or y_pct is None:
+        # Fallback multi-image : screenshot + crop → grounding sans description
+        anchor_b64 = target_spec.get("anchor_image_base64", "")
+        if anchor_b64:
+            try:
+                prompt_mi = (
+                    "Image 1 is a screenshot. Image 2 shows a UI element.\n"
+                    "Find where Image 2 appears on Image 1.\n"
+                    'Return position: {"x": NNN, "y": NNN} in pixels of Image 1.'
+                )
+                resp2 = _requests.post("http://localhost:11434/api/chat", json={
+                    "model": _grounding_model,
+                    "messages": [
+                        {"role": "user", "content": prompt_mi, "images": [shot_b64, anchor_b64]},
+                    ],
+                    "stream": False,
+                    "options": {"temperature": 0.1, "num_predict": 50},
+                }, timeout=60)
+                content2 = resp2.json().get("message", {}).get("content", "")
+                elapsed = time.time() - t0
+
+                # Parser tous les formats
+                arr2 = re.search(r'\[[\s]*([\d.]+)\s*,\s*([\d.]+)(?:\s*,\s*([\d.]+)\s*,\s*([\d.]+))?\s*\]', content2)
+                if arr2:
+                    vals = [float(v) for v in arr2.groups() if v is not None]
+                    if len(vals) >= 4:
+                        x_pct = (vals[0] + vals[2]) / 2 / small_w
+                        y_pct = (vals[1] + vals[3]) / 2 / small_h
+                    elif len(vals) == 2:
+                        x_pct = vals[0] / small_w
+                        y_pct = vals[1] / small_h
+                if x_pct is None:
+                    json2 = re.search(r'"x"\s*:\s*([\d.]+).*?"y"\s*:\s*([\d.]+)', content2)
+                    if json2:
+                        x_pct = float(json2.group(1)) / small_w
+                        y_pct = float(json2.group(2)) / small_h
+                if x_pct is not None:
+                    logger.info("Grounding multi-image OK (%.1fs)", elapsed)
+            except Exception as e:
+                logger.debug("Grounding multi-image erreur: %s", e)
+
+    if x_pct is None or y_pct is None:
+        logger.info(
+            "Grounding : réponse non parsable (%.1fs) — %s",
+            elapsed, content[:120],
+        )
+        return None
+
+    # Valider les bornes
+    if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0):
+        logger.info("Grounding : coordonnées hors bornes (%.3f, %.3f)", x_pct, y_pct)
+        return None
+
+    # Convertir coordonnées fenêtre → coordonnées écran
+    if using_window and window_rect:
+        win_x1, win_y1, win_x2, win_y2 = window_rect
+        win_w = win_x2 - win_x1
+        win_h = win_y2 - win_y1
+        # x_pct/y_pct sont relatifs à la fenêtre, convertir en relatif à l'écran
+        abs_x = win_x1 + x_pct * win_w
+        abs_y = win_y1 + y_pct * win_h
+        x_pct = abs_x / screen_width
+        y_pct = abs_y / screen_height
+        logger.info(
+            "Grounding OK [%s/window] : '%s' → (%.4f, %.4f) en %.1fs",
+            _grounding_model, description[:50], x_pct, y_pct, elapsed,
+        )
+    else:
+        logger.info(
+            "Grounding OK [%s/full] : '%s' → (%.4f, %.4f) en %.1fs",
+            _grounding_model, description[:50], x_pct, y_pct, elapsed,
+        )
+
+    return {
+        "resolved": True,
+        "method": "grounding_vlm",
+        "x_pct": round(x_pct, 6),
+        "y_pct": round(y_pct, 6),
+        "matched_element": {
+            "label": description[:60],
+            "type": "grounding",
+            "role": "grounding_vlm",
+            "confidence": 0.85,
+        },
+        "score": 0.85,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Résolution Set-of-Mark : SomEngine (détection) + VLM (identification)
+# ---------------------------------------------------------------------------
+
+def _get_som_engine_api():
+    """Singleton SomEngine partagé."""
+    try:
+        from core.detection.som_engine import get_shared_engine
+        return get_shared_engine()
+    except ImportError:
+        return None
+
+
+def _resolve_by_som(
+    screenshot_path: str,
+    target_spec: Dict[str, Any],
+    screen_width: int,
+    screen_height: int,
+) -> Optional[Dict[str, Any]]:
+    """Résoudre une cible UI via Set-of-Mark + VLM.
+
+    Pipeline :
+    1. SomEngine détecte tous les éléments et les numérote sur le screenshot
+    2. VLM reçoit l'image annotée + description de la cible
+    3. VLM identifie le numéro du mark → coordonnées précises
+
+    Avantages vs VLM direct :
+    - Le VLM n'a qu'à identifier (son point fort), pas localiser
+    - Les coordonnées viennent de SomEngine (pixel-perfect)
+    - Question simple "quel numéro ?" → réponse simple
+
+    Args:
+        screenshot_path: Chemin du screenshot actuel
+        target_spec: Spécification de la cible (vlm_description, som_element, etc.)
+        screen_width: Largeur écran en pixels
+        screen_height: Hauteur écran en pixels
+
+    Returns:
+        Dict avec resolved=True et coordonnées, ou None si indisponible.
+    """
+    engine = _get_som_engine_api()
+    if engine is None:
+        return None
+
+    client = _get_vlm_client()
+    if client is None:
+        return None
+
+    t0 = time.time()
+
+    # ── 1. Lancer SomEngine sur le screenshot actuel ──
+    try:
+        from PIL import Image as PILImage
+        img = PILImage.open(screenshot_path).convert("RGB")
+        som_result = engine.analyze(img)
+    except Exception as e:
+        logger.warning("SoM resolve : erreur analyse — %s", e)
+        return None
+
+    if not som_result.elements:
+        logger.info("SoM resolve : 0 éléments détectés")
+        return None
+
+    # ── 2. Construire la description de la cible ──
+    som_element = target_spec.get("som_element", {})
+    vlm_description = target_spec.get("vlm_description", "")
+    anchor_label = som_element.get("label", "")
+
+    # Construire un prompt riche
+    target_parts = []
+    if anchor_label:
+        target_parts.append(f"texte '{anchor_label}'")
+    if vlm_description:
+        target_parts.append(vlm_description)
+    if not target_parts:
+        # Sans description, SoM resolve ne peut pas fonctionner
+        logger.debug("SoM resolve : pas de description pour identifier l'élément")
+        return None
+
+    target_desc = ", ".join(target_parts)
+
+    # ── 2.5. Raccourci : si le label est connu, chercher par texte directement ──
+    # Pas besoin du VLM si on connaît le texte exact de l'élément !
+    if anchor_label and len(anchor_label) >= 2:
+        label_lower = anchor_label.lower()
+        # Match exact d'abord, puis partiel
+        exact_matches = [
+            e for e in som_result.elements
+            if e.label and e.label.lower() == label_lower
+        ]
+        if not exact_matches:
+            exact_matches = [
+                e for e in som_result.elements
+                if e.label and len(e.label) >= 3 and (
+                    label_lower in e.label.lower()
+                    or e.label.lower() in label_lower
+                )
+            ]
+
+        if len(exact_matches) == 1:
+            # Match unique par texte → pas besoin du VLM
+            elem = exact_matches[0]
+            elapsed = time.time() - t0
+            cx_norm, cy_norm = elem.center_norm
+            logger.info(
+                "SoM resolve FAST : match texte unique '#%d %s' → (%.4f, %.4f) en %.1fs",
+                elem.id, elem.label, cx_norm, cy_norm, elapsed,
+            )
+            return {
+                "resolved": True,
+                "method": "som_text_match",
+                "x_pct": round(cx_norm, 6),
+                "y_pct": round(cy_norm, 6),
+                "matched_element": {
+                    "label": elem.label,
+                    "type": elem.source,
+                    "role": "som_text_match",
+                    "confidence": max(elem.confidence, 0.85),
+                    "som_id": elem.id,
+                },
+                "score": max(elem.confidence, 0.85),
+            }
+        elif len(exact_matches) > 1:
+            # Plusieurs matchs texte → disambiguïser par proximité à la position originale
+            ref_center = som_element.get("center_norm", [])
+            if ref_center and len(ref_center) == 2:
+                ref_x, ref_y = ref_center
+                best = min(
+                    exact_matches,
+                    key=lambda e: (
+                        (e.center_norm[0] - ref_x) ** 2
+                        + (e.center_norm[1] - ref_y) ** 2
+                    ),
+                )
+                elapsed = time.time() - t0
+                cx_norm, cy_norm = best.center_norm
+                dist = ((cx_norm - ref_x) ** 2 + (cy_norm - ref_y) ** 2) ** 0.5
+                if dist < 0.15:  # Tolérance 15% de l'écran
+                    logger.info(
+                        "SoM resolve FAST : match texte proximité '#%d %s' (dist=%.3f) "
+                        "→ (%.4f, %.4f) en %.1fs",
+                        best.id, best.label, dist, cx_norm, cy_norm, elapsed,
+                    )
+                    return {
+                        "resolved": True,
+                        "method": "som_text_match",
+                        "x_pct": round(cx_norm, 6),
+                        "y_pct": round(cy_norm, 6),
+                        "matched_element": {
+                            "label": best.label,
+                            "type": best.source,
+                            "role": "som_text_match_proximity",
+                            "confidence": max(best.confidence, 0.80),
+                            "som_id": best.id,
+                        },
+                        "score": max(best.confidence, 0.80),
+                    }
+            logger.info(
+                "SoM resolve : %d matchs texte pour '%s', VLM nécessaire",
+                len(exact_matches), anchor_label,
+            )
+
+    # ── 2.7. Fallback : template matching anchor vs éléments SomEngine ──
+    # Pour les icônes sans texte : comparer le crop de référence contre
+    # chaque région YOLO détectée par SomEngine.
+    anchor_b64 = target_spec.get("anchor_image_base64", "")
+    by_text = target_spec.get("by_text", "").strip()
+    if anchor_b64 and (not anchor_label or not by_text):
+        try:
+            import cv2
+            import numpy as np
+
+            # Décoder l'anchor
+            anc_bytes = base64.b64decode(anchor_b64)
+            anc_array = np.frombuffer(anc_bytes, dtype=np.uint8)
+            anc_img = cv2.imdecode(anc_array, cv2.IMREAD_GRAYSCALE)
+
+            # Charger le screenshot en OpenCV
+            screenshot_cv = cv2.imread(screenshot_path, cv2.IMREAD_GRAYSCALE)
+
+            if anc_img is not None and screenshot_cv is not None:
+                # Template matching de l'anchor sur le SCREENSHOT ENTIER
+                # (pas sur les régions individuelles — l'anchor est souvent plus grand)
+                anc_h, anc_w = anc_img.shape[:2]
+                if screenshot_cv.shape[0] >= anc_h and screenshot_cv.shape[1] >= anc_w:
+                    res = cv2.matchTemplate(screenshot_cv, anc_img, cv2.TM_CCOEFF_NORMED)
+                    _, max_score, _, max_loc = cv2.minMaxLoc(res)
+
+                    if max_score >= 0.5:
+                        # Centre du match
+                        match_cx = max_loc[0] + anc_w // 2
+                        match_cy = max_loc[1] + anc_h // 2
+
+                        # Trouver l'élément SomEngine le plus proche du centre du match
+                        best_elem = None
+                        best_dist = float("inf")
+                        for elem in som_result.elements:
+                            cx, cy = elem.center
+                            dist = ((match_cx - cx) ** 2 + (match_cy - cy) ** 2) ** 0.5
+                            if dist < best_dist:
+                                best_dist = dist
+                                best_elem = elem
+
+                        if best_elem and best_dist < 100:  # Max 100px de distance
+                            elapsed = time.time() - t0
+                            cx_norm, cy_norm = best_elem.center_norm
+                            logger.info(
+                                "SoM resolve ANCHOR : match crop score=%.3f → "
+                                "elem '#%d %s' (dist=%.0fpx) → (%.4f, %.4f) en %.1fs",
+                                max_score, best_elem.id, best_elem.label,
+                                best_dist, cx_norm, cy_norm, elapsed,
+                            )
+                            return {
+                                "resolved": True,
+                                "method": "som_anchor_match",
+                                "x_pct": round(cx_norm, 6),
+                                "y_pct": round(cy_norm, 6),
+                                "matched_element": {
+                                    "label": best_elem.label or f"icon #{best_elem.id}",
+                                    "type": best_elem.source,
+                                    "role": "som_anchor_match",
+                                    "confidence": max_score,
+                                    "som_id": best_elem.id,
+                                },
+                                "score": max_score,
+                            }
+        except ImportError:
+            pass
+        except Exception as e:
+            logger.debug("SoM anchor match erreur : %s", e)
+
+    # ── 3. Sauvegarder l'image annotée SoM temporairement ──
+    if som_result.som_image is None:
+        logger.debug("SoM resolve : pas d'image annotée, skip VLM")
+        return None
+
+    try:
+        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
+            som_result.som_image.save(tmp, format="JPEG", quality=85)
+            som_img_path = tmp.name
+    except Exception as e:
+        logger.warning("SoM resolve : erreur sauvegarde image annotée — %s", e)
+        return None
+
+    # ── 4. VLM : identifier le numéro du mark ──
+    # Lister uniquement les éléments avec un label (plus concis pour le VLM)
+    labeled_elements = [e for e in som_result.elements if e.label][:30]
+    elements_list = "\n".join(
+        f"  #{e.id}: '{e.label}'"
+        for e in labeled_elements
+    )
+
+    # Multi-image : SoM annotée + anchor crop (si disponible)
+    anchor_b64 = target_spec.get("anchor_image_base64", "")
+    extra_images = [anchor_b64] if anchor_b64 else None
+
+    if extra_images:
+        prompt = (
+            "Image 1 shows the screen with numbered marks on each UI element.\n"
+            "Image 2 shows the element I'm looking for.\n\n"
+            f"Target: {target_desc}\n\n"
+            f"Detected elements:\n{elements_list}\n\n"
+            "Which mark number matches the target element in Image 2?\n"
+            'Answer with JSON only: {"mark_id": N, "confidence": 0.9}'
+        )
+    else:
+        prompt = (
+            f"I'm looking for: {target_desc}\n\n"
+            f"Detected elements:\n{elements_list}\n\n"
+            "Which number is the correct element?\n"
+            'Answer with JSON only: {"mark_id": N, "confidence": 0.9}'
+        )
+
+    system_prompt = "You identify UI elements by number. Output JSON only, no explanation."
+
+    try:
+        result = client.generate(
+            prompt=prompt,
+            image_path=som_img_path,
+            system_prompt=system_prompt,
+            temperature=0.1,
+            max_tokens=50,
+            force_json=False,
+            extra_images_b64=extra_images,
+        )
+    except Exception as e:
+        logger.warning("SoM resolve : erreur VLM — %s", e)
+        return None
+    finally:
+        try:
+            os.unlink(som_img_path)
+        except OSError:
+            pass
+
+    elapsed = time.time() - t0
+
+    if not result.get("success"):
+        logger.info("SoM resolve : VLM échoué (%.1fs)", elapsed)
+        return None
+
+    # ── 5. Parser la réponse et retourner les coordonnées ──
+    response_text = result.get("response", "").strip()
+
+    # Tenter d'abord l'extraction JSON standard
+    parsed = client._extract_json_from_response(response_text)
+
+    # Fallback : extraire un nombre simple de la réponse
+    if parsed is None:
+        numbers = re.findall(r'\b(\d+)\b', response_text)
+        if numbers:
+            candidate = int(numbers[0])
+            if som_result.get_element_by_id(candidate) is not None:
+                parsed = {"mark_id": candidate, "confidence": 0.7}
+                logger.debug("SoM resolve : extraction numéro fallback → #%d", candidate)
+
+    if parsed is None:
+        logger.info("SoM resolve : réponse non-JSON (%.1fs) — %.80s", elapsed, response_text)
+        return None
+
+    mark_id = parsed.get("mark_id")
+    confidence = float(parsed.get("confidence", 0.0))
+
+    if mark_id is None or confidence < 0.3:
+        logger.info(
+            "SoM resolve : mark non trouvé ou confiance trop basse (mark=%s, conf=%.2f, %.1fs)",
+            mark_id, confidence, elapsed,
+        )
+        return None
+
+    mark_id = int(mark_id)
+    elem = som_result.get_element_by_id(mark_id)
+    if elem is None:
+        logger.warning("SoM resolve : mark #%d inexistant (%.1fs)", mark_id, elapsed)
+        return None
+
+    cx_norm, cy_norm = elem.center_norm
+    logger.info(
+        "SoM resolve OK : mark #%d '%s' → (%.4f, %.4f) conf=%.2f en %.1fs (%d éléments)",
+        mark_id, elem.label, cx_norm, cy_norm, confidence, elapsed, len(som_result.elements),
+    )
+
+    return {
+        "resolved": True,
+        "method": "som_vlm",
+        "x_pct": round(cx_norm, 6),
+        "y_pct": round(cy_norm, 6),
+        "matched_element": {
+            "label": elem.label or f"mark #{mark_id}",
+            "type": elem.source,
+            "role": "som_identified",
+            "confidence": confidence,
+            "som_id": mark_id,
+        },
+        "score": confidence,
+    }
+
+
+# =========================================================================
+# Orchestrateur — Résolution cible complète (synchrone)
+# =========================================================================
+
+def _resolve_target_sync(
+    screenshot_path: str,
+    target_spec: Dict[str, Any],
+    screen_width: int,
+    screen_height: int,
+    fallback_x_pct: float,
+    fallback_y_pct: float,
+    strict_mode: bool = False,
+    processor=None,
+) -> Dict[str, Any]:
+    """Résoudre la cible visuellement (exécuté dans un thread séparé).
+
+    Hiérarchie de résolution (strict_mode=True, replay sessions) — VLM-FIRST :
+    1. VLM Quick Find (~3-8s) — compréhension sémantique de l'écran, multi-image
+       (screenshot + crop de référence + description riche)
+    1.5. SoM + VLM (~5-15s) — SomEngine numérote les éléments, VLM identifie le bon
+    2. Template matching OpenCV (~100ms) — fallback pixel, seuil STRICT 0.90
+    3. resolved=False → STOP le replay
+
+    Le VLM comprend le contexte (titre de fenêtre, type d'élément, position)
+    et peut trouver un élément même si l'écran est différent de l'enregistrement.
+    Le template matching ne compare que des pixels et produit des faux positifs.
+
+    Hiérarchie classique (strict_mode=False, VWB et autres) — INCHANGÉE :
+    1. Template matching OpenCV (~100ms) — seuil 0.70
+    1.5. VLM Quick Find si template échoue et by_text/by_role dispo
+    2. by_text/by_role → VLM Quick Find puis ScreenAnalyzer
+    3. fallback coordonnées statiques
+    """
+    anchor_image_b64 = target_spec.get("anchor_image_base64", "")
+
+    # ===================================================================
+    # MODE STRICT (replay sessions) — Stratégie VLM-FIRST
+    # ===================================================================
+    if strict_mode and anchor_image_b64:
+        vlm_description = target_spec.get("vlm_description", "")
+        by_text_strict = target_spec.get("by_text", "").strip()
+
+        # Fallback : construire la description depuis by_text/by_role
+        if not vlm_description:
+            by_role = target_spec.get("by_role", "").strip()
+            if by_text_strict or by_role:
+                vlm_description = _build_target_description(target_spec)
+
+        # ---------------------------------------------------------------
+        # Étape -1 : Vérification CLIP (si embedding de référence fourni)
+        # Vérifie qu'on est dans la bonne application avant de chercher
+        # l'élément. Filet de sécurité contre les clics au mauvais endroit.
+        # ---------------------------------------------------------------
+        clip_embedding = target_spec.get("clip_embedding")
+        if clip_embedding:
+            try:
+                from core.embedding.clip_embedder import CLIPEmbedder
+                from PIL import Image as _PILImage
+                import numpy as _np
+
+                _clip = CLIPEmbedder()
+                # Embedding de l'écran actuel (fenêtre si possible)
+                window_capture = target_spec.get("window_capture", {})
+                window_rect = window_capture.get("rect")
+                current_img = _PILImage.open(screenshot_path)
+                if window_rect:
+                    current_img = current_img.crop(tuple(window_rect))
+
+                current_emb = _np.array(_clip.embed_image(current_img), dtype=_np.float32).flatten()
+                ref_emb = _np.array(clip_embedding, dtype=_np.float32).flatten()
+
+                clip_sim = float(_np.dot(current_emb, ref_emb) / (
+                    _np.linalg.norm(current_emb) * _np.linalg.norm(ref_emb)
+                ))
+                logger.info(f"CLIP vérification : similarité={clip_sim:.3f}")
+
+                if clip_sim < 0.75:
+                    logger.warning(
+                        f"CLIP MISMATCH : sim={clip_sim:.3f} < 0.75 — "
+                        f"écran actuel trop différent de l'enregistrement"
+                    )
+                    return {
+                        "resolved": False,
+                        "method": "clip_mismatch",
+                        "reason": f"clip_similarity_{clip_sim:.3f}",
+                        "x_pct": fallback_x_pct,
+                        "y_pct": fallback_y_pct,
+                    }
+            except Exception as e:
+                logger.debug(f"CLIP vérification erreur (non-bloquant) : {e}")
+
+        # ---------------------------------------------------------------
+        # Étape 0 : Choisir la stratégie selon le type d'élément
+        # ---------------------------------------------------------------
+        by_text_source = target_spec.get("by_text_source", "")
+
+        has_window = bool(target_spec.get("window_capture", {}).get("rect"))
+
+        if by_text_strict and by_text_source in ("ocr", "vlm") and has_window:
+            # Texte visible DANS une fenêtre → grounding VLM sur fenêtre croppée
+            grounding_result = _resolve_by_grounding(
+                screenshot_path=screenshot_path,
+                target_spec=target_spec,
+                screen_width=screen_width,
+                screen_height=screen_height,
+            )
+            if grounding_result and grounding_result.get("resolved"):
+                logger.info(
+                    "Strict resolve GROUNDING : OK (%.4f, %.4f) pour '%s'",
+                    grounding_result.get("x_pct", 0),
+                    grounding_result.get("y_pct", 0),
+                    by_text_strict[:50],
+                )
+                return grounding_result
+
+        if not by_text_strict or by_text_source not in ("ocr", "vlm"):
+            # Template matching pour les éléments sans texte (icônes pures)
+            window_capture = target_spec.get("window_capture", {})
+            window_rect = window_capture.get("rect")
+            from pathlib import Path as _Path
+            _full = _Path(screenshot_path)
+            _win = _full.parent / _full.name.replace("_full.png", "_window.png")
+            tm_path = str(_win) if _win.is_file() and window_rect else screenshot_path
+            tm_screen_w = (window_rect[2] - window_rect[0]) if window_rect and _win.is_file() else screen_width
+            tm_screen_h = (window_rect[3] - window_rect[1]) if window_rect and _win.is_file() else screen_height
+
+            result = _resolve_by_template_matching(
+                screenshot_path=tm_path,
+                anchor_image_b64=anchor_image_b64,
+                screen_width=tm_screen_w,
+                screen_height=tm_screen_h,
+                confidence_threshold=0.90,
+            )
+            if result and result.get("score", 0) >= 0.90:
+                x_tm, y_tm = result["x_pct"], result["y_pct"]
+                # Convertir coordonnées fenêtre → écran si nécessaire
+                if window_rect and _win.is_file():
+                    abs_x = window_rect[0] + x_tm * tm_screen_w
+                    abs_y = window_rect[1] + y_tm * tm_screen_h
+                    result["x_pct"] = round(abs_x / screen_width, 6)
+                    result["y_pct"] = round(abs_y / screen_height, 6)
+                logger.info(
+                    "Strict resolve TEMPLATE : icon match (score=%.3f)",
+                    result.get("score", 0),
+                )
+                return result
+
+        # ---------------------------------------------------------------
+        # Étape 1 : VLM Quick Find (fallback, multi-image)
+        # ---------------------------------------------------------------
+        if vlm_description or anchor_image_b64:
+            vlm_result = _vlm_quick_find(
+                screenshot_path=screenshot_path,
+                target_description=vlm_description,
+                anchor_image_b64=anchor_image_b64,
+            )
+            if vlm_result and vlm_result.get("resolved"):
+                if vlm_result.get("score", 0) >= 0.3:
+                    logger.info(
+                        "Strict resolve VLM-first : VLM OK (score=%.2f) pour '%s'",
+                        vlm_result.get("score", 0),
+                        vlm_description[:60] if vlm_description else "(anchor)",
+                    )
+                    return vlm_result
+                else:
+                    logger.info(
+                        "Strict resolve VLM-first : VLM score=%.2f trop bas, passage template",
+                        vlm_result.get("score", 0),
+                    )
+            else:
+                logger.info(
+                    "Strict resolve VLM-first : VLM échoué pour '%s', passage template matching",
+                    vlm_description[:60] if vlm_description else "(anchor)",
+                )
+
+        # ---------------------------------------------------------------
+        # Étape 1.5 : SoM + VLM (Set-of-Mark + identification)
+        # SomEngine numérote les éléments, VLM identifie le bon numéro.
+        # Plus fiable que le VLM direct car le VLM n'a qu'à identifier,
+        # pas localiser — et les coordonnées sont pixel-perfect.
+        # ---------------------------------------------------------------
+        som_element = target_spec.get("som_element", {})
+        if som_element or vlm_description:
+            som_result = _resolve_by_som(
+                screenshot_path=screenshot_path,
+                target_spec=target_spec,
+                screen_width=screen_width,
+                screen_height=screen_height,
+            )
+            if som_result and som_result.get("resolved"):
+                logger.info(
+                    "Strict resolve SoM+VLM : OK (score=%.2f, mark=#%s)",
+                    som_result.get("score", 0),
+                    som_result.get("matched_element", {}).get("som_id", "?"),
+                )
+                return som_result
+            else:
+                logger.info("Strict resolve SoM+VLM : échoué, passage template matching")
+
+        # ---------------------------------------------------------------
+        # Étape 2 : Template matching (fallback pixel) — seuil STRICT 0.90
+        # ---------------------------------------------------------------
+        result = _resolve_by_template_matching(
+            screenshot_path=screenshot_path,
+            anchor_image_b64=anchor_image_b64,
+            screen_width=screen_width,
+            screen_height=screen_height,
+            confidence_threshold=0.90,
+        )
+        if result:
+            score = result.get("score", 0)
+            # Score >= 0.95 : match quasi-parfait, pas besoin de valider le contexte
+            if score >= 0.95:
+                logger.info(
+                    "Strict resolve VLM-first : template matching fallback OK "
+                    "(score=%.3f >= 0.95, contexte skip — match quasi-parfait)",
+                    score,
+                )
+                return result
+            elif _validate_match_context(result, fallback_x_pct, fallback_y_pct, target_spec):
+                logger.info(
+                    "Strict resolve VLM-first : template matching fallback OK "
+                    "(score=%.3f >= 0.90, context OK)",
+                    score,
+                )
+                return result
+            else:
+                logger.warning(
+                    "Strict resolve VLM-first : template score=%.3f MAIS contexte invalide, rejeté",
+                    score,
+                )
+
+        # ---------------------------------------------------------------
+        # Étape 3 : RIEN ne fonctionne → resolved=False → STOP replay
+        # ---------------------------------------------------------------
+        return {
+            "resolved": False,
+            "method": "strict_vlm_template_failed",
+            "reason": "vlm_and_template_all_failed",
+            "x_pct": fallback_x_pct,
+            "y_pct": fallback_y_pct,
+        }
+
+    # ===================================================================
+    # MODE CLASSIQUE (VWB et autres) — Comportement existant
+    # ===================================================================
+
+    # ---------------------------------------------------------------
+    # Stratégie 1 : Template matching par image d'ancre (seuil 0.70)
+    # ---------------------------------------------------------------
+    if anchor_image_b64:
+        result = _resolve_by_template_matching(
+            screenshot_path=screenshot_path,
+            anchor_image_b64=anchor_image_b64,
+            screen_width=screen_width,
+            screen_height=screen_height,
+            confidence_threshold=0.7,
+        )
+        if result:
+            return result
+        logger.info(
+            "Template matching échoué pour ancre '%s', tentative VLM Quick Find",
+            target_spec.get("anchor_id", "?"),
+        )
+
+        # ---------------------------------------------------------------
+        # Stratégie 1.5 : VLM Quick Find (fallback léger après template matching)
+        # ---------------------------------------------------------------
+        by_text = target_spec.get("by_text", "").strip()
+        by_role = target_spec.get("by_role", "").strip()
+        if by_text or by_role:
+            vlm_desc = _build_target_description(target_spec)
+            vlm_result = _vlm_quick_find(
+                screenshot_path=screenshot_path,
+                target_description=vlm_desc,
+                anchor_image_b64=anchor_image_b64,
+            )
+            if vlm_result:
+                return vlm_result
+            logger.info(
+                "VLM Quick Find échoué pour ancre '%s', fallback coordonnées",
+                target_spec.get("anchor_id", "?"),
+            )
+
+        return {
+            "resolved": False,
+            "method": "fallback",
+            "reason": "template_matching_failed",
+            "x_pct": fallback_x_pct,
+            "y_pct": fallback_y_pct,
+        }
+
+    # ---------------------------------------------------------------
+    # Stratégie 2 : VLM Quick Find (léger, ~5-10s)
+    # ---------------------------------------------------------------
+    by_text = target_spec.get("by_text", "")
+    by_role = target_spec.get("by_role", "")
+
+    # Si aucun critère sémantique et pas d'ancre, fallback direct
+    if not by_text and not by_role and not anchor_image_b64:
+        return {
+            "resolved": False,
+            "method": "fallback",
+            "reason": "no_target_criteria",
+            "x_pct": fallback_x_pct,
+            "y_pct": fallback_y_pct,
+        }
+
+    # Tenter le VLM Quick Find AVANT ScreenAnalyzer (beaucoup plus rapide)
+    if by_text or by_role:
+        vlm_desc = _build_target_description(target_spec)
+        vlm_result = _vlm_quick_find(
+            screenshot_path=screenshot_path,
+            target_description=vlm_desc,
+        )
+        if vlm_result:
+            return vlm_result
+        logger.info(
+            "VLM Quick Find échoué pour '%s', fallback ScreenAnalyzer",
+            vlm_desc,
+        )
+
+    # ---------------------------------------------------------------
+    # Stratégie 3 : Matching sémantique via ScreenAnalyzer (~15-20s)
+    # ---------------------------------------------------------------
+    if processor is None:
+        return {
+            "resolved": False,
+            "method": "fallback",
+            "reason": "no_processor",
+            "x_pct": fallback_x_pct,
+            "y_pct": fallback_y_pct,
+        }
+
+    processor._ensure_initialized()
+
+    if processor._screen_analyzer is None:
+        return {
+            "resolved": False,
+            "method": "fallback",
+            "reason": "screen_analyzer_unavailable",
+            "x_pct": fallback_x_pct,
+            "y_pct": fallback_y_pct,
+        }
+
+    # Analyser le screenshot (Niveaux 1-3 : raw, OCR, UI elements)
+    try:
+        screen_state = processor._screen_analyzer.analyze(screenshot_path)
+    except Exception as e:
+        logger.warning(f"Analyse screenshot échouée: {e}")
+        return {
+            "resolved": False,
+            "method": "fallback",
+            "reason": f"analysis_failed: {e}",
+            "x_pct": fallback_x_pct,
+            "y_pct": fallback_y_pct,
+        }
+
+    ui_elements = screen_state.ui_elements or []
+    if not ui_elements:
+        logger.info("Aucun élément UI détecté, fallback coordonnées")
+        return {
+            "resolved": False,
+            "method": "fallback",
+            "reason": "no_ui_elements",
+            "x_pct": fallback_x_pct,
+            "y_pct": fallback_y_pct,
+        }
+
+    # Matching de la cible parmi les éléments détectés
+    candidates = []
+
+    for elem in ui_elements:
+        score = 0.0
+
+        # Score par texte (label)
+        if by_text and elem.label:
+            text_lower = by_text.lower()
+            label_lower = elem.label.lower()
+            if text_lower in label_lower or label_lower in text_lower:
+                score += 0.6
+            elif _fuzzy_match(text_lower, label_lower):
+                score += 0.3
+
+        # Score par rôle
+        if by_role:
+            role_lower = by_role.lower()
+            if elem.role and role_lower in elem.role.lower():
+                score += 0.3
+            if elem.type and role_lower in elem.type.lower():
+                score += 0.2
+
+        if score > 0:
+            candidates.append((elem, score))
+
+    if not candidates:
+        logger.info(
+            f"Aucun match visuel pour target(text='{by_text}', role='{by_role}') "
+            f"parmi {len(ui_elements)} éléments"
+        )
+        return {
+            "resolved": False,
+            "method": "fallback",
+            "reason": "no_match",
+            "x_pct": fallback_x_pct,
+            "y_pct": fallback_y_pct,
+            "ui_elements_count": len(ui_elements),
+        }
+
+    # Trier par score décroissant et prendre le meilleur
+    candidates.sort(key=lambda c: c[1], reverse=True)
+    best_elem, best_score = candidates[0]
+
+    # Convertir les coordonnées pixel en proportions
+    cx, cy = best_elem.center
+    x_pct = round(cx / screen_width, 6) if screen_width > 0 else 0.0
+    y_pct = round(cy / screen_height, 6) if screen_height > 0 else 0.0
+
+    logger.info(
+        f"Cible résolue visuellement: '{best_elem.label}' ({best_elem.type}/{best_elem.role}) "
+        f"score={best_score:.2f} → ({x_pct:.4f}, {y_pct:.4f})"
+    )
+
+    return {
+        "resolved": True,
+        "method": "visual",
+        "x_pct": x_pct,
+        "y_pct": y_pct,
+        "matched_element": {
+            "label": best_elem.label,
+            "type": best_elem.type,
+            "role": best_elem.role,
+            "center": list(best_elem.center),
+            "confidence": best_elem.label_confidence,
+        },
+        "score": best_score,
+        "candidates_count": len(candidates),
+        "ui_elements_count": len(ui_elements),
+    }
+
+
+def _fuzzy_match(a: str, b: str, threshold: float = 0.6) -> bool:
+    """Match approximatif par ratio de caractères communs."""
+    if not a or not b:
+        return False
+    common = sum(1 for c in a if c in b)
+    return (common / max(len(a), len(b))) >= threshold
+
+
+def _fallback_response(request: ResolveTargetRequest, reason: str, detail: str) -> Dict:
+    """Réponse de fallback quand la résolution visuelle échoue."""
+    return {
+        "resolved": False,
+        "method": "fallback",
+        "reason": reason,
+        "detail": detail,
+        "x_pct": request.fallback_x_pct,
+        "y_pct": request.fallback_y_pct,
+    }
+
+
+# =========================================================================
+# Observer — Pré-analyse écran avant résolution
+# =========================================================================
+
+def _pre_analyze_screen_sync(
+    screenshot_b64: str,
+    expected_state: str,
+    window_title: str,
+    screen_width: int,
+    screen_height: int,
+) -> Dict[str, Any]:
+    """Pré-analyse synchrone de l'écran via VLM.
+
+    Utilise gemma4 (Docker port 11435) pour détecter :
+    1. Popups/dialogues modaux (avec coordonnées du bouton à cliquer)
+    2. États incohérents avec l'attendu
+
+    Rapide (~2-5s) car gemma4 est léger et en mode texte+image.
+    """
+    import requests as _requests
+
+    gemma4_port = os.environ.get("GEMMA4_PORT", "11435")
+    gemma4_url = f"http://localhost:{gemma4_port}/api/chat"
+
+    # Charger le contexte métier pour l'Observer
+    from .domain_context import get_domain_context
+    domain = get_domain_context(os.environ.get("RPA_DOMAIN", "generic"))
+
+    # Prompt concis pour détection popup
+    prompt = (
+        "Regarde cette capture d'écran.\n"
+        "Y a-t-il une popup, boîte de dialogue, message d'erreur, ou fenêtre modale visible ?\n\n"
+        "Réponds EXACTEMENT dans ce format :\n"
+        "ÉTAT: OK ou POPUP ou INATTENDU\n"
+        "BOUTON: texte du bouton à cliquer (si POPUP, sinon 'aucun')\n"
+        "DÉTAIL: description courte (1 ligne)"
+    )
+
+    # Messages avec contexte métier
+    messages = []
+    if domain.system_prompt:
+        messages.append({"role": "system", "content": domain.system_prompt})
+    messages.append({"role": "user", "content": prompt, "images": [screenshot_b64]})
+
+    try:
+        t_start = time.time()
+        resp = _requests.post(
+            gemma4_url,
+            json={
+                "model": "gemma4:e4b",
+                "messages": messages,
+                "stream": False,
+                "think": True,
+                "options": {"temperature": 0.1, "num_predict": 800},
+            },
+            timeout=30,
+        )
+        elapsed_ms = (time.time() - t_start) * 1000
+
+        if not resp.ok:
+            logger.warning(f"Observer VLM HTTP {resp.status_code}")
+            return {"screen_state": "ok", "detail": f"VLM HTTP {resp.status_code}"}
+
+        content = resp.json().get("message", {}).get("content", "").strip()
+        logger.info(f"Observer VLM ({elapsed_ms:.0f}ms) : {content[:100]}")
+
+        # Parser la réponse
+        state = "ok"
+        button = ""
+        detail = content
+
+        for line in content.split("\n"):
+            line_clean = line.strip()
+            upper = line_clean.upper()
+            if upper.startswith("ÉTAT:") or upper.startswith("ETAT:"):
+                val = upper.split(":", 1)[1].strip()
+                if "POPUP" in val:
+                    state = "popup"
+                elif "INATTENDU" in val or "UNEXPECTED" in val:
+                    state = "unexpected"
+                else:
+                    state = "ok"
+            elif upper.startswith("BOUTON:"):
+                button = line_clean.split(":", 1)[1].strip().strip("'\"")
+                if button.lower() in ("aucun", "none", "n/a", ""):
+                    button = ""
+            elif upper.startswith("DÉTAIL:") or upper.startswith("DETAIL:"):
+                detail = line_clean.split(":", 1)[1].strip()
+
+        if state == "ok":
+            return {"screen_state": "ok"}
+
+        result = {
+            "screen_state": state,
+            "detail": detail,
+            "elapsed_ms": round(elapsed_ms, 1),
+        }
+
+        # Si popup détectée avec un texte de bouton, essayer de le localiser
+        if state == "popup" and button:
+            result["popup_label"] = button
+            # Localiser le bouton par grounding VLM (qwen2.5vl)
+            coords = _locate_popup_button(screenshot_b64, button, screen_width, screen_height)
+            if coords:
+                result["popup_coords"] = coords
+
+        return result
+
+    except _requests.Timeout:
+        logger.debug("Observer VLM timeout (15s)")
+        return {"screen_state": "ok", "detail": "VLM timeout"}
+    except Exception as e:
+        logger.debug(f"Observer VLM erreur : {e}")
+        return {"screen_state": "ok", "detail": str(e)}
+
+
+def _locate_popup_button(
+    screenshot_b64: str, button_text: str,
+    screen_width: int, screen_height: int,
+) -> Optional[Dict[str, float]]:
+    """Localiser un bouton de popup par grounding VLM (qwen2.5vl).
+
+    Utilise le format bbox_2d natif de qwen2.5vl pour trouver
+    la position exacte du bouton sur le screenshot.
+    """
+    import requests as _requests
+
+    ollama_url = "http://localhost:11434/api/chat"
+    prompt = f"Detect the button with text '{button_text}' with a bounding box."
+
+    try:
+        resp = _requests.post(
+            ollama_url,
+            json={
+                "model": "qwen2.5vl:7b",
+                "messages": [{"role": "user", "content": prompt, "images": [screenshot_b64]}],
+                "stream": False,
+                "options": {"temperature": 0.1, "num_predict": 50},
+            },
+            timeout=15,
+        )
+        if not resp.ok:
+            return None
+
+        content = resp.json().get("message", {}).get("content", "")
+
+        # Parser bbox_2d — qwen2.5vl retourne des coordonnées en pixels
+        # relatifs à l'image envoyée, PAS sur une grille 1000x1000.
+        # Format JSON : [{"bbox_2d": [x1, y1, x2, y2], "label": "..."}]
+        bbox_match = re.search(
+            r'"bbox_2d"\s*:\s*\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]',
+            content,
+        )
+        if bbox_match:
+            x1, y1, x2, y2 = [int(bbox_match.group(i)) for i in range(1, 5)]
+            # Normaliser par les dimensions de l'écran (pixels → 0-1)
+            cx = (x1 + x2) / 2 / screen_width
+            cy = (y1 + y2) / 2 / screen_height
+            if 0.0 <= cx <= 1.0 and 0.0 <= cy <= 1.0:
+                logger.info(f"Observer : bouton '{button_text}' localisé à ({cx:.3f}, {cy:.3f})")
+                return {"x_pct": cx, "y_pct": cy}
+
+    except Exception as e:
+        logger.debug(f"Observer grounding bouton erreur : {e}")
+
+    return None