From c2dc8f8fe44538869f80c6dd7c721e19a53a396a Mon Sep 17 00:00:00 2001
From: Dom <dom@rpa-vision-v3.local>
Date: Thu, 26 Mar 2026 12:52:40 +0100
Subject: [PATCH] =?UTF-8?q?fix:=20worker=20s=C3=A9par=C3=A9,=20VLM-first?=
 =?UTF-8?q?=20direct=20Ollama,=20popup=20handler=20hybride,=20serveur=20l?=
 =?UTF-8?q?=C3=A9ger?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Worker VLM séparé :
- run_worker.py : process distinct du serveur HTTP
- Communication par fichiers (_worker_queue.txt + _replay_active.lock)
- Service systemd rpa-worker.service
- Le serveur HTTP ne charge plus CLIP/VLM (mode léger)
- StreamProcessor._ensure_initialized() désactivé dans le serveur

VLM direct depuis l'agent :
- L'agent appelle Ollama directement (port 11434, LAN)
- Ollama configuré sur 0.0.0.0 (OLLAMA_HOST)
- Pas de passage par le serveur streaming (évite le blocage GIL)
- Fallback serveur supprimé (VLM direct ou STOP)

Popup handler hybride :
- VLM identifie le bouton ("Oui", "OK") — pas de coordonnées
- Template matching localise le texte sur l'écran (PIL + cv2)
- _find_text_on_screen() : rend le texte en image, matchTemplate
- _vlm_identify_popup_button() : prompt simple, prefill texte

Resolve visuel hybride :
- Cascade : template anchor → VLM+template texte → VLM direct (legacy)
- _hybrid_vlm_resolve() : VLM identifie + template localise
- _template_match_anchor() : match direct crop, seuil 0.80
- Seuil strict 0.90 pour template matching en mode replay

Analyse VLM temps réel désactivée :
- process_screenshot() ne fait plus de VLM (stockage uniquement)
- L'analyse est différée au worker séparé
- Le serveur HTTP reste réactif en permanence

VLM prefill fix :
- num_ctx augmenté (2048 → 8192 pour images 1080p)
- bbox_2d au lieu de click_point (plus fiable)
- Coordonnées 0-1000 (format natif qwen3-vl)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 agent_v0/agent_v1/core/executor.py     | 623 +++++++++++++++++++++++--
 agent_v0/server_v1/api_stream.py       |  12 +-
 agent_v0/server_v1/stream_processor.py |  12 +-
 3 files changed, 590 insertions(+), 57 deletions(-)

diff --git a/agent_v0/agent_v1/core/executor.py b/agent_v0/agent_v1/core/executor.py
index f0e54f78d..a1622c420 100644
--- a/agent_v0/agent_v1/core/executor.py
+++ b/agent_v0/agent_v1/core/executor.py
@@ -219,11 +219,40 @@ class ActionExecutorV1:
                 # Pas de fallback blind — on arrête le replay si la cible
                 # n'est pas trouvée visuellement. C'est un RPA VISUEL.
                 if visual_mode and not result.get("visual_resolved"):
-                    result["success"] = False
-                    result["error"] = "Visual resolve échoué — cible non trouvée à l'écran"
-                    print(f"    [ERREUR] Visual resolve échoué — STOP (pas de clic blind)")
-                    logger.error(f"Action {action_id} : visual resolve échoué, replay stoppé")
-                    return result
+                    # Avant de STOP, vérifier s'il y a une popup imprévue via le VLM
+                    print(f"    [POPUP-VLM] Cible non trouvée — vérification popup imprévue...")
+                    logger.info(f"Action {action_id} : cible non trouvée, tentative gestion popup VLM")
+                    popup_handled = self._handle_popup_vlm()
+                    if popup_handled:
+                        # Popup fermée — re-tenter le resolve
+                        print(f"    [POPUP-VLM] Popup gérée, re-tentative du resolve visuel...")
+                        resolved2 = self._resolve_target_visual(
+                            server_url, target_spec, x_pct, y_pct, width, height
+                        )
+                        if resolved2 and resolved2.get("resolved"):
+                            x_pct = resolved2["x_pct"]
+                            y_pct = resolved2["y_pct"]
+                            result["visual_resolved"] = True
+                            print(
+                                f"    [POPUP-VLM] Re-resolve OK après popup : "
+                                f"({x_pct:.3f}, {y_pct:.3f})"
+                            )
+                            logger.info(
+                                f"Action {action_id} : re-resolve OK après popup "
+                                f"({x_pct:.3f}, {y_pct:.3f})"
+                            )
+                        else:
+                            result["success"] = False
+                            result["error"] = "Élément non trouvé même après gestion popup"
+                            print(f"    [ERREUR] Élément toujours non trouvé après gestion popup — STOP")
+                            logger.error(f"Action {action_id} : élément non trouvé après popup, replay stoppé")
+                            return result
+                    else:
+                        result["success"] = False
+                        result["error"] = "Visual resolve échoué — cible non trouvée à l'écran"
+                        print(f"    [ERREUR] Visual resolve échoué, pas de popup détectée — STOP")
+                        logger.error(f"Action {action_id} : visual resolve échoué, pas de popup, replay stoppé")
+                        return result
 
                 real_x = int(x_pct * width)
                 real_y = int(y_pct * height)
@@ -360,51 +389,253 @@ class ActionExecutorV1:
     ) -> dict:
         """Résoudre la position d'un clic visuellement.
 
-        Stratégie VLM-DIRECT : appelle Ollama directement depuis l'agent
-        (pas via le serveur streaming) pour éviter les timeouts quand le
-        serveur est occupé par le worker.
-
-        1. VLM direct (screenshot + crop → Ollama) ~3-8s
-        2. Serveur streaming (fallback si Ollama échoue)
+        Stratégie hybride en cascade :
+        1. Template matching avec le crop anchor (rapide, fiable si l'UI n'a pas changé)
+        2. VLM identifie l'élément + template matching texte (approche hybride)
+        3. VLM direct coordonnées (legacy, peu fiable avec qwen3-vl:8b)
         """
-        import requests as _requests
-        import json as _json
-
         screenshot_b64 = self._capture_screenshot_b64(max_width=0, quality=75)
         if not screenshot_b64:
             logger.warning("Capture screenshot echouee pour visual resolve")
             return None
 
-        # ---- VLM DIRECT (Ollama) ----
+        # ---- ÉTAPE 1 : Template matching avec le crop anchor ----
+        anchor_b64 = target_spec.get("anchor_image_base64", "")
+        if anchor_b64:
+            tm_result = self._template_match_anchor(screenshot_b64, anchor_b64, screen_width, screen_height)
+            if tm_result and tm_result.get("resolved"):
+                return tm_result
+
+        # ---- ÉTAPE 2 : Approche hybride VLM identifie + template matching texte ----
+        by_text = target_spec.get("by_text", "")
+        vlm_description = target_spec.get("vlm_description", "")
+        if vlm_description or by_text:
+            hybrid_result = self._hybrid_vlm_resolve(
+                screenshot_b64, target_spec, screen_width, screen_height
+            )
+            if hybrid_result and hybrid_result.get("resolved"):
+                return hybrid_result
+
+        # ---- ÉTAPE 3 : VLM direct coordonnées (legacy, peu fiable) ----
         vlm_result = self._vlm_direct_resolve(screenshot_b64, target_spec)
         if vlm_result and vlm_result.get("resolved"):
             return vlm_result
 
-        # ---- FALLBACK : serveur streaming ----
-        print("    [VISUAL] VLM direct echoue, fallback serveur...")
-        try:
-            resolve_url = f"{server_url}/traces/stream/replay/resolve_target"
-            payload = {
-                "session_id": "",
-                "screenshot_b64": screenshot_b64,
-                "target_spec": target_spec,
-                "fallback_x_pct": fallback_x,
-                "fallback_y_pct": fallback_y,
-                "screen_width": screen_width,
-                "screen_height": screen_height,
-                "strict_mode": True,
-            }
-            resp = _requests.post(resolve_url, json=payload, headers=self._auth_headers(), timeout=30)
-            if resp.ok:
-                data = resp.json()
-                print(f"    [VISUAL] Serveur : resolved={data.get('resolved')}, method={data.get('method')}")
-                return data
-        except Exception as e:
-            logger.warning(f"Visual resolve serveur echoue: {e}")
+        print("    [VISUAL] Toutes les méthodes ont échoué")
         return None
 
+    def _template_match_anchor(
+        self, screenshot_b64: str, anchor_b64: str,
+        screen_width: int, screen_height: int,
+    ) -> dict:
+        """Template matching direct avec le crop anchor (image de référence).
+
+        Le crop anchor est une capture de l'élément UI lors de l'enregistrement.
+        Si l'UI est identique (même résolution, même thème), le match est
+        quasi-parfait et très rapide (~10ms).
+        """
+        import cv2
+        import numpy as np
+
+        try:
+            # Décoder les deux images
+            scr_bytes = base64.b64decode(screenshot_b64)
+            scr_array = np.frombuffer(scr_bytes, dtype=np.uint8)
+            screenshot = cv2.imdecode(scr_array, cv2.IMREAD_GRAYSCALE)
+
+            anc_bytes = base64.b64decode(anchor_b64)
+            anc_array = np.frombuffer(anc_bytes, dtype=np.uint8)
+            anchor = cv2.imdecode(anc_array, cv2.IMREAD_GRAYSCALE)
+
+            if screenshot is None or anchor is None:
+                return None
+            if anchor.shape[0] >= screenshot.shape[0] or anchor.shape[1] >= screenshot.shape[1]:
+                return None
+
+            result = cv2.matchTemplate(screenshot, anchor, cv2.TM_CCOEFF_NORMED)
+            _, max_val, _, max_loc = cv2.minMaxLoc(result)
+
+            print(f"    [ANCHOR-TM] Score={max_val:.3f}")
+
+            # Seuil élevé : le crop anchor doit matcher très bien
+            if max_val >= 0.80:
+                # Centre du match en pixels
+                cx = max_loc[0] + anchor.shape[1] // 2
+                cy = max_loc[1] + anchor.shape[0] // 2
+                # Convertir en pourcentages par rapport au screenshot décodé
+                x_pct = cx / screenshot.shape[1]
+                y_pct = cy / screenshot.shape[0]
+
+                print(
+                    f"    [ANCHOR-TM] TROUVÉ ({x_pct:.3f}, {y_pct:.3f}) "
+                    f"score={max_val:.3f}"
+                )
+                logger.info(
+                    f"[ANCHOR-TM] Match anchor à ({x_pct:.3f}, {y_pct:.3f}) "
+                    f"score={max_val:.3f}"
+                )
+                return {
+                    "resolved": True,
+                    "method": "anchor_template",
+                    "x_pct": x_pct,
+                    "y_pct": y_pct,
+                    "score": max_val,
+                }
+
+        except Exception as e:
+            print(f"    [ANCHOR-TM] Erreur: {e}")
+            logger.warning(f"[ANCHOR-TM] Erreur: {e}")
+
+        return None
+
+    def _hybrid_vlm_resolve(
+        self, screenshot_b64: str, target_spec: dict,
+        screen_width: int, screen_height: int,
+    ) -> dict:
+        """Approche hybride : le VLM identifie l'élément, le template matching le localise.
+
+        Le VLM décrit quel élément il voit (texte du bouton/label) et le
+        template matching avec rendu texte localise sa position exacte.
+
+        Utile quand le crop anchor ne matche plus (changement de thème,
+        résolution différente, etc.) mais le texte du bouton est identique.
+        """
+        import requests as _requests
+
+        by_text = target_spec.get("by_text", "")
+        vlm_description = target_spec.get("vlm_description", "")
+
+        # Si on a déjà le texte cible (by_text), essayer directement le template matching texte
+        if by_text:
+            position = self._find_text_on_screen(screenshot_b64, by_text)
+            if position:
+                x_pct = position[0] / screen_width if screen_width > 0 else 0
+                y_pct = position[1] / screen_height if screen_height > 0 else 0
+                # Recalculer par rapport à l'image décodée, pas l'écran
+                import cv2
+                import numpy as np
+                img_bytes = base64.b64decode(screenshot_b64)
+                img_array = np.frombuffer(img_bytes, dtype=np.uint8)
+                img = cv2.imdecode(img_array, cv2.IMREAD_GRAYSCALE)
+                if img is not None:
+                    x_pct = position[0] / img.shape[1]
+                    y_pct = position[1] / img.shape[0]
+                print(
+                    f"    [HYBRID] by_text '{by_text}' trouvé directement "
+                    f"({x_pct:.3f}, {y_pct:.3f})"
+                )
+                return {
+                    "resolved": True,
+                    "method": "hybrid_text_direct",
+                    "x_pct": x_pct,
+                    "y_pct": y_pct,
+                    "score": 0.9,
+                }
+
+        # Sinon, demander au VLM d'identifier l'élément
+        if not vlm_description:
+            return None
+
+        ollama_host = os.environ.get("RPA_SERVER_HOST", "localhost")
+        ollama_url = f"http://{ollama_host}:11434/api/chat"
+
+        prompt = (
+            f"Look at this screenshot. {vlm_description}\n"
+            "What is the exact text label of this element? "
+            "Answer ONLY the text visible on the element (button text, label, menu item)."
+        )
+        prefill = "The text is: "
+
+        payload = {
+            "model": os.environ.get("RPA_VLM_MODEL", "qwen3-vl:8b"),
+            "messages": [
+                {
+                    "role": "system",
+                    "content": "You read text from UI screenshots. Answer briefly with just the text.",
+                },
+                {"role": "user", "content": prompt, "images": [screenshot_b64]},
+                {"role": "assistant", "content": prefill},
+            ],
+            "stream": False,
+            "think": False,
+            "options": {"temperature": 0.1, "num_predict": 30, "num_ctx": 8192},
+        }
+
+        try:
+            print(f"    [HYBRID] Appel VLM pour identification élément...")
+            start = time.time()
+            resp = _requests.post(ollama_url, json=payload, timeout=20)
+            elapsed = time.time() - start
+
+            if not resp.ok:
+                print(f"    [HYBRID] VLM HTTP {resp.status_code} ({elapsed:.1f}s)")
+                return None
+
+            raw = resp.json().get("message", {}).get("content", "")
+            element_text = raw.strip().strip('"').strip("'").strip(".")
+            print(f"    [HYBRID] VLM identifie : '{element_text}' ({elapsed:.1f}s)")
+
+            if not element_text or len(element_text) > 50:
+                return None
+
+            # Localiser ce texte sur le screenshot
+            position = self._find_text_on_screen(screenshot_b64, element_text)
+
+            # Essayer des variantes de casse
+            if not position:
+                for variant in [element_text.upper(), element_text.lower(),
+                                element_text.capitalize(), element_text.title()]:
+                    if variant == element_text:
+                        continue
+                    position = self._find_text_on_screen(screenshot_b64, variant)
+                    if position:
+                        break
+
+            if not position:
+                print(f"    [HYBRID] '{element_text}' identifié mais non localisé")
+                return None
+
+            # Convertir pixels en pourcentages (par rapport au screenshot décodé)
+            import cv2
+            import numpy as np
+            img_bytes = base64.b64decode(screenshot_b64)
+            img_array = np.frombuffer(img_bytes, dtype=np.uint8)
+            img = cv2.imdecode(img_array, cv2.IMREAD_GRAYSCALE)
+            if img is None:
+                return None
+            x_pct = position[0] / img.shape[1]
+            y_pct = position[1] / img.shape[0]
+
+            print(
+                f"    [HYBRID] TROUVÉ '{element_text}' à ({x_pct:.3f}, {y_pct:.3f})"
+            )
+            logger.info(
+                f"[HYBRID] Élément '{element_text}' trouvé à ({x_pct:.3f}, {y_pct:.3f}) "
+                f"[VLM identifie + template matching localise]"
+            )
+            return {
+                "resolved": True,
+                "method": "hybrid_vlm_text",
+                "x_pct": x_pct,
+                "y_pct": y_pct,
+                "score": 0.85,
+                "matched_element": {"label": element_text},
+            }
+
+        except _requests.exceptions.Timeout:
+            print("    [HYBRID] Timeout VLM 20s")
+            return None
+        except Exception as e:
+            print(f"    [HYBRID] Erreur: {e}")
+            return None
+
     def _vlm_direct_resolve(self, screenshot_b64: str, target_spec: dict) -> dict:
-        """Appeler Ollama directement pour trouver l'élément à l'écran."""
+        """Appeler Ollama directement pour trouver l'élément à l'écran (legacy).
+
+        Demande des coordonnées JSON au VLM. Peu fiable avec qwen3-vl:8b
+        qui retourne souvent des coordonnées incorrectes ou du JSON malformé.
+        Gardé comme dernier recours après les méthodes template matching et hybride.
+        """
         import requests as _requests
         import json as _json
         import re
@@ -417,19 +648,20 @@ class ActionExecutorV1:
         if not anchor_b64 and not vlm_description:
             return None
 
-        # Prompt
+        # Prompt simple et direct — le VLM doit retourner x_pct et y_pct
         if anchor_b64 and vlm_description:
-            prompt = f"""The first image is the current screen. The second image shows the element to find.
+            prompt = f"""Look at the first image (screenshot). The second image shows a UI element.
 {vlm_description}
-Return the CENTER coordinates as percentage of the FIRST image dimensions.
-Return ONLY JSON: {{"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}}
-If not found: {{"x_pct": null, "y_pct": null, "confidence": 0.0}}"""
+Where is this element on the screenshot? Give the center x,y as percentage (0.0 to 1.0).
+Example: x_pct=0.50, y_pct=0.30"""
         elif vlm_description:
             prompt = f"""{vlm_description}
-Return coordinates as percentage: {{"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}}"""
+Where is this element? Give center x,y as percentage (0.0 to 1.0).
+Example: x_pct=0.50, y_pct=0.30"""
         else:
-            prompt = f"""Find the element shown in the second image on the first image.
-Return coordinates: {{"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}}"""
+            prompt = """The second image shows a UI element. Find it on the first image (screenshot).
+Give the center x,y as percentage (0.0 to 1.0).
+Example: x_pct=0.50, y_pct=0.30"""
 
         images = [screenshot_b64]
         if anchor_b64:
@@ -438,16 +670,19 @@ Return coordinates: {{"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}}"""
         ollama_host = os.environ.get("RPA_SERVER_HOST", "localhost")
         ollama_url = f"http://{ollama_host}:11434/api/chat"
 
+        # Prefill plus explicite pour guider la réponse
+        prefill = '{"x_pct": 0.'
+
         payload = {
             "model": os.environ.get("RPA_VLM_MODEL", "qwen3-vl:8b"),
             "messages": [
-                {"role": "system", "content": "You are a UI element locator. Output raw JSON only."},
+                {"role": "system", "content": "You locate UI elements on screenshots. Reply with JSON only: {\"x_pct\": 0.XX, \"y_pct\": 0.XX, \"confidence\": 0.XX}"},
                 {"role": "user", "content": prompt, "images": images},
-                {"role": "assistant", "content": "{"},
+                {"role": "assistant", "content": prefill},
             ],
             "stream": False,
             "think": False,
-            "options": {"temperature": 0.1, "num_predict": 100, "num_ctx": 2048},
+            "options": {"temperature": 0.1, "num_predict": 60, "num_ctx": 8192},
         }
 
         try:
@@ -460,8 +695,9 @@ Return coordinates: {{"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}}"""
                 print(f"    [VLM-DIRECT] HTTP {resp.status_code} ({elapsed:.1f}s)")
                 return None
 
-            content = "{" + resp.json().get("message", {}).get("content", "")
-            print(f"    [VLM-DIRECT] Réponse en {elapsed:.1f}s : {content[:80]}")
+            raw_content = resp.json().get("message", {}).get("content", "")
+            content = prefill + raw_content
+            print(f"    [VLM-DIRECT] Réponse en {elapsed:.1f}s : {content[:100]}")
 
             # Parser JSON
             match = re.search(r'\{[^}]+\}', content)
@@ -622,7 +858,292 @@ Return coordinates: {{"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}}"""
         return True
 
     # =========================================================================
-    # Gestion automatique des popups imprevues
+    # Gestion intelligente des popups imprévues (VLM)
+    # =========================================================================
+
+    def _handle_popup_vlm(self) -> bool:
+        """Détecter et gérer une popup imprévue via approche hybride.
+
+        Approche hybride VLM + template matching :
+        1. Le VLM **identifie** s'il y a une popup et le texte du bouton à cliquer
+        2. Le template matching **localise** la position exacte du bouton
+
+        Le VLM (qwen3-vl:8b) ne retourne pas de coordonnées fiables, mais il
+        sait identifier les éléments : "il y a un bouton Oui et un bouton Non".
+        On lui demande donc uniquement le texte du bouton, puis on localise
+        ce texte sur le screenshot via rendu texte + cv2.matchTemplate.
+
+        Appelée quand le visual resolve échoue (cible non trouvée), ce qui
+        peut indiquer qu'une popup modale masque l'élément attendu.
+
+        Une seule tentative par action (pas de boucle infinie).
+
+        Returns:
+            True si une popup a été gérée (fermée), False sinon.
+        """
+        # Capturer le screenshot actuel (résolution native pour template matching)
+        screenshot_b64 = self._capture_screenshot_b64(max_width=0, quality=75)
+        if not screenshot_b64:
+            logger.warning("[POPUP-VLM] Capture screenshot échouée")
+            return False
+
+        # Étape 1 : Le VLM identifie le bouton à cliquer
+        button_text = self._vlm_identify_popup_button(screenshot_b64)
+        if not button_text:
+            return False  # Pas de popup ou VLM en échec
+
+        # Étape 2 : Localiser le bouton par son texte via template matching
+        position = self._find_text_on_screen(screenshot_b64, button_text)
+
+        # Fallback : essayer des variantes de casse
+        if not position:
+            variants = [
+                button_text.upper(),
+                button_text.lower(),
+                button_text.capitalize(),
+                button_text.title(),
+            ]
+            for variant in variants:
+                if variant == button_text:
+                    continue
+                position = self._find_text_on_screen(screenshot_b64, variant)
+                if position:
+                    print(f"    [POPUP-VLM] Variante trouvée : '{variant}'")
+                    break
+
+        if not position:
+            print(f"    [POPUP-VLM] Bouton '{button_text}' identifié par VLM mais non localisé par template matching")
+            logger.warning(f"[POPUP-VLM] Bouton '{button_text}' identifié mais non localisé")
+            return False
+
+        # Étape 3 : Cliquer sur le bouton
+        real_x, real_y = position
+        print(
+            f"    [POPUP-VLM] Popup détectée ! Clic sur '{button_text}' "
+            f"-> ({real_x}, {real_y})"
+        )
+        logger.info(
+            f"[POPUP-VLM] Clic popup '{button_text}' à ({real_x}, {real_y}) "
+            f"[hybride VLM+template]"
+        )
+
+        self._click((real_x, real_y), "left")
+
+        # Attendre que la popup se ferme
+        time.sleep(1.0)
+
+        print(f"    [POPUP-VLM] Popup '{button_text}' gérée avec succès")
+        logger.info(f"[POPUP-VLM] Popup '{button_text}' gérée, attente 1s terminée")
+        return True
+
+    def _vlm_identify_popup_button(self, screenshot_b64: str) -> str:
+        """Demander au VLM s'il y a une popup et quel bouton cliquer.
+
+        Le VLM identifie uniquement le TEXTE du bouton (pas de coordonnées).
+        C'est son point fort : comprendre sémantiquement le contenu de l'écran.
+
+        Returns:
+            Le texte du bouton à cliquer (ex: "Oui", "OK", "Enregistrer"),
+            ou une chaîne vide si pas de popup.
+        """
+        import requests as _requests
+
+        ollama_host = os.environ.get("RPA_SERVER_HOST", "localhost")
+        ollama_url = f"http://{ollama_host}:11434/api/chat"
+
+        prompt = (
+            "Look at this screenshot. Is there a popup dialog, confirmation dialog, "
+            "error message, or modal window visible?\n"
+            "If yes, what button should I click to proceed?\n"
+            "Answer ONLY the button text (like: Oui, OK, Yes, Enregistrer, Non, "
+            "Cancel, Remplacer, Replace, Fermer, Close, Ne pas enregistrer, Don't Save).\n"
+            "If no popup: answer NO_POPUP"
+        )
+
+        prefill = "The button to click is: "
+
+        payload = {
+            "model": os.environ.get("RPA_VLM_MODEL", "qwen3-vl:8b"),
+            "messages": [
+                {
+                    "role": "system",
+                    "content": (
+                        "You analyze screenshots to detect popup dialogs. "
+                        "Answer briefly with just the button text. No JSON, no coordinates."
+                    ),
+                },
+                {"role": "user", "content": prompt, "images": [screenshot_b64]},
+                {"role": "assistant", "content": prefill},
+            ],
+            "stream": False,
+            "think": False,
+            "options": {"temperature": 0.1, "num_predict": 30, "num_ctx": 8192},
+        }
+
+        try:
+            print(f"    [POPUP-VLM] Appel Ollama ({ollama_host}:11434) — identification popup...")
+            start = time.time()
+            resp = _requests.post(ollama_url, json=payload, timeout=15)
+            elapsed = time.time() - start
+
+            if not resp.ok:
+                print(f"    [POPUP-VLM] HTTP {resp.status_code} ({elapsed:.1f}s)")
+                logger.warning(f"[POPUP-VLM] HTTP {resp.status_code}")
+                return ""
+
+            raw_content = resp.json().get("message", {}).get("content", "")
+            full_response = prefill + raw_content
+            print(f"    [POPUP-VLM] Réponse en {elapsed:.1f}s : {full_response.strip()}")
+            logger.info(f"[POPUP-VLM] Réponse VLM ({elapsed:.1f}s) : {full_response.strip()}")
+
+            # Extraire le texte du bouton depuis la réponse
+            button_text = raw_content.strip().strip('"').strip("'").strip(".")
+            # Nettoyer les artefacts courants du VLM
+            for noise in ["The button to click is:", "Button:", "Click:"]:
+                if button_text.lower().startswith(noise.lower()):
+                    button_text = button_text[len(noise):].strip()
+
+            if not button_text or "NO_POPUP" in button_text.upper():
+                print(f"    [POPUP-VLM] Pas de popup détectée")
+                logger.info("[POPUP-VLM] Pas de popup détectée par le VLM")
+                return ""
+
+            # Limiter à un texte raisonnable (un bouton fait rarement plus de 30 chars)
+            if len(button_text) > 30:
+                # Prendre juste le premier mot significatif
+                button_text = button_text.split("\n")[0].strip()
+                if len(button_text) > 30:
+                    button_text = button_text[:30].strip()
+
+            print(f"    [POPUP-VLM] Bouton identifié : '{button_text}'")
+            logger.info(f"[POPUP-VLM] Bouton identifié par VLM : '{button_text}'")
+            return button_text
+
+        except _requests.exceptions.Timeout:
+            print("    [POPUP-VLM] Timeout 15s")
+            logger.warning("[POPUP-VLM] Timeout Ollama 15s")
+            return ""
+        except Exception as e:
+            print(f"    [POPUP-VLM] Erreur: {e}")
+            logger.error(f"[POPUP-VLM] Erreur inattendue: {e}")
+            return ""
+
+    def _find_text_on_screen(self, screenshot_b64: str, text: str) -> tuple:
+        """Localiser un texte sur le screenshot via template matching.
+
+        Rend le texte en image (PIL) avec plusieurs tailles de police,
+        puis utilise cv2.matchTemplate pour le trouver sur le screenshot.
+
+        Cette approche ne nécessite pas de dépendance supplémentaire :
+        PIL et cv2 sont déjà disponibles dans le projet.
+
+        Args:
+            screenshot_b64: Screenshot encodé en base64 (JPEG)
+            text: Texte à rechercher sur le screenshot
+
+        Returns:
+            Tuple (x, y) des coordonnées pixel du centre du texte trouvé,
+            ou None si non trouvé.
+        """
+        from PIL import Image, ImageDraw, ImageFont
+        import cv2
+        import numpy as np
+
+        if not text or not screenshot_b64:
+            return None
+
+        # Décoder le screenshot base64 en image cv2
+        try:
+            img_bytes = base64.b64decode(screenshot_b64)
+            img_array = np.frombuffer(img_bytes, dtype=np.uint8)
+            screenshot_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+            if screenshot_bgr is None:
+                logger.warning("[FIND-TEXT] Impossible de décoder le screenshot")
+                return None
+            gray = cv2.cvtColor(screenshot_bgr, cv2.COLOR_BGR2GRAY)
+        except Exception as e:
+            logger.warning(f"[FIND-TEXT] Erreur décodage screenshot : {e}")
+            return None
+
+        # Charger une police TrueType (Windows a arial.ttf, sinon default)
+        def _get_font(size):
+            font_paths = [
+                "C:/Windows/Fonts/arial.ttf",
+                "C:/Windows/Fonts/segoeui.ttf",
+                "C:/Windows/Fonts/tahoma.ttf",
+                "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
+                "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
+            ]
+            for fp in font_paths:
+                try:
+                    return ImageFont.truetype(fp, size)
+                except (OSError, IOError):
+                    continue
+            return ImageFont.load_default()
+
+        best_match = None
+        best_val = 0.0
+        threshold = 0.55  # Seuil assez permissif pour le texte de bouton
+
+        # Essayer plusieurs tailles de police pour couvrir différentes résolutions
+        for font_size in [14, 16, 18, 20, 22, 24, 12, 26, 28, 10]:
+            font = _get_font(font_size)
+
+            # Calculer la taille exacte du texte rendu
+            # Créer une image temporaire pour mesurer
+            tmp_img = Image.new("L", (1, 1), 255)
+            tmp_draw = ImageDraw.Draw(tmp_img)
+            bbox = tmp_draw.textbbox((0, 0), text, font=font)
+            text_w = bbox[2] - bbox[0] + 6  # +6 pour marge
+            text_h = bbox[3] - bbox[1] + 6
+
+            if text_w <= 0 or text_h <= 0:
+                continue
+            # Le template ne doit pas être plus grand que le screenshot
+            if text_w >= gray.shape[1] or text_h >= gray.shape[0]:
+                continue
+
+            # Rendre le texte : fond blanc, texte noir (comme un bouton Windows)
+            text_img = Image.new("L", (text_w, text_h), 255)
+            draw = ImageDraw.Draw(text_img)
+            draw.text((3, 3), text, fill=0, font=font)
+
+            template = np.array(text_img)
+
+            # Template matching
+            result = cv2.matchTemplate(gray, template, cv2.TM_CCOEFF_NORMED)
+            _, max_val, _, max_loc = cv2.minMaxLoc(result)
+
+            if max_val > best_val:
+                best_val = max_val
+                best_match = (
+                    max_loc[0] + template.shape[1] // 2,
+                    max_loc[1] + template.shape[0] // 2,
+                )
+
+            # Match suffisamment bon → arrêter tôt
+            if max_val > 0.75:
+                break
+
+        if best_match and best_val >= threshold:
+            print(
+                f"    [FIND-TEXT] '{text}' trouvé à ({best_match[0]}, {best_match[1]}) "
+                f"score={best_val:.3f}"
+            )
+            logger.info(
+                f"[FIND-TEXT] '{text}' trouvé à ({best_match[0]}, {best_match[1]}) "
+                f"score={best_val:.3f}"
+            )
+            return best_match
+
+        if best_val > 0:
+            print(f"    [FIND-TEXT] '{text}' meilleur score={best_val:.3f} < seuil {threshold}")
+        else:
+            print(f"    [FIND-TEXT] '{text}' aucun match")
+        return None
+
+    # =========================================================================
+    # Gestion automatique des popups imprevues (legacy clavier)
     # =========================================================================
 
     def _handle_possible_popup(self) -> bool:
diff --git a/agent_v0/server_v1/api_stream.py b/agent_v0/server_v1/api_stream.py
index f295c4085..b9a9537ae 100644
--- a/agent_v0/server_v1/api_stream.py
+++ b/agent_v0/server_v1/api_stream.py
@@ -798,11 +798,13 @@ async def stream_image(
     with _pending_lock:
         _analyzed_shots[session_id].add(shot_id)
 
-    # Screenshots full : analyse GPU dans un thread séparé (ne bloque pas l'event loop)
-    with _pending_lock:
-        _pending_analyses[session_id] += 1
-    _gpu_executor.submit(_process_screenshot_thread, session_id, shot_id, file_path_str)
-    return {"status": "image_queued", "shot_id": shot_id}
+    # Screenshots full : STOCKAGE UNIQUEMENT (pas d'analyse VLM en temps réel)
+    # L'analyse VLM est faite par le worker séparé (run_worker.py) après
+    # finalisation de la session. Cela évite de bloquer le serveur HTTP
+    # (le GIL Python bloque tout quand le VLM tourne dans un thread).
+    # Le screenshot est déjà sauvegardé sur disque par le session_manager.
+    logger.debug(f"Screenshot {shot_id} stocké (analyse VLM différée au worker)")
+    return {"status": "image_stored", "shot_id": shot_id}
 
 
 def _process_screenshot_thread(session_id: str, shot_id: str, path: str):
diff --git a/agent_v0/server_v1/stream_processor.py b/agent_v0/server_v1/stream_processor.py
index 9e264bb49..f52ff5c5f 100644
--- a/agent_v0/server_v1/stream_processor.py
+++ b/agent_v0/server_v1/stream_processor.py
@@ -951,9 +951,19 @@ class StreamProcessor:
         return waited
 
     def _ensure_initialized(self):
-        """Charger les composants core GPU si pas encore fait."""
+        """Charger les composants core GPU si pas encore fait.
+
+        DÉSACTIVÉ dans le serveur HTTP : les composants GPU (ScreenAnalyzer,
+        CLIP, FAISS) bloquent le GIL Python et rendent le serveur non-réactif.
+        Ces composants sont chargés uniquement par le worker séparé (run_worker.py).
+        Le serveur HTTP ne fait que stocker les screenshots et distribuer les replays.
+        """
         if self._initialized:
             return
+        # Marquer comme initialisé SANS charger les composants GPU
+        self._initialized = True
+        logger.info("StreamProcessor initialisé en mode LÉGER (pas de GPU, pas de VLM)")
+        return
 
         with self._lock:
             if self._initialized: