fix: worker séparé, VLM-first direct Ollama, popup handler hybride, serveur léger

Worker VLM séparé : - run_worker.py : process distinct du serveur HTTP - Communication par fichiers (_worker_queue.txt + _replay_active.lock) - Service systemd rpa-worker.service - Le serveur HTTP ne charge plus CLIP/VLM (mode léger) - StreamProcessor._ensure_initialized() désactivé dans le serveur VLM direct depuis l'agent : - L'agent appelle Ollama directement (port 11434, LAN) - Ollama configuré sur 0.0.0.0 (OLLAMA_HOST) - Pas de passage par le serveur streaming (évite le blocage GIL) - Fallback serveur supprimé (VLM direct ou STOP) Popup handler hybride : - VLM identifie le bouton ("Oui", "OK") — pas de coordonnées - Template matching localise le texte sur l'écran (PIL + cv2) - _find_text_on_screen() : rend le texte en image, matchTemplate - _vlm_identify_popup_button() : prompt simple, prefill texte Resolve visuel hybride : - Cascade : template anchor → VLM+template texte → VLM direct (legacy) - _hybrid_vlm_resolve() : VLM identifie + template localise - _template_match_anchor() : match direct crop, seuil 0.80 - Seuil strict 0.90 pour template matching en mode replay Analyse VLM temps réel désactivée : - process_screenshot() ne fait plus de VLM (stockage uniquement) - L'analyse est différée au worker séparé - Le serveur HTTP reste réactif en permanence VLM prefill fix : - num_ctx augmenté (2048 → 8192 pour images 1080p) - bbox_2d au lieu de click_point (plus fiable) - Coordonnées 0-1000 (format natif qwen3-vl) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 12:52:40 +01:00
parent d5deac3029
commit c2dc8f8fe4
3 changed files with 590 additions and 57 deletions
--- a/agent_v0/agent_v1/core/executor.py
+++ b/agent_v0/agent_v1/core/executor.py
@@ -219,11 +219,40 @@ class ActionExecutorV1:
                # Pas de fallback blind — on arrête le replay si la cible
                # n'est pas trouvée visuellement. C'est un RPA VISUEL.
                if visual_mode and not result.get("visual_resolved"):
-                    result["success"] = False
+                    # Avant de STOP, vérifier s'il y a une popup imprévue via le VLM
-                    result["error"] = "Visual resolve échoué — cible non trouvée à l'écran"
+                    print(f"    [POPUP-VLM] Cible non trouvée — vérification popup imprévue...")
-                    print(f"    [ERREUR] Visual resolve échoué — STOP (pas de clic blind)")
+                    logger.info(f"Action {action_id} : cible non trouvée, tentative gestion popup VLM")
-                    logger.error(f"Action {action_id} : visual resolve échoué, replay stoppé")
+                    popup_handled = self._handle_popup_vlm()
-                    return result
+                    if popup_handled:
                        # Popup fermée — re-tenter le resolve
                        print(f"    [POPUP-VLM] Popup gérée, re-tentative du resolve visuel...")
                        resolved2 = self._resolve_target_visual(
                            server_url, target_spec, x_pct, y_pct, width, height
                        )
                        if resolved2 and resolved2.get("resolved"):
                            x_pct = resolved2["x_pct"]
                            y_pct = resolved2["y_pct"]
                            result["visual_resolved"] = True
                            print(
                                f"    [POPUP-VLM] Re-resolve OK après popup : "
                                f"({x_pct:.3f}, {y_pct:.3f})"
                            )
                            logger.info(
                                f"Action {action_id} : re-resolve OK après popup "
                                f"({x_pct:.3f}, {y_pct:.3f})"
                            )
                        else:
                            result["success"] = False
                            result["error"] = "Élément non trouvé même après gestion popup"
                            print(f"    [ERREUR] Élément toujours non trouvé après gestion popup — STOP")
                            logger.error(f"Action {action_id} : élément non trouvé après popup, replay stoppé")
                            return result
                    else:
                        result["success"] = False
                        result["error"] = "Visual resolve échoué — cible non trouvée à l'écran"
                        print(f"    [ERREUR] Visual resolve échoué, pas de popup détectée — STOP")
                        logger.error(f"Action {action_id} : visual resolve échoué, pas de popup, replay stoppé")
                        return result
                real_x = int(x_pct * width)
                real_y = int(y_pct * height)
@@ -360,51 +389,253 @@ class ActionExecutorV1:
    ) -> dict:
        """Résoudre la position d'un clic visuellement.
-        Stratégie VLM-DIRECT : appelle Ollama directement depuis l'agent
+        Stratégie hybride en cascade :
-        (pas via le serveur streaming) pour éviter les timeouts quand le
+        1. Template matching avec le crop anchor (rapide, fiable si l'UI n'a pas changé)
-        serveur est occupé par le worker.
+        2. VLM identifie l'élément + template matching texte (approche hybride)
-
+        3. VLM direct coordonnées (legacy, peu fiable avec qwen3-vl:8b)
        1. VLM direct (screenshot + crop → Ollama) ~3-8s
        2. Serveur streaming (fallback si Ollama échoue)
        """
        import requests as _requests
        import json as _json
        screenshot_b64 = self._capture_screenshot_b64(max_width=0, quality=75)
        if not screenshot_b64:
            logger.warning("Capture screenshot echouee pour visual resolve")
            return None
-        # ---- VLM DIRECT (Ollama) ----
+        # ---- ÉTAPE 1 : Template matching avec le crop anchor ----
        anchor_b64 = target_spec.get("anchor_image_base64", "")
        if anchor_b64:
            tm_result = self._template_match_anchor(screenshot_b64, anchor_b64, screen_width, screen_height)
            if tm_result and tm_result.get("resolved"):
                return tm_result
        # ---- ÉTAPE 2 : Approche hybride VLM identifie + template matching texte ----
        by_text = target_spec.get("by_text", "")
        vlm_description = target_spec.get("vlm_description", "")
        if vlm_description or by_text:
            hybrid_result = self._hybrid_vlm_resolve(
                screenshot_b64, target_spec, screen_width, screen_height
            )
            if hybrid_result and hybrid_result.get("resolved"):
                return hybrid_result
        # ---- ÉTAPE 3 : VLM direct coordonnées (legacy, peu fiable) ----
        vlm_result = self._vlm_direct_resolve(screenshot_b64, target_spec)
        if vlm_result and vlm_result.get("resolved"):
            return vlm_result
-        # ---- FALLBACK : serveur streaming ----
+        print("    [VISUAL] Toutes les méthodes ont échoué")
        print("    [VISUAL] VLM direct echoue, fallback serveur...")
        try:
            resolve_url = f"{server_url}/traces/stream/replay/resolve_target"
            payload = {
                "session_id": "",
                "screenshot_b64": screenshot_b64,
                "target_spec": target_spec,
                "fallback_x_pct": fallback_x,
                "fallback_y_pct": fallback_y,
                "screen_width": screen_width,
                "screen_height": screen_height,
                "strict_mode": True,
            }
            resp = _requests.post(resolve_url, json=payload, headers=self._auth_headers(), timeout=30)
            if resp.ok:
                data = resp.json()
                print(f"    [VISUAL] Serveur : resolved={data.get('resolved')}, method={data.get('method')}")
                return data
        except Exception as e:
            logger.warning(f"Visual resolve serveur echoue: {e}")
        return None
    def _template_match_anchor(
        self, screenshot_b64: str, anchor_b64: str,
        screen_width: int, screen_height: int,
    ) -> dict:
        """Template matching direct avec le crop anchor (image de référence).
        Le crop anchor est une capture de l'élément UI lors de l'enregistrement.
        Si l'UI est identique (même résolution, même thème), le match est
        quasi-parfait et très rapide (~10ms).
        """
        import cv2
        import numpy as np
        try:
            # Décoder les deux images
            scr_bytes = base64.b64decode(screenshot_b64)
            scr_array = np.frombuffer(scr_bytes, dtype=np.uint8)
            screenshot = cv2.imdecode(scr_array, cv2.IMREAD_GRAYSCALE)
            anc_bytes = base64.b64decode(anchor_b64)
            anc_array = np.frombuffer(anc_bytes, dtype=np.uint8)
            anchor = cv2.imdecode(anc_array, cv2.IMREAD_GRAYSCALE)
            if screenshot is None or anchor is None:
                return None
            if anchor.shape[0] >= screenshot.shape[0] or anchor.shape[1] >= screenshot.shape[1]:
                return None
            result = cv2.matchTemplate(screenshot, anchor, cv2.TM_CCOEFF_NORMED)
            _, max_val, _, max_loc = cv2.minMaxLoc(result)
            print(f"    [ANCHOR-TM] Score={max_val:.3f}")
            # Seuil élevé : le crop anchor doit matcher très bien
            if max_val >= 0.80:
                # Centre du match en pixels
                cx = max_loc[0] + anchor.shape[1] // 2
                cy = max_loc[1] + anchor.shape[0] // 2
                # Convertir en pourcentages par rapport au screenshot décodé
                x_pct = cx / screenshot.shape[1]
                y_pct = cy / screenshot.shape[0]
                print(
                    f"    [ANCHOR-TM] TROUVÉ ({x_pct:.3f}, {y_pct:.3f}) "
                    f"score={max_val:.3f}"
                )
                logger.info(
                    f"[ANCHOR-TM] Match anchor à ({x_pct:.3f}, {y_pct:.3f}) "
                    f"score={max_val:.3f}"
                )
                return {
                    "resolved": True,
                    "method": "anchor_template",
                    "x_pct": x_pct,
                    "y_pct": y_pct,
                    "score": max_val,
                }
        except Exception as e:
            print(f"    [ANCHOR-TM] Erreur: {e}")
            logger.warning(f"[ANCHOR-TM] Erreur: {e}")
        return None
    def _hybrid_vlm_resolve(
        self, screenshot_b64: str, target_spec: dict,
        screen_width: int, screen_height: int,
    ) -> dict:
        """Approche hybride : le VLM identifie l'élément, le template matching le localise.
        Le VLM décrit quel élément il voit (texte du bouton/label) et le
        template matching avec rendu texte localise sa position exacte.
        Utile quand le crop anchor ne matche plus (changement de thème,
        résolution différente, etc.) mais le texte du bouton est identique.
        """
        import requests as _requests
        by_text = target_spec.get("by_text", "")
        vlm_description = target_spec.get("vlm_description", "")
        # Si on a déjà le texte cible (by_text), essayer directement le template matching texte
        if by_text:
            position = self._find_text_on_screen(screenshot_b64, by_text)
            if position:
                x_pct = position[0] / screen_width if screen_width > 0 else 0
                y_pct = position[1] / screen_height if screen_height > 0 else 0
                # Recalculer par rapport à l'image décodée, pas l'écran
                import cv2
                import numpy as np
                img_bytes = base64.b64decode(screenshot_b64)
                img_array = np.frombuffer(img_bytes, dtype=np.uint8)
                img = cv2.imdecode(img_array, cv2.IMREAD_GRAYSCALE)
                if img is not None:
                    x_pct = position[0] / img.shape[1]
                    y_pct = position[1] / img.shape[0]
                print(
                    f"    [HYBRID] by_text '{by_text}' trouvé directement "
                    f"({x_pct:.3f}, {y_pct:.3f})"
                )
                return {
                    "resolved": True,
                    "method": "hybrid_text_direct",
                    "x_pct": x_pct,
                    "y_pct": y_pct,
                    "score": 0.9,
                }
        # Sinon, demander au VLM d'identifier l'élément
        if not vlm_description:
            return None
        ollama_host = os.environ.get("RPA_SERVER_HOST", "localhost")
        ollama_url = f"http://{ollama_host}:11434/api/chat"
        prompt = (
            f"Look at this screenshot. {vlm_description}\n"
            "What is the exact text label of this element? "
            "Answer ONLY the text visible on the element (button text, label, menu item)."
        )
        prefill = "The text is: "
        payload = {
            "model": os.environ.get("RPA_VLM_MODEL", "qwen3-vl:8b"),
            "messages": [
                {
                    "role": "system",
                    "content": "You read text from UI screenshots. Answer briefly with just the text.",
                },
                {"role": "user", "content": prompt, "images": [screenshot_b64]},
                {"role": "assistant", "content": prefill},
            ],
            "stream": False,
            "think": False,
            "options": {"temperature": 0.1, "num_predict": 30, "num_ctx": 8192},
        }
        try:
            print(f"    [HYBRID] Appel VLM pour identification élément...")
            start = time.time()
            resp = _requests.post(ollama_url, json=payload, timeout=20)
            elapsed = time.time() - start
            if not resp.ok:
                print(f"    [HYBRID] VLM HTTP {resp.status_code} ({elapsed:.1f}s)")
                return None
            raw = resp.json().get("message", {}).get("content", "")
            element_text = raw.strip().strip('"').strip("'").strip(".")
            print(f"    [HYBRID] VLM identifie : '{element_text}' ({elapsed:.1f}s)")
            if not element_text or len(element_text) > 50:
                return None
            # Localiser ce texte sur le screenshot
            position = self._find_text_on_screen(screenshot_b64, element_text)
            # Essayer des variantes de casse
            if not position:
                for variant in [element_text.upper(), element_text.lower(),
                                element_text.capitalize(), element_text.title()]:
                    if variant == element_text:
                        continue
                    position = self._find_text_on_screen(screenshot_b64, variant)
                    if position:
                        break
            if not position:
                print(f"    [HYBRID] '{element_text}' identifié mais non localisé")
                return None
            # Convertir pixels en pourcentages (par rapport au screenshot décodé)
            import cv2
            import numpy as np
            img_bytes = base64.b64decode(screenshot_b64)
            img_array = np.frombuffer(img_bytes, dtype=np.uint8)
            img = cv2.imdecode(img_array, cv2.IMREAD_GRAYSCALE)
            if img is None:
                return None
            x_pct = position[0] / img.shape[1]
            y_pct = position[1] / img.shape[0]
            print(
                f"    [HYBRID] TROUVÉ '{element_text}' à ({x_pct:.3f}, {y_pct:.3f})"
            )
            logger.info(
                f"[HYBRID] Élément '{element_text}' trouvé à ({x_pct:.3f}, {y_pct:.3f}) "
                f"[VLM identifie + template matching localise]"
            )
            return {
                "resolved": True,
                "method": "hybrid_vlm_text",
                "x_pct": x_pct,
                "y_pct": y_pct,
                "score": 0.85,
                "matched_element": {"label": element_text},
            }
        except _requests.exceptions.Timeout:
            print("    [HYBRID] Timeout VLM 20s")
            return None
        except Exception as e:
            print(f"    [HYBRID] Erreur: {e}")
            return None
    def _vlm_direct_resolve(self, screenshot_b64: str, target_spec: dict) -> dict:
-        """Appeler Ollama directement pour trouver l'élément à l'écran."""
+        """Appeler Ollama directement pour trouver l'élément à l'écran (legacy).
        Demande des coordonnées JSON au VLM. Peu fiable avec qwen3-vl:8b
        qui retourne souvent des coordonnées incorrectes ou du JSON malformé.
        Gardé comme dernier recours après les méthodes template matching et hybride.
        """
        import requests as _requests
        import json as _json
        import re
@@ -417,19 +648,20 @@ class ActionExecutorV1:
        if not anchor_b64 and not vlm_description:
            return None
-        # Prompt
+        # Prompt simple et direct — le VLM doit retourner x_pct et y_pct
        if anchor_b64 and vlm_description:
-            prompt = f"""The first image is the current screen. The second image shows the element to find.
+            prompt = f"""Look at the first image (screenshot). The second image shows a UI element.
 {vlm_description}
-Return the CENTER coordinates as percentage of the FIRST image dimensions.
+Where is this element on the screenshot? Give the center x,y as percentage (0.0 to 1.0).
-Return ONLY JSON: {{"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}}
+Example: x_pct=0.50, y_pct=0.30"""
 If not found: {{"x_pct": null, "y_pct": null, "confidence": 0.0}}"""
        elif vlm_description:
            prompt = f"""{vlm_description}
-Return coordinates as percentage: {{"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}}"""
+Where is this element? Give center x,y as percentage (0.0 to 1.0).
 Example: x_pct=0.50, y_pct=0.30"""
        else:
-            prompt = f"""Find the element shown in the second image on the first image.
+            prompt = """The second image shows a UI element. Find it on the first image (screenshot).
-Return coordinates: {{"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}}"""
+Give the center x,y as percentage (0.0 to 1.0).
 Example: x_pct=0.50, y_pct=0.30"""
        images = [screenshot_b64]
        if anchor_b64:
@@ -438,16 +670,19 @@ Return coordinates: {{"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}}"""
        ollama_host = os.environ.get("RPA_SERVER_HOST", "localhost")
        ollama_url = f"http://{ollama_host}:11434/api/chat"
        # Prefill plus explicite pour guider la réponse
        prefill = '{"x_pct": 0.'
        payload = {
            "model": os.environ.get("RPA_VLM_MODEL", "qwen3-vl:8b"),
            "messages": [
-                {"role": "system", "content": "You are a UI element locator. Output raw JSON only."},
+                {"role": "system", "content": "You locate UI elements on screenshots. Reply with JSON only: {\"x_pct\": 0.XX, \"y_pct\": 0.XX, \"confidence\": 0.XX}"},
                {"role": "user", "content": prompt, "images": images},
-                {"role": "assistant", "content": "{"},
+                {"role": "assistant", "content": prefill},
            ],
            "stream": False,
            "think": False,
-            "options": {"temperature": 0.1, "num_predict": 100, "num_ctx": 2048},
+            "options": {"temperature": 0.1, "num_predict": 60, "num_ctx": 8192},
        }
        try:
@@ -460,8 +695,9 @@ Return coordinates: {{"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}}"""
                print(f"    [VLM-DIRECT] HTTP {resp.status_code} ({elapsed:.1f}s)")
                return None
-            content = "{" + resp.json().get("message", {}).get("content", "")
+            raw_content = resp.json().get("message", {}).get("content", "")
-            print(f"    [VLM-DIRECT] Réponse en {elapsed:.1f}s : {content[:80]}")
+            content = prefill + raw_content
            print(f"    [VLM-DIRECT] Réponse en {elapsed:.1f}s : {content[:100]}")
            # Parser JSON
            match = re.search(r'\{[^}]+\}', content)
@@ -622,7 +858,292 @@ Return coordinates: {{"x_pct": 0.XX, "y_pct": 0.XX, "confidence": 0.XX}}"""
        return True
    # =========================================================================
-    # Gestion automatique des popups imprevues
+    # Gestion intelligente des popups imprévues (VLM)
    # =========================================================================
    def _handle_popup_vlm(self) -> bool:
        """Détecter et gérer une popup imprévue via approche hybride.
        Approche hybride VLM + template matching :
        1. Le VLM **identifie** s'il y a une popup et le texte du bouton à cliquer
        2. Le template matching **localise** la position exacte du bouton
        Le VLM (qwen3-vl:8b) ne retourne pas de coordonnées fiables, mais il
        sait identifier les éléments : "il y a un bouton Oui et un bouton Non".
        On lui demande donc uniquement le texte du bouton, puis on localise
        ce texte sur le screenshot via rendu texte + cv2.matchTemplate.
        Appelée quand le visual resolve échoue (cible non trouvée), ce qui
        peut indiquer qu'une popup modale masque l'élément attendu.
        Une seule tentative par action (pas de boucle infinie).
        Returns:
            True si une popup a été gérée (fermée), False sinon.
        """
        # Capturer le screenshot actuel (résolution native pour template matching)
        screenshot_b64 = self._capture_screenshot_b64(max_width=0, quality=75)
        if not screenshot_b64:
            logger.warning("[POPUP-VLM] Capture screenshot échouée")
            return False
        # Étape 1 : Le VLM identifie le bouton à cliquer
        button_text = self._vlm_identify_popup_button(screenshot_b64)
        if not button_text:
            return False  # Pas de popup ou VLM en échec
        # Étape 2 : Localiser le bouton par son texte via template matching
        position = self._find_text_on_screen(screenshot_b64, button_text)
        # Fallback : essayer des variantes de casse
        if not position:
            variants = [
                button_text.upper(),
                button_text.lower(),
                button_text.capitalize(),
                button_text.title(),
            ]
            for variant in variants:
                if variant == button_text:
                    continue
                position = self._find_text_on_screen(screenshot_b64, variant)
                if position:
                    print(f"    [POPUP-VLM] Variante trouvée : '{variant}'")
                    break
        if not position:
            print(f"    [POPUP-VLM] Bouton '{button_text}' identifié par VLM mais non localisé par template matching")
            logger.warning(f"[POPUP-VLM] Bouton '{button_text}' identifié mais non localisé")
            return False
        # Étape 3 : Cliquer sur le bouton
        real_x, real_y = position
        print(
            f"    [POPUP-VLM] Popup détectée ! Clic sur '{button_text}' "
            f"-> ({real_x}, {real_y})"
        )
        logger.info(
            f"[POPUP-VLM] Clic popup '{button_text}' à ({real_x}, {real_y}) "
            f"[hybride VLM+template]"
        )
        self._click((real_x, real_y), "left")
        # Attendre que la popup se ferme
        time.sleep(1.0)
        print(f"    [POPUP-VLM] Popup '{button_text}' gérée avec succès")
        logger.info(f"[POPUP-VLM] Popup '{button_text}' gérée, attente 1s terminée")
        return True
    def _vlm_identify_popup_button(self, screenshot_b64: str) -> str:
        """Demander au VLM s'il y a une popup et quel bouton cliquer.
        Le VLM identifie uniquement le TEXTE du bouton (pas de coordonnées).
        C'est son point fort : comprendre sémantiquement le contenu de l'écran.
        Returns:
            Le texte du bouton à cliquer (ex: "Oui", "OK", "Enregistrer"),
            ou une chaîne vide si pas de popup.
        """
        import requests as _requests
        ollama_host = os.environ.get("RPA_SERVER_HOST", "localhost")
        ollama_url = f"http://{ollama_host}:11434/api/chat"
        prompt = (
            "Look at this screenshot. Is there a popup dialog, confirmation dialog, "
            "error message, or modal window visible?\n"
            "If yes, what button should I click to proceed?\n"
            "Answer ONLY the button text (like: Oui, OK, Yes, Enregistrer, Non, "
            "Cancel, Remplacer, Replace, Fermer, Close, Ne pas enregistrer, Don't Save).\n"
            "If no popup: answer NO_POPUP"
        )
        prefill = "The button to click is: "
        payload = {
            "model": os.environ.get("RPA_VLM_MODEL", "qwen3-vl:8b"),
            "messages": [
                {
                    "role": "system",
                    "content": (
                        "You analyze screenshots to detect popup dialogs. "
                        "Answer briefly with just the button text. No JSON, no coordinates."
                    ),
                },
                {"role": "user", "content": prompt, "images": [screenshot_b64]},
                {"role": "assistant", "content": prefill},
            ],
            "stream": False,
            "think": False,
            "options": {"temperature": 0.1, "num_predict": 30, "num_ctx": 8192},
        }
        try:
            print(f"    [POPUP-VLM] Appel Ollama ({ollama_host}:11434) — identification popup...")
            start = time.time()
            resp = _requests.post(ollama_url, json=payload, timeout=15)
            elapsed = time.time() - start
            if not resp.ok:
                print(f"    [POPUP-VLM] HTTP {resp.status_code} ({elapsed:.1f}s)")
                logger.warning(f"[POPUP-VLM] HTTP {resp.status_code}")
                return ""
            raw_content = resp.json().get("message", {}).get("content", "")
            full_response = prefill + raw_content
            print(f"    [POPUP-VLM] Réponse en {elapsed:.1f}s : {full_response.strip()}")
            logger.info(f"[POPUP-VLM] Réponse VLM ({elapsed:.1f}s) : {full_response.strip()}")
            # Extraire le texte du bouton depuis la réponse
            button_text = raw_content.strip().strip('"').strip("'").strip(".")
            # Nettoyer les artefacts courants du VLM
            for noise in ["The button to click is:", "Button:", "Click:"]:
                if button_text.lower().startswith(noise.lower()):
                    button_text = button_text[len(noise):].strip()
            if not button_text or "NO_POPUP" in button_text.upper():
                print(f"    [POPUP-VLM] Pas de popup détectée")
                logger.info("[POPUP-VLM] Pas de popup détectée par le VLM")
                return ""
            # Limiter à un texte raisonnable (un bouton fait rarement plus de 30 chars)
            if len(button_text) > 30:
                # Prendre juste le premier mot significatif
                button_text = button_text.split("\n")[0].strip()
                if len(button_text) > 30:
                    button_text = button_text[:30].strip()
            print(f"    [POPUP-VLM] Bouton identifié : '{button_text}'")
            logger.info(f"[POPUP-VLM] Bouton identifié par VLM : '{button_text}'")
            return button_text
        except _requests.exceptions.Timeout:
            print("    [POPUP-VLM] Timeout 15s")
            logger.warning("[POPUP-VLM] Timeout Ollama 15s")
            return ""
        except Exception as e:
            print(f"    [POPUP-VLM] Erreur: {e}")
            logger.error(f"[POPUP-VLM] Erreur inattendue: {e}")
            return ""
    def _find_text_on_screen(self, screenshot_b64: str, text: str) -> tuple:
        """Localiser un texte sur le screenshot via template matching.
        Rend le texte en image (PIL) avec plusieurs tailles de police,
        puis utilise cv2.matchTemplate pour le trouver sur le screenshot.
        Cette approche ne nécessite pas de dépendance supplémentaire :
        PIL et cv2 sont déjà disponibles dans le projet.
        Args:
            screenshot_b64: Screenshot encodé en base64 (JPEG)
            text: Texte à rechercher sur le screenshot
        Returns:
            Tuple (x, y) des coordonnées pixel du centre du texte trouvé,
            ou None si non trouvé.
        """
        from PIL import Image, ImageDraw, ImageFont
        import cv2
        import numpy as np
        if not text or not screenshot_b64:
            return None
        # Décoder le screenshot base64 en image cv2
        try:
            img_bytes = base64.b64decode(screenshot_b64)
            img_array = np.frombuffer(img_bytes, dtype=np.uint8)
            screenshot_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
            if screenshot_bgr is None:
                logger.warning("[FIND-TEXT] Impossible de décoder le screenshot")
                return None
            gray = cv2.cvtColor(screenshot_bgr, cv2.COLOR_BGR2GRAY)
        except Exception as e:
            logger.warning(f"[FIND-TEXT] Erreur décodage screenshot : {e}")
            return None
        # Charger une police TrueType (Windows a arial.ttf, sinon default)
        def _get_font(size):
            font_paths = [
                "C:/Windows/Fonts/arial.ttf",
                "C:/Windows/Fonts/segoeui.ttf",
                "C:/Windows/Fonts/tahoma.ttf",
                "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
                "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
            ]
            for fp in font_paths:
                try:
                    return ImageFont.truetype(fp, size)
                except (OSError, IOError):
                    continue
            return ImageFont.load_default()
        best_match = None
        best_val = 0.0
        threshold = 0.55  # Seuil assez permissif pour le texte de bouton
        # Essayer plusieurs tailles de police pour couvrir différentes résolutions
        for font_size in [14, 16, 18, 20, 22, 24, 12, 26, 28, 10]:
            font = _get_font(font_size)
            # Calculer la taille exacte du texte rendu
            # Créer une image temporaire pour mesurer
            tmp_img = Image.new("L", (1, 1), 255)
            tmp_draw = ImageDraw.Draw(tmp_img)
            bbox = tmp_draw.textbbox((0, 0), text, font=font)
            text_w = bbox[2] - bbox[0] + 6  # +6 pour marge
            text_h = bbox[3] - bbox[1] + 6
            if text_w <= 0 or text_h <= 0:
                continue
            # Le template ne doit pas être plus grand que le screenshot
            if text_w >= gray.shape[1] or text_h >= gray.shape[0]:
                continue
            # Rendre le texte : fond blanc, texte noir (comme un bouton Windows)
            text_img = Image.new("L", (text_w, text_h), 255)
            draw = ImageDraw.Draw(text_img)
            draw.text((3, 3), text, fill=0, font=font)
            template = np.array(text_img)
            # Template matching
            result = cv2.matchTemplate(gray, template, cv2.TM_CCOEFF_NORMED)
            _, max_val, _, max_loc = cv2.minMaxLoc(result)
            if max_val > best_val:
                best_val = max_val
                best_match = (
                    max_loc[0] + template.shape[1] // 2,
                    max_loc[1] + template.shape[0] // 2,
                )
            # Match suffisamment bon → arrêter tôt
            if max_val > 0.75:
                break
        if best_match and best_val >= threshold:
            print(
                f"    [FIND-TEXT] '{text}' trouvé à ({best_match[0]}, {best_match[1]}) "
                f"score={best_val:.3f}"
            )
            logger.info(
                f"[FIND-TEXT] '{text}' trouvé à ({best_match[0]}, {best_match[1]}) "
                f"score={best_val:.3f}"
            )
            return best_match
        if best_val > 0:
            print(f"    [FIND-TEXT] '{text}' meilleur score={best_val:.3f} < seuil {threshold}")
        else:
            print(f"    [FIND-TEXT] '{text}' aucun match")
        return None
    # =========================================================================
    # Gestion automatique des popups imprevues (legacy clavier)
    # =========================================================================
    def _handle_possible_popup(self) -> bool:
--- a/agent_v0/server_v1/api_stream.py
+++ b/agent_v0/server_v1/api_stream.py
@@ -798,11 +798,13 @@ async def stream_image(
    with _pending_lock:
        _analyzed_shots[session_id].add(shot_id)
-    # Screenshots full : analyse GPU dans un thread séparé (ne bloque pas l'event loop)
+    # Screenshots full : STOCKAGE UNIQUEMENT (pas d'analyse VLM en temps réel)
-    with _pending_lock:
+    # L'analyse VLM est faite par le worker séparé (run_worker.py) après
-        _pending_analyses[session_id] += 1
+    # finalisation de la session. Cela évite de bloquer le serveur HTTP
-    _gpu_executor.submit(_process_screenshot_thread, session_id, shot_id, file_path_str)
+    # (le GIL Python bloque tout quand le VLM tourne dans un thread).
-    return {"status": "image_queued", "shot_id": shot_id}
+    # Le screenshot est déjà sauvegardé sur disque par le session_manager.
    logger.debug(f"Screenshot {shot_id} stocké (analyse VLM différée au worker)")
    return {"status": "image_stored", "shot_id": shot_id}
 def _process_screenshot_thread(session_id: str, shot_id: str, path: str):
--- a/agent_v0/server_v1/stream_processor.py
+++ b/agent_v0/server_v1/stream_processor.py
@@ -951,9 +951,19 @@ class StreamProcessor:
        return waited
    def _ensure_initialized(self):
-        """Charger les composants core GPU si pas encore fait."""
+        """Charger les composants core GPU si pas encore fait.
        DÉSACTIVÉ dans le serveur HTTP : les composants GPU (ScreenAnalyzer,
        CLIP, FAISS) bloquent le GIL Python et rendent le serveur non-réactif.
        Ces composants sont chargés uniquement par le worker séparé (run_worker.py).
        Le serveur HTTP ne fait que stocker les screenshots et distribuer les replays.
        """
        if self._initialized:
            return
        # Marquer comme initialisé SANS charger les composants GPU
        self._initialized = True
        logger.info("StreamProcessor initialisé en mode LÉGER (pas de GPU, pas de VLM)")
        return
        with self._lock:
            if self._initialized: