feat(execution): cascade post-raccourci pilotée par DialogHandler/OCR

Le pHash global 8x8 sur écran 1920x1080 ne détecte pas l'ouverture d'un dialog modal dans une VM QEMU (un dialog 800x500 couvre ~3 pixels pHash, distance Hamming typique = 1-2, sous le seuil de 3). Découvert sur Win11/ Notepad : Ctrl+Shift+S ouvrait bien le dialog mais Léa abortait à tort. _handle_post_shortcut() poll désormais DialogHandler.handle_if_dialog() toutes les 500ms (EasyOCR + KNOWN_DIALOGS). 8s pour le premier dialog, 3s de stabilité entre dialogs successifs, 60s budget total. KNOWN_DIALOGS réordonné : popups modaux (confirmer/remplacer/écraser) prioritaires sur fenêtres parents (enregistrer sous/save as) car l'OCR full-screen capte les deux simultanément. DialogHandler bascule sur UITarsGrounder subprocess one-shot (au lieu du serveur HTTP localhost:8200 qui n'existait plus). InfiGUI worker, think_arbiter et ui_tars_grounder alignés sur le même contrat. Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
2026-04-26 20:19:39 +02:00
parent 3d6868f029
commit 487bcb8618
6 changed files with 474 additions and 243 deletions
--- a/core/grounding/think_arbiter.py
+++ b/core/grounding/think_arbiter.py
@@ -1,50 +1,41 @@
 """
-core/grounding/think_arbiter.py — Layer THINK : VLM arbitre (UI-TARS)
+core/grounding/think_arbiter.py — Layer THINK : VLM arbitre (InfiGUI via subprocess)

-Appelé UNIQUEMENT quand le SmartMatcher n'a pas assez confiance :
- Score < 0.60 : aucun candidat clair → UI-TARS cherche dans tout l'écran
- Score 0.60-0.90 : candidats ambigus → UI-TARS confirme/infirme
-
-Le VLM tourne dans un process séparé (serveur FastAPI port 8200).
-Ce module est un CLIENT HTTP — il ne charge aucun modèle en VRAM.
+Appelé UNIQUEMENT quand le SmartMatcher n'a pas assez confiance.
+Utilise le subprocess worker InfiGUI (pas de serveur HTTP).

 Utilisation :
    from core.grounding.think_arbiter import ThinkArbiter

    arbiter = ThinkArbiter()
-    if arbiter.available:
-        result = arbiter.arbitrate(target, candidates, screenshot)
+    result = arbiter.arbitrate(target, candidates, screenshot)
 """

 from __future__ import annotations

-import base64
-import io
 import time
 from typing import Any, Dict, List, Optional

-from core.grounding.fast_types import DetectedUIElement, LocateResult, MatchCandidate
+from core.grounding.fast_types import LocateResult, MatchCandidate
 from core.grounding.target import GroundingTarget


 class ThinkArbiter:
-    """Arbitre VLM pour les cas ambigus — appelle le serveur UI-TARS."""
+    """Arbitre VLM — appelle InfiGUI via subprocess worker."""

-    DEFAULT_URL = "http://localhost:8200"
+    def __init__(self):
+        self._grounder = None

-    def __init__(self, server_url: str = DEFAULT_URL, timeout: int = 30):
-        self.server_url = server_url
-        self.timeout = timeout
+    def _get_grounder(self):
+        if self._grounder is None:
+            from core.grounding.ui_tars_grounder import UITarsGrounder
+            self._grounder = UITarsGrounder.get_instance()
+        return self._grounder

    @property
    def available(self) -> bool:
-        """Vérifie si le serveur de grounding est accessible."""
-        try:
-            import requests
-            resp = requests.get(f"{self.server_url}/health", timeout=3)
-            return resp.status_code == 200 and resp.json().get("model_loaded", False)
-        except Exception:
-            return False
+        """Toujours disponible — le worker se lance à la demande."""
+        return True

    def arbitrate(
        self,
@@ -54,62 +45,57 @@ class ThinkArbiter:
    ) -> Optional[LocateResult]:
        """Demande au VLM de trancher.

-        Args:
-            target: Ce qu'on cherche.
-            candidates: Candidats SMART (peut être vide).
-            screenshot_pil: Screenshot PIL. Si None, le serveur capture lui-même.
-
-        Returns:
-            LocateResult ou None si le VLM ne trouve pas non plus.
+        Si target.template_b64 est fourni, on bascule en mode fusionné :
+        le crop est passé comme image de référence à InfiGUI, ce qui évite
+        une description Ollama qwen2.5vl coûteuse en VRAM.
        """
        t0 = time.time()

+        # Décodage du crop d'ancre si disponible (mode fusionné)
+        anchor_pil = None
+        if target.template_b64:
+            try:
+                import base64
+                import io
+                from PIL import Image
+
+                raw_b64 = target.template_b64
+                if ',' in raw_b64:
+                    raw_b64 = raw_b64.split(',', 1)[1]
+                anchor_pil = Image.open(io.BytesIO(base64.b64decode(raw_b64))).convert("RGB")
+            except Exception as ex:
+                print(f"⚠️ [THINK] Décodage anchor échoué: {ex}")
+                anchor_pil = None
+
        try:
-            import requests
-
-            # Construire le payload
-            payload: Dict[str, Any] = {
-                "target_text": target.text or "",
-                "target_description": target.description or "",
-            }
-
-            # Envoyer l'image si disponible
-            if screenshot_pil is not None:
-                buf = io.BytesIO()
-                screenshot_pil.save(buf, format="JPEG", quality=85)
-                payload["image_b64"] = base64.b64encode(buf.getvalue()).decode("utf-8")
-
-            # Appel au serveur
-            resp = requests.post(
-                f"{self.server_url}/ground",
-                json=payload,
-                timeout=self.timeout,
+            grounder = self._get_grounder()
+            result = grounder.ground(
+                target_text=target.text or "",
+                target_description=target.description or "",
+                screen_pil=screenshot_pil,
+                anchor_pil=anchor_pil,
            )

            dt = (time.time() - t0) * 1000

-            if resp.status_code != 200:
-                print(f"🤔 [THINK] Serveur HTTP {resp.status_code}")
+            if result is None:
+                label = target.text or "<crop>"
+                print(f"🤔 [THINK] VLM n'a pas trouvé '{label}' ({dt:.0f}ms)")
                return None

-            data = resp.json()
-
-            if data.get("x") is None:
-                print(f"🤔 [THINK] VLM n'a pas trouvé '{target.text}' ({dt:.0f}ms)")
-                return None
-
-            result = LocateResult(
-                x=data["x"],
-                y=data["y"],
-                confidence=data.get("confidence", 0.85),
-                method="think_vlm",
+            method = "think_vlm_fused" if anchor_pil is not None else "think_vlm"
+            locate = LocateResult(
+                x=result.x,
+                y=result.y,
+                confidence=result.confidence,
+                method=method,
                time_ms=dt,
                tier="think",
                candidates_count=len(candidates),
            )

-            print(f"🤔 [THINK] VLM → ({result.x}, {result.y}) conf={result.confidence:.2f} ({dt:.0f}ms)")
-            return result
+            print(f"🤔 [THINK/{method}] ({result.x}, {result.y}) conf={result.confidence:.2f} ({dt:.0f}ms)")
+            return locate

        except Exception as ex:
            dt = (time.time() - t0) * 1000