rpa_vision_v3/core/grounding/think_arbiter.py

"""
core/grounding/think_arbiter.py — Layer THINK : VLM arbitre (UI-TARS)

Appelé UNIQUEMENT quand le SmartMatcher n'a pas assez confiance :
- Score < 0.60 : aucun candidat clair → UI-TARS cherche dans tout l'écran
- Score 0.60-0.90 : candidats ambigus → UI-TARS confirme/infirme

Le VLM tourne dans un process séparé (serveur FastAPI port 8200).
Ce module est un CLIENT HTTP — il ne charge aucun modèle en VRAM.

Utilisation :
    from core.grounding.think_arbiter import ThinkArbiter

    arbiter = ThinkArbiter()
    if arbiter.available:
        result = arbiter.arbitrate(target, candidates, screenshot)
"""

from __future__ import annotations

import base64
import io
import time
from typing import Any, Dict, List, Optional

from core.grounding.fast_types import DetectedUIElement, LocateResult, MatchCandidate
from core.grounding.target import GroundingTarget


class ThinkArbiter:
    """Arbitre VLM pour les cas ambigus — appelle le serveur UI-TARS."""

    DEFAULT_URL = "http://localhost:8200"

    def __init__(self, server_url: str = DEFAULT_URL, timeout: int = 30):
        self.server_url = server_url
        self.timeout = timeout

    @property
    def available(self) -> bool:
        """Vérifie si le serveur de grounding est accessible."""
        try:
            import requests
            resp = requests.get(f"{self.server_url}/health", timeout=3)
            return resp.status_code == 200 and resp.json().get("model_loaded", False)
        except Exception:
            return False

    def arbitrate(
        self,
        target: GroundingTarget,
        candidates: List[MatchCandidate],
        screenshot_pil: Optional[Any] = None,
    ) -> Optional[LocateResult]:
        """Demande au VLM de trancher.

        Args:
            target: Ce qu'on cherche.
            candidates: Candidats SMART (peut être vide).
            screenshot_pil: Screenshot PIL. Si None, le serveur capture lui-même.

        Returns:
            LocateResult ou None si le VLM ne trouve pas non plus.
        """
        t0 = time.time()

        try:
            import requests

            # Construire le payload
            payload: Dict[str, Any] = {
                "target_text": target.text or "",
                "target_description": target.description or "",
            }

            # Envoyer l'image si disponible
            if screenshot_pil is not None:
                buf = io.BytesIO()
                screenshot_pil.save(buf, format="JPEG", quality=85)
                payload["image_b64"] = base64.b64encode(buf.getvalue()).decode("utf-8")

            # Appel au serveur
            resp = requests.post(
                f"{self.server_url}/ground",
                json=payload,
                timeout=self.timeout,
            )

            dt = (time.time() - t0) * 1000

            if resp.status_code != 200:
                print(f"🤔 [THINK] Serveur HTTP {resp.status_code}")
                return None

            data = resp.json()

            if data.get("x") is None:
                print(f"🤔 [THINK] VLM n'a pas trouvé '{target.text}' ({dt:.0f}ms)")
                return None

            result = LocateResult(
                x=data["x"],
                y=data["y"],
                confidence=data.get("confidence", 0.85),
                method="think_vlm",
                time_ms=dt,
                tier="think",
                candidates_count=len(candidates),
            )

            print(f"🤔 [THINK] VLM → ({result.x}, {result.y}) conf={result.confidence:.2f} ({dt:.0f}ms)")
            return result

        except Exception as ex:
            dt = (time.time() - t0) * 1000
            print(f"⚠️ [THINK] Erreur: {ex} ({dt:.0f}ms)")
            return None