rpa_vision_v3/core/extraction/vlm_client.py

"""Client vLLM serveur : (image_path, prompt) -> texte de réponse.

Petit client réutilisable pour la lecture d'écran (extraction de dossier). Le
grounder (`resolve_engine`) fait déjà un POST vers vLLM:8001 mais en INLINE, non
exposé ; on factorise ici un client propre, configurable et testable.

- Image downscalée (largeur max) avant envoi : la fenêtre vLLM est limitée
  (`max_model_len`), un écran plein déborde sinon (vu 30/06 : 6193+2000 > 8192).
- `thinking` désactivé (vérifié : think=on -> sortie vide/lente sur ce modèle).
- `post_fn` injectable -> testable sans vLLM réel.

Branche feat/push-log-dgx.
"""
from __future__ import annotations

import base64
import os
from io import BytesIO
from typing import Callable, Optional

VlmClient = Callable[[str, str], str]

_DEFAULT_PORT = os.environ.get("VLLM_PORT", "8001")
DEFAULT_URL = f"http://localhost:{_DEFAULT_PORT}/v1/chat/completions"
DEFAULT_MODEL = os.environ.get("VLLM_MODEL", "Qwen/Qwen3-VL-4B-Instruct")


def img_data_url(image_path: str, max_w: int = 1280) -> str:
    """Encode l'image en data-URL PNG base64, downscalée à `max_w` si plus large."""
    from PIL import Image
    img = Image.open(image_path).convert("RGB")
    if img.width > max_w:
        h = int(img.height * max_w / img.width)
        img = img.resize((max_w, h), Image.LANCZOS)
    buf = BytesIO()
    img.save(buf, format="PNG")
    return "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode()


def build_chat_body(
    image_path: str,
    prompt: str,
    model: str = DEFAULT_MODEL,
    max_tokens: int = 1500,
    max_w: int = 1280,
) -> dict:
    """Construit le body chat/completions (image + prompt, thinking off)."""
    return {
        "model": model,
        "messages": [{
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": img_data_url(image_path, max_w)}},
                {"type": "text", "text": prompt},
            ],
        }],
        "temperature": 0.0,
        "max_tokens": max_tokens,
        "chat_template_kwargs": {"enable_thinking": False},
    }


def make_vllm_client(
    url: str = DEFAULT_URL,
    model: str = DEFAULT_MODEL,
    max_tokens: int = 1500,
    max_w: int = 1280,
    timeout: float = 120,
    post_fn: Optional[Callable] = None,
) -> VlmClient:
    """Construit un client `(image_path, prompt) -> texte`, branché sur vLLM.

    `post_fn` (signature `requests.post`) est injectable pour les tests.
    Lève `RuntimeError` si le serveur ne répond pas 200 (message technique, sans PII).
    """
    def client(image_path: str, prompt: str) -> str:
        body = build_chat_body(image_path, prompt, model=model, max_tokens=max_tokens, max_w=max_w)
        poster = post_fn
        if poster is None:
            import requests
            poster = requests.post
        r = poster(url, json=body, headers={}, timeout=timeout)
        if r.status_code != 200:
            raise RuntimeError(f"vLLM {r.status_code}: {str(getattr(r, 'text', ''))[:300]}")
        return r.json()["choices"][0]["message"]["content"]
    return client