"""Client vLLM serveur : (image_path, prompt) -> texte de réponse. Petit client réutilisable pour la lecture d'écran (extraction de dossier). Le grounder (`resolve_engine`) fait déjà un POST vers vLLM:8001 mais en INLINE, non exposé ; on factorise ici un client propre, configurable et testable. - Image downscalée (largeur max) avant envoi : la fenêtre vLLM est limitée (`max_model_len`), un écran plein déborde sinon (vu 30/06 : 6193+2000 > 8192). - `thinking` désactivé (vérifié : think=on -> sortie vide/lente sur ce modèle). - `post_fn` injectable -> testable sans vLLM réel. Branche feat/push-log-dgx. """ from __future__ import annotations import base64 import os from io import BytesIO from typing import Callable, Optional VlmClient = Callable[[str, str], str] _DEFAULT_PORT = os.environ.get("VLLM_PORT", "8001") DEFAULT_URL = f"http://localhost:{_DEFAULT_PORT}/v1/chat/completions" DEFAULT_MODEL = os.environ.get("VLLM_MODEL", "Qwen/Qwen3-VL-4B-Instruct") def img_data_url(image_path: str, max_w: int = 1280) -> str: """Encode l'image en data-URL PNG base64, downscalée à `max_w` si plus large.""" from PIL import Image img = Image.open(image_path).convert("RGB") if img.width > max_w: h = int(img.height * max_w / img.width) img = img.resize((max_w, h), Image.LANCZOS) buf = BytesIO() img.save(buf, format="PNG") return "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode() def build_chat_body( image_path: str, prompt: str, model: str = DEFAULT_MODEL, max_tokens: int = 1500, max_w: int = 1280, ) -> dict: """Construit le body chat/completions (image + prompt, thinking off).""" return { "model": model, "messages": [{ "role": "user", "content": [ {"type": "image_url", "image_url": {"url": img_data_url(image_path, max_w)}}, {"type": "text", "text": prompt}, ], }], "temperature": 0.0, "max_tokens": max_tokens, "chat_template_kwargs": {"enable_thinking": False}, } def make_vllm_client( url: str = DEFAULT_URL, model: str = DEFAULT_MODEL, max_tokens: int = 1500, max_w: int = 1280, timeout: float = 120, post_fn: Optional[Callable] = None, ) -> VlmClient: """Construit un client `(image_path, prompt) -> texte`, branché sur vLLM. `post_fn` (signature `requests.post`) est injectable pour les tests. Lève `RuntimeError` si le serveur ne répond pas 200 (message technique, sans PII). """ def client(image_path: str, prompt: str) -> str: body = build_chat_body(image_path, prompt, model=model, max_tokens=max_tokens, max_w=max_w) poster = post_fn if poster is None: import requests poster = requests.post r = poster(url, json=body, headers={}, timeout=timeout) if r.status_code != 200: raise RuntimeError(f"vLLM {r.status_code}: {str(getattr(r, 'text', ''))[:300]}") return r.json()["choices"][0]["message"]["content"] return client