Factorise un client propre pour la lecture d'écran : downscale image (fenêtre max_model_len), thinking off, post_fn injectable (testable sans vLLM). Sert de vlm_client à extract_dossier_from_image dans le handler runtime. 4 tests. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
87 lines
3.0 KiB
Python
87 lines
3.0 KiB
Python
"""Client vLLM serveur : (image_path, prompt) -> texte de réponse.
|
|
|
|
Petit client réutilisable pour la lecture d'écran (extraction de dossier). Le
|
|
grounder (`resolve_engine`) fait déjà un POST vers vLLM:8001 mais en INLINE, non
|
|
exposé ; on factorise ici un client propre, configurable et testable.
|
|
|
|
- Image downscalée (largeur max) avant envoi : la fenêtre vLLM est limitée
|
|
(`max_model_len`), un écran plein déborde sinon (vu 30/06 : 6193+2000 > 8192).
|
|
- `thinking` désactivé (vérifié : think=on -> sortie vide/lente sur ce modèle).
|
|
- `post_fn` injectable -> testable sans vLLM réel.
|
|
|
|
Branche feat/push-log-dgx.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import base64
|
|
import os
|
|
from io import BytesIO
|
|
from typing import Callable, Optional
|
|
|
|
VlmClient = Callable[[str, str], str]
|
|
|
|
_DEFAULT_PORT = os.environ.get("VLLM_PORT", "8001")
|
|
DEFAULT_URL = f"http://localhost:{_DEFAULT_PORT}/v1/chat/completions"
|
|
DEFAULT_MODEL = os.environ.get("VLLM_MODEL", "Qwen/Qwen3-VL-4B-Instruct")
|
|
|
|
|
|
def img_data_url(image_path: str, max_w: int = 1280) -> str:
|
|
"""Encode l'image en data-URL PNG base64, downscalée à `max_w` si plus large."""
|
|
from PIL import Image
|
|
img = Image.open(image_path).convert("RGB")
|
|
if img.width > max_w:
|
|
h = int(img.height * max_w / img.width)
|
|
img = img.resize((max_w, h), Image.LANCZOS)
|
|
buf = BytesIO()
|
|
img.save(buf, format="PNG")
|
|
return "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode()
|
|
|
|
|
|
def build_chat_body(
|
|
image_path: str,
|
|
prompt: str,
|
|
model: str = DEFAULT_MODEL,
|
|
max_tokens: int = 1500,
|
|
max_w: int = 1280,
|
|
) -> dict:
|
|
"""Construit le body chat/completions (image + prompt, thinking off)."""
|
|
return {
|
|
"model": model,
|
|
"messages": [{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "image_url", "image_url": {"url": img_data_url(image_path, max_w)}},
|
|
{"type": "text", "text": prompt},
|
|
],
|
|
}],
|
|
"temperature": 0.0,
|
|
"max_tokens": max_tokens,
|
|
"chat_template_kwargs": {"enable_thinking": False},
|
|
}
|
|
|
|
|
|
def make_vllm_client(
|
|
url: str = DEFAULT_URL,
|
|
model: str = DEFAULT_MODEL,
|
|
max_tokens: int = 1500,
|
|
max_w: int = 1280,
|
|
timeout: float = 120,
|
|
post_fn: Optional[Callable] = None,
|
|
) -> VlmClient:
|
|
"""Construit un client `(image_path, prompt) -> texte`, branché sur vLLM.
|
|
|
|
`post_fn` (signature `requests.post`) est injectable pour les tests.
|
|
Lève `RuntimeError` si le serveur ne répond pas 200 (message technique, sans PII).
|
|
"""
|
|
def client(image_path: str, prompt: str) -> str:
|
|
body = build_chat_body(image_path, prompt, model=model, max_tokens=max_tokens, max_w=max_w)
|
|
poster = post_fn
|
|
if poster is None:
|
|
import requests
|
|
poster = requests.post
|
|
r = poster(url, json=body, headers={}, timeout=timeout)
|
|
if r.status_code != 200:
|
|
raise RuntimeError(f"vLLM {r.status_code}: {str(getattr(r, 'text', ''))[:300]}")
|
|
return r.json()["choices"][0]["message"]["content"]
|
|
return client
|