diff --git a/core/extraction/vlm_client.py b/core/extraction/vlm_client.py new file mode 100644 index 000000000..302f5cbb5 --- /dev/null +++ b/core/extraction/vlm_client.py @@ -0,0 +1,86 @@ +"""Client vLLM serveur : (image_path, prompt) -> texte de réponse. + +Petit client réutilisable pour la lecture d'écran (extraction de dossier). Le +grounder (`resolve_engine`) fait déjà un POST vers vLLM:8001 mais en INLINE, non +exposé ; on factorise ici un client propre, configurable et testable. + +- Image downscalée (largeur max) avant envoi : la fenêtre vLLM est limitée + (`max_model_len`), un écran plein déborde sinon (vu 30/06 : 6193+2000 > 8192). +- `thinking` désactivé (vérifié : think=on -> sortie vide/lente sur ce modèle). +- `post_fn` injectable -> testable sans vLLM réel. + +Branche feat/push-log-dgx. +""" +from __future__ import annotations + +import base64 +import os +from io import BytesIO +from typing import Callable, Optional + +VlmClient = Callable[[str, str], str] + +_DEFAULT_PORT = os.environ.get("VLLM_PORT", "8001") +DEFAULT_URL = f"http://localhost:{_DEFAULT_PORT}/v1/chat/completions" +DEFAULT_MODEL = os.environ.get("VLLM_MODEL", "Qwen/Qwen3-VL-4B-Instruct") + + +def img_data_url(image_path: str, max_w: int = 1280) -> str: + """Encode l'image en data-URL PNG base64, downscalée à `max_w` si plus large.""" + from PIL import Image + img = Image.open(image_path).convert("RGB") + if img.width > max_w: + h = int(img.height * max_w / img.width) + img = img.resize((max_w, h), Image.LANCZOS) + buf = BytesIO() + img.save(buf, format="PNG") + return "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode() + + +def build_chat_body( + image_path: str, + prompt: str, + model: str = DEFAULT_MODEL, + max_tokens: int = 1500, + max_w: int = 1280, +) -> dict: + """Construit le body chat/completions (image + prompt, thinking off).""" + return { + "model": model, + "messages": [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": img_data_url(image_path, max_w)}}, + {"type": "text", "text": prompt}, + ], + }], + "temperature": 0.0, + "max_tokens": max_tokens, + "chat_template_kwargs": {"enable_thinking": False}, + } + + +def make_vllm_client( + url: str = DEFAULT_URL, + model: str = DEFAULT_MODEL, + max_tokens: int = 1500, + max_w: int = 1280, + timeout: float = 120, + post_fn: Optional[Callable] = None, +) -> VlmClient: + """Construit un client `(image_path, prompt) -> texte`, branché sur vLLM. + + `post_fn` (signature `requests.post`) est injectable pour les tests. + Lève `RuntimeError` si le serveur ne répond pas 200 (message technique, sans PII). + """ + def client(image_path: str, prompt: str) -> str: + body = build_chat_body(image_path, prompt, model=model, max_tokens=max_tokens, max_w=max_w) + poster = post_fn + if poster is None: + import requests + poster = requests.post + r = poster(url, json=body, headers={}, timeout=timeout) + if r.status_code != 200: + raise RuntimeError(f"vLLM {r.status_code}: {str(getattr(r, 'text', ''))[:300]}") + return r.json()["choices"][0]["message"]["content"] + return client diff --git a/tests/unit/test_vlm_client.py b/tests/unit/test_vlm_client.py new file mode 100644 index 000000000..083cdda9e --- /dev/null +++ b/tests/unit/test_vlm_client.py @@ -0,0 +1,65 @@ +"""Tests du client vLLM serveur (image + prompt -> texte). + +Le POST réseau est injectable (`post_fn`) → testable sans vLLM. Sert de +`vlm_client` à `extract_dossier_from_image` dans le handler runtime. +""" +import pytest + +from core.extraction.vlm_client import build_chat_body, img_data_url, make_vllm_client + + +def _png(tmp_path, w=2000, h=1000): + from PIL import Image + p = tmp_path / "x.png" + Image.new("RGB", (w, h), (255, 255, 255)).save(p) + return str(p) + + +class _Resp: + def __init__(self, code, payload=None, text=""): + self.status_code = code + self._p = payload or {} + self.text = text + + def json(self): + return self._p + + +def test_img_data_url_downscale(tmp_path): + url = img_data_url(_png(tmp_path), max_w=1280) + assert url.startswith("data:image/png;base64,") + + +def test_build_chat_body_structure(tmp_path): + body = build_chat_body(_png(tmp_path), "PROMPT", model="M", max_tokens=1500, max_w=1280) + assert body["model"] == "M" + assert body["max_tokens"] == 1500 + # thinking désactivé (vérifié hier : think=on -> vide/lent) + assert body["chat_template_kwargs"]["enable_thinking"] is False + content = body["messages"][0]["content"] + assert any(c["type"] == "image_url" for c in content) + assert any(c["type"] == "text" and c["text"] == "PROMPT" for c in content) + + +def test_client_retourne_content(tmp_path): + captured = {} + + def fake_post(url, json=None, headers=None, timeout=None): + captured["url"] = url + captured["body"] = json + return _Resp(200, {"choices": [{"message": {"content": "REPONSE"}}]}) + + client = make_vllm_client(model="M", post_fn=fake_post) + out = client(_png(tmp_path), "PROMPT") + assert out == "REPONSE" + assert "/v1/chat/completions" in captured["url"] + assert captured["body"]["messages"][0]["content"][1]["text"] == "PROMPT" + + +def test_client_erreur_status_leve(tmp_path): + def fake_post(url, json=None, headers=None, timeout=None): + return _Resp(500, text="boom") + + client = make_vllm_client(post_fn=fake_post) + with pytest.raises(RuntimeError): + client(_png(tmp_path), "PROMPT")