feat(extraction): client vLLM serveur (image+prompt -> texte, post_fn injectable)

Factorise un client propre pour la lecture d'écran : downscale image (fenêtre
max_model_len), thinking off, post_fn injectable (testable sans vLLM). Sert de
vlm_client à extract_dossier_from_image dans le handler runtime. 4 tests.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-06-30 18:03:26 +02:00
parent 6a78a0059b
commit f09b8b8cfd
2 changed files with 151 additions and 0 deletions

View File

@@ -0,0 +1,86 @@
"""Client vLLM serveur : (image_path, prompt) -> texte de réponse.
Petit client réutilisable pour la lecture d'écran (extraction de dossier). Le
grounder (`resolve_engine`) fait déjà un POST vers vLLM:8001 mais en INLINE, non
exposé ; on factorise ici un client propre, configurable et testable.
- Image downscalée (largeur max) avant envoi : la fenêtre vLLM est limitée
(`max_model_len`), un écran plein déborde sinon (vu 30/06 : 6193+2000 > 8192).
- `thinking` désactivé (vérifié : think=on -> sortie vide/lente sur ce modèle).
- `post_fn` injectable -> testable sans vLLM réel.
Branche feat/push-log-dgx.
"""
from __future__ import annotations
import base64
import os
from io import BytesIO
from typing import Callable, Optional
VlmClient = Callable[[str, str], str]
_DEFAULT_PORT = os.environ.get("VLLM_PORT", "8001")
DEFAULT_URL = f"http://localhost:{_DEFAULT_PORT}/v1/chat/completions"
DEFAULT_MODEL = os.environ.get("VLLM_MODEL", "Qwen/Qwen3-VL-4B-Instruct")
def img_data_url(image_path: str, max_w: int = 1280) -> str:
"""Encode l'image en data-URL PNG base64, downscalée à `max_w` si plus large."""
from PIL import Image
img = Image.open(image_path).convert("RGB")
if img.width > max_w:
h = int(img.height * max_w / img.width)
img = img.resize((max_w, h), Image.LANCZOS)
buf = BytesIO()
img.save(buf, format="PNG")
return "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode()
def build_chat_body(
image_path: str,
prompt: str,
model: str = DEFAULT_MODEL,
max_tokens: int = 1500,
max_w: int = 1280,
) -> dict:
"""Construit le body chat/completions (image + prompt, thinking off)."""
return {
"model": model,
"messages": [{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": img_data_url(image_path, max_w)}},
{"type": "text", "text": prompt},
],
}],
"temperature": 0.0,
"max_tokens": max_tokens,
"chat_template_kwargs": {"enable_thinking": False},
}
def make_vllm_client(
url: str = DEFAULT_URL,
model: str = DEFAULT_MODEL,
max_tokens: int = 1500,
max_w: int = 1280,
timeout: float = 120,
post_fn: Optional[Callable] = None,
) -> VlmClient:
"""Construit un client `(image_path, prompt) -> texte`, branché sur vLLM.
`post_fn` (signature `requests.post`) est injectable pour les tests.
Lève `RuntimeError` si le serveur ne répond pas 200 (message technique, sans PII).
"""
def client(image_path: str, prompt: str) -> str:
body = build_chat_body(image_path, prompt, model=model, max_tokens=max_tokens, max_w=max_w)
poster = post_fn
if poster is None:
import requests
poster = requests.post
r = poster(url, json=body, headers={}, timeout=timeout)
if r.status_code != 200:
raise RuntimeError(f"vLLM {r.status_code}: {str(getattr(r, 'text', ''))[:300]}")
return r.json()["choices"][0]["message"]["content"]
return client

View File

@@ -0,0 +1,65 @@
"""Tests du client vLLM serveur (image + prompt -> texte).
Le POST réseau est injectable (`post_fn`) → testable sans vLLM. Sert de
`vlm_client` à `extract_dossier_from_image` dans le handler runtime.
"""
import pytest
from core.extraction.vlm_client import build_chat_body, img_data_url, make_vllm_client
def _png(tmp_path, w=2000, h=1000):
from PIL import Image
p = tmp_path / "x.png"
Image.new("RGB", (w, h), (255, 255, 255)).save(p)
return str(p)
class _Resp:
def __init__(self, code, payload=None, text=""):
self.status_code = code
self._p = payload or {}
self.text = text
def json(self):
return self._p
def test_img_data_url_downscale(tmp_path):
url = img_data_url(_png(tmp_path), max_w=1280)
assert url.startswith("data:image/png;base64,")
def test_build_chat_body_structure(tmp_path):
body = build_chat_body(_png(tmp_path), "PROMPT", model="M", max_tokens=1500, max_w=1280)
assert body["model"] == "M"
assert body["max_tokens"] == 1500
# thinking désactivé (vérifié hier : think=on -> vide/lent)
assert body["chat_template_kwargs"]["enable_thinking"] is False
content = body["messages"][0]["content"]
assert any(c["type"] == "image_url" for c in content)
assert any(c["type"] == "text" and c["text"] == "PROMPT" for c in content)
def test_client_retourne_content(tmp_path):
captured = {}
def fake_post(url, json=None, headers=None, timeout=None):
captured["url"] = url
captured["body"] = json
return _Resp(200, {"choices": [{"message": {"content": "REPONSE"}}]})
client = make_vllm_client(model="M", post_fn=fake_post)
out = client(_png(tmp_path), "PROMPT")
assert out == "REPONSE"
assert "/v1/chat/completions" in captured["url"]
assert captured["body"]["messages"][0]["content"][1]["text"] == "PROMPT"
def test_client_erreur_status_leve(tmp_path):
def fake_post(url, json=None, headers=None, timeout=None):
return _Resp(500, text="boom")
client = make_vllm_client(post_fn=fake_post)
with pytest.raises(RuntimeError):
client(_png(tmp_path), "PROMPT")