feat(extraction): client vLLM serveur (image+prompt -> texte, post_fn injectable)

Factorise un client propre pour la lecture d'écran : downscale image (fenêtre max_model_len), thinking off, post_fn injectable (testable sans vLLM). Sert de vlm_client à extract_dossier_from_image dans le handler runtime. 4 tests. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 18:03:26 +02:00
parent 6a78a0059b
commit f09b8b8cfd
2 changed files with 151 additions and 0 deletions
--- a/core/extraction/vlm_client.py
+++ b/core/extraction/vlm_client.py
@@ -0,0 +1,86 @@
+"""Client vLLM serveur : (image_path, prompt) -> texte de réponse.
+
+Petit client réutilisable pour la lecture d'écran (extraction de dossier). Le
+grounder (`resolve_engine`) fait déjà un POST vers vLLM:8001 mais en INLINE, non
+exposé ; on factorise ici un client propre, configurable et testable.
+
+- Image downscalée (largeur max) avant envoi : la fenêtre vLLM est limitée
+  (`max_model_len`), un écran plein déborde sinon (vu 30/06 : 6193+2000 > 8192).
+- `thinking` désactivé (vérifié : think=on -> sortie vide/lente sur ce modèle).
+- `post_fn` injectable -> testable sans vLLM réel.
+
+Branche feat/push-log-dgx.
+"""
+from __future__ import annotations
+
+import base64
+import os
+from io import BytesIO
+from typing import Callable, Optional
+
+VlmClient = Callable[[str, str], str]
+
+_DEFAULT_PORT = os.environ.get("VLLM_PORT", "8001")
+DEFAULT_URL = f"http://localhost:{_DEFAULT_PORT}/v1/chat/completions"
+DEFAULT_MODEL = os.environ.get("VLLM_MODEL", "Qwen/Qwen3-VL-4B-Instruct")
+
+
+def img_data_url(image_path: str, max_w: int = 1280) -> str:
+    """Encode l'image en data-URL PNG base64, downscalée à `max_w` si plus large."""
+    from PIL import Image
+    img = Image.open(image_path).convert("RGB")
+    if img.width > max_w:
+        h = int(img.height * max_w / img.width)
+        img = img.resize((max_w, h), Image.LANCZOS)
+    buf = BytesIO()
+    img.save(buf, format="PNG")
+    return "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode()
+
+
+def build_chat_body(
+    image_path: str,
+    prompt: str,
+    model: str = DEFAULT_MODEL,
+    max_tokens: int = 1500,
+    max_w: int = 1280,
+) -> dict:
+    """Construit le body chat/completions (image + prompt, thinking off)."""
+    return {
+        "model": model,
+        "messages": [{
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": img_data_url(image_path, max_w)}},
+                {"type": "text", "text": prompt},
+            ],
+        }],
+        "temperature": 0.0,
+        "max_tokens": max_tokens,
+        "chat_template_kwargs": {"enable_thinking": False},
+    }
+
+
+def make_vllm_client(
+    url: str = DEFAULT_URL,
+    model: str = DEFAULT_MODEL,
+    max_tokens: int = 1500,
+    max_w: int = 1280,
+    timeout: float = 120,
+    post_fn: Optional[Callable] = None,
+) -> VlmClient:
+    """Construit un client `(image_path, prompt) -> texte`, branché sur vLLM.
+
+    `post_fn` (signature `requests.post`) est injectable pour les tests.
+    Lève `RuntimeError` si le serveur ne répond pas 200 (message technique, sans PII).
+    """
+    def client(image_path: str, prompt: str) -> str:
+        body = build_chat_body(image_path, prompt, model=model, max_tokens=max_tokens, max_w=max_w)
+        poster = post_fn
+        if poster is None:
+            import requests
+            poster = requests.post
+        r = poster(url, json=body, headers={}, timeout=timeout)
+        if r.status_code != 200:
+            raise RuntimeError(f"vLLM {r.status_code}: {str(getattr(r, 'text', ''))[:300]}")
+        return r.json()["choices"][0]["message"]["content"]
+    return client