feat: support vLLM (GPU) comme moteur de grounding, Ollama en fallback

_resolve_by_grounding() essaie vLLM d'abord (API OpenAI-compatible,
port 8100) puis Ollama en fallback. vLLM utilise Qwen2.5-VL-7B-AWQ
sur GPU (~2-3s) vs Ollama sur CPU (~16s).

Config via env vars : VLLM_PORT (défaut 8100), VLLM_MODEL.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-03-31 23:37:12 +02:00
parent 6724f43950
commit 394342be7e

View File

@@ -3428,9 +3428,42 @@ def _resolve_by_grounding(
'Answer ONLY with JSON: {"x": 0.XX, "y": 0.YY}'
)
# Appel VLM (Qwen2.5-VL pour le grounding)
try:
# Appel VLM — vLLM (GPU, rapide) en priorité, Ollama (CPU) en fallback
import requests as _requests
content = ""
# Port vLLM configurable via env
_vllm_port = os.environ.get("VLLM_PORT", "8100")
_vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ")
# Essai 1 : vLLM (API OpenAI-compatible, GPU)
try:
vllm_resp = _requests.post(
f"http://localhost:{_vllm_port}/v1/chat/completions",
json={
"model": _vllm_model,
"messages": [
{"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."},
{"role": "user", "content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}},
]},
],
"temperature": 0.1,
"max_tokens": 80,
},
timeout=30,
)
if vllm_resp.ok:
content = vllm_resp.json().get("choices", [{}])[0].get("message", {}).get("content", "")
if content:
logger.debug("Grounding via vLLM OK")
except Exception as e:
logger.debug("vLLM non disponible (%s), fallback Ollama", e)
# Essai 2 : Ollama (CPU, plus lent)
if not content:
try:
resp = _requests.post("http://localhost:11434/api/chat", json={
"model": "qwen2.5vl:7b",
"messages": [