feat: support vLLM (GPU) comme moteur de grounding, Ollama en fallback
_resolve_by_grounding() essaie vLLM d'abord (API OpenAI-compatible, port 8100) puis Ollama en fallback. vLLM utilise Qwen2.5-VL-7B-AWQ sur GPU (~2-3s) vs Ollama sur CPU (~16s). Config via env vars : VLLM_PORT (défaut 8100), VLLM_MODEL. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3428,22 +3428,55 @@ def _resolve_by_grounding(
|
||||
'Answer ONLY with JSON: {"x": 0.XX, "y": 0.YY}'
|
||||
)
|
||||
|
||||
# Appel VLM (Qwen2.5-VL pour le grounding)
|
||||
# Appel VLM — vLLM (GPU, rapide) en priorité, Ollama (CPU) en fallback
|
||||
import requests as _requests
|
||||
content = ""
|
||||
|
||||
# Port vLLM configurable via env
|
||||
_vllm_port = os.environ.get("VLLM_PORT", "8100")
|
||||
_vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ")
|
||||
|
||||
# Essai 1 : vLLM (API OpenAI-compatible, GPU)
|
||||
try:
|
||||
import requests as _requests
|
||||
resp = _requests.post("http://localhost:11434/api/chat", json={
|
||||
"model": "qwen2.5vl:7b",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."},
|
||||
{"role": "user", "content": prompt, "images": [shot_b64]},
|
||||
],
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.1, "num_predict": 80},
|
||||
}, timeout=60)
|
||||
content = resp.json().get("message", {}).get("content", "")
|
||||
vllm_resp = _requests.post(
|
||||
f"http://localhost:{_vllm_port}/v1/chat/completions",
|
||||
json={
|
||||
"model": _vllm_model,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."},
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}},
|
||||
]},
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 80,
|
||||
},
|
||||
timeout=30,
|
||||
)
|
||||
if vllm_resp.ok:
|
||||
content = vllm_resp.json().get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
if content:
|
||||
logger.debug("Grounding via vLLM OK")
|
||||
except Exception as e:
|
||||
logger.info("Grounding VLM timeout/erreur : %s", e)
|
||||
return None
|
||||
logger.debug("vLLM non disponible (%s), fallback Ollama", e)
|
||||
|
||||
# Essai 2 : Ollama (CPU, plus lent)
|
||||
if not content:
|
||||
try:
|
||||
resp = _requests.post("http://localhost:11434/api/chat", json={
|
||||
"model": "qwen2.5vl:7b",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."},
|
||||
{"role": "user", "content": prompt, "images": [shot_b64]},
|
||||
],
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.1, "num_predict": 80},
|
||||
}, timeout=60)
|
||||
content = resp.json().get("message", {}).get("content", "")
|
||||
except Exception as e:
|
||||
logger.info("Grounding VLM timeout/erreur : %s", e)
|
||||
return None
|
||||
|
||||
elapsed = time.time() - t0
|
||||
|
||||
|
||||
Reference in New Issue
Block a user