feat: support vLLM (GPU) comme moteur de grounding, Ollama en fallback
_resolve_by_grounding() essaie vLLM d'abord (API OpenAI-compatible, port 8100) puis Ollama en fallback. vLLM utilise Qwen2.5-VL-7B-AWQ sur GPU (~2-3s) vs Ollama sur CPU (~16s). Config via env vars : VLLM_PORT (défaut 8100), VLLM_MODEL. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3428,9 +3428,42 @@ def _resolve_by_grounding(
|
|||||||
'Answer ONLY with JSON: {"x": 0.XX, "y": 0.YY}'
|
'Answer ONLY with JSON: {"x": 0.XX, "y": 0.YY}'
|
||||||
)
|
)
|
||||||
|
|
||||||
# Appel VLM (Qwen2.5-VL pour le grounding)
|
# Appel VLM — vLLM (GPU, rapide) en priorité, Ollama (CPU) en fallback
|
||||||
try:
|
|
||||||
import requests as _requests
|
import requests as _requests
|
||||||
|
content = ""
|
||||||
|
|
||||||
|
# Port vLLM configurable via env
|
||||||
|
_vllm_port = os.environ.get("VLLM_PORT", "8100")
|
||||||
|
_vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ")
|
||||||
|
|
||||||
|
# Essai 1 : vLLM (API OpenAI-compatible, GPU)
|
||||||
|
try:
|
||||||
|
vllm_resp = _requests.post(
|
||||||
|
f"http://localhost:{_vllm_port}/v1/chat/completions",
|
||||||
|
json={
|
||||||
|
"model": _vllm_model,
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."},
|
||||||
|
{"role": "user", "content": [
|
||||||
|
{"type": "text", "text": prompt},
|
||||||
|
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}},
|
||||||
|
]},
|
||||||
|
],
|
||||||
|
"temperature": 0.1,
|
||||||
|
"max_tokens": 80,
|
||||||
|
},
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
if vllm_resp.ok:
|
||||||
|
content = vllm_resp.json().get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||||
|
if content:
|
||||||
|
logger.debug("Grounding via vLLM OK")
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("vLLM non disponible (%s), fallback Ollama", e)
|
||||||
|
|
||||||
|
# Essai 2 : Ollama (CPU, plus lent)
|
||||||
|
if not content:
|
||||||
|
try:
|
||||||
resp = _requests.post("http://localhost:11434/api/chat", json={
|
resp = _requests.post("http://localhost:11434/api/chat", json={
|
||||||
"model": "qwen2.5vl:7b",
|
"model": "qwen2.5vl:7b",
|
||||||
"messages": [
|
"messages": [
|
||||||
|
|||||||
Reference in New Issue
Block a user