feat: support vLLM (GPU) comme moteur de grounding, Ollama en fallback

_resolve_by_grounding() essaie vLLM d'abord (API OpenAI-compatible, port 8100) puis Ollama en fallback. vLLM utilise Qwen2.5-VL-7B-AWQ sur GPU (~2-3s) vs Ollama sur CPU (~16s). Config via env vars : VLLM_PORT (défaut 8100), VLLM_MODEL. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 23:37:12 +02:00
parent 6724f43950
commit 394342be7e
1 changed files with 47 additions and 14 deletions
--- a/agent_v0/server_v1/api_stream.py
+++ b/agent_v0/server_v1/api_stream.py
@@ -3428,9 +3428,42 @@ def _resolve_by_grounding(
        'Answer ONLY with JSON: {"x": 0.XX, "y": 0.YY}'
    )
-    # Appel VLM (Qwen2.5-VL pour le grounding)
+    # Appel VLM — vLLM (GPU, rapide) en priorité, Ollama (CPU) en fallback
    try:
    import requests as _requests
    content = ""
    # Port vLLM configurable via env
    _vllm_port = os.environ.get("VLLM_PORT", "8100")
    _vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ")
    # Essai 1 : vLLM (API OpenAI-compatible, GPU)
    try:
        vllm_resp = _requests.post(
            f"http://localhost:{_vllm_port}/v1/chat/completions",
            json={
                "model": _vllm_model,
                "messages": [
                    {"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."},
                    {"role": "user", "content": [
                        {"type": "text", "text": prompt},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}},
                    ]},
                ],
                "temperature": 0.1,
                "max_tokens": 80,
            },
            timeout=30,
        )
        if vllm_resp.ok:
            content = vllm_resp.json().get("choices", [{}])[0].get("message", {}).get("content", "")
            if content:
                logger.debug("Grounding via vLLM OK")
    except Exception as e:
        logger.debug("vLLM non disponible (%s), fallback Ollama", e)
    # Essai 2 : Ollama (CPU, plus lent)
    if not content:
        try:
            resp = _requests.post("http://localhost:11434/api/chat", json={
                "model": "qwen2.5vl:7b",
                "messages": [