feat: support vLLM (GPU) comme moteur de grounding, Ollama en fallback

_resolve_by_grounding() essaie vLLM d'abord (API OpenAI-compatible, port 8100) puis Ollama en fallback. vLLM utilise Qwen2.5-VL-7B-AWQ sur GPU (~2-3s) vs Ollama sur CPU (~16s). Config via env vars : VLLM_PORT (défaut 8100), VLLM_MODEL. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 23:37:12 +02:00
parent 6724f43950
commit 394342be7e
1 changed files with 47 additions and 14 deletions
--- a/agent_v0/server_v1/api_stream.py
+++ b/agent_v0/server_v1/api_stream.py
@@ -3428,9 +3428,42 @@ def _resolve_by_grounding(
        'Answer ONLY with JSON: {"x": 0.XX, "y": 0.YY}'
    )

-    # Appel VLM (Qwen2.5-VL pour le grounding)
-    try:
+    # Appel VLM — vLLM (GPU, rapide) en priorité, Ollama (CPU) en fallback
    import requests as _requests
+    content = ""
+
+    # Port vLLM configurable via env
+    _vllm_port = os.environ.get("VLLM_PORT", "8100")
+    _vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ")
+
+    # Essai 1 : vLLM (API OpenAI-compatible, GPU)
+    try:
+        vllm_resp = _requests.post(
+            f"http://localhost:{_vllm_port}/v1/chat/completions",
+            json={
+                "model": _vllm_model,
+                "messages": [
+                    {"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."},
+                    {"role": "user", "content": [
+                        {"type": "text", "text": prompt},
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}},
+                    ]},
+                ],
+                "temperature": 0.1,
+                "max_tokens": 80,
+            },
+            timeout=30,
+        )
+        if vllm_resp.ok:
+            content = vllm_resp.json().get("choices", [{}])[0].get("message", {}).get("content", "")
+            if content:
+                logger.debug("Grounding via vLLM OK")
+    except Exception as e:
+        logger.debug("vLLM non disponible (%s), fallback Ollama", e)
+
+    # Essai 2 : Ollama (CPU, plus lent)
+    if not content:
+        try:
            resp = _requests.post("http://localhost:11434/api/chat", json={
                "model": "qwen2.5vl:7b",
                "messages": [