diff --git a/agent_v0/server_v1/api_stream.py b/agent_v0/server_v1/api_stream.py index 7da9e3e59..cec5c278e 100644 --- a/agent_v0/server_v1/api_stream.py +++ b/agent_v0/server_v1/api_stream.py @@ -3428,22 +3428,55 @@ def _resolve_by_grounding( 'Answer ONLY with JSON: {"x": 0.XX, "y": 0.YY}' ) - # Appel VLM (Qwen2.5-VL pour le grounding) + # Appel VLM — vLLM (GPU, rapide) en priorité, Ollama (CPU) en fallback + import requests as _requests + content = "" + + # Port vLLM configurable via env + _vllm_port = os.environ.get("VLLM_PORT", "8100") + _vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ") + + # Essai 1 : vLLM (API OpenAI-compatible, GPU) try: - import requests as _requests - resp = _requests.post("http://localhost:11434/api/chat", json={ - "model": "qwen2.5vl:7b", - "messages": [ - {"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."}, - {"role": "user", "content": prompt, "images": [shot_b64]}, - ], - "stream": False, - "options": {"temperature": 0.1, "num_predict": 80}, - }, timeout=60) - content = resp.json().get("message", {}).get("content", "") + vllm_resp = _requests.post( + f"http://localhost:{_vllm_port}/v1/chat/completions", + json={ + "model": _vllm_model, + "messages": [ + {"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."}, + {"role": "user", "content": [ + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}}, + ]}, + ], + "temperature": 0.1, + "max_tokens": 80, + }, + timeout=30, + ) + if vllm_resp.ok: + content = vllm_resp.json().get("choices", [{}])[0].get("message", {}).get("content", "") + if content: + logger.debug("Grounding via vLLM OK") except Exception as e: - logger.info("Grounding VLM timeout/erreur : %s", e) - return None + logger.debug("vLLM non disponible (%s), fallback Ollama", e) + + # Essai 2 : Ollama (CPU, plus lent) + if not content: + try: + resp = _requests.post("http://localhost:11434/api/chat", json={ + "model": "qwen2.5vl:7b", + "messages": [ + {"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."}, + {"role": "user", "content": prompt, "images": [shot_b64]}, + ], + "stream": False, + "options": {"temperature": 0.1, "num_predict": 80}, + }, timeout=60) + content = resp.json().get("message", {}).get("content", "") + except Exception as e: + logger.info("Grounding VLM timeout/erreur : %s", e) + return None elapsed = time.time() - t0