From ba7ed0ac06ea48481683423c024df9139b377c18 Mon Sep 17 00:00:00 2001 From: Thomas Dhome-Casanova Date: Sat, 1 Feb 2025 17:29:34 -0800 Subject: [PATCH] qwen2.5vl --- .../gradio/agent/llm_utils/groqclient.py | 2 +- .../gradio/agent/llm_utils/oai.py | 13 +++++------- computer_use_demo/gradio/agent/vlm_agent.py | 20 +++++++++++++++++-- computer_use_demo/gradio/app.py | 8 +++++--- computer_use_demo/gradio/loop.py | 4 ++-- 5 files changed, 31 insertions(+), 16 deletions(-) diff --git a/computer_use_demo/gradio/agent/llm_utils/groqclient.py b/computer_use_demo/gradio/agent/llm_utils/groqclient.py index 812d929..c31a502 100644 --- a/computer_use_demo/gradio/agent/llm_utils/groqclient.py +++ b/computer_use_demo/gradio/agent/llm_utils/groqclient.py @@ -2,7 +2,7 @@ from groq import Groq import os from .utils import is_image_path -def run_groq_interleaved(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0.6): +def run_groq_interleaved(messages: list, system: str, model_name: str, api_key: str, max_tokens=256, temperature=0.6): """ Run a chat completion through Groq's API, ignoring any images in the messages. """ diff --git a/computer_use_demo/gradio/agent/llm_utils/oai.py b/computer_use_demo/gradio/agent/llm_utils/oai.py index e2daba7..64eb21d 100644 --- a/computer_use_demo/gradio/agent/llm_utils/oai.py +++ b/computer_use_demo/gradio/agent/llm_utils/oai.py @@ -4,11 +4,7 @@ import base64 import requests from .utils import is_image_path, encode_image -def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0): - api_key = api_key or os.environ.get("OPENAI_API_KEY") - if not api_key: - raise ValueError("OPENAI_API_KEY is not set") - +def run_oai_interleaved(messages: list, system: str, model_name: str, api_key: str, max_tokens=256, temperature=0, provider_base_url: str = "https://api.openai.com/v1"): headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"} @@ -43,20 +39,21 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max final_messages = [{"role": "user", "content": messages}] payload = { - "model": llm, + "model": model_name, "messages": final_messages, "max_tokens": max_tokens, "temperature": temperature } response = requests.post( - "https://api.openai.com/v1/chat/completions", headers=headers, json=payload + f"{provider_base_url}/chat/completions", headers=headers, json=payload ) + try: text = response.json()['choices'][0]['message']['content'] token_usage = int(response.json()['usage']['total_tokens']) return text, token_usage except Exception as e: - print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ") + print(f"Error in interleaved openAI: {e}. This may due to your invalid API key. Please check the response: {response.json()} ") return response.json() \ No newline at end of file diff --git a/computer_use_demo/gradio/agent/vlm_agent.py b/computer_use_demo/gradio/agent/vlm_agent.py index 0ae68e4..3c4ea73 100644 --- a/computer_use_demo/gradio/agent/vlm_agent.py +++ b/computer_use_demo/gradio/agent/vlm_agent.py @@ -42,6 +42,8 @@ class VLMAgent: self.model = "gpt-4o-2024-11-20" elif model == "omniparser + R1": self.model = "deepseek-r1-distill-llama-70b" + elif model == "omniparser + qwen2.5vl": + self.model = "qwen2.5-vl-72b-instruct" else: raise ValueError(f"Model {model} not supported") @@ -93,9 +95,10 @@ class VLMAgent: vlm_response, token_usage = run_oai_interleaved( messages=planner_messages, system=system, - llm=self.model, + model_name=self.model, api_key=self.api_key, max_tokens=self.max_tokens, + provider_base_url="https://api.openai.com/v1", temperature=0, ) print(f"oai token usage: {token_usage}") @@ -106,13 +109,26 @@ class VLMAgent: vlm_response, token_usage = run_groq_interleaved( messages=planner_messages, system=system, - llm=self.model, + model_name=self.model, api_key=self.api_key, max_tokens=self.max_tokens, ) print(f"groq token usage: {token_usage}") self.total_token_usage += token_usage self.total_cost += (token_usage * 0.99 / 1000000) + elif "qwen" in self.model: + vlm_response, token_usage = run_oai_interleaved( + messages=planner_messages, + system=system, + model_name=self.model, + api_key=self.api_key, + max_tokens=min(2048, self.max_tokens), + provider_base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + temperature=0, + ) + print(f"qwen token usage: {token_usage}") + self.total_token_usage += token_usage + self.total_cost += (token_usage * 2.2 / 1000000) # https://help.aliyun.com/zh/model-studio/getting-started/models?spm=a2c4g.11186623.0.0.74b04823CGnPv7#fe96cfb1a422a else: raise ValueError(f"Model {self.model} not supported") latency_vlm = time.time() - start diff --git a/computer_use_demo/gradio/app.py b/computer_use_demo/gradio/app.py index 7ca2a22..9c4ad5c 100644 --- a/computer_use_demo/gradio/app.py +++ b/computer_use_demo/gradio/app.py @@ -28,7 +28,7 @@ API_KEY_FILE = CONFIG_DIR / "api_key" INTRO_TEXT = ''' 🚀🤖✨ It's Play Time! -Welcome to the OmniParser+X Computer Use Demo! X = [GPT-4o, R1, Claude]. Let OmniParser turn your general purpose vision-langauge model to an AI agent. +Welcome to the OmniParser+X Computer Use Demo! X = [GPT-4o, R1, Qwen2.5VL, Claude]. Let OmniParser turn your general purpose vision-langauge model to an AI agent. Type a message and press submit to start OmniParser+X. Press the trash icon in the chat to clear the message history. ''' @@ -189,7 +189,7 @@ def valid_params(user_input, state): """Validate all requirements and return a list of error messages.""" errors = [] - for server_name, url in [('Windows Host', args.windows_host_url), ('OmniParser Server', args.omniparser_server_url)]: + for server_name, url in [('Windows Host', 'localhost:5000'), ('OmniParser Server', args.omniparser_server_url)]: try: url = f'http://{url}/probe' response = requests.get(url, timeout=3) @@ -270,7 +270,7 @@ with gr.Blocks(theme=gr.themes.Default()) as demo: with gr.Column(): model = gr.Dropdown( label="Model", - choices=["omniparser + gpt-4o", "omniparser + R1", "claude-3-5-sonnet-20241022"], + choices=["omniparser + gpt-4o", "omniparser + R1", "omniparser + qwen2.5vl", "claude-3-5-sonnet-20241022"], value="omniparser + gpt-4o", interactive=True, ) @@ -326,6 +326,8 @@ with gr.Blocks(theme=gr.themes.Default()) as demo: provider_choices = ["openai"] elif model_selection == "omniparser + R1": provider_choices = ["groq"] + elif model_selection == "omniparser + qwen2.5vl": + provider_choices = ["dashscope"] else: provider_choices = [option.value for option in APIProvider] default_provider_value = provider_choices[0] diff --git a/computer_use_demo/gradio/loop.py b/computer_use_demo/gradio/loop.py index 15041cf..f6d4aa6 100644 --- a/computer_use_demo/gradio/loop.py +++ b/computer_use_demo/gradio/loop.py @@ -64,7 +64,7 @@ def sampling_loop_sync( max_tokens=max_tokens, only_n_most_recent_images=only_n_most_recent_images ) - elif model == "omniparser + gpt-4o" or model == "omniparser + R1": + elif model == "omniparser + gpt-4o" or model == "omniparser + R1" or model == "omniparser + qwen2.5vl": actor = VLMAgent( model=model, provider=provider, @@ -100,7 +100,7 @@ def sampling_loop_sync( messages.append({"content": tool_result_content, "role": "user"}) - elif model == "omniparser + gpt-4o" or model == "omniparser + R1": + elif model == "omniparser + gpt-4o" or model == "omniparser + R1" or model == "omniparser + qwen2.5vl": while True: parsed_screen = omniparser_client() tools_use_needed, vlm_response_json = actor(messages=messages, parsed_screen=parsed_screen)