diff --git a/computer_use_demo/gradio/agent/llm_utils/oai.py b/computer_use_demo/gradio/agent/llm_utils/oaiclient.py similarity index 86% rename from computer_use_demo/gradio/agent/llm_utils/oai.py rename to computer_use_demo/gradio/agent/llm_utils/oaiclient.py index 64eb21d..ad42110 100644 --- a/computer_use_demo/gradio/agent/llm_utils/oai.py +++ b/computer_use_demo/gradio/agent/llm_utils/oaiclient.py @@ -7,7 +7,6 @@ from .utils import is_image_path, encode_image def run_oai_interleaved(messages: list, system: str, model_name: str, api_key: str, max_tokens=256, temperature=0, provider_base_url: str = "https://api.openai.com/v1"): headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"} - final_messages = [{"role": "system", "content": system}] if type(messages) == list: @@ -16,7 +15,8 @@ def run_oai_interleaved(messages: list, system: str, model_name: str, api_key: s if isinstance(item, dict): for cnt in item["content"]: if isinstance(cnt, str): - if is_image_path(cnt): + if is_image_path(cnt) and 'o3-mini' not in model_name: + # 03 mini does not support images base64_image = encode_image(cnt) content = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} else: @@ -41,9 +41,12 @@ def run_oai_interleaved(messages: list, system: str, model_name: str, api_key: s payload = { "model": model_name, "messages": final_messages, - "max_tokens": max_tokens, - "temperature": temperature } + if 'o1' in model_name or 'o3-mini' in model_name: + payload['reasoning_effort'] = 'low' + payload['max_completion_tokens'] = max_tokens + else: + payload['max_tokens'] = max_tokens response = requests.post( f"{provider_base_url}/chat/completions", headers=headers, json=payload diff --git a/computer_use_demo/gradio/agent/vlm_agent.py b/computer_use_demo/gradio/agent/vlm_agent.py index 173c664..7599c39 100644 --- a/computer_use_demo/gradio/agent/vlm_agent.py +++ b/computer_use_demo/gradio/agent/vlm_agent.py @@ -10,7 +10,7 @@ from anthropic import APIResponse from anthropic.types import ToolResultBlockParam from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, BetaMessageParam, BetaUsage -from agent.llm_utils.oai import run_oai_interleaved +from agent.llm_utils.oaiclient import run_oai_interleaved from agent.llm_utils.groqclient import run_groq_interleaved from agent.llm_utils.utils import is_image_path import time @@ -45,6 +45,10 @@ class VLMAgent: self.model = "deepseek-r1-distill-llama-70b" elif model == "omniparser + qwen2.5vl": self.model = "qwen2.5-vl-72b-instruct" + elif model == "omniparser + o1": + self.model = "o1" + elif model == "omniparser + o3-mini": + self.model = "o3-mini" else: raise ValueError(f"Model {model} not supported") @@ -69,9 +73,6 @@ class VLMAgent: latency_omniparser = parsed_screen['latency'] self.output_callback(f'-- Step {self.step_count}: --', sender="bot") screen_info = str(parsed_screen['screen_info']) - - - screenshot_uuid = parsed_screen['screenshot_uuid'] screen_width, screen_height = parsed_screen['width'], parsed_screen['height'] @@ -90,7 +91,7 @@ class VLMAgent: planner_messages[-1]["content"].append(f"{OUTPUT_DIR}/screenshot_som_{screenshot_uuid}.png") start = time.time() - if "gpt" in self.model: + if "gpt" in self.model or "o1" in self.model or "o3-mini" in self.model: vlm_response, token_usage = run_oai_interleaved( messages=planner_messages, system=system, @@ -102,7 +103,12 @@ class VLMAgent: ) print(f"oai token usage: {token_usage}") self.total_token_usage += token_usage - self.total_cost += (token_usage * 2.5 / 1000000) # https://openai.com/api/pricing/ + if 'gpt' in self.model: + self.total_cost += (token_usage * 2.5 / 1000000) # https://openai.com/api/pricing/ + elif 'o1' in self.model: + self.total_cost += (token_usage * 15 / 1000000) # https://openai.com/api/pricing/ + elif 'o3-mini' in self.model: + self.total_cost += (token_usage * 1.1 / 1000000) # https://openai.com/api/pricing/ elif "r1" in self.model: vlm_response, token_usage = run_groq_interleaved( messages=planner_messages, diff --git a/computer_use_demo/gradio/app.py b/computer_use_demo/gradio/app.py index 0128471..92ea689 100644 --- a/computer_use_demo/gradio/app.py +++ b/computer_use_demo/gradio/app.py @@ -242,6 +242,7 @@ def process_input(user_input, state): api_response_callback=partial(_api_response_callback, response_state=state["responses"]), api_key=state["api_key"], only_n_most_recent_images=state["only_n_most_recent_images"], + max_tokens=16384, omniparser_url=args.omniparser_server_url ): if loop_msg is None or state.get("stop"): @@ -280,7 +281,7 @@ with gr.Blocks(theme=gr.themes.Default()) as demo: with gr.Column(): model = gr.Dropdown( label="Model", - choices=["omniparser + gpt-4o", "omniparser + R1", "omniparser + qwen2.5vl", "claude-3-5-sonnet-20241022"], + choices=["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "claude-3-5-sonnet-20241022"], value="omniparser + gpt-4o", interactive=True, ) @@ -334,7 +335,7 @@ with gr.Blocks(theme=gr.themes.Default()) as demo: if model_selection == "claude-3-5-sonnet-20241022": provider_choices = [option.value for option in APIProvider if option.value != "openai"] - elif model_selection == "omniparser + gpt-4o": + elif model_selection in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini"]): provider_choices = ["openai"] elif model_selection == "omniparser + R1": provider_choices = ["groq"] diff --git a/computer_use_demo/gradio/loop.py b/computer_use_demo/gradio/loop.py index f6d4aa6..8d7b75f 100644 --- a/computer_use_demo/gradio/loop.py +++ b/computer_use_demo/gradio/loop.py @@ -64,13 +64,15 @@ def sampling_loop_sync( max_tokens=max_tokens, only_n_most_recent_images=only_n_most_recent_images ) - elif model == "omniparser + gpt-4o" or model == "omniparser + R1" or model == "omniparser + qwen2.5vl": + elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl"]): actor = VLMAgent( model=model, provider=provider, api_key=api_key, api_response_callback=api_response_callback, output_callback=output_callback, + max_tokens=max_tokens, + only_n_most_recent_images=only_n_most_recent_images ) else: raise ValueError(f"Model {model} not supported") @@ -100,7 +102,7 @@ def sampling_loop_sync( messages.append({"content": tool_result_content, "role": "user"}) - elif model == "omniparser + gpt-4o" or model == "omniparser + R1" or model == "omniparser + qwen2.5vl": + elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl"]): while True: parsed_screen = omniparser_client() tools_use_needed, vlm_response_json = actor(messages=messages, parsed_screen=parsed_screen)