o1 (has vision) and o3-mini (no vision)
This commit is contained in:
@@ -7,7 +7,6 @@ from .utils import is_image_path, encode_image
|
||||
def run_oai_interleaved(messages: list, system: str, model_name: str, api_key: str, max_tokens=256, temperature=0, provider_base_url: str = "https://api.openai.com/v1"):
|
||||
headers = {"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_key}"}
|
||||
|
||||
final_messages = [{"role": "system", "content": system}]
|
||||
|
||||
if type(messages) == list:
|
||||
@@ -16,7 +15,8 @@ def run_oai_interleaved(messages: list, system: str, model_name: str, api_key: s
|
||||
if isinstance(item, dict):
|
||||
for cnt in item["content"]:
|
||||
if isinstance(cnt, str):
|
||||
if is_image_path(cnt):
|
||||
if is_image_path(cnt) and 'o3-mini' not in model_name:
|
||||
# 03 mini does not support images
|
||||
base64_image = encode_image(cnt)
|
||||
content = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
|
||||
else:
|
||||
@@ -41,9 +41,12 @@ def run_oai_interleaved(messages: list, system: str, model_name: str, api_key: s
|
||||
payload = {
|
||||
"model": model_name,
|
||||
"messages": final_messages,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature
|
||||
}
|
||||
if 'o1' in model_name or 'o3-mini' in model_name:
|
||||
payload['reasoning_effort'] = 'low'
|
||||
payload['max_completion_tokens'] = max_tokens
|
||||
else:
|
||||
payload['max_tokens'] = max_tokens
|
||||
|
||||
response = requests.post(
|
||||
f"{provider_base_url}/chat/completions", headers=headers, json=payload
|
||||
@@ -10,7 +10,7 @@ from anthropic import APIResponse
|
||||
from anthropic.types import ToolResultBlockParam
|
||||
from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, BetaMessageParam, BetaUsage
|
||||
|
||||
from agent.llm_utils.oai import run_oai_interleaved
|
||||
from agent.llm_utils.oaiclient import run_oai_interleaved
|
||||
from agent.llm_utils.groqclient import run_groq_interleaved
|
||||
from agent.llm_utils.utils import is_image_path
|
||||
import time
|
||||
@@ -45,6 +45,10 @@ class VLMAgent:
|
||||
self.model = "deepseek-r1-distill-llama-70b"
|
||||
elif model == "omniparser + qwen2.5vl":
|
||||
self.model = "qwen2.5-vl-72b-instruct"
|
||||
elif model == "omniparser + o1":
|
||||
self.model = "o1"
|
||||
elif model == "omniparser + o3-mini":
|
||||
self.model = "o3-mini"
|
||||
else:
|
||||
raise ValueError(f"Model {model} not supported")
|
||||
|
||||
@@ -69,9 +73,6 @@ class VLMAgent:
|
||||
latency_omniparser = parsed_screen['latency']
|
||||
self.output_callback(f'-- Step {self.step_count}: --', sender="bot")
|
||||
screen_info = str(parsed_screen['screen_info'])
|
||||
|
||||
|
||||
|
||||
screenshot_uuid = parsed_screen['screenshot_uuid']
|
||||
screen_width, screen_height = parsed_screen['width'], parsed_screen['height']
|
||||
|
||||
@@ -90,7 +91,7 @@ class VLMAgent:
|
||||
planner_messages[-1]["content"].append(f"{OUTPUT_DIR}/screenshot_som_{screenshot_uuid}.png")
|
||||
|
||||
start = time.time()
|
||||
if "gpt" in self.model:
|
||||
if "gpt" in self.model or "o1" in self.model or "o3-mini" in self.model:
|
||||
vlm_response, token_usage = run_oai_interleaved(
|
||||
messages=planner_messages,
|
||||
system=system,
|
||||
@@ -102,7 +103,12 @@ class VLMAgent:
|
||||
)
|
||||
print(f"oai token usage: {token_usage}")
|
||||
self.total_token_usage += token_usage
|
||||
self.total_cost += (token_usage * 2.5 / 1000000) # https://openai.com/api/pricing/
|
||||
if 'gpt' in self.model:
|
||||
self.total_cost += (token_usage * 2.5 / 1000000) # https://openai.com/api/pricing/
|
||||
elif 'o1' in self.model:
|
||||
self.total_cost += (token_usage * 15 / 1000000) # https://openai.com/api/pricing/
|
||||
elif 'o3-mini' in self.model:
|
||||
self.total_cost += (token_usage * 1.1 / 1000000) # https://openai.com/api/pricing/
|
||||
elif "r1" in self.model:
|
||||
vlm_response, token_usage = run_groq_interleaved(
|
||||
messages=planner_messages,
|
||||
|
||||
Reference in New Issue
Block a user