Naming conventions
This commit is contained in:
59
omnitool/gradio/agent/llm_utils/groqclient.py
Normal file
59
omnitool/gradio/agent/llm_utils/groqclient.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from groq import Groq
|
||||
import os
|
||||
from .utils import is_image_path
|
||||
|
||||
def run_groq_interleaved(messages: list, system: str, model_name: str, api_key: str, max_tokens=256, temperature=0.6):
|
||||
"""
|
||||
Run a chat completion through Groq's API, ignoring any images in the messages.
|
||||
"""
|
||||
api_key = api_key or os.environ.get("GROQ_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("GROQ_API_KEY is not set")
|
||||
|
||||
client = Groq(api_key=api_key)
|
||||
# avoid using system messages for R1
|
||||
final_messages = [{"role": "user", "content": system}]
|
||||
|
||||
if isinstance(messages, list):
|
||||
for item in messages:
|
||||
if isinstance(item, dict):
|
||||
# For dict items, concatenate all text content, ignoring images
|
||||
text_contents = []
|
||||
for cnt in item["content"]:
|
||||
if isinstance(cnt, str):
|
||||
if not is_image_path(cnt): # Skip image paths
|
||||
text_contents.append(cnt)
|
||||
else:
|
||||
text_contents.append(str(cnt))
|
||||
|
||||
if text_contents: # Only add if there's text content
|
||||
message = {"role": "user", "content": " ".join(text_contents)}
|
||||
final_messages.append(message)
|
||||
else: # str
|
||||
message = {"role": "user", "content": item}
|
||||
final_messages.append(message)
|
||||
|
||||
elif isinstance(messages, str):
|
||||
final_messages.append({"role": "user", "content": messages})
|
||||
|
||||
try:
|
||||
completion = client.chat.completions.create(
|
||||
model="deepseek-r1-distill-llama-70b",
|
||||
messages=final_messages,
|
||||
temperature=0.6,
|
||||
max_completion_tokens=max_tokens,
|
||||
top_p=0.95,
|
||||
stream=False,
|
||||
reasoning_format="raw"
|
||||
)
|
||||
|
||||
response = completion.choices[0].message.content
|
||||
final_answer = response.split('</think>\n')[-1] if '</think>' in response else response
|
||||
final_answer = final_answer.replace("<output>", "").replace("</output>", "")
|
||||
token_usage = completion.usage.total_tokens
|
||||
|
||||
return final_answer, token_usage
|
||||
except Exception as e:
|
||||
print(f"Error in interleaved Groq: {e}")
|
||||
|
||||
return str(e), 0
|
||||
62
omnitool/gradio/agent/llm_utils/oaiclient.py
Normal file
62
omnitool/gradio/agent/llm_utils/oaiclient.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import os
|
||||
import logging
|
||||
import base64
|
||||
import requests
|
||||
from .utils import is_image_path, encode_image
|
||||
|
||||
def run_oai_interleaved(messages: list, system: str, model_name: str, api_key: str, max_tokens=256, temperature=0, provider_base_url: str = "https://api.openai.com/v1"):
|
||||
headers = {"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_key}"}
|
||||
final_messages = [{"role": "system", "content": system}]
|
||||
|
||||
if type(messages) == list:
|
||||
for item in messages:
|
||||
contents = []
|
||||
if isinstance(item, dict):
|
||||
for cnt in item["content"]:
|
||||
if isinstance(cnt, str):
|
||||
if is_image_path(cnt) and 'o3-mini' not in model_name:
|
||||
# 03 mini does not support images
|
||||
base64_image = encode_image(cnt)
|
||||
content = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
|
||||
else:
|
||||
content = {"type": "text", "text": cnt}
|
||||
else:
|
||||
# in this case it is a text block from anthropic
|
||||
content = {"type": "text", "text": str(cnt)}
|
||||
|
||||
contents.append(content)
|
||||
|
||||
message = {"role": 'user', "content": contents}
|
||||
else: # str
|
||||
contents.append({"type": "text", "text": item})
|
||||
message = {"role": "user", "content": contents}
|
||||
|
||||
final_messages.append(message)
|
||||
|
||||
|
||||
elif isinstance(messages, str):
|
||||
final_messages = [{"role": "user", "content": messages}]
|
||||
|
||||
payload = {
|
||||
"model": model_name,
|
||||
"messages": final_messages,
|
||||
}
|
||||
if 'o1' in model_name or 'o3-mini' in model_name:
|
||||
payload['reasoning_effort'] = 'low'
|
||||
payload['max_completion_tokens'] = max_tokens
|
||||
else:
|
||||
payload['max_tokens'] = max_tokens
|
||||
|
||||
response = requests.post(
|
||||
f"{provider_base_url}/chat/completions", headers=headers, json=payload
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
text = response.json()['choices'][0]['message']['content']
|
||||
token_usage = int(response.json()['usage']['total_tokens'])
|
||||
return text, token_usage
|
||||
except Exception as e:
|
||||
print(f"Error in interleaved openAI: {e}. This may due to your invalid API key. Please check the response: {response.json()} ")
|
||||
return response.json()
|
||||
44
omnitool/gradio/agent/llm_utils/omniparserclient.py
Normal file
44
omnitool/gradio/agent/llm_utils/omniparserclient.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import requests
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from tools.screen_capture import get_screenshot
|
||||
from agent.llm_utils.utils import encode_image
|
||||
|
||||
OUTPUT_DIR = "./tmp/outputs"
|
||||
|
||||
class OmniParserClient:
|
||||
def __init__(self,
|
||||
url: str) -> None:
|
||||
self.url = url
|
||||
|
||||
def __call__(self,):
|
||||
screenshot, screenshot_path = get_screenshot()
|
||||
screenshot_path = str(screenshot_path)
|
||||
image_base64 = encode_image(screenshot_path)
|
||||
response = requests.post(self.url, json={"base64_image": image_base64})
|
||||
response_json = response.json()
|
||||
print('omniparser latency:', response_json['latency'])
|
||||
|
||||
som_image_data = base64.b64decode(response_json['som_image_base64'])
|
||||
screenshot_path_uuid = Path(screenshot_path).stem.replace("screenshot_", "")
|
||||
som_screenshot_path = f"{OUTPUT_DIR}/screenshot_som_{screenshot_path_uuid}.png"
|
||||
with open(som_screenshot_path, "wb") as f:
|
||||
f.write(som_image_data)
|
||||
|
||||
response_json['width'] = screenshot.size[0]
|
||||
response_json['height'] = screenshot.size[1]
|
||||
response_json['original_screenshot_base64'] = image_base64
|
||||
response_json['screenshot_uuid'] = screenshot_path_uuid
|
||||
response_json = self.reformat_messages(response_json)
|
||||
return response_json
|
||||
|
||||
def reformat_messages(self, response_json: dict):
|
||||
screen_info = ""
|
||||
for idx, element in enumerate(response_json["parsed_content_list"]):
|
||||
element['idx'] = idx
|
||||
if element['type'] == 'text':
|
||||
screen_info += f'ID: {idx}, Text: {element["content"]}\n'
|
||||
elif element['type'] == 'icon':
|
||||
screen_info += f'ID: {idx}, Icon: {element["content"]}\n'
|
||||
response_json['screen_info'] = screen_info
|
||||
return response_json
|
||||
13
omnitool/gradio/agent/llm_utils/utils.py
Normal file
13
omnitool/gradio/agent/llm_utils/utils.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import base64
|
||||
|
||||
def is_image_path(text):
|
||||
image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
|
||||
if text.endswith(image_extensions):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def encode_image(image_path):
|
||||
"""Encode image file to base64."""
|
||||
with open(image_path, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode("utf-8")
|
||||
Reference in New Issue
Block a user