code cleanup
This commit is contained in:
@@ -160,23 +160,3 @@ def _maybe_filter_to_n_most_recent_images(
|
|||||||
continue
|
continue
|
||||||
new_content.append(content)
|
new_content.append(content)
|
||||||
tool_result["content"] = new_content
|
tool_result["content"] = new_content
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
pass
|
|
||||||
# client = Anthropic(api_key="")
|
|
||||||
# response = client.beta.messages.with_raw_response.create(
|
|
||||||
# max_tokens=4096,
|
|
||||||
# model="claude-3-5-sonnet-20241022",
|
|
||||||
# system=SYSTEM_PROMPT,
|
|
||||||
# # tools=ToolCollection(
|
|
||||||
# # ComputerTool(),
|
|
||||||
# # ).to_params(),
|
|
||||||
# betas=["computer-use-2024-10-22"],
|
|
||||||
# messages=[
|
|
||||||
# {"role": "user", "content": "click on (199, 199)."}
|
|
||||||
# ],
|
|
||||||
# )
|
|
||||||
|
|
||||||
# print(f"AnthropicActor response: {response.parse().usage.input_tokens+response.parse().usage.output_tokens}")
|
|
||||||
@@ -1,11 +1,9 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
import base64
|
import base64
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
def is_image_path(text):
|
def is_image_path(text):
|
||||||
# Checking if the input text ends with typical image file extensions
|
|
||||||
image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
|
image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
|
||||||
if text.endswith(image_extensions):
|
if text.endswith(image_extensions):
|
||||||
return True
|
return True
|
||||||
@@ -28,7 +26,6 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max
|
|||||||
|
|
||||||
final_messages = [{"role": "system", "content": system}]
|
final_messages = [{"role": "system", "content": system}]
|
||||||
|
|
||||||
# image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
|
||||||
if type(messages) == list:
|
if type(messages) == list:
|
||||||
for item in messages:
|
for item in messages:
|
||||||
contents = []
|
contents = []
|
||||||
@@ -56,7 +53,6 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max
|
|||||||
|
|
||||||
elif isinstance(messages, str):
|
elif isinstance(messages, str):
|
||||||
final_messages = [{"role": "user", "content": messages}]
|
final_messages = [{"role": "user", "content": messages}]
|
||||||
# import pdb; pdb.set_trace()
|
|
||||||
|
|
||||||
print("[oai] sending messages:", {"role": "user", "content": messages})
|
print("[oai] sending messages:", {"role": "user", "content": messages})
|
||||||
|
|
||||||
@@ -64,12 +60,9 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max
|
|||||||
"model": llm,
|
"model": llm,
|
||||||
"messages": final_messages,
|
"messages": final_messages,
|
||||||
"max_tokens": max_tokens,
|
"max_tokens": max_tokens,
|
||||||
"temperature": temperature,
|
"temperature": temperature
|
||||||
# "stop": stop,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# from IPython.core.debugger import Pdb; Pdb().set_trace()
|
|
||||||
|
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
"https://api.openai.com/v1/chat/completions", headers=headers, json=payload
|
"https://api.openai.com/v1/chat/completions", headers=headers, json=payload
|
||||||
)
|
)
|
||||||
@@ -78,30 +71,6 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max
|
|||||||
text = response.json()['choices'][0]['message']['content']
|
text = response.json()['choices'][0]['message']['content']
|
||||||
token_usage = int(response.json()['usage']['total_tokens'])
|
token_usage = int(response.json()['usage']['total_tokens'])
|
||||||
return text, token_usage
|
return text, token_usage
|
||||||
|
|
||||||
# return error message if the response is not successful
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ")
|
print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ")
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
|
|
||||||
api_key = os.environ.get("OPENAI_API_KEY")
|
|
||||||
if not api_key:
|
|
||||||
raise ValueError("OPENAI_API_KEY is not set")
|
|
||||||
|
|
||||||
text, token_usage = run_oai_interleaved(
|
|
||||||
messages= [{"content": [
|
|
||||||
"What is in the screenshot?",
|
|
||||||
"./tmp/outputs/screenshot_0b04acbb783d4706bc93873d17ba8c05.png"],
|
|
||||||
"role": "user"
|
|
||||||
}],
|
|
||||||
llm="gpt-4o-mini",
|
|
||||||
system="You are a helpful assistant",
|
|
||||||
api_key=api_key,
|
|
||||||
max_tokens=256,
|
|
||||||
temperature=0)
|
|
||||||
|
|
||||||
print(text, token_usage)
|
|
||||||
# There is an introduction describing the Calyx... 36986
|
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
"""
|
"""
|
||||||
Entrypoint for Gradio, see https://gradio.app/
|
python app.py --windows_host_url localhost:8006/ --omniparser_server_url localhost:8000
|
||||||
python app.py --windows_host_url xxxx:8006/ --omniparser_host_url localhost:8000
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@@ -35,13 +34,9 @@ Type a message and press submit to start OmniParser+X. Press the trash icon in t
|
|||||||
def parse_arguments():
|
def parse_arguments():
|
||||||
parser = argparse.ArgumentParser(description="Gradio App")
|
parser = argparse.ArgumentParser(description="Gradio App")
|
||||||
parser.add_argument("--windows_host_url", type=str, default='localhost:8006')
|
parser.add_argument("--windows_host_url", type=str, default='localhost:8006')
|
||||||
parser.add_argument("--omniparser_host_url", type=str, default="localhost:8000")
|
parser.add_argument("--omniparser_server_url", type=str, default="localhost:8000")
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
args = parse_arguments()
|
args = parse_arguments()
|
||||||
windows_host_url = args.windows_host_url
|
|
||||||
omniparser_host_url = args.omniparser_host_url
|
|
||||||
print(f"Windows host URL: {windows_host_url}")
|
|
||||||
print(f"OmniParser host URL: {omniparser_host_url}")
|
|
||||||
|
|
||||||
|
|
||||||
class Sender(StrEnum):
|
class Sender(StrEnum):
|
||||||
@@ -140,7 +135,6 @@ def chatbot_output_callback(message, chatbot_state, hide_images=False, sender="b
|
|||||||
is_tool_result = not isinstance(message, str) and (
|
is_tool_result = not isinstance(message, str) and (
|
||||||
isinstance(message, ToolResult)
|
isinstance(message, ToolResult)
|
||||||
or message.__class__.__name__ == "ToolResult"
|
or message.__class__.__name__ == "ToolResult"
|
||||||
or message.__class__.__name__ == "CLIResult"
|
|
||||||
)
|
)
|
||||||
if not message or (
|
if not message or (
|
||||||
is_tool_result
|
is_tool_result
|
||||||
@@ -214,7 +208,7 @@ def process_input(user_input, state):
|
|||||||
api_response_callback=partial(_api_response_callback, response_state=state["responses"]),
|
api_response_callback=partial(_api_response_callback, response_state=state["responses"]),
|
||||||
api_key=state["api_key"],
|
api_key=state["api_key"],
|
||||||
only_n_most_recent_images=state["only_n_most_recent_images"],
|
only_n_most_recent_images=state["only_n_most_recent_images"],
|
||||||
omniparser_url=omniparser_host_url
|
omniparser_url=args.omniparser_server_url
|
||||||
):
|
):
|
||||||
if loop_msg is None:
|
if loop_msg is None:
|
||||||
yield state['chatbot_messages']
|
yield state['chatbot_messages']
|
||||||
@@ -289,20 +283,11 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
|
|||||||
with gr.Column(scale=1):
|
with gr.Column(scale=1):
|
||||||
chatbot = gr.Chatbot(label="Chatbot History", autoscroll=True, height=580)
|
chatbot = gr.Chatbot(label="Chatbot History", autoscroll=True, height=580)
|
||||||
with gr.Column(scale=3):
|
with gr.Column(scale=3):
|
||||||
if not windows_host_url:
|
iframe = gr.HTML(
|
||||||
iframe = gr.HTML(
|
f'<iframe src="http://{args.windows_host_url}/vnc.html?view_only=1&autoconnect=1&resize=scale" width="100%" height="580" allow="fullscreen"></iframe>',
|
||||||
f'<iframe src="http://localhost:8006/vnc.html?view_only=1&autoconnect=1&resize=scale" width="100%" height="580" allow="fullscreen"></iframe>',
|
container=False,
|
||||||
container=False,
|
elem_classes="no-padding"
|
||||||
elem_classes="no-padding"
|
)
|
||||||
)
|
|
||||||
else:
|
|
||||||
# machine_fqdn = socket.getfqdn()
|
|
||||||
# print('machine_fqdn:', machine_fqdn)
|
|
||||||
iframe = gr.HTML(
|
|
||||||
f'<iframe src="http://{windows_host_url}/vnc.html?view_only=1&autoconnect=1&resize=scale" width="100%" height="580" allow="fullscreen"></iframe>',
|
|
||||||
container=False,
|
|
||||||
elem_classes="no-padding"
|
|
||||||
)
|
|
||||||
|
|
||||||
def update_model(model_selection, state):
|
def update_model(model_selection, state):
|
||||||
state["model"] = model_selection
|
state["model"] = model_selection
|
||||||
|
|||||||
@@ -96,9 +96,7 @@ def sampling_loop_sync(
|
|||||||
if model == "claude-3-5-sonnet-20241022": # Anthropic loop
|
if model == "claude-3-5-sonnet-20241022": # Anthropic loop
|
||||||
while True:
|
while True:
|
||||||
parsed_screen = omniparser_client() # parsed_screen: {"som_image_base64": dino_labled_img, "parsed_content_list": parsed_content_list, "screen_info"}
|
parsed_screen = omniparser_client() # parsed_screen: {"som_image_base64": dino_labled_img, "parsed_content_list": parsed_content_list, "screen_info"}
|
||||||
import pdb; pdb.set_trace()
|
|
||||||
screen_info_block = TextBlock(text='Below is the structured accessibility information of the current UI screen, which includes text and icons you can operate on, take these information into account when you are making the prediction for the next action. Note you will still need to take screenshot to get the image: \n' + parsed_screen['screen_info'], type='text')
|
screen_info_block = TextBlock(text='Below is the structured accessibility information of the current UI screen, which includes text and icons you can operate on, take these information into account when you are making the prediction for the next action. Note you will still need to take screenshot to get the image: \n' + parsed_screen['screen_info'], type='text')
|
||||||
# # messages[-1]['content'].append(screen_info_block)
|
|
||||||
screen_info_dict = {"role": "user", "content": [screen_info_block]}
|
screen_info_dict = {"role": "user", "content": [screen_info_block]}
|
||||||
messages.append(screen_info_dict)
|
messages.append(screen_info_dict)
|
||||||
tools_use_needed = actor(messages=messages)
|
tools_use_needed = actor(messages=messages)
|
||||||
@@ -121,9 +119,3 @@ def sampling_loop_sync(
|
|||||||
|
|
||||||
if not tool_result_content:
|
if not tool_result_content:
|
||||||
return messages
|
return messages
|
||||||
|
|
||||||
# import pdb; pdb.set_trace()
|
|
||||||
# messages.append({"role": "user",
|
|
||||||
# "content": ["History plan:\n" + str(vlm_response_json['Reasoning'])]})
|
|
||||||
|
|
||||||
# messages.append({"content": tool_result_content, "role": "user"})
|
|
||||||
@@ -1,10 +1,9 @@
|
|||||||
from .base import CLIResult, ToolResult
|
from .base import ToolResult
|
||||||
from .collection import ToolCollection
|
from .collection import ToolCollection
|
||||||
from .computer import ComputerTool
|
from .computer import ComputerTool
|
||||||
from .screen_capture import get_screenshot
|
from .screen_capture import get_screenshot
|
||||||
|
|
||||||
__ALL__ = [
|
__ALL__ = [
|
||||||
CLIResult,
|
|
||||||
ComputerTool,
|
ComputerTool,
|
||||||
ToolCollection,
|
ToolCollection,
|
||||||
ToolResult,
|
ToolResult,
|
||||||
|
|||||||
@@ -54,10 +54,6 @@ class ToolResult:
|
|||||||
return replace(self, **kwargs)
|
return replace(self, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class CLIResult(ToolResult):
|
|
||||||
"""A ToolResult that can be rendered as a CLI output."""
|
|
||||||
|
|
||||||
|
|
||||||
class ToolFailure(ToolResult):
|
class ToolFailure(ToolResult):
|
||||||
"""A ToolResult that represents a failure."""
|
"""A ToolResult that represents a failure."""
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user