From b2d6bc5c3e237e4850ad2c8cb5dc9e880703232c Mon Sep 17 00:00:00 2001 From: Thomas Dhome-Casanova Date: Wed, 29 Jan 2025 23:01:14 -0800 Subject: [PATCH] code cleanup --- .../gradio/agent/anthropic_agent.py | 22 +----------- .../gradio/agent/llm_utils/oai.py | 35 ++----------------- computer_use_demo/gradio/app.py | 31 +++++----------- computer_use_demo/gradio/loop.py | 10 +----- computer_use_demo/gradio/tools/__init__.py | 3 +- computer_use_demo/gradio/tools/base.py | 4 --- 6 files changed, 13 insertions(+), 92 deletions(-) diff --git a/computer_use_demo/gradio/agent/anthropic_agent.py b/computer_use_demo/gradio/agent/anthropic_agent.py index d891f1f..b1c744e 100644 --- a/computer_use_demo/gradio/agent/anthropic_agent.py +++ b/computer_use_demo/gradio/agent/anthropic_agent.py @@ -159,24 +159,4 @@ def _maybe_filter_to_n_most_recent_images( images_to_remove -= 1 continue new_content.append(content) - tool_result["content"] = new_content - - - -if __name__ == "__main__": - pass - # client = Anthropic(api_key="") - # response = client.beta.messages.with_raw_response.create( - # max_tokens=4096, - # model="claude-3-5-sonnet-20241022", - # system=SYSTEM_PROMPT, - # # tools=ToolCollection( - # # ComputerTool(), - # # ).to_params(), - # betas=["computer-use-2024-10-22"], - # messages=[ - # {"role": "user", "content": "click on (199, 199)."} - # ], - # ) - - # print(f"AnthropicActor response: {response.parse().usage.input_tokens+response.parse().usage.output_tokens}") \ No newline at end of file + tool_result["content"] = new_content \ No newline at end of file diff --git a/computer_use_demo/gradio/agent/llm_utils/oai.py b/computer_use_demo/gradio/agent/llm_utils/oai.py index 0ab3760..2d613f1 100644 --- a/computer_use_demo/gradio/agent/llm_utils/oai.py +++ b/computer_use_demo/gradio/agent/llm_utils/oai.py @@ -1,11 +1,9 @@ - import os import logging import base64 import requests def is_image_path(text): - # Checking if the input text ends with typical image file extensions image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif") if text.endswith(image_extensions): return True @@ -28,7 +26,6 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max final_messages = [{"role": "system", "content": system}] - # image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" if type(messages) == list: for item in messages: contents = [] @@ -56,7 +53,6 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max elif isinstance(messages, str): final_messages = [{"role": "user", "content": messages}] - # import pdb; pdb.set_trace() print("[oai] sending messages:", {"role": "user", "content": messages}) @@ -64,12 +60,9 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max "model": llm, "messages": final_messages, "max_tokens": max_tokens, - "temperature": temperature, - # "stop": stop, + "temperature": temperature } - # from IPython.core.debugger import Pdb; Pdb().set_trace() - response = requests.post( "https://api.openai.com/v1/chat/completions", headers=headers, json=payload ) @@ -78,30 +71,6 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max text = response.json()['choices'][0]['message']['content'] token_usage = int(response.json()['usage']['total_tokens']) return text, token_usage - - # return error message if the response is not successful except Exception as e: print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ") - return response.json() - - -if __name__ == "__main__": - - api_key = os.environ.get("OPENAI_API_KEY") - if not api_key: - raise ValueError("OPENAI_API_KEY is not set") - - text, token_usage = run_oai_interleaved( - messages= [{"content": [ - "What is in the screenshot?", - "./tmp/outputs/screenshot_0b04acbb783d4706bc93873d17ba8c05.png"], - "role": "user" - }], - llm="gpt-4o-mini", - system="You are a helpful assistant", - api_key=api_key, - max_tokens=256, - temperature=0) - - print(text, token_usage) - # There is an introduction describing the Calyx... 36986 + return response.json() \ No newline at end of file diff --git a/computer_use_demo/gradio/app.py b/computer_use_demo/gradio/app.py index 5e216c6..69c4efa 100644 --- a/computer_use_demo/gradio/app.py +++ b/computer_use_demo/gradio/app.py @@ -1,6 +1,5 @@ """ -Entrypoint for Gradio, see https://gradio.app/ -python app.py --windows_host_url xxxx:8006/ --omniparser_host_url localhost:8000 +python app.py --windows_host_url localhost:8006/ --omniparser_server_url localhost:8000 """ import os @@ -35,13 +34,9 @@ Type a message and press submit to start OmniParser+X. Press the trash icon in t def parse_arguments(): parser = argparse.ArgumentParser(description="Gradio App") parser.add_argument("--windows_host_url", type=str, default='localhost:8006') - parser.add_argument("--omniparser_host_url", type=str, default="localhost:8000") + parser.add_argument("--omniparser_server_url", type=str, default="localhost:8000") return parser.parse_args() args = parse_arguments() -windows_host_url = args.windows_host_url -omniparser_host_url = args.omniparser_host_url -print(f"Windows host URL: {windows_host_url}") -print(f"OmniParser host URL: {omniparser_host_url}") class Sender(StrEnum): @@ -140,7 +135,6 @@ def chatbot_output_callback(message, chatbot_state, hide_images=False, sender="b is_tool_result = not isinstance(message, str) and ( isinstance(message, ToolResult) or message.__class__.__name__ == "ToolResult" - or message.__class__.__name__ == "CLIResult" ) if not message or ( is_tool_result @@ -214,7 +208,7 @@ def process_input(user_input, state): api_response_callback=partial(_api_response_callback, response_state=state["responses"]), api_key=state["api_key"], only_n_most_recent_images=state["only_n_most_recent_images"], - omniparser_url=omniparser_host_url + omniparser_url=args.omniparser_server_url ): if loop_msg is None: yield state['chatbot_messages'] @@ -289,20 +283,11 @@ with gr.Blocks(theme=gr.themes.Default()) as demo: with gr.Column(scale=1): chatbot = gr.Chatbot(label="Chatbot History", autoscroll=True, height=580) with gr.Column(scale=3): - if not windows_host_url: - iframe = gr.HTML( - f'', - container=False, - elem_classes="no-padding" - ) - else: - # machine_fqdn = socket.getfqdn() - # print('machine_fqdn:', machine_fqdn) - iframe = gr.HTML( - f'', - container=False, - elem_classes="no-padding" - ) + iframe = gr.HTML( + f'', + container=False, + elem_classes="no-padding" + ) def update_model(model_selection, state): state["model"] = model_selection diff --git a/computer_use_demo/gradio/loop.py b/computer_use_demo/gradio/loop.py index f989ef8..f809d3e 100644 --- a/computer_use_demo/gradio/loop.py +++ b/computer_use_demo/gradio/loop.py @@ -96,9 +96,7 @@ def sampling_loop_sync( if model == "claude-3-5-sonnet-20241022": # Anthropic loop while True: parsed_screen = omniparser_client() # parsed_screen: {"som_image_base64": dino_labled_img, "parsed_content_list": parsed_content_list, "screen_info"} - import pdb; pdb.set_trace() screen_info_block = TextBlock(text='Below is the structured accessibility information of the current UI screen, which includes text and icons you can operate on, take these information into account when you are making the prediction for the next action. Note you will still need to take screenshot to get the image: \n' + parsed_screen['screen_info'], type='text') - # # messages[-1]['content'].append(screen_info_block) screen_info_dict = {"role": "user", "content": [screen_info_block]} messages.append(screen_info_dict) tools_use_needed = actor(messages=messages) @@ -120,10 +118,4 @@ def sampling_loop_sync( yield message if not tool_result_content: - return messages - - # import pdb; pdb.set_trace() - # messages.append({"role": "user", - # "content": ["History plan:\n" + str(vlm_response_json['Reasoning'])]}) - - # messages.append({"content": tool_result_content, "role": "user"}) \ No newline at end of file + return messages \ No newline at end of file diff --git a/computer_use_demo/gradio/tools/__init__.py b/computer_use_demo/gradio/tools/__init__.py index bb051ae..2726f25 100644 --- a/computer_use_demo/gradio/tools/__init__.py +++ b/computer_use_demo/gradio/tools/__init__.py @@ -1,10 +1,9 @@ -from .base import CLIResult, ToolResult +from .base import ToolResult from .collection import ToolCollection from .computer import ComputerTool from .screen_capture import get_screenshot __ALL__ = [ - CLIResult, ComputerTool, ToolCollection, ToolResult, diff --git a/computer_use_demo/gradio/tools/base.py b/computer_use_demo/gradio/tools/base.py index d6f1371..8a82c1b 100644 --- a/computer_use_demo/gradio/tools/base.py +++ b/computer_use_demo/gradio/tools/base.py @@ -54,10 +54,6 @@ class ToolResult: return replace(self, **kwargs) -class CLIResult(ToolResult): - """A ToolResult that can be rendered as a CLI output.""" - - class ToolFailure(ToolResult): """A ToolResult that represents a failure."""