diff --git a/demo/gradio/computer_use_demo/gui_agent/anthropic_agent.py b/demo/gradio/computer_use_demo/gui_agent/anthropic_agent.py index e7b2071..0e49850 100644 --- a/demo/gradio/computer_use_demo/gui_agent/anthropic_agent.py +++ b/demo/gradio/computer_use_demo/gui_agent/anthropic_agent.py @@ -31,32 +31,19 @@ from io import BytesIO import gradio as gr from typing import Dict - BETA_FLAG = "computer-use-2024-10-22" - class APIProvider(StrEnum): ANTHROPIC = "anthropic" BEDROCK = "bedrock" VERTEX = "vertex" - -PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = { - APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022", - APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0", - APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022", -} - - -# Check OS - SYSTEM_PROMPT = f""" * You are utilizing a Windows system with internet access. * The current date is {datetime.today().strftime('%A, %B %d, %Y')}. """ - class AnthropicActor: def __init__( self, diff --git a/demo/gradio/computer_use_demo/gui_agent/llm_utils/llm_utils.py b/demo/gradio/computer_use_demo/gui_agent/llm_utils/llm_utils.py deleted file mode 100644 index 3c01c56..0000000 --- a/demo/gradio/computer_use_demo/gui_agent/llm_utils/llm_utils.py +++ /dev/null @@ -1,109 +0,0 @@ -import os -import re -import ast -import base64 - - -def is_image_path(text): - # Checking if the input text ends with typical image file extensions - image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif") - if text.endswith(image_extensions): - return True - else: - return False - - -def encode_image(image_path): - """Encode image file to base64.""" - with open(image_path, "rb") as image_file: - return base64.b64encode(image_file.read()).decode("utf-8") - - -def is_url_or_filepath(input_string): - # Check if input_string is a URL - url_pattern = re.compile( - r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" - ) - if url_pattern.match(input_string): - return "URL" - - # Check if input_string is a file path - file_path = os.path.abspath(input_string) - if os.path.exists(file_path): - return "File path" - - return "Invalid" - - -def extract_data(input_string, data_type): - # Regular expression to extract content starting from '```python' until the end if there are no closing backticks - pattern = f"```{data_type}" + r"(.*?)(```|$)" - # Extract content - # re.DOTALL allows '.' to match newlines as well - matches = re.findall(pattern, input_string, re.DOTALL) - # Return the first match if exists, trimming whitespace and ignoring potential closing backticks - return matches[0][0].strip() if matches else input_string - - -def parse_input(code): - """Use AST to parse the input string and extract the function name, arguments, and keyword arguments.""" - - def get_target_names(target): - """Recursively get all variable names from the assignment target.""" - if isinstance(target, ast.Name): - return [target.id] - elif isinstance(target, ast.Tuple): - names = [] - for elt in target.elts: - names.extend(get_target_names(elt)) - return names - return [] - - def extract_value(node): - """提取 AST 节点的实际值""" - if isinstance(node, ast.Constant): - return node.value - elif isinstance(node, ast.Name): - # TODO: a better way to handle variables - raise ValueError( - f"Arguments should be a Constant, got a variable {node.id} instead." - ) - # 添加其他需要处理的 AST 节点类型 - return None - - try: - tree = ast.parse(code) - for node in ast.walk(tree): - if isinstance(node, ast.Assign): - targets = [] - for t in node.targets: - targets.extend(get_target_names(t)) - if isinstance(node.value, ast.Call): - func_name = node.value.func.id - args = [ast.dump(arg) for arg in node.value.args] - kwargs = { - kw.arg: extract_value(kw.value) for kw in node.value.keywords - } - print(f"Input: {code.strip()}") - print(f"Output Variables: {targets}") - print(f"Function Name: {func_name}") - print(f"Arguments: {args}") - print(f"Keyword Arguments: {kwargs}") - elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Call): - targets = [] - func_name = extract_value(node.value.func) - args = [extract_value(arg) for arg in node.value.args] - kwargs = {kw.arg: extract_value(kw.value) for kw in node.value.keywords} - - except SyntaxError: - print(f"Input: {code.strip()}") - print("No match found") - - return targets, func_name, args, kwargs - - -if __name__ == "__main__": - import json - s='{"Reasoning": "The Docker icon has been successfully clicked, and the Docker application should now be opening. No further actions are required.", "Next Action": None}' - json_str = json.loads(s) - print(json_str) \ No newline at end of file diff --git a/demo/gradio/computer_use_demo/gui_agent/llm_utils/qwen.py b/demo/gradio/computer_use_demo/gui_agent/llm_utils/qwen.py deleted file mode 100644 index 0d570cd..0000000 --- a/demo/gradio/computer_use_demo/gui_agent/llm_utils/qwen.py +++ /dev/null @@ -1,107 +0,0 @@ - -import os -import logging -import base64 -import requests - -import dashscope -# from computer_use_demo.gui_agent.llm_utils import is_image_path, encode_image - -def is_image_path(text): - return False - -def encode_image(image_path): - return "" - - -def run_qwen(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0): - - api_key = api_key or os.environ.get("QWEN_API_KEY") - if not api_key: - raise ValueError("QWEN_API_KEY is not set") - - dashscope.api_key = api_key - - # from IPython.core.debugger import Pdb; Pdb().set_trace() - - final_messages = [{"role": "system", "content": [{"text": system}]}] - # image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - if type(messages) == list: - for item in messages: - contents = [] - if isinstance(item, dict): - for cnt in item["content"]: - if isinstance(cnt, str): - if is_image_path(cnt): - # base64_image = encode_image(cnt) - content = [{"image": cnt}] - # content = {"type": "image_url", "image_url": {"url": image_url}} - else: - content = {"text": cnt} - contents.append(content) - - message = {"role": item["role"], "content": contents} - else: # str - contents.append({"text": item}) - message = {"role": "user", "content": contents} - - final_messages.append(message) - - print("[qwen-vl] sending messages:", final_messages) - - response = dashscope.MultiModalConversation.call( - model='qwen-vl-max-0809', - messages=final_messages - ) - - # from IPython.core.debugger import Pdb; Pdb().set_trace() - - try: - text = response.output.choices[0].message.content[0]['text'] - usage = response.usage - - if "total_tokens" not in usage: - token_usage = int(usage["input_tokens"] + usage["output_tokens"]) - else: - token_usage = int(usage["total_tokens"]) - - return text, token_usage - # return response.json()['choices'][0]['message']['content'] - # return error message if the response is not successful - except Exception as e: - print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ") - return response.json() - - - -if __name__ == "__main__": - api_key = os.environ.get("QWEN_API_KEY") - if not api_key: - raise ValueError("QWEN_API_KEY is not set") - - dashscope.api_key = api_key - - final_messages = [{"role": "user", - "content": [ - {"text": "What is in the screenshot?"}, - {"image": "./tmp/outputs/screenshot_0b04acbb783d4706bc93873d17ba8c05.png"} - ] - } - ] - response = dashscope.MultiModalConversation.call(model='qwen-vl-max-0809', messages=final_messages) - - print(response) - - text = response.output.choices[0].message.content[0]['text'] - usage = response.usage - - if "total_tokens" not in usage: - if "image_tokens" in usage: - token_usage = usage["input_tokens"] + usage["output_tokens"] + usage["image_tokens"] - else: - token_usage = usage["input_tokens"] + usage["output_tokens"] - else: - token_usage = usage["total_tokens"] - - print(text, token_usage) - # The screenshot is from a video game... 1387 \ No newline at end of file diff --git a/demo/gradio/computer_use_demo/gui_agent/llm_utils/run_llm.py b/demo/gradio/computer_use_demo/gui_agent/llm_utils/run_llm.py deleted file mode 100644 index a1de8ba..0000000 --- a/demo/gradio/computer_use_demo/gui_agent/llm_utils/run_llm.py +++ /dev/null @@ -1,44 +0,0 @@ -import base64 -import logging -from .oai import run_oai_interleaved -from .gemini import run_gemini_interleaved - -def run_llm(prompt, llm="gpt-4o-mini", max_tokens=256, temperature=0, stop=None): - log_prompt(prompt) - - # turn string prompt into list - if isinstance(prompt, str): - prompt = [prompt] - elif isinstance(prompt, list): - pass - else: - raise ValueError(f"Invalid prompt type: {type(prompt)}") - - if llm.startswith("gpt"): # gpt series - out = run_oai_interleaved( - prompt, - llm, - max_tokens, - temperature, - stop - ) - elif llm.startswith("gemini"): # gemini series - out = run_gemini_interleaved( - prompt, - llm, - max_tokens, - temperature, - stop - ) - else: - raise ValueError(f"Invalid llm: {llm}") - logging.info( - f"========Output for {llm}=======\n{out}\n============================") - return out - -def log_prompt(prompt): - prompt_display = [prompt] if isinstance(prompt, str) else prompt - prompt_display = "\n\n".join(prompt_display) - logging.info( - f"========Prompt=======\n{prompt_display}\n============================") - \ No newline at end of file diff --git a/demo/gradio/computer_use_demo/loop.py b/demo/gradio/computer_use_demo/loop.py index 2a7c076..449e37e 100644 --- a/demo/gradio/computer_use_demo/loop.py +++ b/demo/gradio/computer_use_demo/loop.py @@ -1,76 +1,40 @@ """ Agentic sampling loop that calls the Anthropic API and local implenmentation of anthropic-defined computer use tools. """ -import time -import json -import asyncio -import platform from collections.abc import Callable -from datetime import datetime from enum import StrEnum -from typing import Any, cast, Dict -from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex, APIResponse +from anthropic import APIResponse from anthropic.types import ( - ToolResultBlockParam, TextBlock, ) from anthropic.types.beta import ( BetaContentBlock, - BetaContentBlockParam, - BetaImageBlockParam, BetaMessage, - BetaMessageParam, - BetaTextBlockParam, - BetaToolResultBlockParam, + BetaMessageParam ) from computer_use_demo.tools import ToolResult -import torch - from computer_use_demo.gui_agent.anthropic_agent import AnthropicActor from computer_use_demo.executor.anthropic_executor import AnthropicExecutor from computer_use_demo.omniparser_agent.vlm_agent import OmniParser, VLMAgent -from computer_use_demo.colorful_text import colorful_text_vlm -from computer_use_demo.tools.screen_capture import get_screenshot -from computer_use_demo.gui_agent.llm_utils.oai import encode_image - BETA_FLAG = "computer-use-2024-10-22" - class APIProvider(StrEnum): ANTHROPIC = "anthropic" BEDROCK = "bedrock" VERTEX = "vertex" OPENAI = "openai" - QWEN = "qwen" PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = { APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022", APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0", APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022", - # APIProvider.OPENAI: "gpt-4o", - # APIProvider.QWEN: "qwen2vl", + APIProvider.OPENAI: "gpt-4o", } - -# This system prompt is optimized for the Docker environment in this repository and -# specific tool combinations enabled. -# We encourage modifying this system prompt to ensure the model has context for the -# environment it is running in, and to provide any additional information that may be -# helpful for the task at hand. -SYSTEM_PROMPT = f""" -* You are utilizing a Windows system with internet access. -* The current date is {datetime.today().strftime('%A, %B %d, %Y')}. - -""" - -import base64 -from PIL import Image -from io import BytesIO - def sampling_loop_sync( *, model: str, diff --git a/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py b/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py index 26d5190..421e196 100644 --- a/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py +++ b/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py @@ -18,13 +18,21 @@ from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, B from computer_use_demo.tools.screen_capture import get_screenshot from computer_use_demo.gui_agent.llm_utils.oai import run_oai_interleaved, encode_image -from computer_use_demo.gui_agent.llm_utils.qwen import run_qwen -from computer_use_demo.gui_agent.llm_utils.llm_utils import extract_data from computer_use_demo.colorful_text import colorful_text_vlm import time +import re OUTPUT_DIR = "./tmp/outputs" +def extract_data(input_string, data_type): + # Regular expression to extract content starting from '```python' until the end if there are no closing backticks + pattern = f"```{data_type}" + r"(.*?)(```|$)" + # Extract content + # re.DOTALL allows '.' to match newlines as well + matches = re.findall(pattern, input_string, re.DOTALL) + # Return the first match if exists, trimming whitespace and ignoring potential closing backticks + return matches[0][0].strip() if matches else input_string + class OmniParser: def __init__(self, url: str, @@ -165,19 +173,6 @@ class VLMAgent: print(f"oai token usage: {token_usage}") self.total_token_usage += token_usage self.total_cost += (token_usage * 0.15 / 1000000) # https://openai.com/api/pricing/ - - elif "qwen" in self.model: - vlm_response, token_usage = run_qwen( - messages=planner_messages, - system=system, - llm=self.model, - api_key=self.api_key, - max_tokens=self.max_tokens, - temperature=0, - ) - print(f"qwen token usage: {token_usage}") - self.total_token_usage += token_usage - self.total_cost += (token_usage * 0.02 / 7.25 / 1000) # 1USD=7.25CNY, https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-qianwen-vl-plus-api elif "phi" in self.model: pass # TODO else: