diff --git a/demo/gradio/computer_use_demo/gui_agent/anthropic_agent.py b/demo/gradio/computer_use_demo/gui_agent/anthropic_agent.py
index e7b2071..0e49850 100644
--- a/demo/gradio/computer_use_demo/gui_agent/anthropic_agent.py
+++ b/demo/gradio/computer_use_demo/gui_agent/anthropic_agent.py
@@ -31,32 +31,19 @@ from io import BytesIO
import gradio as gr
from typing import Dict
-
BETA_FLAG = "computer-use-2024-10-22"
-
class APIProvider(StrEnum):
ANTHROPIC = "anthropic"
BEDROCK = "bedrock"
VERTEX = "vertex"
-
-PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
- APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
- APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
- APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
-}
-
-
-# Check OS
-
SYSTEM_PROMPT = f"""
* You are utilizing a Windows system with internet access.
* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
"""
-
class AnthropicActor:
def __init__(
self,
diff --git a/demo/gradio/computer_use_demo/gui_agent/llm_utils/llm_utils.py b/demo/gradio/computer_use_demo/gui_agent/llm_utils/llm_utils.py
deleted file mode 100644
index 3c01c56..0000000
--- a/demo/gradio/computer_use_demo/gui_agent/llm_utils/llm_utils.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import os
-import re
-import ast
-import base64
-
-
-def is_image_path(text):
- # Checking if the input text ends with typical image file extensions
- image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
- if text.endswith(image_extensions):
- return True
- else:
- return False
-
-
-def encode_image(image_path):
- """Encode image file to base64."""
- with open(image_path, "rb") as image_file:
- return base64.b64encode(image_file.read()).decode("utf-8")
-
-
-def is_url_or_filepath(input_string):
- # Check if input_string is a URL
- url_pattern = re.compile(
- r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
- )
- if url_pattern.match(input_string):
- return "URL"
-
- # Check if input_string is a file path
- file_path = os.path.abspath(input_string)
- if os.path.exists(file_path):
- return "File path"
-
- return "Invalid"
-
-
-def extract_data(input_string, data_type):
- # Regular expression to extract content starting from '```python' until the end if there are no closing backticks
- pattern = f"```{data_type}" + r"(.*?)(```|$)"
- # Extract content
- # re.DOTALL allows '.' to match newlines as well
- matches = re.findall(pattern, input_string, re.DOTALL)
- # Return the first match if exists, trimming whitespace and ignoring potential closing backticks
- return matches[0][0].strip() if matches else input_string
-
-
-def parse_input(code):
- """Use AST to parse the input string and extract the function name, arguments, and keyword arguments."""
-
- def get_target_names(target):
- """Recursively get all variable names from the assignment target."""
- if isinstance(target, ast.Name):
- return [target.id]
- elif isinstance(target, ast.Tuple):
- names = []
- for elt in target.elts:
- names.extend(get_target_names(elt))
- return names
- return []
-
- def extract_value(node):
- """提取 AST 节点的实际值"""
- if isinstance(node, ast.Constant):
- return node.value
- elif isinstance(node, ast.Name):
- # TODO: a better way to handle variables
- raise ValueError(
- f"Arguments should be a Constant, got a variable {node.id} instead."
- )
- # 添加其他需要处理的 AST 节点类型
- return None
-
- try:
- tree = ast.parse(code)
- for node in ast.walk(tree):
- if isinstance(node, ast.Assign):
- targets = []
- for t in node.targets:
- targets.extend(get_target_names(t))
- if isinstance(node.value, ast.Call):
- func_name = node.value.func.id
- args = [ast.dump(arg) for arg in node.value.args]
- kwargs = {
- kw.arg: extract_value(kw.value) for kw in node.value.keywords
- }
- print(f"Input: {code.strip()}")
- print(f"Output Variables: {targets}")
- print(f"Function Name: {func_name}")
- print(f"Arguments: {args}")
- print(f"Keyword Arguments: {kwargs}")
- elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Call):
- targets = []
- func_name = extract_value(node.value.func)
- args = [extract_value(arg) for arg in node.value.args]
- kwargs = {kw.arg: extract_value(kw.value) for kw in node.value.keywords}
-
- except SyntaxError:
- print(f"Input: {code.strip()}")
- print("No match found")
-
- return targets, func_name, args, kwargs
-
-
-if __name__ == "__main__":
- import json
- s='{"Reasoning": "The Docker icon has been successfully clicked, and the Docker application should now be opening. No further actions are required.", "Next Action": None}'
- json_str = json.loads(s)
- print(json_str)
\ No newline at end of file
diff --git a/demo/gradio/computer_use_demo/gui_agent/llm_utils/qwen.py b/demo/gradio/computer_use_demo/gui_agent/llm_utils/qwen.py
deleted file mode 100644
index 0d570cd..0000000
--- a/demo/gradio/computer_use_demo/gui_agent/llm_utils/qwen.py
+++ /dev/null
@@ -1,107 +0,0 @@
-
-import os
-import logging
-import base64
-import requests
-
-import dashscope
-# from computer_use_demo.gui_agent.llm_utils import is_image_path, encode_image
-
-def is_image_path(text):
- return False
-
-def encode_image(image_path):
- return ""
-
-
-def run_qwen(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0):
-
- api_key = api_key or os.environ.get("QWEN_API_KEY")
- if not api_key:
- raise ValueError("QWEN_API_KEY is not set")
-
- dashscope.api_key = api_key
-
- # from IPython.core.debugger import Pdb; Pdb().set_trace()
-
- final_messages = [{"role": "system", "content": [{"text": system}]}]
- # image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
- if type(messages) == list:
- for item in messages:
- contents = []
- if isinstance(item, dict):
- for cnt in item["content"]:
- if isinstance(cnt, str):
- if is_image_path(cnt):
- # base64_image = encode_image(cnt)
- content = [{"image": cnt}]
- # content = {"type": "image_url", "image_url": {"url": image_url}}
- else:
- content = {"text": cnt}
- contents.append(content)
-
- message = {"role": item["role"], "content": contents}
- else: # str
- contents.append({"text": item})
- message = {"role": "user", "content": contents}
-
- final_messages.append(message)
-
- print("[qwen-vl] sending messages:", final_messages)
-
- response = dashscope.MultiModalConversation.call(
- model='qwen-vl-max-0809',
- messages=final_messages
- )
-
- # from IPython.core.debugger import Pdb; Pdb().set_trace()
-
- try:
- text = response.output.choices[0].message.content[0]['text']
- usage = response.usage
-
- if "total_tokens" not in usage:
- token_usage = int(usage["input_tokens"] + usage["output_tokens"])
- else:
- token_usage = int(usage["total_tokens"])
-
- return text, token_usage
- # return response.json()['choices'][0]['message']['content']
- # return error message if the response is not successful
- except Exception as e:
- print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ")
- return response.json()
-
-
-
-if __name__ == "__main__":
- api_key = os.environ.get("QWEN_API_KEY")
- if not api_key:
- raise ValueError("QWEN_API_KEY is not set")
-
- dashscope.api_key = api_key
-
- final_messages = [{"role": "user",
- "content": [
- {"text": "What is in the screenshot?"},
- {"image": "./tmp/outputs/screenshot_0b04acbb783d4706bc93873d17ba8c05.png"}
- ]
- }
- ]
- response = dashscope.MultiModalConversation.call(model='qwen-vl-max-0809', messages=final_messages)
-
- print(response)
-
- text = response.output.choices[0].message.content[0]['text']
- usage = response.usage
-
- if "total_tokens" not in usage:
- if "image_tokens" in usage:
- token_usage = usage["input_tokens"] + usage["output_tokens"] + usage["image_tokens"]
- else:
- token_usage = usage["input_tokens"] + usage["output_tokens"]
- else:
- token_usage = usage["total_tokens"]
-
- print(text, token_usage)
- # The screenshot is from a video game... 1387
\ No newline at end of file
diff --git a/demo/gradio/computer_use_demo/gui_agent/llm_utils/run_llm.py b/demo/gradio/computer_use_demo/gui_agent/llm_utils/run_llm.py
deleted file mode 100644
index a1de8ba..0000000
--- a/demo/gradio/computer_use_demo/gui_agent/llm_utils/run_llm.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import base64
-import logging
-from .oai import run_oai_interleaved
-from .gemini import run_gemini_interleaved
-
-def run_llm(prompt, llm="gpt-4o-mini", max_tokens=256, temperature=0, stop=None):
- log_prompt(prompt)
-
- # turn string prompt into list
- if isinstance(prompt, str):
- prompt = [prompt]
- elif isinstance(prompt, list):
- pass
- else:
- raise ValueError(f"Invalid prompt type: {type(prompt)}")
-
- if llm.startswith("gpt"): # gpt series
- out = run_oai_interleaved(
- prompt,
- llm,
- max_tokens,
- temperature,
- stop
- )
- elif llm.startswith("gemini"): # gemini series
- out = run_gemini_interleaved(
- prompt,
- llm,
- max_tokens,
- temperature,
- stop
- )
- else:
- raise ValueError(f"Invalid llm: {llm}")
- logging.info(
- f"========Output for {llm}=======\n{out}\n============================")
- return out
-
-def log_prompt(prompt):
- prompt_display = [prompt] if isinstance(prompt, str) else prompt
- prompt_display = "\n\n".join(prompt_display)
- logging.info(
- f"========Prompt=======\n{prompt_display}\n============================")
-
\ No newline at end of file
diff --git a/demo/gradio/computer_use_demo/loop.py b/demo/gradio/computer_use_demo/loop.py
index 2a7c076..449e37e 100644
--- a/demo/gradio/computer_use_demo/loop.py
+++ b/demo/gradio/computer_use_demo/loop.py
@@ -1,76 +1,40 @@
"""
Agentic sampling loop that calls the Anthropic API and local implenmentation of anthropic-defined computer use tools.
"""
-import time
-import json
-import asyncio
-import platform
from collections.abc import Callable
-from datetime import datetime
from enum import StrEnum
-from typing import Any, cast, Dict
-from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex, APIResponse
+from anthropic import APIResponse
from anthropic.types import (
- ToolResultBlockParam,
TextBlock,
)
from anthropic.types.beta import (
BetaContentBlock,
- BetaContentBlockParam,
- BetaImageBlockParam,
BetaMessage,
- BetaMessageParam,
- BetaTextBlockParam,
- BetaToolResultBlockParam,
+ BetaMessageParam
)
from computer_use_demo.tools import ToolResult
-import torch
-
from computer_use_demo.gui_agent.anthropic_agent import AnthropicActor
from computer_use_demo.executor.anthropic_executor import AnthropicExecutor
from computer_use_demo.omniparser_agent.vlm_agent import OmniParser, VLMAgent
-from computer_use_demo.colorful_text import colorful_text_vlm
-from computer_use_demo.tools.screen_capture import get_screenshot
-from computer_use_demo.gui_agent.llm_utils.oai import encode_image
-
BETA_FLAG = "computer-use-2024-10-22"
-
class APIProvider(StrEnum):
ANTHROPIC = "anthropic"
BEDROCK = "bedrock"
VERTEX = "vertex"
OPENAI = "openai"
- QWEN = "qwen"
PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
- # APIProvider.OPENAI: "gpt-4o",
- # APIProvider.QWEN: "qwen2vl",
+ APIProvider.OPENAI: "gpt-4o",
}
-
-# This system prompt is optimized for the Docker environment in this repository and
-# specific tool combinations enabled.
-# We encourage modifying this system prompt to ensure the model has context for the
-# environment it is running in, and to provide any additional information that may be
-# helpful for the task at hand.
-SYSTEM_PROMPT = f"""
-* You are utilizing a Windows system with internet access.
-* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
-
-"""
-
-import base64
-from PIL import Image
-from io import BytesIO
-
def sampling_loop_sync(
*,
model: str,
diff --git a/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py b/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py
index 26d5190..421e196 100644
--- a/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py
+++ b/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py
@@ -18,13 +18,21 @@ from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, B
from computer_use_demo.tools.screen_capture import get_screenshot
from computer_use_demo.gui_agent.llm_utils.oai import run_oai_interleaved, encode_image
-from computer_use_demo.gui_agent.llm_utils.qwen import run_qwen
-from computer_use_demo.gui_agent.llm_utils.llm_utils import extract_data
from computer_use_demo.colorful_text import colorful_text_vlm
import time
+import re
OUTPUT_DIR = "./tmp/outputs"
+def extract_data(input_string, data_type):
+ # Regular expression to extract content starting from '```python' until the end if there are no closing backticks
+ pattern = f"```{data_type}" + r"(.*?)(```|$)"
+ # Extract content
+ # re.DOTALL allows '.' to match newlines as well
+ matches = re.findall(pattern, input_string, re.DOTALL)
+ # Return the first match if exists, trimming whitespace and ignoring potential closing backticks
+ return matches[0][0].strip() if matches else input_string
+
class OmniParser:
def __init__(self,
url: str,
@@ -165,19 +173,6 @@ class VLMAgent:
print(f"oai token usage: {token_usage}")
self.total_token_usage += token_usage
self.total_cost += (token_usage * 0.15 / 1000000) # https://openai.com/api/pricing/
-
- elif "qwen" in self.model:
- vlm_response, token_usage = run_qwen(
- messages=planner_messages,
- system=system,
- llm=self.model,
- api_key=self.api_key,
- max_tokens=self.max_tokens,
- temperature=0,
- )
- print(f"qwen token usage: {token_usage}")
- self.total_token_usage += token_usage
- self.total_cost += (token_usage * 0.02 / 7.25 / 1000) # 1USD=7.25CNY, https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-qianwen-vl-plus-api
elif "phi" in self.model:
pass # TODO
else: