unused file cleanup
This commit is contained in:
@@ -31,32 +31,19 @@ from io import BytesIO
|
|||||||
import gradio as gr
|
import gradio as gr
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
|
||||||
|
|
||||||
BETA_FLAG = "computer-use-2024-10-22"
|
BETA_FLAG = "computer-use-2024-10-22"
|
||||||
|
|
||||||
|
|
||||||
class APIProvider(StrEnum):
|
class APIProvider(StrEnum):
|
||||||
ANTHROPIC = "anthropic"
|
ANTHROPIC = "anthropic"
|
||||||
BEDROCK = "bedrock"
|
BEDROCK = "bedrock"
|
||||||
VERTEX = "vertex"
|
VERTEX = "vertex"
|
||||||
|
|
||||||
|
|
||||||
PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
|
|
||||||
APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
|
|
||||||
APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
|
|
||||||
APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# Check OS
|
|
||||||
|
|
||||||
SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
|
SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
|
||||||
* You are utilizing a Windows system with internet access.
|
* You are utilizing a Windows system with internet access.
|
||||||
* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
|
* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
|
||||||
</SYSTEM_CAPABILITY>
|
</SYSTEM_CAPABILITY>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
class AnthropicActor:
|
class AnthropicActor:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -1,109 +0,0 @@
|
|||||||
import os
|
|
||||||
import re
|
|
||||||
import ast
|
|
||||||
import base64
|
|
||||||
|
|
||||||
|
|
||||||
def is_image_path(text):
|
|
||||||
# Checking if the input text ends with typical image file extensions
|
|
||||||
image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
|
|
||||||
if text.endswith(image_extensions):
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def encode_image(image_path):
|
|
||||||
"""Encode image file to base64."""
|
|
||||||
with open(image_path, "rb") as image_file:
|
|
||||||
return base64.b64encode(image_file.read()).decode("utf-8")
|
|
||||||
|
|
||||||
|
|
||||||
def is_url_or_filepath(input_string):
|
|
||||||
# Check if input_string is a URL
|
|
||||||
url_pattern = re.compile(
|
|
||||||
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
|
|
||||||
)
|
|
||||||
if url_pattern.match(input_string):
|
|
||||||
return "URL"
|
|
||||||
|
|
||||||
# Check if input_string is a file path
|
|
||||||
file_path = os.path.abspath(input_string)
|
|
||||||
if os.path.exists(file_path):
|
|
||||||
return "File path"
|
|
||||||
|
|
||||||
return "Invalid"
|
|
||||||
|
|
||||||
|
|
||||||
def extract_data(input_string, data_type):
|
|
||||||
# Regular expression to extract content starting from '```python' until the end if there are no closing backticks
|
|
||||||
pattern = f"```{data_type}" + r"(.*?)(```|$)"
|
|
||||||
# Extract content
|
|
||||||
# re.DOTALL allows '.' to match newlines as well
|
|
||||||
matches = re.findall(pattern, input_string, re.DOTALL)
|
|
||||||
# Return the first match if exists, trimming whitespace and ignoring potential closing backticks
|
|
||||||
return matches[0][0].strip() if matches else input_string
|
|
||||||
|
|
||||||
|
|
||||||
def parse_input(code):
|
|
||||||
"""Use AST to parse the input string and extract the function name, arguments, and keyword arguments."""
|
|
||||||
|
|
||||||
def get_target_names(target):
|
|
||||||
"""Recursively get all variable names from the assignment target."""
|
|
||||||
if isinstance(target, ast.Name):
|
|
||||||
return [target.id]
|
|
||||||
elif isinstance(target, ast.Tuple):
|
|
||||||
names = []
|
|
||||||
for elt in target.elts:
|
|
||||||
names.extend(get_target_names(elt))
|
|
||||||
return names
|
|
||||||
return []
|
|
||||||
|
|
||||||
def extract_value(node):
|
|
||||||
"""提取 AST 节点的实际值"""
|
|
||||||
if isinstance(node, ast.Constant):
|
|
||||||
return node.value
|
|
||||||
elif isinstance(node, ast.Name):
|
|
||||||
# TODO: a better way to handle variables
|
|
||||||
raise ValueError(
|
|
||||||
f"Arguments should be a Constant, got a variable {node.id} instead."
|
|
||||||
)
|
|
||||||
# 添加其他需要处理的 AST 节点类型
|
|
||||||
return None
|
|
||||||
|
|
||||||
try:
|
|
||||||
tree = ast.parse(code)
|
|
||||||
for node in ast.walk(tree):
|
|
||||||
if isinstance(node, ast.Assign):
|
|
||||||
targets = []
|
|
||||||
for t in node.targets:
|
|
||||||
targets.extend(get_target_names(t))
|
|
||||||
if isinstance(node.value, ast.Call):
|
|
||||||
func_name = node.value.func.id
|
|
||||||
args = [ast.dump(arg) for arg in node.value.args]
|
|
||||||
kwargs = {
|
|
||||||
kw.arg: extract_value(kw.value) for kw in node.value.keywords
|
|
||||||
}
|
|
||||||
print(f"Input: {code.strip()}")
|
|
||||||
print(f"Output Variables: {targets}")
|
|
||||||
print(f"Function Name: {func_name}")
|
|
||||||
print(f"Arguments: {args}")
|
|
||||||
print(f"Keyword Arguments: {kwargs}")
|
|
||||||
elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Call):
|
|
||||||
targets = []
|
|
||||||
func_name = extract_value(node.value.func)
|
|
||||||
args = [extract_value(arg) for arg in node.value.args]
|
|
||||||
kwargs = {kw.arg: extract_value(kw.value) for kw in node.value.keywords}
|
|
||||||
|
|
||||||
except SyntaxError:
|
|
||||||
print(f"Input: {code.strip()}")
|
|
||||||
print("No match found")
|
|
||||||
|
|
||||||
return targets, func_name, args, kwargs
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import json
|
|
||||||
s='{"Reasoning": "The Docker icon has been successfully clicked, and the Docker application should now be opening. No further actions are required.", "Next Action": None}'
|
|
||||||
json_str = json.loads(s)
|
|
||||||
print(json_str)
|
|
||||||
@@ -1,107 +0,0 @@
|
|||||||
|
|
||||||
import os
|
|
||||||
import logging
|
|
||||||
import base64
|
|
||||||
import requests
|
|
||||||
|
|
||||||
import dashscope
|
|
||||||
# from computer_use_demo.gui_agent.llm_utils import is_image_path, encode_image
|
|
||||||
|
|
||||||
def is_image_path(text):
|
|
||||||
return False
|
|
||||||
|
|
||||||
def encode_image(image_path):
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def run_qwen(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0):
|
|
||||||
|
|
||||||
api_key = api_key or os.environ.get("QWEN_API_KEY")
|
|
||||||
if not api_key:
|
|
||||||
raise ValueError("QWEN_API_KEY is not set")
|
|
||||||
|
|
||||||
dashscope.api_key = api_key
|
|
||||||
|
|
||||||
# from IPython.core.debugger import Pdb; Pdb().set_trace()
|
|
||||||
|
|
||||||
final_messages = [{"role": "system", "content": [{"text": system}]}]
|
|
||||||
# image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
|
||||||
if type(messages) == list:
|
|
||||||
for item in messages:
|
|
||||||
contents = []
|
|
||||||
if isinstance(item, dict):
|
|
||||||
for cnt in item["content"]:
|
|
||||||
if isinstance(cnt, str):
|
|
||||||
if is_image_path(cnt):
|
|
||||||
# base64_image = encode_image(cnt)
|
|
||||||
content = [{"image": cnt}]
|
|
||||||
# content = {"type": "image_url", "image_url": {"url": image_url}}
|
|
||||||
else:
|
|
||||||
content = {"text": cnt}
|
|
||||||
contents.append(content)
|
|
||||||
|
|
||||||
message = {"role": item["role"], "content": contents}
|
|
||||||
else: # str
|
|
||||||
contents.append({"text": item})
|
|
||||||
message = {"role": "user", "content": contents}
|
|
||||||
|
|
||||||
final_messages.append(message)
|
|
||||||
|
|
||||||
print("[qwen-vl] sending messages:", final_messages)
|
|
||||||
|
|
||||||
response = dashscope.MultiModalConversation.call(
|
|
||||||
model='qwen-vl-max-0809',
|
|
||||||
messages=final_messages
|
|
||||||
)
|
|
||||||
|
|
||||||
# from IPython.core.debugger import Pdb; Pdb().set_trace()
|
|
||||||
|
|
||||||
try:
|
|
||||||
text = response.output.choices[0].message.content[0]['text']
|
|
||||||
usage = response.usage
|
|
||||||
|
|
||||||
if "total_tokens" not in usage:
|
|
||||||
token_usage = int(usage["input_tokens"] + usage["output_tokens"])
|
|
||||||
else:
|
|
||||||
token_usage = int(usage["total_tokens"])
|
|
||||||
|
|
||||||
return text, token_usage
|
|
||||||
# return response.json()['choices'][0]['message']['content']
|
|
||||||
# return error message if the response is not successful
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ")
|
|
||||||
return response.json()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
api_key = os.environ.get("QWEN_API_KEY")
|
|
||||||
if not api_key:
|
|
||||||
raise ValueError("QWEN_API_KEY is not set")
|
|
||||||
|
|
||||||
dashscope.api_key = api_key
|
|
||||||
|
|
||||||
final_messages = [{"role": "user",
|
|
||||||
"content": [
|
|
||||||
{"text": "What is in the screenshot?"},
|
|
||||||
{"image": "./tmp/outputs/screenshot_0b04acbb783d4706bc93873d17ba8c05.png"}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
response = dashscope.MultiModalConversation.call(model='qwen-vl-max-0809', messages=final_messages)
|
|
||||||
|
|
||||||
print(response)
|
|
||||||
|
|
||||||
text = response.output.choices[0].message.content[0]['text']
|
|
||||||
usage = response.usage
|
|
||||||
|
|
||||||
if "total_tokens" not in usage:
|
|
||||||
if "image_tokens" in usage:
|
|
||||||
token_usage = usage["input_tokens"] + usage["output_tokens"] + usage["image_tokens"]
|
|
||||||
else:
|
|
||||||
token_usage = usage["input_tokens"] + usage["output_tokens"]
|
|
||||||
else:
|
|
||||||
token_usage = usage["total_tokens"]
|
|
||||||
|
|
||||||
print(text, token_usage)
|
|
||||||
# The screenshot is from a video game... 1387
|
|
||||||
@@ -1,44 +0,0 @@
|
|||||||
import base64
|
|
||||||
import logging
|
|
||||||
from .oai import run_oai_interleaved
|
|
||||||
from .gemini import run_gemini_interleaved
|
|
||||||
|
|
||||||
def run_llm(prompt, llm="gpt-4o-mini", max_tokens=256, temperature=0, stop=None):
|
|
||||||
log_prompt(prompt)
|
|
||||||
|
|
||||||
# turn string prompt into list
|
|
||||||
if isinstance(prompt, str):
|
|
||||||
prompt = [prompt]
|
|
||||||
elif isinstance(prompt, list):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Invalid prompt type: {type(prompt)}")
|
|
||||||
|
|
||||||
if llm.startswith("gpt"): # gpt series
|
|
||||||
out = run_oai_interleaved(
|
|
||||||
prompt,
|
|
||||||
llm,
|
|
||||||
max_tokens,
|
|
||||||
temperature,
|
|
||||||
stop
|
|
||||||
)
|
|
||||||
elif llm.startswith("gemini"): # gemini series
|
|
||||||
out = run_gemini_interleaved(
|
|
||||||
prompt,
|
|
||||||
llm,
|
|
||||||
max_tokens,
|
|
||||||
temperature,
|
|
||||||
stop
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Invalid llm: {llm}")
|
|
||||||
logging.info(
|
|
||||||
f"========Output for {llm}=======\n{out}\n============================")
|
|
||||||
return out
|
|
||||||
|
|
||||||
def log_prompt(prompt):
|
|
||||||
prompt_display = [prompt] if isinstance(prompt, str) else prompt
|
|
||||||
prompt_display = "\n\n".join(prompt_display)
|
|
||||||
logging.info(
|
|
||||||
f"========Prompt=======\n{prompt_display}\n============================")
|
|
||||||
|
|
||||||
@@ -1,76 +1,40 @@
|
|||||||
"""
|
"""
|
||||||
Agentic sampling loop that calls the Anthropic API and local implenmentation of anthropic-defined computer use tools.
|
Agentic sampling loop that calls the Anthropic API and local implenmentation of anthropic-defined computer use tools.
|
||||||
"""
|
"""
|
||||||
import time
|
|
||||||
import json
|
|
||||||
import asyncio
|
|
||||||
import platform
|
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from datetime import datetime
|
|
||||||
from enum import StrEnum
|
from enum import StrEnum
|
||||||
from typing import Any, cast, Dict
|
|
||||||
|
|
||||||
from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex, APIResponse
|
from anthropic import APIResponse
|
||||||
from anthropic.types import (
|
from anthropic.types import (
|
||||||
ToolResultBlockParam,
|
|
||||||
TextBlock,
|
TextBlock,
|
||||||
)
|
)
|
||||||
from anthropic.types.beta import (
|
from anthropic.types.beta import (
|
||||||
BetaContentBlock,
|
BetaContentBlock,
|
||||||
BetaContentBlockParam,
|
|
||||||
BetaImageBlockParam,
|
|
||||||
BetaMessage,
|
BetaMessage,
|
||||||
BetaMessageParam,
|
BetaMessageParam
|
||||||
BetaTextBlockParam,
|
|
||||||
BetaToolResultBlockParam,
|
|
||||||
)
|
)
|
||||||
from computer_use_demo.tools import ToolResult
|
from computer_use_demo.tools import ToolResult
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from computer_use_demo.gui_agent.anthropic_agent import AnthropicActor
|
from computer_use_demo.gui_agent.anthropic_agent import AnthropicActor
|
||||||
from computer_use_demo.executor.anthropic_executor import AnthropicExecutor
|
from computer_use_demo.executor.anthropic_executor import AnthropicExecutor
|
||||||
from computer_use_demo.omniparser_agent.vlm_agent import OmniParser, VLMAgent
|
from computer_use_demo.omniparser_agent.vlm_agent import OmniParser, VLMAgent
|
||||||
from computer_use_demo.colorful_text import colorful_text_vlm
|
|
||||||
from computer_use_demo.tools.screen_capture import get_screenshot
|
|
||||||
from computer_use_demo.gui_agent.llm_utils.oai import encode_image
|
|
||||||
|
|
||||||
|
|
||||||
BETA_FLAG = "computer-use-2024-10-22"
|
BETA_FLAG = "computer-use-2024-10-22"
|
||||||
|
|
||||||
|
|
||||||
class APIProvider(StrEnum):
|
class APIProvider(StrEnum):
|
||||||
ANTHROPIC = "anthropic"
|
ANTHROPIC = "anthropic"
|
||||||
BEDROCK = "bedrock"
|
BEDROCK = "bedrock"
|
||||||
VERTEX = "vertex"
|
VERTEX = "vertex"
|
||||||
OPENAI = "openai"
|
OPENAI = "openai"
|
||||||
QWEN = "qwen"
|
|
||||||
|
|
||||||
|
|
||||||
PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
|
PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
|
||||||
APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
|
APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
|
||||||
APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
|
APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
|
||||||
APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
|
APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
|
||||||
# APIProvider.OPENAI: "gpt-4o",
|
APIProvider.OPENAI: "gpt-4o",
|
||||||
# APIProvider.QWEN: "qwen2vl",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# This system prompt is optimized for the Docker environment in this repository and
|
|
||||||
# specific tool combinations enabled.
|
|
||||||
# We encourage modifying this system prompt to ensure the model has context for the
|
|
||||||
# environment it is running in, and to provide any additional information that may be
|
|
||||||
# helpful for the task at hand.
|
|
||||||
SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
|
|
||||||
* You are utilizing a Windows system with internet access.
|
|
||||||
* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
|
|
||||||
</SYSTEM_CAPABILITY>
|
|
||||||
"""
|
|
||||||
|
|
||||||
import base64
|
|
||||||
from PIL import Image
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
def sampling_loop_sync(
|
def sampling_loop_sync(
|
||||||
*,
|
*,
|
||||||
model: str,
|
model: str,
|
||||||
|
|||||||
@@ -18,13 +18,21 @@ from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, B
|
|||||||
|
|
||||||
from computer_use_demo.tools.screen_capture import get_screenshot
|
from computer_use_demo.tools.screen_capture import get_screenshot
|
||||||
from computer_use_demo.gui_agent.llm_utils.oai import run_oai_interleaved, encode_image
|
from computer_use_demo.gui_agent.llm_utils.oai import run_oai_interleaved, encode_image
|
||||||
from computer_use_demo.gui_agent.llm_utils.qwen import run_qwen
|
|
||||||
from computer_use_demo.gui_agent.llm_utils.llm_utils import extract_data
|
|
||||||
from computer_use_demo.colorful_text import colorful_text_vlm
|
from computer_use_demo.colorful_text import colorful_text_vlm
|
||||||
import time
|
import time
|
||||||
|
import re
|
||||||
|
|
||||||
OUTPUT_DIR = "./tmp/outputs"
|
OUTPUT_DIR = "./tmp/outputs"
|
||||||
|
|
||||||
|
def extract_data(input_string, data_type):
|
||||||
|
# Regular expression to extract content starting from '```python' until the end if there are no closing backticks
|
||||||
|
pattern = f"```{data_type}" + r"(.*?)(```|$)"
|
||||||
|
# Extract content
|
||||||
|
# re.DOTALL allows '.' to match newlines as well
|
||||||
|
matches = re.findall(pattern, input_string, re.DOTALL)
|
||||||
|
# Return the first match if exists, trimming whitespace and ignoring potential closing backticks
|
||||||
|
return matches[0][0].strip() if matches else input_string
|
||||||
|
|
||||||
class OmniParser:
|
class OmniParser:
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
url: str,
|
url: str,
|
||||||
@@ -165,19 +173,6 @@ class VLMAgent:
|
|||||||
print(f"oai token usage: {token_usage}")
|
print(f"oai token usage: {token_usage}")
|
||||||
self.total_token_usage += token_usage
|
self.total_token_usage += token_usage
|
||||||
self.total_cost += (token_usage * 0.15 / 1000000) # https://openai.com/api/pricing/
|
self.total_cost += (token_usage * 0.15 / 1000000) # https://openai.com/api/pricing/
|
||||||
|
|
||||||
elif "qwen" in self.model:
|
|
||||||
vlm_response, token_usage = run_qwen(
|
|
||||||
messages=planner_messages,
|
|
||||||
system=system,
|
|
||||||
llm=self.model,
|
|
||||||
api_key=self.api_key,
|
|
||||||
max_tokens=self.max_tokens,
|
|
||||||
temperature=0,
|
|
||||||
)
|
|
||||||
print(f"qwen token usage: {token_usage}")
|
|
||||||
self.total_token_usage += token_usage
|
|
||||||
self.total_cost += (token_usage * 0.02 / 7.25 / 1000) # 1USD=7.25CNY, https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-qianwen-vl-plus-api
|
|
||||||
elif "phi" in self.model:
|
elif "phi" in self.model:
|
||||||
pass # TODO
|
pass # TODO
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user