unused file cleanup
This commit is contained in:
@@ -31,32 +31,19 @@ from io import BytesIO
|
||||
import gradio as gr
|
||||
from typing import Dict
|
||||
|
||||
|
||||
BETA_FLAG = "computer-use-2024-10-22"
|
||||
|
||||
|
||||
class APIProvider(StrEnum):
|
||||
ANTHROPIC = "anthropic"
|
||||
BEDROCK = "bedrock"
|
||||
VERTEX = "vertex"
|
||||
|
||||
|
||||
PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
|
||||
APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
|
||||
APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
|
||||
APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
|
||||
}
|
||||
|
||||
|
||||
# Check OS
|
||||
|
||||
SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
|
||||
* You are utilizing a Windows system with internet access.
|
||||
* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
|
||||
</SYSTEM_CAPABILITY>
|
||||
"""
|
||||
|
||||
|
||||
class AnthropicActor:
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -1,109 +0,0 @@
|
||||
import os
|
||||
import re
|
||||
import ast
|
||||
import base64
|
||||
|
||||
|
||||
def is_image_path(text):
|
||||
# Checking if the input text ends with typical image file extensions
|
||||
image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
|
||||
if text.endswith(image_extensions):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def encode_image(image_path):
|
||||
"""Encode image file to base64."""
|
||||
with open(image_path, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode("utf-8")
|
||||
|
||||
|
||||
def is_url_or_filepath(input_string):
|
||||
# Check if input_string is a URL
|
||||
url_pattern = re.compile(
|
||||
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
|
||||
)
|
||||
if url_pattern.match(input_string):
|
||||
return "URL"
|
||||
|
||||
# Check if input_string is a file path
|
||||
file_path = os.path.abspath(input_string)
|
||||
if os.path.exists(file_path):
|
||||
return "File path"
|
||||
|
||||
return "Invalid"
|
||||
|
||||
|
||||
def extract_data(input_string, data_type):
|
||||
# Regular expression to extract content starting from '```python' until the end if there are no closing backticks
|
||||
pattern = f"```{data_type}" + r"(.*?)(```|$)"
|
||||
# Extract content
|
||||
# re.DOTALL allows '.' to match newlines as well
|
||||
matches = re.findall(pattern, input_string, re.DOTALL)
|
||||
# Return the first match if exists, trimming whitespace and ignoring potential closing backticks
|
||||
return matches[0][0].strip() if matches else input_string
|
||||
|
||||
|
||||
def parse_input(code):
|
||||
"""Use AST to parse the input string and extract the function name, arguments, and keyword arguments."""
|
||||
|
||||
def get_target_names(target):
|
||||
"""Recursively get all variable names from the assignment target."""
|
||||
if isinstance(target, ast.Name):
|
||||
return [target.id]
|
||||
elif isinstance(target, ast.Tuple):
|
||||
names = []
|
||||
for elt in target.elts:
|
||||
names.extend(get_target_names(elt))
|
||||
return names
|
||||
return []
|
||||
|
||||
def extract_value(node):
|
||||
"""提取 AST 节点的实际值"""
|
||||
if isinstance(node, ast.Constant):
|
||||
return node.value
|
||||
elif isinstance(node, ast.Name):
|
||||
# TODO: a better way to handle variables
|
||||
raise ValueError(
|
||||
f"Arguments should be a Constant, got a variable {node.id} instead."
|
||||
)
|
||||
# 添加其他需要处理的 AST 节点类型
|
||||
return None
|
||||
|
||||
try:
|
||||
tree = ast.parse(code)
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.Assign):
|
||||
targets = []
|
||||
for t in node.targets:
|
||||
targets.extend(get_target_names(t))
|
||||
if isinstance(node.value, ast.Call):
|
||||
func_name = node.value.func.id
|
||||
args = [ast.dump(arg) for arg in node.value.args]
|
||||
kwargs = {
|
||||
kw.arg: extract_value(kw.value) for kw in node.value.keywords
|
||||
}
|
||||
print(f"Input: {code.strip()}")
|
||||
print(f"Output Variables: {targets}")
|
||||
print(f"Function Name: {func_name}")
|
||||
print(f"Arguments: {args}")
|
||||
print(f"Keyword Arguments: {kwargs}")
|
||||
elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Call):
|
||||
targets = []
|
||||
func_name = extract_value(node.value.func)
|
||||
args = [extract_value(arg) for arg in node.value.args]
|
||||
kwargs = {kw.arg: extract_value(kw.value) for kw in node.value.keywords}
|
||||
|
||||
except SyntaxError:
|
||||
print(f"Input: {code.strip()}")
|
||||
print("No match found")
|
||||
|
||||
return targets, func_name, args, kwargs
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import json
|
||||
s='{"Reasoning": "The Docker icon has been successfully clicked, and the Docker application should now be opening. No further actions are required.", "Next Action": None}'
|
||||
json_str = json.loads(s)
|
||||
print(json_str)
|
||||
@@ -1,107 +0,0 @@
|
||||
|
||||
import os
|
||||
import logging
|
||||
import base64
|
||||
import requests
|
||||
|
||||
import dashscope
|
||||
# from computer_use_demo.gui_agent.llm_utils import is_image_path, encode_image
|
||||
|
||||
def is_image_path(text):
|
||||
return False
|
||||
|
||||
def encode_image(image_path):
|
||||
return ""
|
||||
|
||||
|
||||
def run_qwen(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0):
|
||||
|
||||
api_key = api_key or os.environ.get("QWEN_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("QWEN_API_KEY is not set")
|
||||
|
||||
dashscope.api_key = api_key
|
||||
|
||||
# from IPython.core.debugger import Pdb; Pdb().set_trace()
|
||||
|
||||
final_messages = [{"role": "system", "content": [{"text": system}]}]
|
||||
# image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
if type(messages) == list:
|
||||
for item in messages:
|
||||
contents = []
|
||||
if isinstance(item, dict):
|
||||
for cnt in item["content"]:
|
||||
if isinstance(cnt, str):
|
||||
if is_image_path(cnt):
|
||||
# base64_image = encode_image(cnt)
|
||||
content = [{"image": cnt}]
|
||||
# content = {"type": "image_url", "image_url": {"url": image_url}}
|
||||
else:
|
||||
content = {"text": cnt}
|
||||
contents.append(content)
|
||||
|
||||
message = {"role": item["role"], "content": contents}
|
||||
else: # str
|
||||
contents.append({"text": item})
|
||||
message = {"role": "user", "content": contents}
|
||||
|
||||
final_messages.append(message)
|
||||
|
||||
print("[qwen-vl] sending messages:", final_messages)
|
||||
|
||||
response = dashscope.MultiModalConversation.call(
|
||||
model='qwen-vl-max-0809',
|
||||
messages=final_messages
|
||||
)
|
||||
|
||||
# from IPython.core.debugger import Pdb; Pdb().set_trace()
|
||||
|
||||
try:
|
||||
text = response.output.choices[0].message.content[0]['text']
|
||||
usage = response.usage
|
||||
|
||||
if "total_tokens" not in usage:
|
||||
token_usage = int(usage["input_tokens"] + usage["output_tokens"])
|
||||
else:
|
||||
token_usage = int(usage["total_tokens"])
|
||||
|
||||
return text, token_usage
|
||||
# return response.json()['choices'][0]['message']['content']
|
||||
# return error message if the response is not successful
|
||||
except Exception as e:
|
||||
print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ")
|
||||
return response.json()
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
api_key = os.environ.get("QWEN_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("QWEN_API_KEY is not set")
|
||||
|
||||
dashscope.api_key = api_key
|
||||
|
||||
final_messages = [{"role": "user",
|
||||
"content": [
|
||||
{"text": "What is in the screenshot?"},
|
||||
{"image": "./tmp/outputs/screenshot_0b04acbb783d4706bc93873d17ba8c05.png"}
|
||||
]
|
||||
}
|
||||
]
|
||||
response = dashscope.MultiModalConversation.call(model='qwen-vl-max-0809', messages=final_messages)
|
||||
|
||||
print(response)
|
||||
|
||||
text = response.output.choices[0].message.content[0]['text']
|
||||
usage = response.usage
|
||||
|
||||
if "total_tokens" not in usage:
|
||||
if "image_tokens" in usage:
|
||||
token_usage = usage["input_tokens"] + usage["output_tokens"] + usage["image_tokens"]
|
||||
else:
|
||||
token_usage = usage["input_tokens"] + usage["output_tokens"]
|
||||
else:
|
||||
token_usage = usage["total_tokens"]
|
||||
|
||||
print(text, token_usage)
|
||||
# The screenshot is from a video game... 1387
|
||||
@@ -1,44 +0,0 @@
|
||||
import base64
|
||||
import logging
|
||||
from .oai import run_oai_interleaved
|
||||
from .gemini import run_gemini_interleaved
|
||||
|
||||
def run_llm(prompt, llm="gpt-4o-mini", max_tokens=256, temperature=0, stop=None):
|
||||
log_prompt(prompt)
|
||||
|
||||
# turn string prompt into list
|
||||
if isinstance(prompt, str):
|
||||
prompt = [prompt]
|
||||
elif isinstance(prompt, list):
|
||||
pass
|
||||
else:
|
||||
raise ValueError(f"Invalid prompt type: {type(prompt)}")
|
||||
|
||||
if llm.startswith("gpt"): # gpt series
|
||||
out = run_oai_interleaved(
|
||||
prompt,
|
||||
llm,
|
||||
max_tokens,
|
||||
temperature,
|
||||
stop
|
||||
)
|
||||
elif llm.startswith("gemini"): # gemini series
|
||||
out = run_gemini_interleaved(
|
||||
prompt,
|
||||
llm,
|
||||
max_tokens,
|
||||
temperature,
|
||||
stop
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Invalid llm: {llm}")
|
||||
logging.info(
|
||||
f"========Output for {llm}=======\n{out}\n============================")
|
||||
return out
|
||||
|
||||
def log_prompt(prompt):
|
||||
prompt_display = [prompt] if isinstance(prompt, str) else prompt
|
||||
prompt_display = "\n\n".join(prompt_display)
|
||||
logging.info(
|
||||
f"========Prompt=======\n{prompt_display}\n============================")
|
||||
|
||||
@@ -1,76 +1,40 @@
|
||||
"""
|
||||
Agentic sampling loop that calls the Anthropic API and local implenmentation of anthropic-defined computer use tools.
|
||||
"""
|
||||
import time
|
||||
import json
|
||||
import asyncio
|
||||
import platform
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime
|
||||
from enum import StrEnum
|
||||
from typing import Any, cast, Dict
|
||||
|
||||
from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex, APIResponse
|
||||
from anthropic import APIResponse
|
||||
from anthropic.types import (
|
||||
ToolResultBlockParam,
|
||||
TextBlock,
|
||||
)
|
||||
from anthropic.types.beta import (
|
||||
BetaContentBlock,
|
||||
BetaContentBlockParam,
|
||||
BetaImageBlockParam,
|
||||
BetaMessage,
|
||||
BetaMessageParam,
|
||||
BetaTextBlockParam,
|
||||
BetaToolResultBlockParam,
|
||||
BetaMessageParam
|
||||
)
|
||||
from computer_use_demo.tools import ToolResult
|
||||
|
||||
import torch
|
||||
|
||||
from computer_use_demo.gui_agent.anthropic_agent import AnthropicActor
|
||||
from computer_use_demo.executor.anthropic_executor import AnthropicExecutor
|
||||
from computer_use_demo.omniparser_agent.vlm_agent import OmniParser, VLMAgent
|
||||
from computer_use_demo.colorful_text import colorful_text_vlm
|
||||
from computer_use_demo.tools.screen_capture import get_screenshot
|
||||
from computer_use_demo.gui_agent.llm_utils.oai import encode_image
|
||||
|
||||
|
||||
BETA_FLAG = "computer-use-2024-10-22"
|
||||
|
||||
|
||||
class APIProvider(StrEnum):
|
||||
ANTHROPIC = "anthropic"
|
||||
BEDROCK = "bedrock"
|
||||
VERTEX = "vertex"
|
||||
OPENAI = "openai"
|
||||
QWEN = "qwen"
|
||||
|
||||
|
||||
PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
|
||||
APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
|
||||
APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
|
||||
APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
|
||||
# APIProvider.OPENAI: "gpt-4o",
|
||||
# APIProvider.QWEN: "qwen2vl",
|
||||
APIProvider.OPENAI: "gpt-4o",
|
||||
}
|
||||
|
||||
|
||||
# This system prompt is optimized for the Docker environment in this repository and
|
||||
# specific tool combinations enabled.
|
||||
# We encourage modifying this system prompt to ensure the model has context for the
|
||||
# environment it is running in, and to provide any additional information that may be
|
||||
# helpful for the task at hand.
|
||||
SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
|
||||
* You are utilizing a Windows system with internet access.
|
||||
* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
|
||||
</SYSTEM_CAPABILITY>
|
||||
"""
|
||||
|
||||
import base64
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
|
||||
def sampling_loop_sync(
|
||||
*,
|
||||
model: str,
|
||||
|
||||
@@ -18,13 +18,21 @@ from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, B
|
||||
|
||||
from computer_use_demo.tools.screen_capture import get_screenshot
|
||||
from computer_use_demo.gui_agent.llm_utils.oai import run_oai_interleaved, encode_image
|
||||
from computer_use_demo.gui_agent.llm_utils.qwen import run_qwen
|
||||
from computer_use_demo.gui_agent.llm_utils.llm_utils import extract_data
|
||||
from computer_use_demo.colorful_text import colorful_text_vlm
|
||||
import time
|
||||
import re
|
||||
|
||||
OUTPUT_DIR = "./tmp/outputs"
|
||||
|
||||
def extract_data(input_string, data_type):
|
||||
# Regular expression to extract content starting from '```python' until the end if there are no closing backticks
|
||||
pattern = f"```{data_type}" + r"(.*?)(```|$)"
|
||||
# Extract content
|
||||
# re.DOTALL allows '.' to match newlines as well
|
||||
matches = re.findall(pattern, input_string, re.DOTALL)
|
||||
# Return the first match if exists, trimming whitespace and ignoring potential closing backticks
|
||||
return matches[0][0].strip() if matches else input_string
|
||||
|
||||
class OmniParser:
|
||||
def __init__(self,
|
||||
url: str,
|
||||
@@ -165,19 +173,6 @@ class VLMAgent:
|
||||
print(f"oai token usage: {token_usage}")
|
||||
self.total_token_usage += token_usage
|
||||
self.total_cost += (token_usage * 0.15 / 1000000) # https://openai.com/api/pricing/
|
||||
|
||||
elif "qwen" in self.model:
|
||||
vlm_response, token_usage = run_qwen(
|
||||
messages=planner_messages,
|
||||
system=system,
|
||||
llm=self.model,
|
||||
api_key=self.api_key,
|
||||
max_tokens=self.max_tokens,
|
||||
temperature=0,
|
||||
)
|
||||
print(f"qwen token usage: {token_usage}")
|
||||
self.total_token_usage += token_usage
|
||||
self.total_cost += (token_usage * 0.02 / 7.25 / 1000) # 1USD=7.25CNY, https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-qianwen-vl-plus-api
|
||||
elif "phi" in self.model:
|
||||
pass # TODO
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user