unused file cleanup

This commit is contained in:
Thomas Dhome Casanova (from Dev Box)
2025-01-22 21:14:21 -08:00
parent c29ac5064a
commit 9db016b52f
6 changed files with 13 additions and 327 deletions

View File

@@ -31,32 +31,19 @@ from io import BytesIO
import gradio as gr
from typing import Dict
BETA_FLAG = "computer-use-2024-10-22"
class APIProvider(StrEnum):
ANTHROPIC = "anthropic"
BEDROCK = "bedrock"
VERTEX = "vertex"
PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
}
# Check OS
SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
* You are utilizing a Windows system with internet access.
* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
</SYSTEM_CAPABILITY>
"""
class AnthropicActor:
def __init__(
self,

View File

@@ -1,109 +0,0 @@
import os
import re
import ast
import base64
def is_image_path(text):
# Checking if the input text ends with typical image file extensions
image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
if text.endswith(image_extensions):
return True
else:
return False
def encode_image(image_path):
"""Encode image file to base64."""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def is_url_or_filepath(input_string):
# Check if input_string is a URL
url_pattern = re.compile(
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
)
if url_pattern.match(input_string):
return "URL"
# Check if input_string is a file path
file_path = os.path.abspath(input_string)
if os.path.exists(file_path):
return "File path"
return "Invalid"
def extract_data(input_string, data_type):
# Regular expression to extract content starting from '```python' until the end if there are no closing backticks
pattern = f"```{data_type}" + r"(.*?)(```|$)"
# Extract content
# re.DOTALL allows '.' to match newlines as well
matches = re.findall(pattern, input_string, re.DOTALL)
# Return the first match if exists, trimming whitespace and ignoring potential closing backticks
return matches[0][0].strip() if matches else input_string
def parse_input(code):
"""Use AST to parse the input string and extract the function name, arguments, and keyword arguments."""
def get_target_names(target):
"""Recursively get all variable names from the assignment target."""
if isinstance(target, ast.Name):
return [target.id]
elif isinstance(target, ast.Tuple):
names = []
for elt in target.elts:
names.extend(get_target_names(elt))
return names
return []
def extract_value(node):
"""提取 AST 节点的实际值"""
if isinstance(node, ast.Constant):
return node.value
elif isinstance(node, ast.Name):
# TODO: a better way to handle variables
raise ValueError(
f"Arguments should be a Constant, got a variable {node.id} instead."
)
# 添加其他需要处理的 AST 节点类型
return None
try:
tree = ast.parse(code)
for node in ast.walk(tree):
if isinstance(node, ast.Assign):
targets = []
for t in node.targets:
targets.extend(get_target_names(t))
if isinstance(node.value, ast.Call):
func_name = node.value.func.id
args = [ast.dump(arg) for arg in node.value.args]
kwargs = {
kw.arg: extract_value(kw.value) for kw in node.value.keywords
}
print(f"Input: {code.strip()}")
print(f"Output Variables: {targets}")
print(f"Function Name: {func_name}")
print(f"Arguments: {args}")
print(f"Keyword Arguments: {kwargs}")
elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Call):
targets = []
func_name = extract_value(node.value.func)
args = [extract_value(arg) for arg in node.value.args]
kwargs = {kw.arg: extract_value(kw.value) for kw in node.value.keywords}
except SyntaxError:
print(f"Input: {code.strip()}")
print("No match found")
return targets, func_name, args, kwargs
if __name__ == "__main__":
import json
s='{"Reasoning": "The Docker icon has been successfully clicked, and the Docker application should now be opening. No further actions are required.", "Next Action": None}'
json_str = json.loads(s)
print(json_str)

View File

@@ -1,107 +0,0 @@
import os
import logging
import base64
import requests
import dashscope
# from computer_use_demo.gui_agent.llm_utils import is_image_path, encode_image
def is_image_path(text):
return False
def encode_image(image_path):
return ""
def run_qwen(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0):
api_key = api_key or os.environ.get("QWEN_API_KEY")
if not api_key:
raise ValueError("QWEN_API_KEY is not set")
dashscope.api_key = api_key
# from IPython.core.debugger import Pdb; Pdb().set_trace()
final_messages = [{"role": "system", "content": [{"text": system}]}]
# image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
if type(messages) == list:
for item in messages:
contents = []
if isinstance(item, dict):
for cnt in item["content"]:
if isinstance(cnt, str):
if is_image_path(cnt):
# base64_image = encode_image(cnt)
content = [{"image": cnt}]
# content = {"type": "image_url", "image_url": {"url": image_url}}
else:
content = {"text": cnt}
contents.append(content)
message = {"role": item["role"], "content": contents}
else: # str
contents.append({"text": item})
message = {"role": "user", "content": contents}
final_messages.append(message)
print("[qwen-vl] sending messages:", final_messages)
response = dashscope.MultiModalConversation.call(
model='qwen-vl-max-0809',
messages=final_messages
)
# from IPython.core.debugger import Pdb; Pdb().set_trace()
try:
text = response.output.choices[0].message.content[0]['text']
usage = response.usage
if "total_tokens" not in usage:
token_usage = int(usage["input_tokens"] + usage["output_tokens"])
else:
token_usage = int(usage["total_tokens"])
return text, token_usage
# return response.json()['choices'][0]['message']['content']
# return error message if the response is not successful
except Exception as e:
print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ")
return response.json()
if __name__ == "__main__":
api_key = os.environ.get("QWEN_API_KEY")
if not api_key:
raise ValueError("QWEN_API_KEY is not set")
dashscope.api_key = api_key
final_messages = [{"role": "user",
"content": [
{"text": "What is in the screenshot?"},
{"image": "./tmp/outputs/screenshot_0b04acbb783d4706bc93873d17ba8c05.png"}
]
}
]
response = dashscope.MultiModalConversation.call(model='qwen-vl-max-0809', messages=final_messages)
print(response)
text = response.output.choices[0].message.content[0]['text']
usage = response.usage
if "total_tokens" not in usage:
if "image_tokens" in usage:
token_usage = usage["input_tokens"] + usage["output_tokens"] + usage["image_tokens"]
else:
token_usage = usage["input_tokens"] + usage["output_tokens"]
else:
token_usage = usage["total_tokens"]
print(text, token_usage)
# The screenshot is from a video game... 1387

View File

@@ -1,44 +0,0 @@
import base64
import logging
from .oai import run_oai_interleaved
from .gemini import run_gemini_interleaved
def run_llm(prompt, llm="gpt-4o-mini", max_tokens=256, temperature=0, stop=None):
log_prompt(prompt)
# turn string prompt into list
if isinstance(prompt, str):
prompt = [prompt]
elif isinstance(prompt, list):
pass
else:
raise ValueError(f"Invalid prompt type: {type(prompt)}")
if llm.startswith("gpt"): # gpt series
out = run_oai_interleaved(
prompt,
llm,
max_tokens,
temperature,
stop
)
elif llm.startswith("gemini"): # gemini series
out = run_gemini_interleaved(
prompt,
llm,
max_tokens,
temperature,
stop
)
else:
raise ValueError(f"Invalid llm: {llm}")
logging.info(
f"========Output for {llm}=======\n{out}\n============================")
return out
def log_prompt(prompt):
prompt_display = [prompt] if isinstance(prompt, str) else prompt
prompt_display = "\n\n".join(prompt_display)
logging.info(
f"========Prompt=======\n{prompt_display}\n============================")

View File

@@ -1,76 +1,40 @@
"""
Agentic sampling loop that calls the Anthropic API and local implenmentation of anthropic-defined computer use tools.
"""
import time
import json
import asyncio
import platform
from collections.abc import Callable
from datetime import datetime
from enum import StrEnum
from typing import Any, cast, Dict
from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex, APIResponse
from anthropic import APIResponse
from anthropic.types import (
ToolResultBlockParam,
TextBlock,
)
from anthropic.types.beta import (
BetaContentBlock,
BetaContentBlockParam,
BetaImageBlockParam,
BetaMessage,
BetaMessageParam,
BetaTextBlockParam,
BetaToolResultBlockParam,
BetaMessageParam
)
from computer_use_demo.tools import ToolResult
import torch
from computer_use_demo.gui_agent.anthropic_agent import AnthropicActor
from computer_use_demo.executor.anthropic_executor import AnthropicExecutor
from computer_use_demo.omniparser_agent.vlm_agent import OmniParser, VLMAgent
from computer_use_demo.colorful_text import colorful_text_vlm
from computer_use_demo.tools.screen_capture import get_screenshot
from computer_use_demo.gui_agent.llm_utils.oai import encode_image
BETA_FLAG = "computer-use-2024-10-22"
class APIProvider(StrEnum):
ANTHROPIC = "anthropic"
BEDROCK = "bedrock"
VERTEX = "vertex"
OPENAI = "openai"
QWEN = "qwen"
PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
# APIProvider.OPENAI: "gpt-4o",
# APIProvider.QWEN: "qwen2vl",
APIProvider.OPENAI: "gpt-4o",
}
# This system prompt is optimized for the Docker environment in this repository and
# specific tool combinations enabled.
# We encourage modifying this system prompt to ensure the model has context for the
# environment it is running in, and to provide any additional information that may be
# helpful for the task at hand.
SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
* You are utilizing a Windows system with internet access.
* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
</SYSTEM_CAPABILITY>
"""
import base64
from PIL import Image
from io import BytesIO
def sampling_loop_sync(
*,
model: str,

View File

@@ -18,13 +18,21 @@ from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, B
from computer_use_demo.tools.screen_capture import get_screenshot
from computer_use_demo.gui_agent.llm_utils.oai import run_oai_interleaved, encode_image
from computer_use_demo.gui_agent.llm_utils.qwen import run_qwen
from computer_use_demo.gui_agent.llm_utils.llm_utils import extract_data
from computer_use_demo.colorful_text import colorful_text_vlm
import time
import re
OUTPUT_DIR = "./tmp/outputs"
def extract_data(input_string, data_type):
# Regular expression to extract content starting from '```python' until the end if there are no closing backticks
pattern = f"```{data_type}" + r"(.*?)(```|$)"
# Extract content
# re.DOTALL allows '.' to match newlines as well
matches = re.findall(pattern, input_string, re.DOTALL)
# Return the first match if exists, trimming whitespace and ignoring potential closing backticks
return matches[0][0].strip() if matches else input_string
class OmniParser:
def __init__(self,
url: str,
@@ -165,19 +173,6 @@ class VLMAgent:
print(f"oai token usage: {token_usage}")
self.total_token_usage += token_usage
self.total_cost += (token_usage * 0.15 / 1000000) # https://openai.com/api/pricing/
elif "qwen" in self.model:
vlm_response, token_usage = run_qwen(
messages=planner_messages,
system=system,
llm=self.model,
api_key=self.api_key,
max_tokens=self.max_tokens,
temperature=0,
)
print(f"qwen token usage: {token_usage}")
self.total_token_usage += token_usage
self.total_cost += (token_usage * 0.02 / 7.25 / 1000) # 1USD=7.25CNY, https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-qianwen-vl-plus-api
elif "phi" in self.model:
pass # TODO
else: