docker demo, migration, speedup inference using cv2

This commit is contained in:
yadonglu
2025-01-04 20:06:33 -08:00
parent d0c163cd02
commit b9d3cb715b
36 changed files with 5842 additions and 2456 deletions

View File

@@ -0,0 +1,207 @@
"""
Agentic sampling loop that calls the Anthropic API and local implenmentation of anthropic-defined computer use tools.
"""
import asyncio
import platform
from collections.abc import Callable
from datetime import datetime
from enum import StrEnum
from typing import Any, cast
from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex, APIResponse
from anthropic.types import (
ToolResultBlockParam,
)
from anthropic.types.beta import (
BetaContentBlock,
BetaContentBlockParam,
BetaImageBlockParam,
BetaMessage,
BetaMessageParam,
BetaTextBlockParam,
BetaToolResultBlockParam,
)
from anthropic.types import TextBlock
from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock
from tools import BashTool, ComputerTool, EditTool, ToolCollection, ToolResult
from PIL import Image
from io import BytesIO
import gradio as gr
from typing import Dict
BETA_FLAG = "computer-use-2024-10-22"
class APIProvider(StrEnum):
ANTHROPIC = "anthropic"
BEDROCK = "bedrock"
VERTEX = "vertex"
PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
}
# Check OS
SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
* You are utilizing a Windows system with internet access.
* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
</SYSTEM_CAPABILITY>
"""
class AnthropicActor:
def __init__(
self,
model: str,
provider: APIProvider,
system_prompt_suffix: str,
api_key: str,
api_response_callback: Callable[[APIResponse[BetaMessage]], None],
max_tokens: int = 4096,
only_n_most_recent_images: int | None = None,
selected_screen: int = 0,
print_usage: bool = True,
):
self.model = model
self.provider = provider
self.system_prompt_suffix = system_prompt_suffix
self.api_key = api_key
self.api_response_callback = api_response_callback
self.max_tokens = max_tokens
self.only_n_most_recent_images = only_n_most_recent_images
self.selected_screen = selected_screen
self.tool_collection = ToolCollection(
ComputerTool(selected_screen=selected_screen),
BashTool(),
EditTool(),
)
self.system = (
f"{SYSTEM_PROMPT}{' ' + system_prompt_suffix if system_prompt_suffix else ''}"
)
self.total_token_usage = 0
self.total_cost = 0
self.print_usage = print_usage
# Instantiate the appropriate API client based on the provider
if provider == APIProvider.ANTHROPIC:
self.client = Anthropic(api_key=api_key)
elif provider == APIProvider.VERTEX:
self.client = AnthropicVertex()
elif provider == APIProvider.BEDROCK:
self.client = AnthropicBedrock()
def __call__(
self,
*,
messages: list[BetaMessageParam]
):
"""
Generate a response given history messages.
"""
if self.only_n_most_recent_images:
_maybe_filter_to_n_most_recent_images(messages, self.only_n_most_recent_images)
# Call the API synchronously
raw_response = self.client.beta.messages.with_raw_response.create(
max_tokens=self.max_tokens,
messages=messages,
model=self.model,
system=self.system,
tools=self.tool_collection.to_params(),
betas=["computer-use-2024-10-22"],
)
self.api_response_callback(cast(APIResponse[BetaMessage], raw_response))
response = raw_response.parse()
print(f"AnthropicActor response: {response}")
self.total_token_usage += response.usage.input_tokens + response.usage.output_tokens
self.total_cost += (response.usage.input_tokens * 3 / 1000000 + response.usage.output_tokens * 15 / 1000000)
if self.print_usage:
print(f"Claude total token usage so far: {self.total_token_usage}, total cost so far: $USD{self.total_cost}")
return response
def _maybe_filter_to_n_most_recent_images(
messages: list[BetaMessageParam],
images_to_keep: int,
min_removal_threshold: int = 10,
):
"""
With the assumption that images are screenshots that are of diminishing value as
the conversation progresses, remove all but the final `images_to_keep` tool_result
images in place, with a chunk of min_removal_threshold to reduce the amount we
break the implicit prompt cache.
"""
if images_to_keep is None:
return messages
tool_result_blocks = cast(
list[ToolResultBlockParam],
[
item
for message in messages
for item in (
message["content"] if isinstance(message["content"], list) else []
)
if isinstance(item, dict) and item.get("type") == "tool_result"
],
)
total_images = sum(
1
for tool_result in tool_result_blocks
for content in tool_result.get("content", [])
if isinstance(content, dict) and content.get("type") == "image"
)
images_to_remove = total_images - images_to_keep
# for better cache behavior, we want to remove in chunks
images_to_remove -= images_to_remove % min_removal_threshold
for tool_result in tool_result_blocks:
if isinstance(tool_result.get("content"), list):
new_content = []
for content in tool_result.get("content", []):
if isinstance(content, dict) and content.get("type") == "image":
if images_to_remove > 0:
images_to_remove -= 1
continue
new_content.append(content)
tool_result["content"] = new_content
if __name__ == "__main__":
pass
# client = Anthropic(api_key="")
# response = client.beta.messages.with_raw_response.create(
# max_tokens=4096,
# model="claude-3-5-sonnet-20241022",
# system=SYSTEM_PROMPT,
# # tools=ToolCollection(
# # ComputerTool(selected_screen=0),
# # BashTool(),
# # EditTool(),
# # ).to_params(),
# betas=["computer-use-2024-10-22"],
# messages=[
# {"role": "user", "content": "click on (199, 199)."}
# ],
# )
# print(f"AnthropicActor response: {response.parse().usage.input_tokens+response.parse().usage.output_tokens}")

View File

@@ -0,0 +1,109 @@
import os
import re
import ast
import base64
def is_image_path(text):
# Checking if the input text ends with typical image file extensions
image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
if text.endswith(image_extensions):
return True
else:
return False
def encode_image(image_path):
"""Encode image file to base64."""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def is_url_or_filepath(input_string):
# Check if input_string is a URL
url_pattern = re.compile(
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
)
if url_pattern.match(input_string):
return "URL"
# Check if input_string is a file path
file_path = os.path.abspath(input_string)
if os.path.exists(file_path):
return "File path"
return "Invalid"
def extract_data(input_string, data_type):
# Regular expression to extract content starting from '```python' until the end if there are no closing backticks
pattern = f"```{data_type}" + r"(.*?)(```|$)"
# Extract content
# re.DOTALL allows '.' to match newlines as well
matches = re.findall(pattern, input_string, re.DOTALL)
# Return the first match if exists, trimming whitespace and ignoring potential closing backticks
return matches[0][0].strip() if matches else input_string
def parse_input(code):
"""Use AST to parse the input string and extract the function name, arguments, and keyword arguments."""
def get_target_names(target):
"""Recursively get all variable names from the assignment target."""
if isinstance(target, ast.Name):
return [target.id]
elif isinstance(target, ast.Tuple):
names = []
for elt in target.elts:
names.extend(get_target_names(elt))
return names
return []
def extract_value(node):
"""提取 AST 节点的实际值"""
if isinstance(node, ast.Constant):
return node.value
elif isinstance(node, ast.Name):
# TODO: a better way to handle variables
raise ValueError(
f"Arguments should be a Constant, got a variable {node.id} instead."
)
# 添加其他需要处理的 AST 节点类型
return None
try:
tree = ast.parse(code)
for node in ast.walk(tree):
if isinstance(node, ast.Assign):
targets = []
for t in node.targets:
targets.extend(get_target_names(t))
if isinstance(node.value, ast.Call):
func_name = node.value.func.id
args = [ast.dump(arg) for arg in node.value.args]
kwargs = {
kw.arg: extract_value(kw.value) for kw in node.value.keywords
}
print(f"Input: {code.strip()}")
print(f"Output Variables: {targets}")
print(f"Function Name: {func_name}")
print(f"Arguments: {args}")
print(f"Keyword Arguments: {kwargs}")
elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Call):
targets = []
func_name = extract_value(node.value.func)
args = [extract_value(arg) for arg in node.value.args]
kwargs = {kw.arg: extract_value(kw.value) for kw in node.value.keywords}
except SyntaxError:
print(f"Input: {code.strip()}")
print("No match found")
return targets, func_name, args, kwargs
if __name__ == "__main__":
import json
s='{"Reasoning": "The Docker icon has been successfully clicked, and the Docker application should now be opening. No further actions are required.", "Next Action": None}'
json_str = json.loads(s)
print(json_str)

View File

@@ -0,0 +1,117 @@
import os
import logging
import base64
import requests
# from computer_use_demo.gui_agent.llm_utils import is_image_path, encode_image
def is_image_path(text):
# Checking if the input text ends with typical image file extensions
image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
if text.endswith(image_extensions):
return True
else:
return False
def encode_image(image_path):
"""Encode image file to base64."""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
# from openai import OpenAI
# client = OpenAI(
# api_key=os.environ.get("OPENAI_API_KEY")
# )
def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0):
api_key = api_key or os.environ.get("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY is not set")
headers = {"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"}
final_messages = [{"role": "system", "content": system}]
# image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
if type(messages) == list:
for item in messages:
contents = []
if isinstance(item, dict):
for cnt in item["content"]:
if isinstance(cnt, str):
if is_image_path(cnt):
base64_image = encode_image(cnt)
content = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
else:
content = {"type": "text", "text": cnt}
else:
# in this case it is a text block from anthropic
content = {"type": "text", "text": str(cnt)}
contents.append(content)
message = {"role": 'user', "content": contents}
else: # str
contents.append({"type": "text", "text": item})
message = {"role": "user", "content": contents}
final_messages.append(message)
elif isinstance(messages, str):
final_messages = [{"role": "user", "content": messages}]
# import pdb; pdb.set_trace()
print("[oai] sending messages:", {"role": "user", "content": messages})
payload = {
"model": llm,
"messages": final_messages,
"max_tokens": max_tokens,
"temperature": temperature,
# "stop": stop,
}
# from IPython.core.debugger import Pdb; Pdb().set_trace()
response = requests.post(
"https://api.openai.com/v1/chat/completions", headers=headers, json=payload
)
try:
text = response.json()['choices'][0]['message']['content']
token_usage = int(response.json()['usage']['total_tokens'])
return text, token_usage
# return error message if the response is not successful
except Exception as e:
print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ")
return response.json()
if __name__ == "__main__":
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY is not set")
text, token_usage = run_oai_interleaved(
messages= [{"content": [
"What is in the screenshot?",
"./tmp/outputs/screenshot_0b04acbb783d4706bc93873d17ba8c05.png"],
"role": "user"
}],
llm="gpt-4o-mini",
system="You are a helpful assistant",
api_key=api_key,
max_tokens=256,
temperature=0)
print(text, token_usage)
# There is an introduction describing the Calyx... 36986

View File

@@ -0,0 +1,107 @@
import os
import logging
import base64
import requests
import dashscope
# from computer_use_demo.gui_agent.llm_utils import is_image_path, encode_image
def is_image_path(text):
return False
def encode_image(image_path):
return ""
def run_qwen(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0):
api_key = api_key or os.environ.get("QWEN_API_KEY")
if not api_key:
raise ValueError("QWEN_API_KEY is not set")
dashscope.api_key = api_key
# from IPython.core.debugger import Pdb; Pdb().set_trace()
final_messages = [{"role": "system", "content": [{"text": system}]}]
# image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
if type(messages) == list:
for item in messages:
contents = []
if isinstance(item, dict):
for cnt in item["content"]:
if isinstance(cnt, str):
if is_image_path(cnt):
# base64_image = encode_image(cnt)
content = [{"image": cnt}]
# content = {"type": "image_url", "image_url": {"url": image_url}}
else:
content = {"text": cnt}
contents.append(content)
message = {"role": item["role"], "content": contents}
else: # str
contents.append({"text": item})
message = {"role": "user", "content": contents}
final_messages.append(message)
print("[qwen-vl] sending messages:", final_messages)
response = dashscope.MultiModalConversation.call(
model='qwen-vl-max-0809',
messages=final_messages
)
# from IPython.core.debugger import Pdb; Pdb().set_trace()
try:
text = response.output.choices[0].message.content[0]['text']
usage = response.usage
if "total_tokens" not in usage:
token_usage = int(usage["input_tokens"] + usage["output_tokens"])
else:
token_usage = int(usage["total_tokens"])
return text, token_usage
# return response.json()['choices'][0]['message']['content']
# return error message if the response is not successful
except Exception as e:
print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ")
return response.json()
if __name__ == "__main__":
api_key = os.environ.get("QWEN_API_KEY")
if not api_key:
raise ValueError("QWEN_API_KEY is not set")
dashscope.api_key = api_key
final_messages = [{"role": "user",
"content": [
{"text": "What is in the screenshot?"},
{"image": "./tmp/outputs/screenshot_0b04acbb783d4706bc93873d17ba8c05.png"}
]
}
]
response = dashscope.MultiModalConversation.call(model='qwen-vl-max-0809', messages=final_messages)
print(response)
text = response.output.choices[0].message.content[0]['text']
usage = response.usage
if "total_tokens" not in usage:
if "image_tokens" in usage:
token_usage = usage["input_tokens"] + usage["output_tokens"] + usage["image_tokens"]
else:
token_usage = usage["input_tokens"] + usage["output_tokens"]
else:
token_usage = usage["total_tokens"]
print(text, token_usage)
# The screenshot is from a video game... 1387

View File

@@ -0,0 +1,44 @@
import base64
import logging
from .oai import run_oai_interleaved
from .gemini import run_gemini_interleaved
def run_llm(prompt, llm="gpt-4o-mini", max_tokens=256, temperature=0, stop=None):
log_prompt(prompt)
# turn string prompt into list
if isinstance(prompt, str):
prompt = [prompt]
elif isinstance(prompt, list):
pass
else:
raise ValueError(f"Invalid prompt type: {type(prompt)}")
if llm.startswith("gpt"): # gpt series
out = run_oai_interleaved(
prompt,
llm,
max_tokens,
temperature,
stop
)
elif llm.startswith("gemini"): # gemini series
out = run_gemini_interleaved(
prompt,
llm,
max_tokens,
temperature,
stop
)
else:
raise ValueError(f"Invalid llm: {llm}")
logging.info(
f"========Output for {llm}=======\n{out}\n============================")
return out
def log_prompt(prompt):
prompt_display = [prompt] if isinstance(prompt, str) else prompt
prompt_display = "\n\n".join(prompt_display)
logging.info(
f"========Prompt=======\n{prompt_display}\n============================")