Files
OmniParser/demo/gradio/computer_use_demo/loop.py
Thomas Dhome Casanova (from Dev Box) 9db016b52f unused file cleanup
2025-01-22 21:14:21 -08:00

139 lines
5.3 KiB
Python

"""
Agentic sampling loop that calls the Anthropic API and local implenmentation of anthropic-defined computer use tools.
"""
from collections.abc import Callable
from enum import StrEnum
from anthropic import APIResponse
from anthropic.types import (
TextBlock,
)
from anthropic.types.beta import (
BetaContentBlock,
BetaMessage,
BetaMessageParam
)
from computer_use_demo.tools import ToolResult
from computer_use_demo.gui_agent.anthropic_agent import AnthropicActor
from computer_use_demo.executor.anthropic_executor import AnthropicExecutor
from computer_use_demo.omniparser_agent.vlm_agent import OmniParser, VLMAgent
BETA_FLAG = "computer-use-2024-10-22"
class APIProvider(StrEnum):
ANTHROPIC = "anthropic"
BEDROCK = "bedrock"
VERTEX = "vertex"
OPENAI = "openai"
PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
APIProvider.OPENAI: "gpt-4o",
}
def sampling_loop_sync(
*,
model: str,
provider: APIProvider | None,
messages: list[BetaMessageParam],
output_callback: Callable[[BetaContentBlock], None],
tool_output_callback: Callable[[ToolResult, str], None],
api_response_callback: Callable[[APIResponse[BetaMessage]], None],
api_key: str,
only_n_most_recent_images: int | None = 2,
max_tokens: int = 4096,
selected_screen: int = 0
):
"""
Synchronous agentic sampling loop for the assistant/tool interaction of computer use.
"""
print('in sampling_loop_sync, model:', model)
if model == "claude-3-5-sonnet-20241022":
omniparser = OmniParser(url="http://localhost:8000/send_text/",
selected_screen=selected_screen,)
# Register Actor and Executor
actor = AnthropicActor(
model=model,
provider=provider,
api_key=api_key,
api_response_callback=api_response_callback,
max_tokens=max_tokens,
only_n_most_recent_images=only_n_most_recent_images,
selected_screen=selected_screen
)
# from IPython.core.debugger import Pdb; Pdb().set_trace()
executor = AnthropicExecutor(
output_callback=output_callback,
tool_output_callback=tool_output_callback,
selected_screen=selected_screen
)
elif model == "omniparser + gpt-4o" or model == "omniparser + phi35v":
# omniparser = OmniParser(url="http://localhost:8000/send_text/",
# selected_screen=selected_screen,)
omniparser = OmniParser(url=None,
selected_screen=selected_screen,)
actor = VLMAgent(
model=model,
provider=provider,
api_key=api_key,
api_response_callback=api_response_callback,
selected_screen=selected_screen,
output_callback=output_callback,
)
executor = AnthropicExecutor(
output_callback=output_callback,
tool_output_callback=tool_output_callback,
selected_screen=selected_screen
)
else:
raise ValueError(f"Model {model} not supported")
print(f"Model Inited: {model}, Provider: {provider}")
tool_result_content = None
print(f"Start the message loop. User messages: {messages}")
if model == "claude-3-5-sonnet-20241022": # Anthropic loop
while True:
parsed_screen = omniparser() # parsed_screen: {"som_image_base64": dino_labled_img, "parsed_content_list": parsed_content_list, "screen_info"}
import pdb; pdb.set_trace()
screen_info_block = TextBlock(text='Below is the structured accessibility information of the current UI screen, which includes text and icons you can operate on, take these information into account when you are making the prediction for the next action. Note you will still need to take screenshot to get the image: \n' + parsed_screen['screen_info'], type='text')
# # messages[-1]['content'].append(screen_info_block)
screen_info_dict = {"role": "user", "content": [screen_info_block]}
messages.append(screen_info_dict)
tools_use_needed = actor(messages=messages)
for message, tool_result_content in executor(tools_use_needed, messages):
yield message
if not tool_result_content:
return messages
messages.append({"content": tool_result_content, "role": "user"})
elif model == "omniparser + gpt-4o" or model == "omniparser + phi35v":
while True:
parsed_screen = omniparser()
tools_use_needed, vlm_response_json = actor(messages=messages, parsed_screen=parsed_screen)
for message, tool_result_content in executor(tools_use_needed, messages):
yield message
if not tool_result_content:
return messages
# import pdb; pdb.set_trace()
# messages.append({"role": "user",
# "content": ["History plan:\n" + str(vlm_response_json['Reasoning'])]})
# messages.append({"content": tool_result_content, "role": "user"})