remove selected screen as only 1 screen
This commit is contained in:
@@ -209,7 +209,6 @@ def process_input(user_input, state):
|
|||||||
api_response_callback=partial(_api_response_callback, response_state=state["responses"]),
|
api_response_callback=partial(_api_response_callback, response_state=state["responses"]),
|
||||||
api_key=state["api_key"],
|
api_key=state["api_key"],
|
||||||
only_n_most_recent_images=state["only_n_most_recent_images"],
|
only_n_most_recent_images=state["only_n_most_recent_images"],
|
||||||
selected_screen=0,
|
|
||||||
omniparser_url=state["omniparser_url"]
|
omniparser_url=state["omniparser_url"]
|
||||||
):
|
):
|
||||||
if loop_msg is None:
|
if loop_msg is None:
|
||||||
|
|||||||
@@ -20,10 +20,9 @@ class AnthropicExecutor:
|
|||||||
self,
|
self,
|
||||||
output_callback: Callable[[BetaContentBlockParam], None],
|
output_callback: Callable[[BetaContentBlockParam], None],
|
||||||
tool_output_callback: Callable[[Any, str], None],
|
tool_output_callback: Callable[[Any, str], None],
|
||||||
selected_screen: int = 0
|
|
||||||
):
|
):
|
||||||
self.tool_collection = ToolCollection(
|
self.tool_collection = ToolCollection(
|
||||||
ComputerTool(selected_screen=selected_screen)
|
ComputerTool()
|
||||||
)
|
)
|
||||||
self.output_callback = output_callback
|
self.output_callback = output_callback
|
||||||
self.tool_output_callback = tool_output_callback
|
self.tool_output_callback = tool_output_callback
|
||||||
|
|||||||
@@ -53,7 +53,6 @@ class AnthropicActor:
|
|||||||
api_response_callback: Callable[[APIResponse[BetaMessage]], None],
|
api_response_callback: Callable[[APIResponse[BetaMessage]], None],
|
||||||
max_tokens: int = 4096,
|
max_tokens: int = 4096,
|
||||||
only_n_most_recent_images: int | None = None,
|
only_n_most_recent_images: int | None = None,
|
||||||
selected_screen: int = 0,
|
|
||||||
print_usage: bool = True,
|
print_usage: bool = True,
|
||||||
):
|
):
|
||||||
self.model = model
|
self.model = model
|
||||||
@@ -62,11 +61,8 @@ class AnthropicActor:
|
|||||||
self.api_response_callback = api_response_callback
|
self.api_response_callback = api_response_callback
|
||||||
self.max_tokens = max_tokens
|
self.max_tokens = max_tokens
|
||||||
self.only_n_most_recent_images = only_n_most_recent_images
|
self.only_n_most_recent_images = only_n_most_recent_images
|
||||||
self.selected_screen = selected_screen
|
|
||||||
|
|
||||||
self.tool_collection = ToolCollection(
|
self.tool_collection = ToolCollection(ComputerTool())
|
||||||
ComputerTool(selected_screen=selected_screen),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.system = SYSTEM_PROMPT
|
self.system = SYSTEM_PROMPT
|
||||||
|
|
||||||
@@ -175,7 +171,7 @@ if __name__ == "__main__":
|
|||||||
# model="claude-3-5-sonnet-20241022",
|
# model="claude-3-5-sonnet-20241022",
|
||||||
# system=SYSTEM_PROMPT,
|
# system=SYSTEM_PROMPT,
|
||||||
# # tools=ToolCollection(
|
# # tools=ToolCollection(
|
||||||
# # ComputerTool(selected_screen=0),
|
# # ComputerTool(),
|
||||||
# # ).to_params(),
|
# # ).to_params(),
|
||||||
# betas=["computer-use-2024-10-22"],
|
# betas=["computer-use-2024-10-22"],
|
||||||
# messages=[
|
# messages=[
|
||||||
|
|||||||
@@ -46,15 +46,13 @@ def sampling_loop_sync(
|
|||||||
api_key: str,
|
api_key: str,
|
||||||
only_n_most_recent_images: int | None = 2,
|
only_n_most_recent_images: int | None = 2,
|
||||||
max_tokens: int = 4096,
|
max_tokens: int = 4096,
|
||||||
selected_screen: int = 0,
|
|
||||||
omniparser_url: str
|
omniparser_url: str
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Synchronous agentic sampling loop for the assistant/tool interaction of computer use.
|
Synchronous agentic sampling loop for the assistant/tool interaction of computer use.
|
||||||
"""
|
"""
|
||||||
print('in sampling_loop_sync, model:', model)
|
print('in sampling_loop_sync, model:', model)
|
||||||
omniparser = OmniParser(url=f"http://{omniparser_url}/send_text/" if omniparser_url else None,
|
omniparser = OmniParser(url=f"http://{omniparser_url}/send_text/" if omniparser_url else None)
|
||||||
selected_screen=selected_screen,)
|
|
||||||
if model == "claude-3-5-sonnet-20241022":
|
if model == "claude-3-5-sonnet-20241022":
|
||||||
# Register Actor and Executor
|
# Register Actor and Executor
|
||||||
actor = AnthropicActor(
|
actor = AnthropicActor(
|
||||||
@@ -63,15 +61,13 @@ def sampling_loop_sync(
|
|||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
api_response_callback=api_response_callback,
|
api_response_callback=api_response_callback,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
only_n_most_recent_images=only_n_most_recent_images,
|
only_n_most_recent_images=only_n_most_recent_images
|
||||||
selected_screen=selected_screen
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# from IPython.core.debugger import Pdb; Pdb().set_trace()
|
# from IPython.core.debugger import Pdb; Pdb().set_trace()
|
||||||
executor = AnthropicExecutor(
|
executor = AnthropicExecutor(
|
||||||
output_callback=output_callback,
|
output_callback=output_callback,
|
||||||
tool_output_callback=tool_output_callback,
|
tool_output_callback=tool_output_callback
|
||||||
selected_screen=selected_screen
|
|
||||||
)
|
)
|
||||||
|
|
||||||
elif model == "omniparser + gpt-4o" or model == "omniparser + phi35v":
|
elif model == "omniparser + gpt-4o" or model == "omniparser + phi35v":
|
||||||
@@ -80,14 +76,12 @@ def sampling_loop_sync(
|
|||||||
provider=provider,
|
provider=provider,
|
||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
api_response_callback=api_response_callback,
|
api_response_callback=api_response_callback,
|
||||||
selected_screen=selected_screen,
|
|
||||||
output_callback=output_callback,
|
output_callback=output_callback,
|
||||||
)
|
)
|
||||||
|
|
||||||
executor = AnthropicExecutor(
|
executor = AnthropicExecutor(
|
||||||
output_callback=output_callback,
|
output_callback=output_callback,
|
||||||
tool_output_callback=tool_output_callback,
|
tool_output_callback=tool_output_callback,
|
||||||
selected_screen=selected_screen
|
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -35,10 +35,8 @@ def extract_data(input_string, data_type):
|
|||||||
|
|
||||||
class OmniParser:
|
class OmniParser:
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
url: str,
|
url: str) -> None:
|
||||||
selected_screen: int = 0) -> None:
|
|
||||||
self.url = url
|
self.url = url
|
||||||
self.selected_screen = selected_screen
|
|
||||||
if not self.url:
|
if not self.url:
|
||||||
config = {
|
config = {
|
||||||
'som_model_path': '../weights/icon_detect_v1_5/model_v1_5.pt',
|
'som_model_path': '../weights/icon_detect_v1_5/model_v1_5.pt',
|
||||||
@@ -51,7 +49,7 @@ class OmniParser:
|
|||||||
self.omniparser = Omniparser_class(config=config)
|
self.omniparser = Omniparser_class(config=config)
|
||||||
|
|
||||||
def __call__(self,):
|
def __call__(self,):
|
||||||
screenshot, screenshot_path = get_screenshot(selected_screen=self.selected_screen)
|
screenshot, screenshot_path = get_screenshot()
|
||||||
screenshot_path = str(screenshot_path)
|
screenshot_path = str(screenshot_path)
|
||||||
image_base64 = encode_image(screenshot_path)
|
image_base64 = encode_image(screenshot_path)
|
||||||
if self.url:
|
if self.url:
|
||||||
@@ -106,7 +104,6 @@ class VLMAgent:
|
|||||||
api_response_callback: Callable,
|
api_response_callback: Callable,
|
||||||
max_tokens: int = 4096,
|
max_tokens: int = 4096,
|
||||||
only_n_most_recent_images: int | None = None,
|
only_n_most_recent_images: int | None = None,
|
||||||
selected_screen: int = 0,
|
|
||||||
print_usage: bool = True,
|
print_usage: bool = True,
|
||||||
):
|
):
|
||||||
if model == "omniparser + gpt-4o":
|
if model == "omniparser + gpt-4o":
|
||||||
@@ -119,7 +116,6 @@ class VLMAgent:
|
|||||||
self.api_response_callback = api_response_callback
|
self.api_response_callback = api_response_callback
|
||||||
self.max_tokens = max_tokens
|
self.max_tokens = max_tokens
|
||||||
self.only_n_most_recent_images = only_n_most_recent_images
|
self.only_n_most_recent_images = only_n_most_recent_images
|
||||||
self.selected_screen = selected_screen
|
|
||||||
self.output_callback = output_callback
|
self.output_callback = output_callback
|
||||||
|
|
||||||
self.print_usage = print_usage
|
self.print_usage = print_usage
|
||||||
|
|||||||
@@ -86,14 +86,13 @@ class ComputerTool(BaseAnthropicTool):
|
|||||||
def to_params(self) -> BetaToolComputerUse20241022Param:
|
def to_params(self) -> BetaToolComputerUse20241022Param:
|
||||||
return {"name": self.name, "type": self.api_type, **self.options}
|
return {"name": self.name, "type": self.api_type, **self.options}
|
||||||
|
|
||||||
def __init__(self, selected_screen: int = 0, is_scaling: bool = False):
|
def __init__(self, is_scaling: bool = False):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
# Get screen width and height using Windows command
|
# Get screen width and height using Windows command
|
||||||
self.display_num = None
|
self.display_num = None
|
||||||
self.offset_x = 0
|
self.offset_x = 0
|
||||||
self.offset_y = 0
|
self.offset_y = 0
|
||||||
self.selected_screen = selected_screen
|
|
||||||
self.is_scaling = is_scaling
|
self.is_scaling = is_scaling
|
||||||
self.width, self.height = self.get_screen_size()
|
self.width, self.height = self.get_screen_size()
|
||||||
print(f"screen size: {self.width}, {self.height}")
|
print(f"screen size: {self.width}, {self.height}")
|
||||||
@@ -253,7 +252,7 @@ class ComputerTool(BaseAnthropicTool):
|
|||||||
screenshot = self.padding_image(screenshot)
|
screenshot = self.padding_image(screenshot)
|
||||||
self.target_dimension = MAX_SCALING_TARGETS["WXGA"]
|
self.target_dimension = MAX_SCALING_TARGETS["WXGA"]
|
||||||
width, height = self.target_dimension["width"], self.target_dimension["height"]
|
width, height = self.target_dimension["width"], self.target_dimension["height"]
|
||||||
screenshot, path = get_screenshot(selected_screen=0, resize=True, target_width=width, target_height=height)
|
screenshot, path = get_screenshot(resize=True, target_width=width, target_height=height)
|
||||||
|
|
||||||
# return ToolResult()
|
# return ToolResult()
|
||||||
return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode())
|
return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode())
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from io import BytesIO
|
|||||||
|
|
||||||
OUTPUT_DIR = "./tmp/outputs"
|
OUTPUT_DIR = "./tmp/outputs"
|
||||||
|
|
||||||
def get_screenshot(selected_screen: int = 0, resize: bool = False, target_width: int = 1920, target_height: int = 1080):
|
def get_screenshot(resize: bool = False, target_width: int = 1920, target_height: int = 1080):
|
||||||
"""Capture screenshot by requesting from HTTP endpoint - returns native resolution unless resized"""
|
"""Capture screenshot by requesting from HTTP endpoint - returns native resolution unless resized"""
|
||||||
output_dir = Path(OUTPUT_DIR)
|
output_dir = Path(OUTPUT_DIR)
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user