diff --git a/demo/gradio/app.py b/demo/gradio/app.py index 94b478f..0004e3c 100644 --- a/demo/gradio/app.py +++ b/demo/gradio/app.py @@ -209,7 +209,6 @@ def process_input(user_input, state): api_response_callback=partial(_api_response_callback, response_state=state["responses"]), api_key=state["api_key"], only_n_most_recent_images=state["only_n_most_recent_images"], - selected_screen=0, omniparser_url=state["omniparser_url"] ): if loop_msg is None: diff --git a/demo/gradio/computer_use_demo/executor/anthropic_executor.py b/demo/gradio/computer_use_demo/executor/anthropic_executor.py index 2970be1..a5d5837 100644 --- a/demo/gradio/computer_use_demo/executor/anthropic_executor.py +++ b/demo/gradio/computer_use_demo/executor/anthropic_executor.py @@ -20,10 +20,9 @@ class AnthropicExecutor: self, output_callback: Callable[[BetaContentBlockParam], None], tool_output_callback: Callable[[Any, str], None], - selected_screen: int = 0 ): self.tool_collection = ToolCollection( - ComputerTool(selected_screen=selected_screen) + ComputerTool() ) self.output_callback = output_callback self.tool_output_callback = tool_output_callback diff --git a/demo/gradio/computer_use_demo/gui_agent/anthropic_agent.py b/demo/gradio/computer_use_demo/gui_agent/anthropic_agent.py index 0e49850..b1a40c0 100644 --- a/demo/gradio/computer_use_demo/gui_agent/anthropic_agent.py +++ b/demo/gradio/computer_use_demo/gui_agent/anthropic_agent.py @@ -53,7 +53,6 @@ class AnthropicActor: api_response_callback: Callable[[APIResponse[BetaMessage]], None], max_tokens: int = 4096, only_n_most_recent_images: int | None = None, - selected_screen: int = 0, print_usage: bool = True, ): self.model = model @@ -62,11 +61,8 @@ class AnthropicActor: self.api_response_callback = api_response_callback self.max_tokens = max_tokens self.only_n_most_recent_images = only_n_most_recent_images - self.selected_screen = selected_screen - self.tool_collection = ToolCollection( - ComputerTool(selected_screen=selected_screen), - ) + self.tool_collection = ToolCollection(ComputerTool()) self.system = SYSTEM_PROMPT @@ -175,7 +171,7 @@ if __name__ == "__main__": # model="claude-3-5-sonnet-20241022", # system=SYSTEM_PROMPT, # # tools=ToolCollection( - # # ComputerTool(selected_screen=0), + # # ComputerTool(), # # ).to_params(), # betas=["computer-use-2024-10-22"], # messages=[ diff --git a/demo/gradio/computer_use_demo/loop.py b/demo/gradio/computer_use_demo/loop.py index 3e4e3bf..c400ec4 100644 --- a/demo/gradio/computer_use_demo/loop.py +++ b/demo/gradio/computer_use_demo/loop.py @@ -46,15 +46,13 @@ def sampling_loop_sync( api_key: str, only_n_most_recent_images: int | None = 2, max_tokens: int = 4096, - selected_screen: int = 0, omniparser_url: str ): """ Synchronous agentic sampling loop for the assistant/tool interaction of computer use. """ print('in sampling_loop_sync, model:', model) - omniparser = OmniParser(url=f"http://{omniparser_url}/send_text/" if omniparser_url else None, - selected_screen=selected_screen,) + omniparser = OmniParser(url=f"http://{omniparser_url}/send_text/" if omniparser_url else None) if model == "claude-3-5-sonnet-20241022": # Register Actor and Executor actor = AnthropicActor( @@ -63,15 +61,13 @@ def sampling_loop_sync( api_key=api_key, api_response_callback=api_response_callback, max_tokens=max_tokens, - only_n_most_recent_images=only_n_most_recent_images, - selected_screen=selected_screen + only_n_most_recent_images=only_n_most_recent_images ) # from IPython.core.debugger import Pdb; Pdb().set_trace() executor = AnthropicExecutor( output_callback=output_callback, - tool_output_callback=tool_output_callback, - selected_screen=selected_screen + tool_output_callback=tool_output_callback ) elif model == "omniparser + gpt-4o" or model == "omniparser + phi35v": @@ -80,14 +76,12 @@ def sampling_loop_sync( provider=provider, api_key=api_key, api_response_callback=api_response_callback, - selected_screen=selected_screen, output_callback=output_callback, ) executor = AnthropicExecutor( output_callback=output_callback, tool_output_callback=tool_output_callback, - selected_screen=selected_screen ) else: diff --git a/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py b/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py index 421e196..92a7901 100644 --- a/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py +++ b/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py @@ -35,10 +35,8 @@ def extract_data(input_string, data_type): class OmniParser: def __init__(self, - url: str, - selected_screen: int = 0) -> None: + url: str) -> None: self.url = url - self.selected_screen = selected_screen if not self.url: config = { 'som_model_path': '../weights/icon_detect_v1_5/model_v1_5.pt', @@ -51,7 +49,7 @@ class OmniParser: self.omniparser = Omniparser_class(config=config) def __call__(self,): - screenshot, screenshot_path = get_screenshot(selected_screen=self.selected_screen) + screenshot, screenshot_path = get_screenshot() screenshot_path = str(screenshot_path) image_base64 = encode_image(screenshot_path) if self.url: @@ -106,7 +104,6 @@ class VLMAgent: api_response_callback: Callable, max_tokens: int = 4096, only_n_most_recent_images: int | None = None, - selected_screen: int = 0, print_usage: bool = True, ): if model == "omniparser + gpt-4o": @@ -119,7 +116,6 @@ class VLMAgent: self.api_response_callback = api_response_callback self.max_tokens = max_tokens self.only_n_most_recent_images = only_n_most_recent_images - self.selected_screen = selected_screen self.output_callback = output_callback self.print_usage = print_usage diff --git a/demo/gradio/computer_use_demo/tools/computer.py b/demo/gradio/computer_use_demo/tools/computer.py index 7086a62..6f9fbe6 100644 --- a/demo/gradio/computer_use_demo/tools/computer.py +++ b/demo/gradio/computer_use_demo/tools/computer.py @@ -86,14 +86,13 @@ class ComputerTool(BaseAnthropicTool): def to_params(self) -> BetaToolComputerUse20241022Param: return {"name": self.name, "type": self.api_type, **self.options} - def __init__(self, selected_screen: int = 0, is_scaling: bool = False): + def __init__(self, is_scaling: bool = False): super().__init__() # Get screen width and height using Windows command self.display_num = None self.offset_x = 0 self.offset_y = 0 - self.selected_screen = selected_screen self.is_scaling = is_scaling self.width, self.height = self.get_screen_size() print(f"screen size: {self.width}, {self.height}") @@ -253,7 +252,7 @@ class ComputerTool(BaseAnthropicTool): screenshot = self.padding_image(screenshot) self.target_dimension = MAX_SCALING_TARGETS["WXGA"] width, height = self.target_dimension["width"], self.target_dimension["height"] - screenshot, path = get_screenshot(selected_screen=0, resize=True, target_width=width, target_height=height) + screenshot, path = get_screenshot(resize=True, target_width=width, target_height=height) # return ToolResult() return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode()) diff --git a/demo/gradio/computer_use_demo/tools/screen_capture.py b/demo/gradio/computer_use_demo/tools/screen_capture.py index 40529d3..1c1ad04 100644 --- a/demo/gradio/computer_use_demo/tools/screen_capture.py +++ b/demo/gradio/computer_use_demo/tools/screen_capture.py @@ -7,7 +7,7 @@ from io import BytesIO OUTPUT_DIR = "./tmp/outputs" -def get_screenshot(selected_screen: int = 0, resize: bool = False, target_width: int = 1920, target_height: int = 1080): +def get_screenshot(resize: bool = False, target_width: int = 1920, target_height: int = 1080): """Capture screenshot by requesting from HTTP endpoint - returns native resolution unless resized""" output_dir = Path(OUTPUT_DIR) output_dir.mkdir(parents=True, exist_ok=True)