remove selected screen as only 1 screen

This commit is contained in:
Thomas Dhome Casanova (from Dev Box)
2025-01-22 21:45:32 -08:00
parent b1cd705f1b
commit 8778970aff
7 changed files with 11 additions and 28 deletions

View File

@@ -209,7 +209,6 @@ def process_input(user_input, state):
api_response_callback=partial(_api_response_callback, response_state=state["responses"]), api_response_callback=partial(_api_response_callback, response_state=state["responses"]),
api_key=state["api_key"], api_key=state["api_key"],
only_n_most_recent_images=state["only_n_most_recent_images"], only_n_most_recent_images=state["only_n_most_recent_images"],
selected_screen=0,
omniparser_url=state["omniparser_url"] omniparser_url=state["omniparser_url"]
): ):
if loop_msg is None: if loop_msg is None:

View File

@@ -20,10 +20,9 @@ class AnthropicExecutor:
self, self,
output_callback: Callable[[BetaContentBlockParam], None], output_callback: Callable[[BetaContentBlockParam], None],
tool_output_callback: Callable[[Any, str], None], tool_output_callback: Callable[[Any, str], None],
selected_screen: int = 0
): ):
self.tool_collection = ToolCollection( self.tool_collection = ToolCollection(
ComputerTool(selected_screen=selected_screen) ComputerTool()
) )
self.output_callback = output_callback self.output_callback = output_callback
self.tool_output_callback = tool_output_callback self.tool_output_callback = tool_output_callback

View File

@@ -53,7 +53,6 @@ class AnthropicActor:
api_response_callback: Callable[[APIResponse[BetaMessage]], None], api_response_callback: Callable[[APIResponse[BetaMessage]], None],
max_tokens: int = 4096, max_tokens: int = 4096,
only_n_most_recent_images: int | None = None, only_n_most_recent_images: int | None = None,
selected_screen: int = 0,
print_usage: bool = True, print_usage: bool = True,
): ):
self.model = model self.model = model
@@ -62,11 +61,8 @@ class AnthropicActor:
self.api_response_callback = api_response_callback self.api_response_callback = api_response_callback
self.max_tokens = max_tokens self.max_tokens = max_tokens
self.only_n_most_recent_images = only_n_most_recent_images self.only_n_most_recent_images = only_n_most_recent_images
self.selected_screen = selected_screen
self.tool_collection = ToolCollection( self.tool_collection = ToolCollection(ComputerTool())
ComputerTool(selected_screen=selected_screen),
)
self.system = SYSTEM_PROMPT self.system = SYSTEM_PROMPT
@@ -175,7 +171,7 @@ if __name__ == "__main__":
# model="claude-3-5-sonnet-20241022", # model="claude-3-5-sonnet-20241022",
# system=SYSTEM_PROMPT, # system=SYSTEM_PROMPT,
# # tools=ToolCollection( # # tools=ToolCollection(
# # ComputerTool(selected_screen=0), # # ComputerTool(),
# # ).to_params(), # # ).to_params(),
# betas=["computer-use-2024-10-22"], # betas=["computer-use-2024-10-22"],
# messages=[ # messages=[

View File

@@ -46,15 +46,13 @@ def sampling_loop_sync(
api_key: str, api_key: str,
only_n_most_recent_images: int | None = 2, only_n_most_recent_images: int | None = 2,
max_tokens: int = 4096, max_tokens: int = 4096,
selected_screen: int = 0,
omniparser_url: str omniparser_url: str
): ):
""" """
Synchronous agentic sampling loop for the assistant/tool interaction of computer use. Synchronous agentic sampling loop for the assistant/tool interaction of computer use.
""" """
print('in sampling_loop_sync, model:', model) print('in sampling_loop_sync, model:', model)
omniparser = OmniParser(url=f"http://{omniparser_url}/send_text/" if omniparser_url else None, omniparser = OmniParser(url=f"http://{omniparser_url}/send_text/" if omniparser_url else None)
selected_screen=selected_screen,)
if model == "claude-3-5-sonnet-20241022": if model == "claude-3-5-sonnet-20241022":
# Register Actor and Executor # Register Actor and Executor
actor = AnthropicActor( actor = AnthropicActor(
@@ -63,15 +61,13 @@ def sampling_loop_sync(
api_key=api_key, api_key=api_key,
api_response_callback=api_response_callback, api_response_callback=api_response_callback,
max_tokens=max_tokens, max_tokens=max_tokens,
only_n_most_recent_images=only_n_most_recent_images, only_n_most_recent_images=only_n_most_recent_images
selected_screen=selected_screen
) )
# from IPython.core.debugger import Pdb; Pdb().set_trace() # from IPython.core.debugger import Pdb; Pdb().set_trace()
executor = AnthropicExecutor( executor = AnthropicExecutor(
output_callback=output_callback, output_callback=output_callback,
tool_output_callback=tool_output_callback, tool_output_callback=tool_output_callback
selected_screen=selected_screen
) )
elif model == "omniparser + gpt-4o" or model == "omniparser + phi35v": elif model == "omniparser + gpt-4o" or model == "omniparser + phi35v":
@@ -80,14 +76,12 @@ def sampling_loop_sync(
provider=provider, provider=provider,
api_key=api_key, api_key=api_key,
api_response_callback=api_response_callback, api_response_callback=api_response_callback,
selected_screen=selected_screen,
output_callback=output_callback, output_callback=output_callback,
) )
executor = AnthropicExecutor( executor = AnthropicExecutor(
output_callback=output_callback, output_callback=output_callback,
tool_output_callback=tool_output_callback, tool_output_callback=tool_output_callback,
selected_screen=selected_screen
) )
else: else:

View File

@@ -35,10 +35,8 @@ def extract_data(input_string, data_type):
class OmniParser: class OmniParser:
def __init__(self, def __init__(self,
url: str, url: str) -> None:
selected_screen: int = 0) -> None:
self.url = url self.url = url
self.selected_screen = selected_screen
if not self.url: if not self.url:
config = { config = {
'som_model_path': '../weights/icon_detect_v1_5/model_v1_5.pt', 'som_model_path': '../weights/icon_detect_v1_5/model_v1_5.pt',
@@ -51,7 +49,7 @@ class OmniParser:
self.omniparser = Omniparser_class(config=config) self.omniparser = Omniparser_class(config=config)
def __call__(self,): def __call__(self,):
screenshot, screenshot_path = get_screenshot(selected_screen=self.selected_screen) screenshot, screenshot_path = get_screenshot()
screenshot_path = str(screenshot_path) screenshot_path = str(screenshot_path)
image_base64 = encode_image(screenshot_path) image_base64 = encode_image(screenshot_path)
if self.url: if self.url:
@@ -106,7 +104,6 @@ class VLMAgent:
api_response_callback: Callable, api_response_callback: Callable,
max_tokens: int = 4096, max_tokens: int = 4096,
only_n_most_recent_images: int | None = None, only_n_most_recent_images: int | None = None,
selected_screen: int = 0,
print_usage: bool = True, print_usage: bool = True,
): ):
if model == "omniparser + gpt-4o": if model == "omniparser + gpt-4o":
@@ -119,7 +116,6 @@ class VLMAgent:
self.api_response_callback = api_response_callback self.api_response_callback = api_response_callback
self.max_tokens = max_tokens self.max_tokens = max_tokens
self.only_n_most_recent_images = only_n_most_recent_images self.only_n_most_recent_images = only_n_most_recent_images
self.selected_screen = selected_screen
self.output_callback = output_callback self.output_callback = output_callback
self.print_usage = print_usage self.print_usage = print_usage

View File

@@ -86,14 +86,13 @@ class ComputerTool(BaseAnthropicTool):
def to_params(self) -> BetaToolComputerUse20241022Param: def to_params(self) -> BetaToolComputerUse20241022Param:
return {"name": self.name, "type": self.api_type, **self.options} return {"name": self.name, "type": self.api_type, **self.options}
def __init__(self, selected_screen: int = 0, is_scaling: bool = False): def __init__(self, is_scaling: bool = False):
super().__init__() super().__init__()
# Get screen width and height using Windows command # Get screen width and height using Windows command
self.display_num = None self.display_num = None
self.offset_x = 0 self.offset_x = 0
self.offset_y = 0 self.offset_y = 0
self.selected_screen = selected_screen
self.is_scaling = is_scaling self.is_scaling = is_scaling
self.width, self.height = self.get_screen_size() self.width, self.height = self.get_screen_size()
print(f"screen size: {self.width}, {self.height}") print(f"screen size: {self.width}, {self.height}")
@@ -253,7 +252,7 @@ class ComputerTool(BaseAnthropicTool):
screenshot = self.padding_image(screenshot) screenshot = self.padding_image(screenshot)
self.target_dimension = MAX_SCALING_TARGETS["WXGA"] self.target_dimension = MAX_SCALING_TARGETS["WXGA"]
width, height = self.target_dimension["width"], self.target_dimension["height"] width, height = self.target_dimension["width"], self.target_dimension["height"]
screenshot, path = get_screenshot(selected_screen=0, resize=True, target_width=width, target_height=height) screenshot, path = get_screenshot(resize=True, target_width=width, target_height=height)
# return ToolResult() # return ToolResult()
return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode()) return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode())

View File

@@ -7,7 +7,7 @@ from io import BytesIO
OUTPUT_DIR = "./tmp/outputs" OUTPUT_DIR = "./tmp/outputs"
def get_screenshot(selected_screen: int = 0, resize: bool = False, target_width: int = 1920, target_height: int = 1080): def get_screenshot(resize: bool = False, target_width: int = 1920, target_height: int = 1080):
"""Capture screenshot by requesting from HTTP endpoint - returns native resolution unless resized""" """Capture screenshot by requesting from HTTP endpoint - returns native resolution unless resized"""
output_dir = Path(OUTPUT_DIR) output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)