remove selected screen as only 1 screen

This commit is contained in:
Thomas Dhome Casanova (from Dev Box)
2025-01-22 21:45:32 -08:00
parent b1cd705f1b
commit 8778970aff
7 changed files with 11 additions and 28 deletions

View File

@@ -209,7 +209,6 @@ def process_input(user_input, state):
api_response_callback=partial(_api_response_callback, response_state=state["responses"]),
api_key=state["api_key"],
only_n_most_recent_images=state["only_n_most_recent_images"],
selected_screen=0,
omniparser_url=state["omniparser_url"]
):
if loop_msg is None:

View File

@@ -20,10 +20,9 @@ class AnthropicExecutor:
self,
output_callback: Callable[[BetaContentBlockParam], None],
tool_output_callback: Callable[[Any, str], None],
selected_screen: int = 0
):
self.tool_collection = ToolCollection(
ComputerTool(selected_screen=selected_screen)
ComputerTool()
)
self.output_callback = output_callback
self.tool_output_callback = tool_output_callback

View File

@@ -53,7 +53,6 @@ class AnthropicActor:
api_response_callback: Callable[[APIResponse[BetaMessage]], None],
max_tokens: int = 4096,
only_n_most_recent_images: int | None = None,
selected_screen: int = 0,
print_usage: bool = True,
):
self.model = model
@@ -62,11 +61,8 @@ class AnthropicActor:
self.api_response_callback = api_response_callback
self.max_tokens = max_tokens
self.only_n_most_recent_images = only_n_most_recent_images
self.selected_screen = selected_screen
self.tool_collection = ToolCollection(
ComputerTool(selected_screen=selected_screen),
)
self.tool_collection = ToolCollection(ComputerTool())
self.system = SYSTEM_PROMPT
@@ -175,7 +171,7 @@ if __name__ == "__main__":
# model="claude-3-5-sonnet-20241022",
# system=SYSTEM_PROMPT,
# # tools=ToolCollection(
# # ComputerTool(selected_screen=0),
# # ComputerTool(),
# # ).to_params(),
# betas=["computer-use-2024-10-22"],
# messages=[

View File

@@ -46,15 +46,13 @@ def sampling_loop_sync(
api_key: str,
only_n_most_recent_images: int | None = 2,
max_tokens: int = 4096,
selected_screen: int = 0,
omniparser_url: str
):
"""
Synchronous agentic sampling loop for the assistant/tool interaction of computer use.
"""
print('in sampling_loop_sync, model:', model)
omniparser = OmniParser(url=f"http://{omniparser_url}/send_text/" if omniparser_url else None,
selected_screen=selected_screen,)
omniparser = OmniParser(url=f"http://{omniparser_url}/send_text/" if omniparser_url else None)
if model == "claude-3-5-sonnet-20241022":
# Register Actor and Executor
actor = AnthropicActor(
@@ -63,15 +61,13 @@ def sampling_loop_sync(
api_key=api_key,
api_response_callback=api_response_callback,
max_tokens=max_tokens,
only_n_most_recent_images=only_n_most_recent_images,
selected_screen=selected_screen
only_n_most_recent_images=only_n_most_recent_images
)
# from IPython.core.debugger import Pdb; Pdb().set_trace()
executor = AnthropicExecutor(
output_callback=output_callback,
tool_output_callback=tool_output_callback,
selected_screen=selected_screen
tool_output_callback=tool_output_callback
)
elif model == "omniparser + gpt-4o" or model == "omniparser + phi35v":
@@ -80,14 +76,12 @@ def sampling_loop_sync(
provider=provider,
api_key=api_key,
api_response_callback=api_response_callback,
selected_screen=selected_screen,
output_callback=output_callback,
)
executor = AnthropicExecutor(
output_callback=output_callback,
tool_output_callback=tool_output_callback,
selected_screen=selected_screen
)
else:

View File

@@ -35,10 +35,8 @@ def extract_data(input_string, data_type):
class OmniParser:
def __init__(self,
url: str,
selected_screen: int = 0) -> None:
url: str) -> None:
self.url = url
self.selected_screen = selected_screen
if not self.url:
config = {
'som_model_path': '../weights/icon_detect_v1_5/model_v1_5.pt',
@@ -51,7 +49,7 @@ class OmniParser:
self.omniparser = Omniparser_class(config=config)
def __call__(self,):
screenshot, screenshot_path = get_screenshot(selected_screen=self.selected_screen)
screenshot, screenshot_path = get_screenshot()
screenshot_path = str(screenshot_path)
image_base64 = encode_image(screenshot_path)
if self.url:
@@ -106,7 +104,6 @@ class VLMAgent:
api_response_callback: Callable,
max_tokens: int = 4096,
only_n_most_recent_images: int | None = None,
selected_screen: int = 0,
print_usage: bool = True,
):
if model == "omniparser + gpt-4o":
@@ -119,7 +116,6 @@ class VLMAgent:
self.api_response_callback = api_response_callback
self.max_tokens = max_tokens
self.only_n_most_recent_images = only_n_most_recent_images
self.selected_screen = selected_screen
self.output_callback = output_callback
self.print_usage = print_usage

View File

@@ -86,14 +86,13 @@ class ComputerTool(BaseAnthropicTool):
def to_params(self) -> BetaToolComputerUse20241022Param:
return {"name": self.name, "type": self.api_type, **self.options}
def __init__(self, selected_screen: int = 0, is_scaling: bool = False):
def __init__(self, is_scaling: bool = False):
super().__init__()
# Get screen width and height using Windows command
self.display_num = None
self.offset_x = 0
self.offset_y = 0
self.selected_screen = selected_screen
self.is_scaling = is_scaling
self.width, self.height = self.get_screen_size()
print(f"screen size: {self.width}, {self.height}")
@@ -253,7 +252,7 @@ class ComputerTool(BaseAnthropicTool):
screenshot = self.padding_image(screenshot)
self.target_dimension = MAX_SCALING_TARGETS["WXGA"]
width, height = self.target_dimension["width"], self.target_dimension["height"]
screenshot, path = get_screenshot(selected_screen=0, resize=True, target_width=width, target_height=height)
screenshot, path = get_screenshot(resize=True, target_width=width, target_height=height)
# return ToolResult()
return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode())

View File

@@ -7,7 +7,7 @@ from io import BytesIO
OUTPUT_DIR = "./tmp/outputs"
def get_screenshot(selected_screen: int = 0, resize: bool = False, target_width: int = 1920, target_height: int = 1080):
def get_screenshot(resize: bool = False, target_width: int = 1920, target_height: int = 1080):
"""Capture screenshot by requesting from HTTP endpoint - returns native resolution unless resized"""
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(parents=True, exist_ok=True)