remove selected screen as only 1 screen
This commit is contained in:
@@ -209,7 +209,6 @@ def process_input(user_input, state):
|
||||
api_response_callback=partial(_api_response_callback, response_state=state["responses"]),
|
||||
api_key=state["api_key"],
|
||||
only_n_most_recent_images=state["only_n_most_recent_images"],
|
||||
selected_screen=0,
|
||||
omniparser_url=state["omniparser_url"]
|
||||
):
|
||||
if loop_msg is None:
|
||||
|
||||
@@ -20,10 +20,9 @@ class AnthropicExecutor:
|
||||
self,
|
||||
output_callback: Callable[[BetaContentBlockParam], None],
|
||||
tool_output_callback: Callable[[Any, str], None],
|
||||
selected_screen: int = 0
|
||||
):
|
||||
self.tool_collection = ToolCollection(
|
||||
ComputerTool(selected_screen=selected_screen)
|
||||
ComputerTool()
|
||||
)
|
||||
self.output_callback = output_callback
|
||||
self.tool_output_callback = tool_output_callback
|
||||
|
||||
@@ -53,7 +53,6 @@ class AnthropicActor:
|
||||
api_response_callback: Callable[[APIResponse[BetaMessage]], None],
|
||||
max_tokens: int = 4096,
|
||||
only_n_most_recent_images: int | None = None,
|
||||
selected_screen: int = 0,
|
||||
print_usage: bool = True,
|
||||
):
|
||||
self.model = model
|
||||
@@ -62,11 +61,8 @@ class AnthropicActor:
|
||||
self.api_response_callback = api_response_callback
|
||||
self.max_tokens = max_tokens
|
||||
self.only_n_most_recent_images = only_n_most_recent_images
|
||||
self.selected_screen = selected_screen
|
||||
|
||||
self.tool_collection = ToolCollection(
|
||||
ComputerTool(selected_screen=selected_screen),
|
||||
)
|
||||
self.tool_collection = ToolCollection(ComputerTool())
|
||||
|
||||
self.system = SYSTEM_PROMPT
|
||||
|
||||
@@ -175,7 +171,7 @@ if __name__ == "__main__":
|
||||
# model="claude-3-5-sonnet-20241022",
|
||||
# system=SYSTEM_PROMPT,
|
||||
# # tools=ToolCollection(
|
||||
# # ComputerTool(selected_screen=0),
|
||||
# # ComputerTool(),
|
||||
# # ).to_params(),
|
||||
# betas=["computer-use-2024-10-22"],
|
||||
# messages=[
|
||||
|
||||
@@ -46,15 +46,13 @@ def sampling_loop_sync(
|
||||
api_key: str,
|
||||
only_n_most_recent_images: int | None = 2,
|
||||
max_tokens: int = 4096,
|
||||
selected_screen: int = 0,
|
||||
omniparser_url: str
|
||||
):
|
||||
"""
|
||||
Synchronous agentic sampling loop for the assistant/tool interaction of computer use.
|
||||
"""
|
||||
print('in sampling_loop_sync, model:', model)
|
||||
omniparser = OmniParser(url=f"http://{omniparser_url}/send_text/" if omniparser_url else None,
|
||||
selected_screen=selected_screen,)
|
||||
omniparser = OmniParser(url=f"http://{omniparser_url}/send_text/" if omniparser_url else None)
|
||||
if model == "claude-3-5-sonnet-20241022":
|
||||
# Register Actor and Executor
|
||||
actor = AnthropicActor(
|
||||
@@ -63,15 +61,13 @@ def sampling_loop_sync(
|
||||
api_key=api_key,
|
||||
api_response_callback=api_response_callback,
|
||||
max_tokens=max_tokens,
|
||||
only_n_most_recent_images=only_n_most_recent_images,
|
||||
selected_screen=selected_screen
|
||||
only_n_most_recent_images=only_n_most_recent_images
|
||||
)
|
||||
|
||||
# from IPython.core.debugger import Pdb; Pdb().set_trace()
|
||||
executor = AnthropicExecutor(
|
||||
output_callback=output_callback,
|
||||
tool_output_callback=tool_output_callback,
|
||||
selected_screen=selected_screen
|
||||
tool_output_callback=tool_output_callback
|
||||
)
|
||||
|
||||
elif model == "omniparser + gpt-4o" or model == "omniparser + phi35v":
|
||||
@@ -80,14 +76,12 @@ def sampling_loop_sync(
|
||||
provider=provider,
|
||||
api_key=api_key,
|
||||
api_response_callback=api_response_callback,
|
||||
selected_screen=selected_screen,
|
||||
output_callback=output_callback,
|
||||
)
|
||||
|
||||
executor = AnthropicExecutor(
|
||||
output_callback=output_callback,
|
||||
tool_output_callback=tool_output_callback,
|
||||
selected_screen=selected_screen
|
||||
)
|
||||
|
||||
else:
|
||||
|
||||
@@ -35,10 +35,8 @@ def extract_data(input_string, data_type):
|
||||
|
||||
class OmniParser:
|
||||
def __init__(self,
|
||||
url: str,
|
||||
selected_screen: int = 0) -> None:
|
||||
url: str) -> None:
|
||||
self.url = url
|
||||
self.selected_screen = selected_screen
|
||||
if not self.url:
|
||||
config = {
|
||||
'som_model_path': '../weights/icon_detect_v1_5/model_v1_5.pt',
|
||||
@@ -51,7 +49,7 @@ class OmniParser:
|
||||
self.omniparser = Omniparser_class(config=config)
|
||||
|
||||
def __call__(self,):
|
||||
screenshot, screenshot_path = get_screenshot(selected_screen=self.selected_screen)
|
||||
screenshot, screenshot_path = get_screenshot()
|
||||
screenshot_path = str(screenshot_path)
|
||||
image_base64 = encode_image(screenshot_path)
|
||||
if self.url:
|
||||
@@ -106,7 +104,6 @@ class VLMAgent:
|
||||
api_response_callback: Callable,
|
||||
max_tokens: int = 4096,
|
||||
only_n_most_recent_images: int | None = None,
|
||||
selected_screen: int = 0,
|
||||
print_usage: bool = True,
|
||||
):
|
||||
if model == "omniparser + gpt-4o":
|
||||
@@ -119,7 +116,6 @@ class VLMAgent:
|
||||
self.api_response_callback = api_response_callback
|
||||
self.max_tokens = max_tokens
|
||||
self.only_n_most_recent_images = only_n_most_recent_images
|
||||
self.selected_screen = selected_screen
|
||||
self.output_callback = output_callback
|
||||
|
||||
self.print_usage = print_usage
|
||||
|
||||
@@ -86,14 +86,13 @@ class ComputerTool(BaseAnthropicTool):
|
||||
def to_params(self) -> BetaToolComputerUse20241022Param:
|
||||
return {"name": self.name, "type": self.api_type, **self.options}
|
||||
|
||||
def __init__(self, selected_screen: int = 0, is_scaling: bool = False):
|
||||
def __init__(self, is_scaling: bool = False):
|
||||
super().__init__()
|
||||
|
||||
# Get screen width and height using Windows command
|
||||
self.display_num = None
|
||||
self.offset_x = 0
|
||||
self.offset_y = 0
|
||||
self.selected_screen = selected_screen
|
||||
self.is_scaling = is_scaling
|
||||
self.width, self.height = self.get_screen_size()
|
||||
print(f"screen size: {self.width}, {self.height}")
|
||||
@@ -253,7 +252,7 @@ class ComputerTool(BaseAnthropicTool):
|
||||
screenshot = self.padding_image(screenshot)
|
||||
self.target_dimension = MAX_SCALING_TARGETS["WXGA"]
|
||||
width, height = self.target_dimension["width"], self.target_dimension["height"]
|
||||
screenshot, path = get_screenshot(selected_screen=0, resize=True, target_width=width, target_height=height)
|
||||
screenshot, path = get_screenshot(resize=True, target_width=width, target_height=height)
|
||||
|
||||
# return ToolResult()
|
||||
return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode())
|
||||
|
||||
@@ -7,7 +7,7 @@ from io import BytesIO
|
||||
|
||||
OUTPUT_DIR = "./tmp/outputs"
|
||||
|
||||
def get_screenshot(selected_screen: int = 0, resize: bool = False, target_width: int = 1920, target_height: int = 1080):
|
||||
def get_screenshot(resize: bool = False, target_width: int = 1920, target_height: int = 1080):
|
||||
"""Capture screenshot by requesting from HTTP endpoint - returns native resolution unless resized"""
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
Reference in New Issue
Block a user