fixes
This commit is contained in:
@@ -12,6 +12,7 @@ from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, B
|
|||||||
|
|
||||||
from agent.llm_utils.oai import run_oai_interleaved
|
from agent.llm_utils.oai import run_oai_interleaved
|
||||||
from agent.llm_utils.groqclient import run_groq_interleaved
|
from agent.llm_utils.groqclient import run_groq_interleaved
|
||||||
|
from agent.llm_utils.utils import is_image_path
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
|
|
||||||
@@ -79,10 +80,8 @@ class VLMAgent:
|
|||||||
|
|
||||||
# drop looping actions msg, byte image etc
|
# drop looping actions msg, byte image etc
|
||||||
planner_messages = messages
|
planner_messages = messages
|
||||||
planner_messages = _keep_latest_images(planner_messages)
|
_remove_som_images(planner_messages)
|
||||||
# if self.only_n_most_recent_images:
|
_maybe_filter_to_n_most_recent_images(planner_messages, self.only_n_most_recent_images)
|
||||||
# _maybe_filter_to_n_most_recent_images(planner_messages, self.only_n_most_recent_images)
|
|
||||||
# print(f"filtered_messages: {planner_messages}\n\n", "full messages:", messages)
|
|
||||||
|
|
||||||
if isinstance(planner_messages[-1], dict):
|
if isinstance(planner_messages[-1], dict):
|
||||||
if not isinstance(planner_messages[-1]["content"], list):
|
if not isinstance(planner_messages[-1]["content"], list):
|
||||||
@@ -103,9 +102,8 @@ class VLMAgent:
|
|||||||
)
|
)
|
||||||
print(f"oai token usage: {token_usage}")
|
print(f"oai token usage: {token_usage}")
|
||||||
self.total_token_usage += token_usage
|
self.total_token_usage += token_usage
|
||||||
self.total_cost += (token_usage * 0.15 / 1000000) # https://openai.com/api/pricing/
|
self.total_cost += (token_usage * 2.5 / 1000000) # https://openai.com/api/pricing/
|
||||||
elif "r1" in self.model:
|
elif "r1" in self.model:
|
||||||
print(f"Sending messages to Groq: {planner_messages}")
|
|
||||||
vlm_response, token_usage = run_groq_interleaved(
|
vlm_response, token_usage = run_groq_interleaved(
|
||||||
messages=planner_messages,
|
messages=planner_messages,
|
||||||
system=system,
|
system=system,
|
||||||
@@ -184,11 +182,10 @@ class VLMAgent:
|
|||||||
if vlm_response_json["Next Action"] == "None":
|
if vlm_response_json["Next Action"] == "None":
|
||||||
print("Task paused/completed.")
|
print("Task paused/completed.")
|
||||||
elif vlm_response_json["Next Action"] == "type":
|
elif vlm_response_json["Next Action"] == "type":
|
||||||
click_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}', input={'action': 'left_click'}, name='computer', type='tool_use')
|
|
||||||
sim_content_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}',
|
sim_content_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}',
|
||||||
input={'action': vlm_response_json["Next Action"], 'text': vlm_response_json["value"]},
|
input={'action': vlm_response_json["Next Action"], 'text': vlm_response_json["value"]},
|
||||||
name='computer', type='tool_use')
|
name='computer', type='tool_use')
|
||||||
response_content.extend([click_block, sim_content_block])
|
response_content.append(sim_content_block)
|
||||||
else:
|
else:
|
||||||
sim_content_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}',
|
sim_content_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}',
|
||||||
input={'action': vlm_response_json["Next Action"]},
|
input={'action': vlm_response_json["Next Action"]},
|
||||||
@@ -212,16 +209,16 @@ You should carefully consider your plan base on the task, screenshot, and histor
|
|||||||
Here is the list of all detected bounding boxes by IDs on the screen and their description:{screen_info}
|
Here is the list of all detected bounding boxes by IDs on the screen and their description:{screen_info}
|
||||||
|
|
||||||
Your available "Next Action" only include:
|
Your available "Next Action" only include:
|
||||||
- type: move mouse to box id, left clicks and types a string of text.
|
- type: types a string of text.
|
||||||
- left_click: move mouse to box id and left clicks
|
- left_click: move mouse to box id and left clicks.
|
||||||
- right_click: move mouse to box id and right clicks
|
- right_click: move mouse to box id and right clicks.
|
||||||
- double_click: move mouse to box id and double clicks
|
- double_click: move mouse to box id and double clicks.
|
||||||
- hover: move mouse to box id
|
- hover: move mouse to box id.
|
||||||
- scroll_up: scrolls the screen up.
|
- scroll_up: scrolls the screen up.
|
||||||
- scroll_down: scrolls the screen down.
|
- scroll_down: scrolls the screen down.
|
||||||
- wait: waits for 1 second for the device to load or respond.
|
- wait: waits for 1 second for the device to load or respond.
|
||||||
|
|
||||||
Based on the visual information from the screenshot image and the detected bounding boxes, please determine the next action, the Box ID you should operate on, and the value (if the action is 'type') in order to complete the task.
|
Based on the visual information from the screenshot image and the detected bounding boxes, please determine the next action, the Box ID you should operate on (if action is not 'type', 'hover', 'scroll_up', 'scroll_down', 'wait'), and the value (if the action is 'type') in order to complete the task.
|
||||||
|
|
||||||
Output format:
|
Output format:
|
||||||
```json
|
```json
|
||||||
@@ -275,14 +272,14 @@ IMPORTANT NOTES:
|
|||||||
|
|
||||||
return main_section
|
return main_section
|
||||||
|
|
||||||
def _keep_latest_images(messages):
|
def _remove_som_images(messages):
|
||||||
for i in range(len(messages)-1):
|
for msg in messages:
|
||||||
if isinstance(messages[i]["content"], list):
|
msg_content = msg["content"]
|
||||||
for cnt in messages[i]["content"]:
|
if isinstance(msg_content, list):
|
||||||
if isinstance(cnt, str):
|
msg["content"] = [
|
||||||
if cnt.endswith((".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")):
|
cnt for cnt in msg_content
|
||||||
messages[i]["content"].remove(cnt)
|
if not (isinstance(cnt, str) and 'som' in cnt and is_image_path(cnt))
|
||||||
return messages
|
]
|
||||||
|
|
||||||
|
|
||||||
def _maybe_filter_to_n_most_recent_images(
|
def _maybe_filter_to_n_most_recent_images(
|
||||||
@@ -293,42 +290,43 @@ def _maybe_filter_to_n_most_recent_images(
|
|||||||
"""
|
"""
|
||||||
With the assumption that images are screenshots that are of diminishing value as
|
With the assumption that images are screenshots that are of diminishing value as
|
||||||
the conversation progresses, remove all but the final `images_to_keep` tool_result
|
the conversation progresses, remove all but the final `images_to_keep` tool_result
|
||||||
images in place, with a chunk of min_removal_threshold to reduce the amount we
|
images in place
|
||||||
break the implicit prompt cache.
|
|
||||||
"""
|
"""
|
||||||
if images_to_keep is None:
|
if images_to_keep is None:
|
||||||
return messages
|
return messages
|
||||||
|
|
||||||
tool_result_blocks = cast(
|
total_images = 0
|
||||||
list[ToolResultBlockParam],
|
for msg in messages:
|
||||||
[
|
for cnt in msg.get("content", []):
|
||||||
item
|
if isinstance(cnt, str) and is_image_path(cnt):
|
||||||
for message in messages
|
total_images += 1
|
||||||
for item in (
|
elif isinstance(cnt, dict) and cnt.get("type") == "tool_result":
|
||||||
message["content"] if isinstance(message["content"], list) else []
|
for content in cnt.get("content", []):
|
||||||
)
|
if isinstance(content, dict) and content.get("type") == "image":
|
||||||
if isinstance(item, dict) and item.get("type") == "tool_result"
|
total_images += 1
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
total_images = sum(
|
|
||||||
1
|
|
||||||
for tool_result in tool_result_blocks
|
|
||||||
for content in tool_result.get("content", [])
|
|
||||||
if isinstance(content, dict) and content.get("type") == "image"
|
|
||||||
)
|
|
||||||
|
|
||||||
images_to_remove = total_images - images_to_keep
|
images_to_remove = total_images - images_to_keep
|
||||||
# for better cache behavior, we want to remove in chunks
|
|
||||||
images_to_remove -= images_to_remove % min_removal_threshold
|
|
||||||
|
|
||||||
for tool_result in tool_result_blocks:
|
for msg in messages:
|
||||||
if isinstance(tool_result.get("content"), list):
|
msg_content = msg["content"]
|
||||||
|
if isinstance(msg_content, list):
|
||||||
new_content = []
|
new_content = []
|
||||||
for content in tool_result.get("content", []):
|
for cnt in msg_content:
|
||||||
if isinstance(content, dict) and content.get("type") == "image":
|
# Remove images from SOM or screenshot as needed
|
||||||
|
if isinstance(cnt, str) and is_image_path(cnt):
|
||||||
if images_to_remove > 0:
|
if images_to_remove > 0:
|
||||||
images_to_remove -= 1
|
images_to_remove -= 1
|
||||||
continue
|
continue
|
||||||
new_content.append(content)
|
# VLM shouldn't use anthropic screenshot tool so shouldn't have these but in case it does, remove as needed
|
||||||
tool_result["content"] = new_content
|
elif isinstance(cnt, dict) and cnt.get("type") == "tool_result":
|
||||||
|
new_tool_result_content = []
|
||||||
|
for tool_result_entry in cnt.get("content", []):
|
||||||
|
if isinstance(tool_result_entry, dict) and tool_result_entry.get("type") == "image":
|
||||||
|
if images_to_remove > 0:
|
||||||
|
images_to_remove -= 1
|
||||||
|
continue
|
||||||
|
new_tool_result_content.append(tool_result_entry)
|
||||||
|
cnt["content"] = new_tool_result_content
|
||||||
|
# Append fixed content to current message's content list
|
||||||
|
new_content.append(cnt)
|
||||||
|
msg["content"] = new_content
|
||||||
@@ -170,8 +170,7 @@ class ComputerTool(BaseAnthropicTool):
|
|||||||
return ToolResult(output=f"Pressed keys: {text}")
|
return ToolResult(output=f"Pressed keys: {text}")
|
||||||
|
|
||||||
elif action == "type":
|
elif action == "type":
|
||||||
self.send_to_vm(f"pyautogui.typewrite('{text}', interval={TYPING_DELAY_MS / 1000})") # Convert ms to seconds
|
self.send_to_vm(f"pyautogui.typewrite('{text}', interval={TYPING_DELAY_MS / 1000})")
|
||||||
self.send_to_vm("pyautogui.press('enter')")
|
|
||||||
screenshot_base64 = (await self.screenshot()).base64_image
|
screenshot_base64 = (await self.screenshot()).base64_image
|
||||||
return ToolResult(output=text, base64_image=screenshot_base64)
|
return ToolResult(output=text, base64_image=screenshot_base64)
|
||||||
|
|
||||||
@@ -261,11 +260,8 @@ class ComputerTool(BaseAnthropicTool):
|
|||||||
width, height = self.target_dimension["width"], self.target_dimension["height"]
|
width, height = self.target_dimension["width"], self.target_dimension["height"]
|
||||||
screenshot, path = get_screenshot(resize=True, target_width=width, target_height=height)
|
screenshot, path = get_screenshot(resize=True, target_width=width, target_height=height)
|
||||||
time.sleep(0.7) # avoid async error as actions take time to complete
|
time.sleep(0.7) # avoid async error as actions take time to complete
|
||||||
# return ToolResult()
|
|
||||||
return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode())
|
return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode())
|
||||||
|
|
||||||
raise ToolError(f"Failed to take screenshot: {path} does not exist.")
|
|
||||||
|
|
||||||
def padding_image(self, screenshot):
|
def padding_image(self, screenshot):
|
||||||
"""Pad the screenshot to 16:10 aspect ratio, when the aspect ratio is not 16:10."""
|
"""Pad the screenshot to 16:10 aspect ratio, when the aspect ratio is not 16:10."""
|
||||||
_, height = screenshot.size
|
_, height = screenshot.size
|
||||||
|
|||||||
Reference in New Issue
Block a user