docker demo, migration, speedup inference using cv2

2025-01-04 20:06:33 -08:00
parent d0c163cd02
commit b9d3cb715b
36 changed files with 5842 additions and 2456 deletions
--- a/demo/gui_agent/anthropic_agent.py
+++ b/demo/gui_agent/anthropic_agent.py
@@ -0,0 +1,207 @@
+"""
+Agentic sampling loop that calls the Anthropic API and local implenmentation of anthropic-defined computer use tools.
+"""
+import asyncio
+import platform
+from collections.abc import Callable
+from datetime import datetime
+from enum import StrEnum
+from typing import Any, cast
+
+from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex, APIResponse
+from anthropic.types import (
+    ToolResultBlockParam,
+)
+from anthropic.types.beta import (
+    BetaContentBlock,
+    BetaContentBlockParam,
+    BetaImageBlockParam,
+    BetaMessage,
+    BetaMessageParam,
+    BetaTextBlockParam,
+    BetaToolResultBlockParam,
+)
+from anthropic.types import TextBlock
+from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock
+
+from tools import BashTool, ComputerTool, EditTool, ToolCollection, ToolResult
+
+from PIL import Image
+from io import BytesIO
+import gradio as gr
+from typing import Dict
+
+
+BETA_FLAG = "computer-use-2024-10-22"
+
+
+class APIProvider(StrEnum):
+    ANTHROPIC = "anthropic"
+    BEDROCK = "bedrock"
+    VERTEX = "vertex"
+
+
+PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
+    APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
+    APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
+    APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
+}
+
+
+# Check OS
+
+SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
+* You are utilizing a Windows system with internet access.
+* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
+</SYSTEM_CAPABILITY>
+"""
+
+
+class AnthropicActor:
+    def __init__(
+        self, 
+        model: str, 
+        provider: APIProvider, 
+        system_prompt_suffix: str, 
+        api_key: str,
+        api_response_callback: Callable[[APIResponse[BetaMessage]], None],
+        max_tokens: int = 4096,
+        only_n_most_recent_images: int | None = None,
+        selected_screen: int = 0,
+        print_usage: bool = True,
+    ):
+        self.model = model
+        self.provider = provider
+        self.system_prompt_suffix = system_prompt_suffix
+        self.api_key = api_key
+        self.api_response_callback = api_response_callback
+        self.max_tokens = max_tokens
+        self.only_n_most_recent_images = only_n_most_recent_images
+        self.selected_screen = selected_screen
+        
+        self.tool_collection = ToolCollection(
+            ComputerTool(selected_screen=selected_screen),
+            BashTool(),
+            EditTool(),
+        )
+
+        self.system = (
+            f"{SYSTEM_PROMPT}{' ' + system_prompt_suffix if system_prompt_suffix else ''}"
+        )
+        
+        self.total_token_usage = 0
+        self.total_cost = 0
+        self.print_usage = print_usage
+
+        # Instantiate the appropriate API client based on the provider
+        if provider == APIProvider.ANTHROPIC:
+            self.client = Anthropic(api_key=api_key)
+        elif provider == APIProvider.VERTEX:
+            self.client = AnthropicVertex()
+        elif provider == APIProvider.BEDROCK:
+            self.client = AnthropicBedrock()
+
+    def __call__(
+        self, 
+        *,
+        messages: list[BetaMessageParam]
+    ):
+        """
+        Generate a response given history messages.
+        """
+        if self.only_n_most_recent_images:
+            _maybe_filter_to_n_most_recent_images(messages, self.only_n_most_recent_images)
+
+        # Call the API synchronously
+        raw_response = self.client.beta.messages.with_raw_response.create(
+            max_tokens=self.max_tokens,
+            messages=messages,
+            model=self.model,
+            system=self.system,
+            tools=self.tool_collection.to_params(),
+            betas=["computer-use-2024-10-22"],
+        )
+
+        self.api_response_callback(cast(APIResponse[BetaMessage], raw_response))
+
+        response = raw_response.parse()
+        print(f"AnthropicActor response: {response}")
+
+        self.total_token_usage += response.usage.input_tokens + response.usage.output_tokens
+        self.total_cost += (response.usage.input_tokens * 3 / 1000000 + response.usage.output_tokens * 15 / 1000000)
+        
+        if self.print_usage:
+            print(f"Claude total token usage so far: {self.total_token_usage}, total cost so far: $USD{self.total_cost}")
+        
+        return response
+
+
+def _maybe_filter_to_n_most_recent_images(
+    messages: list[BetaMessageParam],
+    images_to_keep: int,
+    min_removal_threshold: int = 10,
+):
+    """
+    With the assumption that images are screenshots that are of diminishing value as
+    the conversation progresses, remove all but the final `images_to_keep` tool_result
+    images in place, with a chunk of min_removal_threshold to reduce the amount we
+    break the implicit prompt cache.
+    """
+    if images_to_keep is None:
+        return messages
+
+    tool_result_blocks = cast(
+        list[ToolResultBlockParam],
+        [
+            item
+            for message in messages
+            for item in (
+                message["content"] if isinstance(message["content"], list) else []
+            )
+            if isinstance(item, dict) and item.get("type") == "tool_result"
+        ],
+    )
+
+    total_images = sum(
+        1
+        for tool_result in tool_result_blocks
+        for content in tool_result.get("content", [])
+        if isinstance(content, dict) and content.get("type") == "image"
+    )
+
+    images_to_remove = total_images - images_to_keep
+    # for better cache behavior, we want to remove in chunks
+    images_to_remove -= images_to_remove % min_removal_threshold
+
+    for tool_result in tool_result_blocks:
+        if isinstance(tool_result.get("content"), list):
+            new_content = []
+            for content in tool_result.get("content", []):
+                if isinstance(content, dict) and content.get("type") == "image":
+                    if images_to_remove > 0:
+                        images_to_remove -= 1
+                        continue
+                new_content.append(content)
+            tool_result["content"] = new_content
+            
+            
+
+if __name__ == "__main__":
+    pass
+    # client = Anthropic(api_key="")
+    # response = client.beta.messages.with_raw_response.create(
+    #     max_tokens=4096,
+    #     model="claude-3-5-sonnet-20241022",
+    #     system=SYSTEM_PROMPT,
+    #     # tools=ToolCollection(
+    #     #     ComputerTool(selected_screen=0),
+    #     #     BashTool(),
+    #     #     EditTool(),
+    #     # ).to_params(),
+    #     betas=["computer-use-2024-10-22"],
+    #     messages=[
+    #         {"role": "user", "content": "click on (199, 199)."}
+    #     ],
+    # )
+    
+    # print(f"AnthropicActor response: {response.parse().usage.input_tokens+response.parse().usage.output_tokens}")
--- a/demo/gui_agent/llm_utils/llm_utils.py
+++ b/demo/gui_agent/llm_utils/llm_utils.py
@@ -0,0 +1,109 @@
+import os
+import re
+import ast
+import base64
+
+
+def is_image_path(text):
+    # Checking if the input text ends with typical image file extensions
+    image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
+    if text.endswith(image_extensions):
+        return True
+    else:
+        return False
+
+
+def encode_image(image_path):
+    """Encode image file to base64."""
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+
+
+def is_url_or_filepath(input_string):
+    # Check if input_string is a URL
+    url_pattern = re.compile(
+        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
+    )
+    if url_pattern.match(input_string):
+        return "URL"
+
+    # Check if input_string is a file path
+    file_path = os.path.abspath(input_string)
+    if os.path.exists(file_path):
+        return "File path"
+
+    return "Invalid"
+
+
+def extract_data(input_string, data_type):
+    # Regular expression to extract content starting from '```python' until the end if there are no closing backticks
+    pattern = f"```{data_type}" + r"(.*?)(```|$)"
+    # Extract content
+    # re.DOTALL allows '.' to match newlines as well
+    matches = re.findall(pattern, input_string, re.DOTALL)
+    # Return the first match if exists, trimming whitespace and ignoring potential closing backticks
+    return matches[0][0].strip() if matches else input_string
+
+
+def parse_input(code):
+    """Use AST to parse the input string and extract the function name, arguments, and keyword arguments."""
+
+    def get_target_names(target):
+        """Recursively get all variable names from the assignment target."""
+        if isinstance(target, ast.Name):
+            return [target.id]
+        elif isinstance(target, ast.Tuple):
+            names = []
+            for elt in target.elts:
+                names.extend(get_target_names(elt))
+            return names
+        return []
+
+    def extract_value(node):
+        """提取 AST 节点的实际值"""
+        if isinstance(node, ast.Constant):
+            return node.value
+        elif isinstance(node, ast.Name):
+            # TODO: a better way to handle variables
+            raise ValueError(
+                f"Arguments should be a Constant, got a variable {node.id} instead."
+            )
+        # 添加其他需要处理的 AST 节点类型
+        return None
+
+    try:
+        tree = ast.parse(code)
+        for node in ast.walk(tree):
+            if isinstance(node, ast.Assign):
+                targets = []
+                for t in node.targets:
+                    targets.extend(get_target_names(t))
+                if isinstance(node.value, ast.Call):
+                    func_name = node.value.func.id
+                    args = [ast.dump(arg) for arg in node.value.args]
+                    kwargs = {
+                        kw.arg: extract_value(kw.value) for kw in node.value.keywords
+                    }
+                    print(f"Input: {code.strip()}")
+                    print(f"Output Variables: {targets}")
+                    print(f"Function Name: {func_name}")
+                    print(f"Arguments: {args}")
+                    print(f"Keyword Arguments: {kwargs}")
+            elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Call):
+                targets = []
+                func_name = extract_value(node.value.func)
+                args = [extract_value(arg) for arg in node.value.args]
+                kwargs = {kw.arg: extract_value(kw.value) for kw in node.value.keywords}
+
+    except SyntaxError:
+        print(f"Input: {code.strip()}")
+        print("No match found")
+
+    return targets, func_name, args, kwargs
+
+
+if __name__ == "__main__":
+    import json
+    s='{"Reasoning": "The Docker icon has been successfully clicked, and the Docker application should now be opening. No further actions are required.", "Next Action": None}'
+    json_str = json.loads(s)
+    print(json_str)
--- a/demo/gui_agent/llm_utils/oai.py
+++ b/demo/gui_agent/llm_utils/oai.py
@@ -0,0 +1,117 @@
+
+import os
+import logging
+import base64
+import requests
+
+# from computer_use_demo.gui_agent.llm_utils import is_image_path, encode_image
+
+def is_image_path(text):
+    # Checking if the input text ends with typical image file extensions
+    image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
+    if text.endswith(image_extensions):
+        return True
+    else:
+        return False
+
+def encode_image(image_path):
+    """Encode image file to base64."""
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+    
+
+
+# from openai import OpenAI
+# client = OpenAI(
+#     api_key=os.environ.get("OPENAI_API_KEY")
+#     )
+
+
+def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0):
+
+    api_key = api_key or os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        raise ValueError("OPENAI_API_KEY is not set")
+    
+    headers = {"Content-Type": "application/json",
+               "Authorization": f"Bearer {api_key}"}
+
+    final_messages = [{"role": "system", "content": system}]
+
+    # image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    if type(messages) == list:
+        for item in messages:
+            contents = []
+            if isinstance(item, dict):
+                for cnt in item["content"]:
+                    if isinstance(cnt, str):
+                        if is_image_path(cnt):
+                            base64_image = encode_image(cnt)
+                            content = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
+                        else:
+                            content = {"type": "text", "text": cnt}
+                    else:
+                        # in this case it is a text block from anthropic
+                        content = {"type": "text", "text": str(cnt)}
+                        
+                    contents.append(content)
+                    
+                message = {"role": 'user', "content": contents}
+            else:  # str
+                contents.append({"type": "text", "text": item})
+                message = {"role": "user", "content": contents}
+            
+            final_messages.append(message)
+
+    
+    elif isinstance(messages, str):
+        final_messages = [{"role": "user", "content": messages}]
+    # import pdb; pdb.set_trace()
+
+    print("[oai] sending messages:", {"role": "user", "content": messages})
+
+    payload = {
+        "model": llm,
+        "messages": final_messages,
+        "max_tokens": max_tokens,
+        "temperature": temperature,
+        # "stop": stop,
+    }
+
+    # from IPython.core.debugger import Pdb; Pdb().set_trace()
+
+    response = requests.post(
+        "https://api.openai.com/v1/chat/completions", headers=headers, json=payload
+    )
+
+    try:
+        text = response.json()['choices'][0]['message']['content']
+        token_usage = int(response.json()['usage']['total_tokens'])
+        return text, token_usage
+        
+    # return error message if the response is not successful
+    except Exception as e:
+        print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ")
+        return response.json()
+
+
+if __name__ == "__main__":
+    
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        raise ValueError("OPENAI_API_KEY is not set")
+    
+    text, token_usage = run_oai_interleaved(
+        messages= [{"content": [
+                        "What is in the screenshot?",   
+                        "./tmp/outputs/screenshot_0b04acbb783d4706bc93873d17ba8c05.png"],
+                    "role": "user"
+                    }],
+        llm="gpt-4o-mini",
+        system="You are a helpful assistant",
+        api_key=api_key,
+        max_tokens=256,
+        temperature=0)
+    
+    print(text, token_usage)
+    # There is an introduction describing the Calyx... 36986
--- a/demo/gui_agent/llm_utils/qwen.py
+++ b/demo/gui_agent/llm_utils/qwen.py
@@ -0,0 +1,107 @@
+
+import os
+import logging
+import base64
+import requests
+
+import dashscope
+# from computer_use_demo.gui_agent.llm_utils import is_image_path, encode_image
+
+def is_image_path(text):
+    return False
+
+def encode_image(image_path):
+    return ""   
+
+
+def run_qwen(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0):
+    
+    api_key = api_key or os.environ.get("QWEN_API_KEY")
+    if not api_key:
+        raise ValueError("QWEN_API_KEY is not set")
+    
+    dashscope.api_key = api_key
+    
+    # from IPython.core.debugger import Pdb; Pdb().set_trace()
+
+    final_messages = [{"role": "system", "content": [{"text": system}]}]
+    # image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    if type(messages) == list:
+        for item in messages:
+            contents = []
+            if isinstance(item, dict):
+                for cnt in item["content"]:
+                    if isinstance(cnt, str):
+                        if is_image_path(cnt):
+                            # base64_image = encode_image(cnt)
+                            content = [{"image": cnt}]
+                        # content = {"type": "image_url", "image_url": {"url": image_url}}
+                    else:
+                        content = {"text": cnt}
+                    contents.append(content)
+                    
+                message = {"role": item["role"], "content": contents}
+            else:  # str
+                contents.append({"text": item})
+                message = {"role": "user", "content": contents}
+            
+            final_messages.append(message)
+
+    print("[qwen-vl] sending messages:", final_messages)
+
+    response = dashscope.MultiModalConversation.call(
+        model='qwen-vl-max-0809',
+        messages=final_messages
+        )
+
+    # from IPython.core.debugger import Pdb; Pdb().set_trace()
+    
+    try:
+        text = response.output.choices[0].message.content[0]['text']
+        usage = response.usage
+        
+        if "total_tokens" not in usage:
+            token_usage = int(usage["input_tokens"] + usage["output_tokens"])
+        else:
+            token_usage = int(usage["total_tokens"])
+        
+        return text, token_usage
+        # return response.json()['choices'][0]['message']['content']
+    # return error message if the response is not successful
+    except Exception as e:
+        print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ")
+        return response.json()
+
+
+
+if __name__ == "__main__":
+    api_key = os.environ.get("QWEN_API_KEY")
+    if not api_key:
+        raise ValueError("QWEN_API_KEY is not set")
+    
+    dashscope.api_key = api_key
+    
+    final_messages = [{"role": "user",
+                       "content": [
+                           {"text": "What is in the screenshot?"},
+                           {"image": "./tmp/outputs/screenshot_0b04acbb783d4706bc93873d17ba8c05.png"}
+                           ]
+                       }
+                    ]
+    response = dashscope.MultiModalConversation.call(model='qwen-vl-max-0809', messages=final_messages)
+    
+    print(response)
+    
+    text = response.output.choices[0].message.content[0]['text']
+    usage = response.usage
+    
+    if "total_tokens" not in usage:
+        if "image_tokens" in usage:
+            token_usage = usage["input_tokens"] + usage["output_tokens"] + usage["image_tokens"]
+        else:
+            token_usage = usage["input_tokens"] + usage["output_tokens"]
+    else:
+        token_usage = usage["total_tokens"]
+    
+    print(text, token_usage)
+    # The screenshot is from a video game... 1387
--- a/demo/gui_agent/llm_utils/run_llm.py
+++ b/demo/gui_agent/llm_utils/run_llm.py
@@ -0,0 +1,44 @@
+import base64
+import logging
+from .oai import run_oai_interleaved
+from .gemini import run_gemini_interleaved
+
+def run_llm(prompt, llm="gpt-4o-mini", max_tokens=256, temperature=0, stop=None):
+    log_prompt(prompt)
+    
+    # turn string prompt into list
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    elif isinstance(prompt, list):
+        pass
+    else:
+        raise ValueError(f"Invalid prompt type: {type(prompt)}")
+    
+    if llm.startswith("gpt"): # gpt series
+        out = run_oai_interleaved(
+            prompt, 
+            llm, 
+            max_tokens, 
+            temperature, 
+            stop
+        )
+    elif llm.startswith("gemini"): # gemini series
+        out = run_gemini_interleaved(
+            prompt, 
+            llm, 
+            max_tokens,
+            temperature, 
+            stop
+        )
+    else:
+        raise ValueError(f"Invalid llm: {llm}")
+    logging.info(
+        f"========Output for {llm}=======\n{out}\n============================")
+    return out
+
+def log_prompt(prompt):
+    prompt_display = [prompt] if isinstance(prompt, str) else prompt
+    prompt_display = "\n\n".join(prompt_display)
+    logging.info(
+        f"========Prompt=======\n{prompt_display}\n============================")
+