unused file cleanup

2025-01-22 21:14:21 -08:00
parent c29ac5064a
commit 9db016b52f
6 changed files with 13 additions and 327 deletions
--- a/demo/gradio/computer_use_demo/gui_agent/anthropic_agent.py
+++ b/demo/gradio/computer_use_demo/gui_agent/anthropic_agent.py
@@ -31,32 +31,19 @@ from io import BytesIO
 import gradio as gr
 from typing import Dict
 BETA_FLAG = "computer-use-2024-10-22"
 class APIProvider(StrEnum):
    ANTHROPIC = "anthropic"
    BEDROCK = "bedrock"
    VERTEX = "vertex"
 PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
    APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
    APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
    APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
 }
 # Check OS
 SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
 * You are utilizing a Windows system with internet access.
 * The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
 </SYSTEM_CAPABILITY>
 """
 class AnthropicActor:
    def __init__(
        self, 
--- a/demo/gradio/computer_use_demo/gui_agent/llm_utils/llm_utils.py
+++ b/demo/gradio/computer_use_demo/gui_agent/llm_utils/llm_utils.py
@@ -1,109 +0,0 @@
 import os
 import re
 import ast
 import base64
 def is_image_path(text):
    # Checking if the input text ends with typical image file extensions
    image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
    if text.endswith(image_extensions):
        return True
    else:
        return False
 def encode_image(image_path):
    """Encode image file to base64."""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")
 def is_url_or_filepath(input_string):
    # Check if input_string is a URL
    url_pattern = re.compile(
        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    )
    if url_pattern.match(input_string):
        return "URL"
    # Check if input_string is a file path
    file_path = os.path.abspath(input_string)
    if os.path.exists(file_path):
        return "File path"
    return "Invalid"
 def extract_data(input_string, data_type):
    # Regular expression to extract content starting from '```python' until the end if there are no closing backticks
    pattern = f"```{data_type}" + r"(.*?)(```|$)"
    # Extract content
    # re.DOTALL allows '.' to match newlines as well
    matches = re.findall(pattern, input_string, re.DOTALL)
    # Return the first match if exists, trimming whitespace and ignoring potential closing backticks
    return matches[0][0].strip() if matches else input_string
 def parse_input(code):
    """Use AST to parse the input string and extract the function name, arguments, and keyword arguments."""
    def get_target_names(target):
        """Recursively get all variable names from the assignment target."""
        if isinstance(target, ast.Name):
            return [target.id]
        elif isinstance(target, ast.Tuple):
            names = []
            for elt in target.elts:
                names.extend(get_target_names(elt))
            return names
        return []
    def extract_value(node):
        """提取 AST 节点的实际值"""
        if isinstance(node, ast.Constant):
            return node.value
        elif isinstance(node, ast.Name):
            # TODO: a better way to handle variables
            raise ValueError(
                f"Arguments should be a Constant, got a variable {node.id} instead."
            )
        # 添加其他需要处理的 AST 节点类型
        return None
    try:
        tree = ast.parse(code)
        for node in ast.walk(tree):
            if isinstance(node, ast.Assign):
                targets = []
                for t in node.targets:
                    targets.extend(get_target_names(t))
                if isinstance(node.value, ast.Call):
                    func_name = node.value.func.id
                    args = [ast.dump(arg) for arg in node.value.args]
                    kwargs = {
                        kw.arg: extract_value(kw.value) for kw in node.value.keywords
                    }
                    print(f"Input: {code.strip()}")
                    print(f"Output Variables: {targets}")
                    print(f"Function Name: {func_name}")
                    print(f"Arguments: {args}")
                    print(f"Keyword Arguments: {kwargs}")
            elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Call):
                targets = []
                func_name = extract_value(node.value.func)
                args = [extract_value(arg) for arg in node.value.args]
                kwargs = {kw.arg: extract_value(kw.value) for kw in node.value.keywords}
    except SyntaxError:
        print(f"Input: {code.strip()}")
        print("No match found")
    return targets, func_name, args, kwargs
 if __name__ == "__main__":
    import json
    s='{"Reasoning": "The Docker icon has been successfully clicked, and the Docker application should now be opening. No further actions are required.", "Next Action": None}'
    json_str = json.loads(s)
    print(json_str)
--- a/demo/gradio/computer_use_demo/gui_agent/llm_utils/qwen.py
+++ b/demo/gradio/computer_use_demo/gui_agent/llm_utils/qwen.py
@@ -1,107 +0,0 @@
 import os
 import logging
 import base64
 import requests
 import dashscope
 # from computer_use_demo.gui_agent.llm_utils import is_image_path, encode_image
 def is_image_path(text):
    return False
 def encode_image(image_path):
    return ""   
 def run_qwen(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0):
    api_key = api_key or os.environ.get("QWEN_API_KEY")
    if not api_key:
        raise ValueError("QWEN_API_KEY is not set")
    dashscope.api_key = api_key
    # from IPython.core.debugger import Pdb; Pdb().set_trace()
    final_messages = [{"role": "system", "content": [{"text": system}]}]
    # image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
    if type(messages) == list:
        for item in messages:
            contents = []
            if isinstance(item, dict):
                for cnt in item["content"]:
                    if isinstance(cnt, str):
                        if is_image_path(cnt):
                            # base64_image = encode_image(cnt)
                            content = [{"image": cnt}]
                        # content = {"type": "image_url", "image_url": {"url": image_url}}
                    else:
                        content = {"text": cnt}
                    contents.append(content)
                message = {"role": item["role"], "content": contents}
            else:  # str
                contents.append({"text": item})
                message = {"role": "user", "content": contents}
            final_messages.append(message)
    print("[qwen-vl] sending messages:", final_messages)
    response = dashscope.MultiModalConversation.call(
        model='qwen-vl-max-0809',
        messages=final_messages
        )
    # from IPython.core.debugger import Pdb; Pdb().set_trace()
    try:
        text = response.output.choices[0].message.content[0]['text']
        usage = response.usage
        if "total_tokens" not in usage:
            token_usage = int(usage["input_tokens"] + usage["output_tokens"])
        else:
            token_usage = int(usage["total_tokens"])
        return text, token_usage
        # return response.json()['choices'][0]['message']['content']
    # return error message if the response is not successful
    except Exception as e:
        print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ")
        return response.json()
 if __name__ == "__main__":
    api_key = os.environ.get("QWEN_API_KEY")
    if not api_key:
        raise ValueError("QWEN_API_KEY is not set")
    dashscope.api_key = api_key
    final_messages = [{"role": "user",
                       "content": [
                           {"text": "What is in the screenshot?"},
                           {"image": "./tmp/outputs/screenshot_0b04acbb783d4706bc93873d17ba8c05.png"}
                           ]
                       }
                    ]
    response = dashscope.MultiModalConversation.call(model='qwen-vl-max-0809', messages=final_messages)
    print(response)
    text = response.output.choices[0].message.content[0]['text']
    usage = response.usage
    if "total_tokens" not in usage:
        if "image_tokens" in usage:
            token_usage = usage["input_tokens"] + usage["output_tokens"] + usage["image_tokens"]
        else:
            token_usage = usage["input_tokens"] + usage["output_tokens"]
    else:
        token_usage = usage["total_tokens"]
    print(text, token_usage)
    # The screenshot is from a video game... 1387
--- a/demo/gradio/computer_use_demo/gui_agent/llm_utils/run_llm.py
+++ b/demo/gradio/computer_use_demo/gui_agent/llm_utils/run_llm.py
@@ -1,44 +0,0 @@
 import base64
 import logging
 from .oai import run_oai_interleaved
 from .gemini import run_gemini_interleaved
 def run_llm(prompt, llm="gpt-4o-mini", max_tokens=256, temperature=0, stop=None):
    log_prompt(prompt)
    # turn string prompt into list
    if isinstance(prompt, str):
        prompt = [prompt]
    elif isinstance(prompt, list):
        pass
    else:
        raise ValueError(f"Invalid prompt type: {type(prompt)}")
    if llm.startswith("gpt"): # gpt series
        out = run_oai_interleaved(
            prompt, 
            llm, 
            max_tokens, 
            temperature, 
            stop
        )
    elif llm.startswith("gemini"): # gemini series
        out = run_gemini_interleaved(
            prompt, 
            llm, 
            max_tokens,
            temperature, 
            stop
        )
    else:
        raise ValueError(f"Invalid llm: {llm}")
    logging.info(
        f"========Output for {llm}=======\n{out}\n============================")
    return out
 def log_prompt(prompt):
    prompt_display = [prompt] if isinstance(prompt, str) else prompt
    prompt_display = "\n\n".join(prompt_display)
    logging.info(
        f"========Prompt=======\n{prompt_display}\n============================")
--- a/demo/gradio/computer_use_demo/loop.py
+++ b/demo/gradio/computer_use_demo/loop.py
@@ -1,76 +1,40 @@
 """
 Agentic sampling loop that calls the Anthropic API and local implenmentation of anthropic-defined computer use tools.
 """
 import time
 import json
 import asyncio
 import platform
 from collections.abc import Callable
 from datetime import datetime
 from enum import StrEnum
 from typing import Any, cast, Dict
-from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex, APIResponse
+from anthropic import APIResponse
 from anthropic.types import (
    ToolResultBlockParam,
    TextBlock,
 )
 from anthropic.types.beta import (
    BetaContentBlock,
    BetaContentBlockParam,
    BetaImageBlockParam,
    BetaMessage,
-    BetaMessageParam,
+    BetaMessageParam
    BetaTextBlockParam,
    BetaToolResultBlockParam,
 )
 from computer_use_demo.tools import ToolResult
 import torch
 from computer_use_demo.gui_agent.anthropic_agent import AnthropicActor
 from computer_use_demo.executor.anthropic_executor import AnthropicExecutor
 from computer_use_demo.omniparser_agent.vlm_agent import OmniParser, VLMAgent
 from computer_use_demo.colorful_text import colorful_text_vlm
 from computer_use_demo.tools.screen_capture import get_screenshot
 from computer_use_demo.gui_agent.llm_utils.oai import encode_image
 BETA_FLAG = "computer-use-2024-10-22"
 class APIProvider(StrEnum):
    ANTHROPIC = "anthropic"
    BEDROCK = "bedrock"
    VERTEX = "vertex"
    OPENAI = "openai"
    QWEN = "qwen"
 PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
    APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
    APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
    APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
-    # APIProvider.OPENAI: "gpt-4o",
+    APIProvider.OPENAI: "gpt-4o",
    # APIProvider.QWEN: "qwen2vl",
 }
 # This system prompt is optimized for the Docker environment in this repository and
 # specific tool combinations enabled.
 # We encourage modifying this system prompt to ensure the model has context for the
 # environment it is running in, and to provide any additional information that may be
 # helpful for the task at hand.
 SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
 * You are utilizing a Windows system with internet access.
 * The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
 </SYSTEM_CAPABILITY>
 """
 import base64
 from PIL import Image
 from io import BytesIO
 def sampling_loop_sync(
    *,
    model: str,
--- a/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py
+++ b/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py
@@ -18,13 +18,21 @@ from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, B
 from computer_use_demo.tools.screen_capture import get_screenshot
 from computer_use_demo.gui_agent.llm_utils.oai import run_oai_interleaved, encode_image
 from computer_use_demo.gui_agent.llm_utils.qwen import run_qwen
 from computer_use_demo.gui_agent.llm_utils.llm_utils import extract_data
 from computer_use_demo.colorful_text import colorful_text_vlm
 import time
 import re
 OUTPUT_DIR = "./tmp/outputs"
 def extract_data(input_string, data_type):
    # Regular expression to extract content starting from '```python' until the end if there are no closing backticks
    pattern = f"```{data_type}" + r"(.*?)(```|$)"
    # Extract content
    # re.DOTALL allows '.' to match newlines as well
    matches = re.findall(pattern, input_string, re.DOTALL)
    # Return the first match if exists, trimming whitespace and ignoring potential closing backticks
    return matches[0][0].strip() if matches else input_string
 class OmniParser:
    def __init__(self, 
                 url: str,
@@ -165,19 +173,6 @@ class VLMAgent:
            print(f"oai token usage: {token_usage}")
            self.total_token_usage += token_usage
            self.total_cost += (token_usage * 0.15 / 1000000)  # https://openai.com/api/pricing/
        elif "qwen" in self.model:
            vlm_response, token_usage = run_qwen(
                messages=planner_messages,
                system=system,
                llm=self.model,
                api_key=self.api_key,
                max_tokens=self.max_tokens,
                temperature=0,
            )
            print(f"qwen token usage: {token_usage}")
            self.total_token_usage += token_usage
            self.total_cost += (token_usage * 0.02 / 7.25 / 1000)  # 1USD=7.25CNY, https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-qianwen-vl-plus-api
        elif "phi" in self.model:
            pass # TODO
        else: