unused file cleanup

2025-01-22 21:14:21 -08:00
parent c29ac5064a
commit 9db016b52f
6 changed files with 13 additions and 327 deletions
--- a/demo/gradio/computer_use_demo/gui_agent/anthropic_agent.py
+++ b/demo/gradio/computer_use_demo/gui_agent/anthropic_agent.py
@@ -31,32 +31,19 @@ from io import BytesIO
 import gradio as gr
 from typing import Dict

-
 BETA_FLAG = "computer-use-2024-10-22"

-
 class APIProvider(StrEnum):
    ANTHROPIC = "anthropic"
    BEDROCK = "bedrock"
    VERTEX = "vertex"

-
-PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
-    APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
-    APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
-    APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
-}
-
-
-# Check OS
-
 SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
 * You are utilizing a Windows system with internet access.
 * The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
 </SYSTEM_CAPABILITY>
 """

-
 class AnthropicActor:
    def __init__(
        self, 
--- a/demo/gradio/computer_use_demo/gui_agent/llm_utils/llm_utils.py
+++ b/demo/gradio/computer_use_demo/gui_agent/llm_utils/llm_utils.py
@@ -1,109 +0,0 @@
-import os
-import re
-import ast
-import base64
-
-
-def is_image_path(text):
-    # Checking if the input text ends with typical image file extensions
-    image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
-    if text.endswith(image_extensions):
-        return True
-    else:
-        return False
-
-
-def encode_image(image_path):
-    """Encode image file to base64."""
-    with open(image_path, "rb") as image_file:
-        return base64.b64encode(image_file.read()).decode("utf-8")
-
-
-def is_url_or_filepath(input_string):
-    # Check if input_string is a URL
-    url_pattern = re.compile(
-        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
-    )
-    if url_pattern.match(input_string):
-        return "URL"
-
-    # Check if input_string is a file path
-    file_path = os.path.abspath(input_string)
-    if os.path.exists(file_path):
-        return "File path"
-
-    return "Invalid"
-
-
-def extract_data(input_string, data_type):
-    # Regular expression to extract content starting from '```python' until the end if there are no closing backticks
-    pattern = f"```{data_type}" + r"(.*?)(```|$)"
-    # Extract content
-    # re.DOTALL allows '.' to match newlines as well
-    matches = re.findall(pattern, input_string, re.DOTALL)
-    # Return the first match if exists, trimming whitespace and ignoring potential closing backticks
-    return matches[0][0].strip() if matches else input_string
-
-
-def parse_input(code):
-    """Use AST to parse the input string and extract the function name, arguments, and keyword arguments."""
-
-    def get_target_names(target):
-        """Recursively get all variable names from the assignment target."""
-        if isinstance(target, ast.Name):
-            return [target.id]
-        elif isinstance(target, ast.Tuple):
-            names = []
-            for elt in target.elts:
-                names.extend(get_target_names(elt))
-            return names
-        return []
-
-    def extract_value(node):
-        """提取 AST 节点的实际值"""
-        if isinstance(node, ast.Constant):
-            return node.value
-        elif isinstance(node, ast.Name):
-            # TODO: a better way to handle variables
-            raise ValueError(
-                f"Arguments should be a Constant, got a variable {node.id} instead."
-            )
-        # 添加其他需要处理的 AST 节点类型
-        return None
-
-    try:
-        tree = ast.parse(code)
-        for node in ast.walk(tree):
-            if isinstance(node, ast.Assign):
-                targets = []
-                for t in node.targets:
-                    targets.extend(get_target_names(t))
-                if isinstance(node.value, ast.Call):
-                    func_name = node.value.func.id
-                    args = [ast.dump(arg) for arg in node.value.args]
-                    kwargs = {
-                        kw.arg: extract_value(kw.value) for kw in node.value.keywords
-                    }
-                    print(f"Input: {code.strip()}")
-                    print(f"Output Variables: {targets}")
-                    print(f"Function Name: {func_name}")
-                    print(f"Arguments: {args}")
-                    print(f"Keyword Arguments: {kwargs}")
-            elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Call):
-                targets = []
-                func_name = extract_value(node.value.func)
-                args = [extract_value(arg) for arg in node.value.args]
-                kwargs = {kw.arg: extract_value(kw.value) for kw in node.value.keywords}
-
-    except SyntaxError:
-        print(f"Input: {code.strip()}")
-        print("No match found")
-
-    return targets, func_name, args, kwargs
-
-
-if __name__ == "__main__":
-    import json
-    s='{"Reasoning": "The Docker icon has been successfully clicked, and the Docker application should now be opening. No further actions are required.", "Next Action": None}'
-    json_str = json.loads(s)
-    print(json_str)
--- a/demo/gradio/computer_use_demo/gui_agent/llm_utils/qwen.py
+++ b/demo/gradio/computer_use_demo/gui_agent/llm_utils/qwen.py
@@ -1,107 +0,0 @@
-
-import os
-import logging
-import base64
-import requests
-
-import dashscope
-# from computer_use_demo.gui_agent.llm_utils import is_image_path, encode_image
-
-def is_image_path(text):
-    return False
-
-def encode_image(image_path):
-    return ""   
-
-
-def run_qwen(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0):
-    
-    api_key = api_key or os.environ.get("QWEN_API_KEY")
-    if not api_key:
-        raise ValueError("QWEN_API_KEY is not set")
-    
-    dashscope.api_key = api_key
-    
-    # from IPython.core.debugger import Pdb; Pdb().set_trace()
-
-    final_messages = [{"role": "system", "content": [{"text": system}]}]
-    # image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-    if type(messages) == list:
-        for item in messages:
-            contents = []
-            if isinstance(item, dict):
-                for cnt in item["content"]:
-                    if isinstance(cnt, str):
-                        if is_image_path(cnt):
-                            # base64_image = encode_image(cnt)
-                            content = [{"image": cnt}]
-                        # content = {"type": "image_url", "image_url": {"url": image_url}}
-                    else:
-                        content = {"text": cnt}
-                    contents.append(content)
-                    
-                message = {"role": item["role"], "content": contents}
-            else:  # str
-                contents.append({"text": item})
-                message = {"role": "user", "content": contents}
-            
-            final_messages.append(message)
-
-    print("[qwen-vl] sending messages:", final_messages)
-
-    response = dashscope.MultiModalConversation.call(
-        model='qwen-vl-max-0809',
-        messages=final_messages
-        )
-
-    # from IPython.core.debugger import Pdb; Pdb().set_trace()
-    
-    try:
-        text = response.output.choices[0].message.content[0]['text']
-        usage = response.usage
-        
-        if "total_tokens" not in usage:
-            token_usage = int(usage["input_tokens"] + usage["output_tokens"])
-        else:
-            token_usage = int(usage["total_tokens"])
-        
-        return text, token_usage
-        # return response.json()['choices'][0]['message']['content']
-    # return error message if the response is not successful
-    except Exception as e:
-        print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ")
-        return response.json()
-
-
-
-if __name__ == "__main__":
-    api_key = os.environ.get("QWEN_API_KEY")
-    if not api_key:
-        raise ValueError("QWEN_API_KEY is not set")
-    
-    dashscope.api_key = api_key
-    
-    final_messages = [{"role": "user",
-                       "content": [
-                           {"text": "What is in the screenshot?"},
-                           {"image": "./tmp/outputs/screenshot_0b04acbb783d4706bc93873d17ba8c05.png"}
-                           ]
-                       }
-                    ]
-    response = dashscope.MultiModalConversation.call(model='qwen-vl-max-0809', messages=final_messages)
-    
-    print(response)
-    
-    text = response.output.choices[0].message.content[0]['text']
-    usage = response.usage
-    
-    if "total_tokens" not in usage:
-        if "image_tokens" in usage:
-            token_usage = usage["input_tokens"] + usage["output_tokens"] + usage["image_tokens"]
-        else:
-            token_usage = usage["input_tokens"] + usage["output_tokens"]
-    else:
-        token_usage = usage["total_tokens"]
-    
-    print(text, token_usage)
-    # The screenshot is from a video game... 1387
--- a/demo/gradio/computer_use_demo/gui_agent/llm_utils/run_llm.py
+++ b/demo/gradio/computer_use_demo/gui_agent/llm_utils/run_llm.py
@@ -1,44 +0,0 @@
-import base64
-import logging
-from .oai import run_oai_interleaved
-from .gemini import run_gemini_interleaved
-
-def run_llm(prompt, llm="gpt-4o-mini", max_tokens=256, temperature=0, stop=None):
-    log_prompt(prompt)
-    
-    # turn string prompt into list
-    if isinstance(prompt, str):
-        prompt = [prompt]
-    elif isinstance(prompt, list):
-        pass
-    else:
-        raise ValueError(f"Invalid prompt type: {type(prompt)}")
-    
-    if llm.startswith("gpt"): # gpt series
-        out = run_oai_interleaved(
-            prompt, 
-            llm, 
-            max_tokens, 
-            temperature, 
-            stop
-        )
-    elif llm.startswith("gemini"): # gemini series
-        out = run_gemini_interleaved(
-            prompt, 
-            llm, 
-            max_tokens,
-            temperature, 
-            stop
-        )
-    else:
-        raise ValueError(f"Invalid llm: {llm}")
-    logging.info(
-        f"========Output for {llm}=======\n{out}\n============================")
-    return out
-
-def log_prompt(prompt):
-    prompt_display = [prompt] if isinstance(prompt, str) else prompt
-    prompt_display = "\n\n".join(prompt_display)
-    logging.info(
-        f"========Prompt=======\n{prompt_display}\n============================")
-    
--- a/demo/gradio/computer_use_demo/loop.py
+++ b/demo/gradio/computer_use_demo/loop.py
@@ -1,76 +1,40 @@
 """
 Agentic sampling loop that calls the Anthropic API and local implenmentation of anthropic-defined computer use tools.
 """
-import time
-import json
-import asyncio
-import platform
 from collections.abc import Callable
-from datetime import datetime
 from enum import StrEnum
-from typing import Any, cast, Dict

-from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex, APIResponse
+from anthropic import APIResponse
 from anthropic.types import (
-    ToolResultBlockParam,
    TextBlock,
 )
 from anthropic.types.beta import (
    BetaContentBlock,
-    BetaContentBlockParam,
-    BetaImageBlockParam,
    BetaMessage,
-    BetaMessageParam,
-    BetaTextBlockParam,
-    BetaToolResultBlockParam,
+    BetaMessageParam
 )
 from computer_use_demo.tools import ToolResult

-import torch
-
 from computer_use_demo.gui_agent.anthropic_agent import AnthropicActor
 from computer_use_demo.executor.anthropic_executor import AnthropicExecutor
 from computer_use_demo.omniparser_agent.vlm_agent import OmniParser, VLMAgent
-from computer_use_demo.colorful_text import colorful_text_vlm
-from computer_use_demo.tools.screen_capture import get_screenshot
-from computer_use_demo.gui_agent.llm_utils.oai import encode_image
-

 BETA_FLAG = "computer-use-2024-10-22"

-
 class APIProvider(StrEnum):
    ANTHROPIC = "anthropic"
    BEDROCK = "bedrock"
    VERTEX = "vertex"
    OPENAI = "openai"
-    QWEN = "qwen"


 PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
    APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
    APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
    APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
-    # APIProvider.OPENAI: "gpt-4o",
-    # APIProvider.QWEN: "qwen2vl",
+    APIProvider.OPENAI: "gpt-4o",
 }

-
-# This system prompt is optimized for the Docker environment in this repository and
-# specific tool combinations enabled.
-# We encourage modifying this system prompt to ensure the model has context for the
-# environment it is running in, and to provide any additional information that may be
-# helpful for the task at hand.
-SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
-* You are utilizing a Windows system with internet access.
-* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
-</SYSTEM_CAPABILITY>
-"""
-
-import base64
-from PIL import Image
-from io import BytesIO
-
 def sampling_loop_sync(
    *,
    model: str,
--- a/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py
+++ b/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py
@@ -18,13 +18,21 @@ from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, B

 from computer_use_demo.tools.screen_capture import get_screenshot
 from computer_use_demo.gui_agent.llm_utils.oai import run_oai_interleaved, encode_image
-from computer_use_demo.gui_agent.llm_utils.qwen import run_qwen
-from computer_use_demo.gui_agent.llm_utils.llm_utils import extract_data
 from computer_use_demo.colorful_text import colorful_text_vlm
 import time
+import re

 OUTPUT_DIR = "./tmp/outputs"

+def extract_data(input_string, data_type):
+    # Regular expression to extract content starting from '```python' until the end if there are no closing backticks
+    pattern = f"```{data_type}" + r"(.*?)(```|$)"
+    # Extract content
+    # re.DOTALL allows '.' to match newlines as well
+    matches = re.findall(pattern, input_string, re.DOTALL)
+    # Return the first match if exists, trimming whitespace and ignoring potential closing backticks
+    return matches[0][0].strip() if matches else input_string
+
 class OmniParser:
    def __init__(self, 
                 url: str,
@@ -165,19 +173,6 @@ class VLMAgent:
            print(f"oai token usage: {token_usage}")
            self.total_token_usage += token_usage
            self.total_cost += (token_usage * 0.15 / 1000000)  # https://openai.com/api/pricing/
-            
-        elif "qwen" in self.model:
-            vlm_response, token_usage = run_qwen(
-                messages=planner_messages,
-                system=system,
-                llm=self.model,
-                api_key=self.api_key,
-                max_tokens=self.max_tokens,
-                temperature=0,
-            )
-            print(f"qwen token usage: {token_usage}")
-            self.total_token_usage += token_usage
-            self.total_cost += (token_usage * 0.02 / 7.25 / 1000)  # 1USD=7.25CNY, https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-qianwen-vl-plus-api
        elif "phi" in self.model:
            pass # TODO
        else: