merge

2025-01-04 20:14:49 -08:00
parent b9d3cb715b 36b0cbea71
commit ebc3912727
7 changed files with 1290 additions and 1199 deletions
--- a/404
+++ b/404
@@ -1,203 +1,201 @@
-# Dockerfile for OmniParser with GPU support and OpenGL libraries
+# Dockerfile for OmniParser with GPU support and OpenGL libraries
-#
+#
-# This Dockerfile is intended to create an environment with NVIDIA CUDA
+# This Dockerfile is intended to create an environment with NVIDIA CUDA
-# support and the necessary dependencies to run the OmniParser project.
+# support and the necessary dependencies to run the OmniParser project.
-# The configuration is designed to support applications that rely on
+# The configuration is designed to support applications that rely on
-# Python 3.12, OpenCV, Hugging Face transformers, and Gradio. Additionally,
+# Python 3.12, OpenCV, Hugging Face transformers, and Gradio. Additionally,
-# it includes steps to pull large files from Git LFS and a script to
+# it includes steps to pull large files from Git LFS and a script to
-# convert model weights from .safetensor to .pt format. The container
+# convert model weights from .safetensor to .pt format. The container
-# runs a Gradio server by default, exposed on port 7861.
+# runs a Gradio server by default, exposed on port 7861.
-#
+#
-# Base image: nvidia/cuda:12.3.1-devel-ubuntu22.04
+# Base image: nvidia/cuda:12.3.1-devel-ubuntu22.04
-#
+#
-# Key features:
+# Key features:
-# - System dependencies for OpenGL to support graphical libraries.
+# - System dependencies for OpenGL to support graphical libraries.
-# - Miniconda for Python 3.12, allowing for environment management.
+# - Miniconda for Python 3.12, allowing for environment management.
-# - Git Large File Storage (LFS) setup for handling large model files.
+# - Git Large File Storage (LFS) setup for handling large model files.
-# - Requirement file installation, including specific versions of
+# - Requirement file installation, including specific versions of
-#   OpenCV and Hugging Face Hub.
+#   OpenCV and Hugging Face Hub.
-# - Entrypoint script execution with Gradio server configuration for
+# - Entrypoint script execution with Gradio server configuration for
-#   external access.
+#   external access.
-
+
-# If it is gpu enviroment, use nvidia/cuda:12.3.1-devel-ubuntu22.04, otherwise use ubuntu:22.04
+FROM nvidia/cuda:12.3.1-devel-ubuntu22.04
-# FROM nvidia/cuda:12.3.1-devel-ubuntu22.04
+
-FROM docker.io/ubuntu:22.04
+# Install system dependencies with explicit OpenGL libraries
-
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
-# Install system dependencies with explicit OpenGL libraries
+    git \
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    git-lfs \
-    git \
+    wget \
-    git-lfs \
+    libgl1 \
-    wget \
+    libglib2.0-0 \
-    libgl1 \
+    libsm6 \
-    libglib2.0-0 \
+    libxext6 \
-    libsm6 \
+    libxrender1 \
-    libxext6 \
+    libglu1-mesa \
-    libxrender1 \
+    libglib2.0-0 \
-    libglu1-mesa \
+    libsm6 \
-    libglib2.0-0 \
+    libxrender1 \
-    libsm6 \
+    libxext6 \
-    libxrender1 \
+    python3-opencv \
-    libxext6 \
+    && apt-get clean \
-    python3-opencv \
+    && rm -rf /var/lib/apt/lists/* \
-    && apt-get clean \
+    && git lfs install
-    && rm -rf /var/lib/apt/lists/* \
+
-    && git lfs install
+# Install Miniconda for Python 3.12
-
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
-# Install Miniconda for Python 3.12
+    bash miniconda.sh -b -p /opt/conda && \
-RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
+    rm miniconda.sh
-    bash miniconda.sh -b -p /opt/conda && \
+ENV PATH="/opt/conda/bin:$PATH"
-    rm miniconda.sh
+
-ENV PATH="/opt/conda/bin:$PATH"
+# Create and activate Conda environment with Python 3.12, and set it as the default
-
+RUN conda create -n omni python=3.12 && \
-# Create and activate Conda environment with Python 3.12, and set it as the default
+    echo "source activate omni" > ~/.bashrc
-RUN conda create -n omni python=3.12 && \
+ENV CONDA_DEFAULT_ENV=omni
-    echo "source activate omni" > ~/.bashrc
+ENV PATH="/opt/conda/envs/omni/bin:$PATH"
-ENV CONDA_DEFAULT_ENV=omni
+
-ENV PATH="/opt/conda/envs/omni/bin:$PATH"
+# Set the working directory in the container
-
+WORKDIR /usr/src/app
-# Set the working directory in the container
+
-WORKDIR /usr/src/app
+# Copy project files and requirements
-
+COPY . .
-# Copy project files and requirements
+COPY requirements.txt /usr/src/app/requirements.txt
-COPY . .
+
-COPY requirements.txt /usr/src/app/requirements.txt
+# Initialize Git LFS and pull LFS files
-
+RUN git lfs install && \
-# Initialize Git LFS and pull LFS files
+    git lfs pull
-RUN git lfs install && \
+
-    git lfs pull
+# Install dependencies from requirements.txt with specific opencv-python-headless version
-
+RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
-# Install dependencies from requirements.txt with specific opencv-python-headless version
+    pip uninstall -y opencv-python opencv-python-headless && \
-RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
+    pip install --no-cache-dir opencv-python-headless==4.8.1.78 && \
-    # pip uninstall -y opencv-python opencv-python-headless && \
+    pip install -r requirements.txt && \
-    # pip install --no-cache-dir opencv-python-headless==4.8.1.78 && \
+    pip install huggingface_hub
-    pip install -r requirements.txt && \
+
-    pip install huggingface_hub
+# Run download.py to fetch model weights and convert safetensors to .pt format
-
+# RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
-# Run download.py to fetch model weights and convert safetensors to .pt format
+#     python download.py && \
-# RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
+#     echo "Contents of weights directory:" && \
-#     python download.py && \
+#     ls -lR weights && \
-#     echo "Contents of weights directory:" && \
+#     python weights/convert_safetensor_to_pt.py
-#     ls -lR weights && \
+
-#     python weights/convert_safetensor_to_pt.py
+# Expose the default Gradio port
-
+EXPOSE 7861
-# Expose the default Gradio port
+
-EXPOSE 7861
+# Configure Gradio to be accessible externally
-
+ENV GRADIO_SERVER_NAME="0.0.0.0"
-# Configure Gradio to be accessible externally
+
-ENV GRADIO_SERVER_NAME="0.0.0.0"
+# Copy and set permissions for entrypoint script
-
+# COPY entrypoint.sh /usr/src/app/entrypoint.sh
-# Copy and set permissions for entrypoint script
+# RUN chmod +x /usr/src/app/entrypoint.sh
-# COPY entrypoint.sh /usr/src/app/entrypoint.sh
+
-# RUN chmod +x /usr/src/app/entrypoint.sh
+# To debug, keep the container running
-
+# CMD ["tail", "-f", "/dev/null"]
-# To debug, keep the container running
+
-# CMD ["tail", "-f", "/dev/null"]
+################################################################################################
-
+# virtual display related setup --> from anthropic-quickstarts/computer-use-demo/Dockerfile
-################################################################################################
+
-# virtual display related setup --> from anthropic-quickstarts/computer-use-demo/Dockerfile
+ENV DEBIAN_FRONTEND=noninteractive
-
+ENV DEBIAN_PRIORITY=high
-ENV DEBIAN_FRONTEND=noninteractive
+
-ENV DEBIAN_PRIORITY=high
+RUN apt-get update && \
-
+    apt-get -y upgrade && \
-RUN apt-get update && \
+    apt-get -y install \
-    apt-get -y upgrade && \
+    # UI Requirements
-    apt-get -y install \
+    xvfb \
-    # UI Requirements
+    xterm \
-    xvfb \
+    xdotool \
-    xterm \
+    scrot \
-    xdotool \
+    imagemagick \
-    scrot \
+    sudo \
-    imagemagick \
+    mutter \
-    sudo \
+    x11vnc \
-    mutter \
+    # Python/pyenv reqs
-    x11vnc \
+    build-essential \
-    # Python/pyenv reqs
+    libssl-dev  \
-    build-essential \
+    zlib1g-dev \
-    libssl-dev  \
+    libbz2-dev \
-    zlib1g-dev \
+    libreadline-dev \
-    libbz2-dev \
+    libsqlite3-dev \
-    libreadline-dev \
+    curl \
-    libsqlite3-dev \
+    git \
-    curl \
+    libncursesw5-dev \
-    git \
+    xz-utils \
-    libncursesw5-dev \
+    tk-dev \
-    xz-utils \
+    libxml2-dev \
-    tk-dev \
+    libxmlsec1-dev \
-    libxml2-dev \
+    libffi-dev \
-    libxmlsec1-dev \
+    liblzma-dev \
-    libffi-dev \
+    # Network tools
-    liblzma-dev \
+    net-tools \
-    # Network tools
+    netcat \
-    net-tools \
+    # PPA req
-    netcat \
+    software-properties-common && \
-    # PPA req
+    # Userland apps
-    software-properties-common && \
+    sudo add-apt-repository ppa:mozillateam/ppa && \
-    # Userland apps
+    sudo apt-get install -y --no-install-recommends \
-    sudo add-apt-repository ppa:mozillateam/ppa && \
+    libreoffice \
-    sudo apt-get install -y --no-install-recommends \
+    firefox-esr \
-    libreoffice \
+    x11-apps \
-    firefox-esr \
+    xpdf \
-    x11-apps \
+    gedit \
-    xpdf \
+    xpaint \
-    gedit \
+    tint2 \
-    xpaint \
+    galculator \
-    tint2 \
+    pcmanfm \
-    galculator \
+    unzip && \
-    pcmanfm \
+    apt-get clean
-    unzip && \
+
-    apt-get clean
+# Install noVNC
-
+RUN git clone --branch v1.5.0 https://github.com/novnc/noVNC.git /opt/noVNC && \
-# Install noVNC
+    git clone --branch v0.12.0 https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \
-RUN git clone --branch v1.5.0 https://github.com/novnc/noVNC.git /opt/noVNC && \
+    ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html
-    git clone --branch v0.12.0 https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \
+
-    ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html
+# setup user
-
+ENV USERNAME=computeruse
-# setup user
+ENV HOME=/home/$USERNAME
-ENV USERNAME=computeruse
+RUN useradd -m -s /bin/bash -d $HOME $USERNAME
-ENV HOME=/home/$USERNAME
+RUN echo "${USERNAME} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
-RUN useradd -m -s /bin/bash -d $HOME $USERNAME
+USER computeruse
-RUN echo "${USERNAME} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+WORKDIR $HOME
-USER computeruse
+
-WORKDIR $HOME
+# setup python
-
+RUN git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \
-# setup python
+    cd ~/.pyenv && src/configure && make -C src && cd .. && \
-RUN git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \
+    echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc && \
-    cd ~/.pyenv && src/configure && make -C src && cd .. && \
+    echo 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc && \
-    echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc && \
+    echo 'eval "$(pyenv init -)"' >> ~/.bashrc
-    echo 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc && \
+ENV PYENV_ROOT="$HOME/.pyenv"
-    echo 'eval "$(pyenv init -)"' >> ~/.bashrc
+ENV PATH="$PYENV_ROOT/bin:$PATH"
-ENV PYENV_ROOT="$HOME/.pyenv"
+ENV PYENV_VERSION_MAJOR=3
-ENV PATH="$PYENV_ROOT/bin:$PATH"
+ENV PYENV_VERSION_MINOR=11
-ENV PYENV_VERSION_MAJOR=3
+ENV PYENV_VERSION_PATCH=6
-ENV PYENV_VERSION_MINOR=11
+ENV PYENV_VERSION=$PYENV_VERSION_MAJOR.$PYENV_VERSION_MINOR.$PYENV_VERSION_PATCH
-ENV PYENV_VERSION_PATCH=6
+RUN eval "$(pyenv init -)" && \
-ENV PYENV_VERSION=$PYENV_VERSION_MAJOR.$PYENV_VERSION_MINOR.$PYENV_VERSION_PATCH
+    pyenv install $PYENV_VERSION && \
-RUN eval "$(pyenv init -)" && \
+    pyenv global $PYENV_VERSION && \
-    pyenv install $PYENV_VERSION && \
+    pyenv rehash
-    pyenv global $PYENV_VERSION && \
+
-    pyenv rehash
+ENV PATH="$HOME/.pyenv/shims:$HOME/.pyenv/bin:$PATH"
-
+
-ENV PATH="$HOME/.pyenv/shims:$HOME/.pyenv/bin:$PATH"
+RUN python -m pip install --upgrade pip==23.1.2 setuptools==58.0.4 wheel==0.40.0 && \
-
+    python -m pip config set global.disable-pip-version-check true
-RUN python -m pip install --upgrade pip==23.1.2 setuptools==58.0.4 wheel==0.40.0 && \
+
-    python -m pip config set global.disable-pip-version-check true
+# only reinstall if requirements.txt changes
-
+# COPY --chown=$USERNAME:$USERNAME computer_use_demo/requirements.txt $HOME/computer_use_demo/requirements.txt
-# only reinstall if requirements.txt changes
+# RUN python -m pip install -r $HOME/computer_use_demo/requirements.txt
-# COPY --chown=$USERNAME:$USERNAME computer_use_demo/requirements.txt $HOME/computer_use_demo/requirements.txt
+
-# RUN python -m pip install -r $HOME/computer_use_demo/requirements.txt
+# setup desktop env & app
-
+# COPY --chown=$USERNAME:$USERNAME image/ $HOME
-# setup desktop env & app
+# COPY --chown=$USERNAME:$USERNAME computer_use_demo/ $HOME/computer_use_demo/
-# COPY --chown=$USERNAME:$USERNAME image/ $HOME
+
-# COPY --chown=$USERNAME:$USERNAME computer_use_demo/ $HOME/computer_use_demo/
+ARG DISPLAY_NUM=1
-
+ARG HEIGHT=768
-ARG DISPLAY_NUM=1
+ARG WIDTH=1024
-ARG HEIGHT=768
+ENV DISPLAY_NUM=$DISPLAY_NUM
-ARG WIDTH=1024
+ENV HEIGHT=$HEIGHT
-ENV DISPLAY_NUM=$DISPLAY_NUM
+ENV WIDTH=$WIDTH
-ENV HEIGHT=$HEIGHT
+
-ENV WIDTH=$WIDTH
+# Set the entrypoint
-
+# ENTRYPOINT ["/usr/src/app/entrypoint.sh"]
-# Set the entrypoint
+
-# ENTRYPOINT ["/usr/src/app/entrypoint.sh"]
+#  docker build . -t omniparser-x-demo:local  # manually build the docker image (optional)
 #  sudo docker build . -t omniparser-x-demo:local  # manually build the docker image (optional)
--- a/pycache/utils.cpython-312.pyc
+++ b/pycache/utils.cpython-312.pyc
--- a/demo.ipynb
+++ b/demo.ipynb
--- a/imgs/demo_image.jpg
+++ b/imgs/demo_image.jpg
--- a/imgs/demo_image_som.jpg
+++ b/imgs/demo_image_som.jpg
--- a/omniparser.py
+++ b/omniparser.py
@@ -1,60 +1,60 @@
-from utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor,  get_dino_model, get_yolo_model
+from utils import get_som_labeled_img, check_ocr_box, get_yolo_model
-import torch
+import torch
-from ultralytics import YOLO
+from ultralytics import YOLO
-from PIL import Image
+from PIL import Image
-from typing import Dict, Tuple, List
+from typing import Dict, Tuple, List
-import io
+import io
-import base64
+import base64
-
+
-
+
-config = {
+config = {
-    'som_model_path': 'finetuned_icon_detect.pt',
+    'som_model_path': 'finetuned_icon_detect.pt',
-    'device': 'cpu',
+    'device': 'cpu',
-    'caption_model_path': 'Salesforce/blip2-opt-2.7b',
+    'caption_model_path': 'Salesforce/blip2-opt-2.7b',
-    'draw_bbox_config': {
+    'draw_bbox_config': {
-        'text_scale': 0.8,
+        'text_scale': 0.8,
-        'text_thickness': 2,
+        'text_thickness': 2,
-        'text_padding': 3,
+        'text_padding': 3,
-        'thickness': 3,
+        'thickness': 3,
-    },
+    },
-    'BOX_TRESHOLD': 0.05
+    'BOX_TRESHOLD': 0.05
-}
+}
-
+
-
+
-class Omniparser(object):
+class Omniparser(object):
-    def __init__(self, config: Dict):
+    def __init__(self, config: Dict):
-        self.config = config
+        self.config = config
-        
+        
-        self.som_model = get_yolo_model(model_path=config['som_model_path'])
+        self.som_model = get_yolo_model(model_path=config['som_model_path'])
-        # self.caption_model_processor = get_caption_model_processor(config['caption_model_path'], device=cofig['device'])
+        # self.caption_model_processor = get_caption_model_processor(config['caption_model_path'], device=cofig['device'])
-        # self.caption_model_processor['model'].to(torch.float32)
+        # self.caption_model_processor['model'].to(torch.float32)
-
+
-    def parse(self, image_path: str):
+    def parse(self, image_path: str):
-        print('Parsing image:', image_path)
+        print('Parsing image:', image_path)
-        ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9})
+        ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9})
-        text, ocr_bbox = ocr_bbox_rslt
+        text, ocr_bbox = ocr_bbox_rslt
-
+
-        draw_bbox_config = self.config['draw_bbox_config']
+        draw_bbox_config = self.config['draw_bbox_config']
-        BOX_TRESHOLD = self.config['BOX_TRESHOLD']
+        BOX_TRESHOLD = self.config['BOX_TRESHOLD']
-        dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, self.som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=False, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=None, ocr_text=text,use_local_semantics=False)
+        dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, self.som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=False, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=None, ocr_text=text,use_local_semantics=False)
-        
+        
-        image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
+        image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
-        # formating output
+        # formating output
-        return_list = [{'from': 'omniparser', 'shape': {'x':coord[0], 'y':coord[1], 'width':coord[2], 'height':coord[3]},
+        return_list = [{'from': 'omniparser', 'shape': {'x':coord[0], 'y':coord[1], 'width':coord[2], 'height':coord[3]},
-                        'text': parsed_content_list[i].split(': ')[1], 'type':'text'} for i, (k, coord) in enumerate(label_coordinates.items()) if i < len(parsed_content_list)]
+                        'text': parsed_content_list[i].split(': ')[1], 'type':'text'} for i, (k, coord) in enumerate(label_coordinates.items()) if i < len(parsed_content_list)]
-        return_list.extend(
+        return_list.extend(
-            [{'from': 'omniparser', 'shape': {'x':coord[0], 'y':coord[1], 'width':coord[2], 'height':coord[3]},
+            [{'from': 'omniparser', 'shape': {'x':coord[0], 'y':coord[1], 'width':coord[2], 'height':coord[3]},
-                        'text': 'None', 'type':'icon'} for i, (k, coord) in enumerate(label_coordinates.items()) if i >= len(parsed_content_list)]
+                        'text': 'None', 'type':'icon'} for i, (k, coord) in enumerate(label_coordinates.items()) if i >= len(parsed_content_list)]
-              )
+              )
-
+
-        return [image, return_list]
+        return [image, return_list]
-    
+    
-parser = Omniparser(config)
+parser = Omniparser(config)
-image_path = 'examples/pc_1.png'
+image_path = 'examples/pc_1.png'
-
+
-#  time the parser
+#  time the parser
-import time
+import time
-s = time.time()
+s = time.time()
-image, parsed_content_list = parser.parse(image_path)
+image, parsed_content_list = parser.parse(image_path)
-device = config['device']
+device = config['device']
-print(f'Time taken for Omniparser on {device}:', time.time() - s)
+print(f'Time taken for Omniparser on {device}:', time.time() - s)
--- a/utils.py
+++ b/utils.py