This commit is contained in:
yadonglu
2025-01-04 20:14:49 -08:00
7 changed files with 1290 additions and 1199 deletions

View File

@@ -1,203 +1,201 @@
# Dockerfile for OmniParser with GPU support and OpenGL libraries # Dockerfile for OmniParser with GPU support and OpenGL libraries
# #
# This Dockerfile is intended to create an environment with NVIDIA CUDA # This Dockerfile is intended to create an environment with NVIDIA CUDA
# support and the necessary dependencies to run the OmniParser project. # support and the necessary dependencies to run the OmniParser project.
# The configuration is designed to support applications that rely on # The configuration is designed to support applications that rely on
# Python 3.12, OpenCV, Hugging Face transformers, and Gradio. Additionally, # Python 3.12, OpenCV, Hugging Face transformers, and Gradio. Additionally,
# it includes steps to pull large files from Git LFS and a script to # it includes steps to pull large files from Git LFS and a script to
# convert model weights from .safetensor to .pt format. The container # convert model weights from .safetensor to .pt format. The container
# runs a Gradio server by default, exposed on port 7861. # runs a Gradio server by default, exposed on port 7861.
# #
# Base image: nvidia/cuda:12.3.1-devel-ubuntu22.04 # Base image: nvidia/cuda:12.3.1-devel-ubuntu22.04
# #
# Key features: # Key features:
# - System dependencies for OpenGL to support graphical libraries. # - System dependencies for OpenGL to support graphical libraries.
# - Miniconda for Python 3.12, allowing for environment management. # - Miniconda for Python 3.12, allowing for environment management.
# - Git Large File Storage (LFS) setup for handling large model files. # - Git Large File Storage (LFS) setup for handling large model files.
# - Requirement file installation, including specific versions of # - Requirement file installation, including specific versions of
# OpenCV and Hugging Face Hub. # OpenCV and Hugging Face Hub.
# - Entrypoint script execution with Gradio server configuration for # - Entrypoint script execution with Gradio server configuration for
# external access. # external access.
# If it is gpu enviroment, use nvidia/cuda:12.3.1-devel-ubuntu22.04, otherwise use ubuntu:22.04 FROM nvidia/cuda:12.3.1-devel-ubuntu22.04
# FROM nvidia/cuda:12.3.1-devel-ubuntu22.04
FROM docker.io/ubuntu:22.04 # Install system dependencies with explicit OpenGL libraries
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
# Install system dependencies with explicit OpenGL libraries git \
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ git-lfs \
git \ wget \
git-lfs \ libgl1 \
wget \ libglib2.0-0 \
libgl1 \ libsm6 \
libglib2.0-0 \ libxext6 \
libsm6 \ libxrender1 \
libxext6 \ libglu1-mesa \
libxrender1 \ libglib2.0-0 \
libglu1-mesa \ libsm6 \
libglib2.0-0 \ libxrender1 \
libsm6 \ libxext6 \
libxrender1 \ python3-opencv \
libxext6 \ && apt-get clean \
python3-opencv \ && rm -rf /var/lib/apt/lists/* \
&& apt-get clean \ && git lfs install
&& rm -rf /var/lib/apt/lists/* \
&& git lfs install # Install Miniconda for Python 3.12
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
# Install Miniconda for Python 3.12 bash miniconda.sh -b -p /opt/conda && \
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ rm miniconda.sh
bash miniconda.sh -b -p /opt/conda && \ ENV PATH="/opt/conda/bin:$PATH"
rm miniconda.sh
ENV PATH="/opt/conda/bin:$PATH" # Create and activate Conda environment with Python 3.12, and set it as the default
RUN conda create -n omni python=3.12 && \
# Create and activate Conda environment with Python 3.12, and set it as the default echo "source activate omni" > ~/.bashrc
RUN conda create -n omni python=3.12 && \ ENV CONDA_DEFAULT_ENV=omni
echo "source activate omni" > ~/.bashrc ENV PATH="/opt/conda/envs/omni/bin:$PATH"
ENV CONDA_DEFAULT_ENV=omni
ENV PATH="/opt/conda/envs/omni/bin:$PATH" # Set the working directory in the container
WORKDIR /usr/src/app
# Set the working directory in the container
WORKDIR /usr/src/app # Copy project files and requirements
COPY . .
# Copy project files and requirements COPY requirements.txt /usr/src/app/requirements.txt
COPY . .
COPY requirements.txt /usr/src/app/requirements.txt # Initialize Git LFS and pull LFS files
RUN git lfs install && \
# Initialize Git LFS and pull LFS files git lfs pull
RUN git lfs install && \
git lfs pull # Install dependencies from requirements.txt with specific opencv-python-headless version
RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
# Install dependencies from requirements.txt with specific opencv-python-headless version pip uninstall -y opencv-python opencv-python-headless && \
RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \ pip install --no-cache-dir opencv-python-headless==4.8.1.78 && \
# pip uninstall -y opencv-python opencv-python-headless && \ pip install -r requirements.txt && \
# pip install --no-cache-dir opencv-python-headless==4.8.1.78 && \ pip install huggingface_hub
pip install -r requirements.txt && \
pip install huggingface_hub # Run download.py to fetch model weights and convert safetensors to .pt format
# RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
# Run download.py to fetch model weights and convert safetensors to .pt format # python download.py && \
# RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \ # echo "Contents of weights directory:" && \
# python download.py && \ # ls -lR weights && \
# echo "Contents of weights directory:" && \ # python weights/convert_safetensor_to_pt.py
# ls -lR weights && \
# python weights/convert_safetensor_to_pt.py # Expose the default Gradio port
EXPOSE 7861
# Expose the default Gradio port
EXPOSE 7861 # Configure Gradio to be accessible externally
ENV GRADIO_SERVER_NAME="0.0.0.0"
# Configure Gradio to be accessible externally
ENV GRADIO_SERVER_NAME="0.0.0.0" # Copy and set permissions for entrypoint script
# COPY entrypoint.sh /usr/src/app/entrypoint.sh
# Copy and set permissions for entrypoint script # RUN chmod +x /usr/src/app/entrypoint.sh
# COPY entrypoint.sh /usr/src/app/entrypoint.sh
# RUN chmod +x /usr/src/app/entrypoint.sh # To debug, keep the container running
# CMD ["tail", "-f", "/dev/null"]
# To debug, keep the container running
# CMD ["tail", "-f", "/dev/null"] ################################################################################################
# virtual display related setup --> from anthropic-quickstarts/computer-use-demo/Dockerfile
################################################################################################
# virtual display related setup --> from anthropic-quickstarts/computer-use-demo/Dockerfile ENV DEBIAN_FRONTEND=noninteractive
ENV DEBIAN_PRIORITY=high
ENV DEBIAN_FRONTEND=noninteractive
ENV DEBIAN_PRIORITY=high RUN apt-get update && \
apt-get -y upgrade && \
RUN apt-get update && \ apt-get -y install \
apt-get -y upgrade && \ # UI Requirements
apt-get -y install \ xvfb \
# UI Requirements xterm \
xvfb \ xdotool \
xterm \ scrot \
xdotool \ imagemagick \
scrot \ sudo \
imagemagick \ mutter \
sudo \ x11vnc \
mutter \ # Python/pyenv reqs
x11vnc \ build-essential \
# Python/pyenv reqs libssl-dev \
build-essential \ zlib1g-dev \
libssl-dev \ libbz2-dev \
zlib1g-dev \ libreadline-dev \
libbz2-dev \ libsqlite3-dev \
libreadline-dev \ curl \
libsqlite3-dev \ git \
curl \ libncursesw5-dev \
git \ xz-utils \
libncursesw5-dev \ tk-dev \
xz-utils \ libxml2-dev \
tk-dev \ libxmlsec1-dev \
libxml2-dev \ libffi-dev \
libxmlsec1-dev \ liblzma-dev \
libffi-dev \ # Network tools
liblzma-dev \ net-tools \
# Network tools netcat \
net-tools \ # PPA req
netcat \ software-properties-common && \
# PPA req # Userland apps
software-properties-common && \ sudo add-apt-repository ppa:mozillateam/ppa && \
# Userland apps sudo apt-get install -y --no-install-recommends \
sudo add-apt-repository ppa:mozillateam/ppa && \ libreoffice \
sudo apt-get install -y --no-install-recommends \ firefox-esr \
libreoffice \ x11-apps \
firefox-esr \ xpdf \
x11-apps \ gedit \
xpdf \ xpaint \
gedit \ tint2 \
xpaint \ galculator \
tint2 \ pcmanfm \
galculator \ unzip && \
pcmanfm \ apt-get clean
unzip && \
apt-get clean # Install noVNC
RUN git clone --branch v1.5.0 https://github.com/novnc/noVNC.git /opt/noVNC && \
# Install noVNC git clone --branch v0.12.0 https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \
RUN git clone --branch v1.5.0 https://github.com/novnc/noVNC.git /opt/noVNC && \ ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html
git clone --branch v0.12.0 https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \
ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html # setup user
ENV USERNAME=computeruse
# setup user ENV HOME=/home/$USERNAME
ENV USERNAME=computeruse RUN useradd -m -s /bin/bash -d $HOME $USERNAME
ENV HOME=/home/$USERNAME RUN echo "${USERNAME} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
RUN useradd -m -s /bin/bash -d $HOME $USERNAME USER computeruse
RUN echo "${USERNAME} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers WORKDIR $HOME
USER computeruse
WORKDIR $HOME # setup python
RUN git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \
# setup python cd ~/.pyenv && src/configure && make -C src && cd .. && \
RUN git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \ echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc && \
cd ~/.pyenv && src/configure && make -C src && cd .. && \ echo 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc && \
echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc && \ echo 'eval "$(pyenv init -)"' >> ~/.bashrc
echo 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc && \ ENV PYENV_ROOT="$HOME/.pyenv"
echo 'eval "$(pyenv init -)"' >> ~/.bashrc ENV PATH="$PYENV_ROOT/bin:$PATH"
ENV PYENV_ROOT="$HOME/.pyenv" ENV PYENV_VERSION_MAJOR=3
ENV PATH="$PYENV_ROOT/bin:$PATH" ENV PYENV_VERSION_MINOR=11
ENV PYENV_VERSION_MAJOR=3 ENV PYENV_VERSION_PATCH=6
ENV PYENV_VERSION_MINOR=11 ENV PYENV_VERSION=$PYENV_VERSION_MAJOR.$PYENV_VERSION_MINOR.$PYENV_VERSION_PATCH
ENV PYENV_VERSION_PATCH=6 RUN eval "$(pyenv init -)" && \
ENV PYENV_VERSION=$PYENV_VERSION_MAJOR.$PYENV_VERSION_MINOR.$PYENV_VERSION_PATCH pyenv install $PYENV_VERSION && \
RUN eval "$(pyenv init -)" && \ pyenv global $PYENV_VERSION && \
pyenv install $PYENV_VERSION && \ pyenv rehash
pyenv global $PYENV_VERSION && \
pyenv rehash ENV PATH="$HOME/.pyenv/shims:$HOME/.pyenv/bin:$PATH"
ENV PATH="$HOME/.pyenv/shims:$HOME/.pyenv/bin:$PATH" RUN python -m pip install --upgrade pip==23.1.2 setuptools==58.0.4 wheel==0.40.0 && \
python -m pip config set global.disable-pip-version-check true
RUN python -m pip install --upgrade pip==23.1.2 setuptools==58.0.4 wheel==0.40.0 && \
python -m pip config set global.disable-pip-version-check true # only reinstall if requirements.txt changes
# COPY --chown=$USERNAME:$USERNAME computer_use_demo/requirements.txt $HOME/computer_use_demo/requirements.txt
# only reinstall if requirements.txt changes # RUN python -m pip install -r $HOME/computer_use_demo/requirements.txt
# COPY --chown=$USERNAME:$USERNAME computer_use_demo/requirements.txt $HOME/computer_use_demo/requirements.txt
# RUN python -m pip install -r $HOME/computer_use_demo/requirements.txt # setup desktop env & app
# COPY --chown=$USERNAME:$USERNAME image/ $HOME
# setup desktop env & app # COPY --chown=$USERNAME:$USERNAME computer_use_demo/ $HOME/computer_use_demo/
# COPY --chown=$USERNAME:$USERNAME image/ $HOME
# COPY --chown=$USERNAME:$USERNAME computer_use_demo/ $HOME/computer_use_demo/ ARG DISPLAY_NUM=1
ARG HEIGHT=768
ARG DISPLAY_NUM=1 ARG WIDTH=1024
ARG HEIGHT=768 ENV DISPLAY_NUM=$DISPLAY_NUM
ARG WIDTH=1024 ENV HEIGHT=$HEIGHT
ENV DISPLAY_NUM=$DISPLAY_NUM ENV WIDTH=$WIDTH
ENV HEIGHT=$HEIGHT
ENV WIDTH=$WIDTH # Set the entrypoint
# ENTRYPOINT ["/usr/src/app/entrypoint.sh"]
# Set the entrypoint
# ENTRYPOINT ["/usr/src/app/entrypoint.sh"] # docker build . -t omniparser-x-demo:local # manually build the docker image (optional)
# sudo docker build . -t omniparser-x-demo:local # manually build the docker image (optional)

Binary file not shown.

File diff suppressed because one or more lines are too long

Binary file not shown.

Before

Width:  |  Height:  |  Size: 328 KiB

After

Width:  |  Height:  |  Size: 560 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 404 KiB

After

Width:  |  Height:  |  Size: 720 KiB

View File

@@ -1,60 +1,60 @@
from utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_dino_model, get_yolo_model from utils import get_som_labeled_img, check_ocr_box, get_yolo_model
import torch import torch
from ultralytics import YOLO from ultralytics import YOLO
from PIL import Image from PIL import Image
from typing import Dict, Tuple, List from typing import Dict, Tuple, List
import io import io
import base64 import base64
config = { config = {
'som_model_path': 'finetuned_icon_detect.pt', 'som_model_path': 'finetuned_icon_detect.pt',
'device': 'cpu', 'device': 'cpu',
'caption_model_path': 'Salesforce/blip2-opt-2.7b', 'caption_model_path': 'Salesforce/blip2-opt-2.7b',
'draw_bbox_config': { 'draw_bbox_config': {
'text_scale': 0.8, 'text_scale': 0.8,
'text_thickness': 2, 'text_thickness': 2,
'text_padding': 3, 'text_padding': 3,
'thickness': 3, 'thickness': 3,
}, },
'BOX_TRESHOLD': 0.05 'BOX_TRESHOLD': 0.05
} }
class Omniparser(object): class Omniparser(object):
def __init__(self, config: Dict): def __init__(self, config: Dict):
self.config = config self.config = config
self.som_model = get_yolo_model(model_path=config['som_model_path']) self.som_model = get_yolo_model(model_path=config['som_model_path'])
# self.caption_model_processor = get_caption_model_processor(config['caption_model_path'], device=cofig['device']) # self.caption_model_processor = get_caption_model_processor(config['caption_model_path'], device=cofig['device'])
# self.caption_model_processor['model'].to(torch.float32) # self.caption_model_processor['model'].to(torch.float32)
def parse(self, image_path: str): def parse(self, image_path: str):
print('Parsing image:', image_path) print('Parsing image:', image_path)
ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}) ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9})
text, ocr_bbox = ocr_bbox_rslt text, ocr_bbox = ocr_bbox_rslt
draw_bbox_config = self.config['draw_bbox_config'] draw_bbox_config = self.config['draw_bbox_config']
BOX_TRESHOLD = self.config['BOX_TRESHOLD'] BOX_TRESHOLD = self.config['BOX_TRESHOLD']
dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, self.som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=False, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=None, ocr_text=text,use_local_semantics=False) dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, self.som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=False, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=None, ocr_text=text,use_local_semantics=False)
image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img))) image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
# formating output # formating output
return_list = [{'from': 'omniparser', 'shape': {'x':coord[0], 'y':coord[1], 'width':coord[2], 'height':coord[3]}, return_list = [{'from': 'omniparser', 'shape': {'x':coord[0], 'y':coord[1], 'width':coord[2], 'height':coord[3]},
'text': parsed_content_list[i].split(': ')[1], 'type':'text'} for i, (k, coord) in enumerate(label_coordinates.items()) if i < len(parsed_content_list)] 'text': parsed_content_list[i].split(': ')[1], 'type':'text'} for i, (k, coord) in enumerate(label_coordinates.items()) if i < len(parsed_content_list)]
return_list.extend( return_list.extend(
[{'from': 'omniparser', 'shape': {'x':coord[0], 'y':coord[1], 'width':coord[2], 'height':coord[3]}, [{'from': 'omniparser', 'shape': {'x':coord[0], 'y':coord[1], 'width':coord[2], 'height':coord[3]},
'text': 'None', 'type':'icon'} for i, (k, coord) in enumerate(label_coordinates.items()) if i >= len(parsed_content_list)] 'text': 'None', 'type':'icon'} for i, (k, coord) in enumerate(label_coordinates.items()) if i >= len(parsed_content_list)]
) )
return [image, return_list] return [image, return_list]
parser = Omniparser(config) parser = Omniparser(config)
image_path = 'examples/pc_1.png' image_path = 'examples/pc_1.png'
# time the parser # time the parser
import time import time
s = time.time() s = time.time()
image, parsed_content_list = parser.parse(image_path) image, parsed_content_list = parser.parse(image_path)
device = config['device'] device = config['device']
print(f'Time taken for Omniparser on {device}:', time.time() - s) print(f'Time taken for Omniparser on {device}:', time.time() - s)

1087
utils.py

File diff suppressed because it is too large Load Diff