add docker file, accelerate inference using cv2
This commit is contained in:
201
Dockerfile
Normal file
201
Dockerfile
Normal file
@@ -0,0 +1,201 @@
|
|||||||
|
# Dockerfile for OmniParser with GPU support and OpenGL libraries
|
||||||
|
#
|
||||||
|
# This Dockerfile is intended to create an environment with NVIDIA CUDA
|
||||||
|
# support and the necessary dependencies to run the OmniParser project.
|
||||||
|
# The configuration is designed to support applications that rely on
|
||||||
|
# Python 3.12, OpenCV, Hugging Face transformers, and Gradio. Additionally,
|
||||||
|
# it includes steps to pull large files from Git LFS and a script to
|
||||||
|
# convert model weights from .safetensor to .pt format. The container
|
||||||
|
# runs a Gradio server by default, exposed on port 7861.
|
||||||
|
#
|
||||||
|
# Base image: nvidia/cuda:12.3.1-devel-ubuntu22.04
|
||||||
|
#
|
||||||
|
# Key features:
|
||||||
|
# - System dependencies for OpenGL to support graphical libraries.
|
||||||
|
# - Miniconda for Python 3.12, allowing for environment management.
|
||||||
|
# - Git Large File Storage (LFS) setup for handling large model files.
|
||||||
|
# - Requirement file installation, including specific versions of
|
||||||
|
# OpenCV and Hugging Face Hub.
|
||||||
|
# - Entrypoint script execution with Gradio server configuration for
|
||||||
|
# external access.
|
||||||
|
|
||||||
|
FROM nvidia/cuda:12.3.1-devel-ubuntu22.04
|
||||||
|
|
||||||
|
# Install system dependencies with explicit OpenGL libraries
|
||||||
|
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
||||||
|
git \
|
||||||
|
git-lfs \
|
||||||
|
wget \
|
||||||
|
libgl1 \
|
||||||
|
libglib2.0-0 \
|
||||||
|
libsm6 \
|
||||||
|
libxext6 \
|
||||||
|
libxrender1 \
|
||||||
|
libglu1-mesa \
|
||||||
|
libglib2.0-0 \
|
||||||
|
libsm6 \
|
||||||
|
libxrender1 \
|
||||||
|
libxext6 \
|
||||||
|
python3-opencv \
|
||||||
|
&& apt-get clean \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& git lfs install
|
||||||
|
|
||||||
|
# Install Miniconda for Python 3.12
|
||||||
|
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
|
||||||
|
bash miniconda.sh -b -p /opt/conda && \
|
||||||
|
rm miniconda.sh
|
||||||
|
ENV PATH="/opt/conda/bin:$PATH"
|
||||||
|
|
||||||
|
# Create and activate Conda environment with Python 3.12, and set it as the default
|
||||||
|
RUN conda create -n omni python=3.12 && \
|
||||||
|
echo "source activate omni" > ~/.bashrc
|
||||||
|
ENV CONDA_DEFAULT_ENV=omni
|
||||||
|
ENV PATH="/opt/conda/envs/omni/bin:$PATH"
|
||||||
|
|
||||||
|
# Set the working directory in the container
|
||||||
|
WORKDIR /usr/src/app
|
||||||
|
|
||||||
|
# Copy project files and requirements
|
||||||
|
COPY . .
|
||||||
|
COPY requirements.txt /usr/src/app/requirements.txt
|
||||||
|
|
||||||
|
# Initialize Git LFS and pull LFS files
|
||||||
|
RUN git lfs install && \
|
||||||
|
git lfs pull
|
||||||
|
|
||||||
|
# Install dependencies from requirements.txt with specific opencv-python-headless version
|
||||||
|
RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
|
||||||
|
pip uninstall -y opencv-python opencv-python-headless && \
|
||||||
|
pip install --no-cache-dir opencv-python-headless==4.8.1.78 && \
|
||||||
|
pip install -r requirements.txt && \
|
||||||
|
pip install huggingface_hub
|
||||||
|
|
||||||
|
# Run download.py to fetch model weights and convert safetensors to .pt format
|
||||||
|
# RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
|
||||||
|
# python download.py && \
|
||||||
|
# echo "Contents of weights directory:" && \
|
||||||
|
# ls -lR weights && \
|
||||||
|
# python weights/convert_safetensor_to_pt.py
|
||||||
|
|
||||||
|
# Expose the default Gradio port
|
||||||
|
EXPOSE 7861
|
||||||
|
|
||||||
|
# Configure Gradio to be accessible externally
|
||||||
|
ENV GRADIO_SERVER_NAME="0.0.0.0"
|
||||||
|
|
||||||
|
# Copy and set permissions for entrypoint script
|
||||||
|
# COPY entrypoint.sh /usr/src/app/entrypoint.sh
|
||||||
|
# RUN chmod +x /usr/src/app/entrypoint.sh
|
||||||
|
|
||||||
|
# To debug, keep the container running
|
||||||
|
# CMD ["tail", "-f", "/dev/null"]
|
||||||
|
|
||||||
|
################################################################################################
|
||||||
|
# virtual display related setup --> from anthropic-quickstarts/computer-use-demo/Dockerfile
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
ENV DEBIAN_PRIORITY=high
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get -y upgrade && \
|
||||||
|
apt-get -y install \
|
||||||
|
# UI Requirements
|
||||||
|
xvfb \
|
||||||
|
xterm \
|
||||||
|
xdotool \
|
||||||
|
scrot \
|
||||||
|
imagemagick \
|
||||||
|
sudo \
|
||||||
|
mutter \
|
||||||
|
x11vnc \
|
||||||
|
# Python/pyenv reqs
|
||||||
|
build-essential \
|
||||||
|
libssl-dev \
|
||||||
|
zlib1g-dev \
|
||||||
|
libbz2-dev \
|
||||||
|
libreadline-dev \
|
||||||
|
libsqlite3-dev \
|
||||||
|
curl \
|
||||||
|
git \
|
||||||
|
libncursesw5-dev \
|
||||||
|
xz-utils \
|
||||||
|
tk-dev \
|
||||||
|
libxml2-dev \
|
||||||
|
libxmlsec1-dev \
|
||||||
|
libffi-dev \
|
||||||
|
liblzma-dev \
|
||||||
|
# Network tools
|
||||||
|
net-tools \
|
||||||
|
netcat \
|
||||||
|
# PPA req
|
||||||
|
software-properties-common && \
|
||||||
|
# Userland apps
|
||||||
|
sudo add-apt-repository ppa:mozillateam/ppa && \
|
||||||
|
sudo apt-get install -y --no-install-recommends \
|
||||||
|
libreoffice \
|
||||||
|
firefox-esr \
|
||||||
|
x11-apps \
|
||||||
|
xpdf \
|
||||||
|
gedit \
|
||||||
|
xpaint \
|
||||||
|
tint2 \
|
||||||
|
galculator \
|
||||||
|
pcmanfm \
|
||||||
|
unzip && \
|
||||||
|
apt-get clean
|
||||||
|
|
||||||
|
# Install noVNC
|
||||||
|
RUN git clone --branch v1.5.0 https://github.com/novnc/noVNC.git /opt/noVNC && \
|
||||||
|
git clone --branch v0.12.0 https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \
|
||||||
|
ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html
|
||||||
|
|
||||||
|
# setup user
|
||||||
|
ENV USERNAME=computeruse
|
||||||
|
ENV HOME=/home/$USERNAME
|
||||||
|
RUN useradd -m -s /bin/bash -d $HOME $USERNAME
|
||||||
|
RUN echo "${USERNAME} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
|
||||||
|
USER computeruse
|
||||||
|
WORKDIR $HOME
|
||||||
|
|
||||||
|
# setup python
|
||||||
|
RUN git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \
|
||||||
|
cd ~/.pyenv && src/configure && make -C src && cd .. && \
|
||||||
|
echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc && \
|
||||||
|
echo 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc && \
|
||||||
|
echo 'eval "$(pyenv init -)"' >> ~/.bashrc
|
||||||
|
ENV PYENV_ROOT="$HOME/.pyenv"
|
||||||
|
ENV PATH="$PYENV_ROOT/bin:$PATH"
|
||||||
|
ENV PYENV_VERSION_MAJOR=3
|
||||||
|
ENV PYENV_VERSION_MINOR=11
|
||||||
|
ENV PYENV_VERSION_PATCH=6
|
||||||
|
ENV PYENV_VERSION=$PYENV_VERSION_MAJOR.$PYENV_VERSION_MINOR.$PYENV_VERSION_PATCH
|
||||||
|
RUN eval "$(pyenv init -)" && \
|
||||||
|
pyenv install $PYENV_VERSION && \
|
||||||
|
pyenv global $PYENV_VERSION && \
|
||||||
|
pyenv rehash
|
||||||
|
|
||||||
|
ENV PATH="$HOME/.pyenv/shims:$HOME/.pyenv/bin:$PATH"
|
||||||
|
|
||||||
|
RUN python -m pip install --upgrade pip==23.1.2 setuptools==58.0.4 wheel==0.40.0 && \
|
||||||
|
python -m pip config set global.disable-pip-version-check true
|
||||||
|
|
||||||
|
# only reinstall if requirements.txt changes
|
||||||
|
# COPY --chown=$USERNAME:$USERNAME computer_use_demo/requirements.txt $HOME/computer_use_demo/requirements.txt
|
||||||
|
# RUN python -m pip install -r $HOME/computer_use_demo/requirements.txt
|
||||||
|
|
||||||
|
# setup desktop env & app
|
||||||
|
# COPY --chown=$USERNAME:$USERNAME image/ $HOME
|
||||||
|
# COPY --chown=$USERNAME:$USERNAME computer_use_demo/ $HOME/computer_use_demo/
|
||||||
|
|
||||||
|
ARG DISPLAY_NUM=1
|
||||||
|
ARG HEIGHT=768
|
||||||
|
ARG WIDTH=1024
|
||||||
|
ENV DISPLAY_NUM=$DISPLAY_NUM
|
||||||
|
ENV HEIGHT=$HEIGHT
|
||||||
|
ENV WIDTH=$WIDTH
|
||||||
|
|
||||||
|
# Set the entrypoint
|
||||||
|
# ENTRYPOINT ["/usr/src/app/entrypoint.sh"]
|
||||||
|
|
||||||
|
# docker build . -t omniparser-x-demo:local # manually build the docker image (optional)
|
||||||
Binary file not shown.
128
demo.ipynb
128
demo.ipynb
@@ -2,14 +2,14 @@
|
|||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 35,
|
"execution_count": 1,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"model to cuda\n"
|
"model to cpu\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -18,7 +18,8 @@
|
|||||||
"import torch\n",
|
"import torch\n",
|
||||||
"from ultralytics import YOLO\n",
|
"from ultralytics import YOLO\n",
|
||||||
"from PIL import Image\n",
|
"from PIL import Image\n",
|
||||||
"device = 'cuda'\n",
|
"device = 'cpu'\n",
|
||||||
|
"device = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
|
||||||
"model_path='weights/icon_detect/best.pt'\n",
|
"model_path='weights/icon_detect/best.pt'\n",
|
||||||
"model_path='weights/icon_detect_v1_5/model_v1_5.pt'\n",
|
"model_path='weights/icon_detect_v1_5/model_v1_5.pt'\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -30,7 +31,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 2,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@@ -57,7 +58,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": 3,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@@ -66,7 +67,7 @@
|
|||||||
"(device(type='cuda', index=0), ultralytics.models.yolo.model.YOLO)"
|
"(device(type='cuda', index=0), ultralytics.models.yolo.model.YOLO)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 9,
|
"execution_count": 3,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@@ -77,7 +78,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 36,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@@ -86,8 +87,15 @@
|
|||||||
"text": [
|
"text": [
|
||||||
"image size: (1919, 1079)\n",
|
"image size: (1919, 1079)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"image 1/1 /home/yadonglu/OmniParser/imgs/word.png: 736x1280 115 icons, 51.7ms\n",
|
"image 1/1 /home/yadonglu/OmniParser/imgs/word.png: 736x1280 115 icons, 13.7ms\n",
|
||||||
"Speed: 5.0ms preprocess, 51.7ms inference, 1.6ms postprocess per image at shape (1, 3, 736, 1280)\n"
|
"Speed: 5.5ms preprocess, 13.7ms inference, 1.6ms postprocess per image at shape (1, 3, 736, 1280)\n",
|
||||||
|
"len(filtered_boxes): 151 65\n",
|
||||||
|
"time to prepare bbox: 0.01561737060546875\n",
|
||||||
|
"time to process image + tokenize text inputs: 0.09026336669921875\n",
|
||||||
|
"time to generate: 0.7382848262786865\n",
|
||||||
|
"time to get parsed content: 0.8477945327758789\n",
|
||||||
|
"ocr time: 0.6952385902404785\n",
|
||||||
|
"caption time: 1.245499849319458\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -127,9 +135,83 @@
|
|||||||
"cur_time_ocr = time.time() \n",
|
"cur_time_ocr = time.time() \n",
|
||||||
"\n",
|
"\n",
|
||||||
"dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,use_local_semantics=True, iou_threshold=0.7, scale_img=False, batch_size=128)\n",
|
"dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,use_local_semantics=True, iou_threshold=0.7, scale_img=False, batch_size=128)\n",
|
||||||
"cur_time_caption = time.time()\n"
|
"cur_time_caption = time.time()\n",
|
||||||
|
"print('ocr time:', cur_time_ocr - start)\n",
|
||||||
|
"print('caption time:', cur_time_caption - cur_time_ocr)\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"image size: (1919, 1079)\n",
|
||||||
|
"\n",
|
||||||
|
"image 1/1 /home/yadonglu/OmniParser/imgs/word.png: 736x1280 115 icons, 299.2ms\n",
|
||||||
|
"Speed: 5.7ms preprocess, 299.2ms inference, 3.7ms postprocess per image at shape (1, 3, 736, 1280)\n",
|
||||||
|
"len(filtered_boxes): 151 65\n",
|
||||||
|
"time to prepare bbox: 0.016057729721069336\n",
|
||||||
|
"time to process image + tokenize text inputs: 1.802201509475708\n",
|
||||||
|
"time to generate: 61.352588415145874\n",
|
||||||
|
"time to get parsed content: 63.17377543449402\n",
|
||||||
|
"ocr time: 0.8477699756622314\n",
|
||||||
|
"caption time: 64.17442154884338\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# run on cpu!!!\n",
|
||||||
|
"# reload utils\n",
|
||||||
|
"import importlib\n",
|
||||||
|
"import utils\n",
|
||||||
|
"importlib.reload(utils)\n",
|
||||||
|
"from utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_yolo_model\n",
|
||||||
|
"\n",
|
||||||
|
"image_path = 'imgs/google_page.png'\n",
|
||||||
|
"image_path = 'imgs/windows_home.png'\n",
|
||||||
|
"# image_path = 'imgs/windows_multitab.png'\n",
|
||||||
|
"# image_path = 'imgs/omni3.jpg'\n",
|
||||||
|
"# image_path = 'imgs/ios.png'\n",
|
||||||
|
"image_path = 'imgs/word.png'\n",
|
||||||
|
"# image_path = 'imgs/excel2.png'\n",
|
||||||
|
"# image_path = 'imgs/mobile.png'\n",
|
||||||
|
"\n",
|
||||||
|
"image = Image.open(image_path)\n",
|
||||||
|
"image_rgb = image.convert('RGB')\n",
|
||||||
|
"print('image size:', image.size)\n",
|
||||||
|
"\n",
|
||||||
|
"box_overlay_ratio = max(image.size) / 3200\n",
|
||||||
|
"draw_bbox_config = {\n",
|
||||||
|
" 'text_scale': 0.8 * box_overlay_ratio,\n",
|
||||||
|
" 'text_thickness': max(int(2 * box_overlay_ratio), 1),\n",
|
||||||
|
" 'text_padding': max(int(3 * box_overlay_ratio), 1),\n",
|
||||||
|
" 'thickness': max(int(3 * box_overlay_ratio), 1),\n",
|
||||||
|
"}\n",
|
||||||
|
"BOX_TRESHOLD = 0.05\n",
|
||||||
|
"\n",
|
||||||
|
"import time\n",
|
||||||
|
"start = time.time()\n",
|
||||||
|
"ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.5}, use_paddleocr=True)\n",
|
||||||
|
"text, ocr_bbox = ocr_bbox_rslt\n",
|
||||||
|
"cur_time_ocr = time.time() \n",
|
||||||
|
"\n",
|
||||||
|
"dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,use_local_semantics=True, iou_threshold=0.7, scale_img=False, batch_size=128)\n",
|
||||||
|
"cur_time_caption = time.time()\n",
|
||||||
|
"print('ocr time:', cur_time_ocr - start)\n",
|
||||||
|
"print('caption time:', cur_time_caption - cur_time_ocr)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 37,
|
"execution_count": 37,
|
||||||
@@ -172,7 +254,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 38,
|
"execution_count": 16,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@@ -257,7 +339,7 @@
|
|||||||
" <td>icon</td>\n",
|
" <td>icon</td>\n",
|
||||||
" <td>[0.27768608927726746, 0.1485075205564499, 0.28...</td>\n",
|
" <td>[0.27768608927726746, 0.1485075205564499, 0.28...</td>\n",
|
||||||
" <td>True</td>\n",
|
" <td>True</td>\n",
|
||||||
" <td>Redo</td>\n",
|
" <td>Six</td>\n",
|
||||||
" <td>146</td>\n",
|
" <td>146</td>\n",
|
||||||
" </tr>\n",
|
" </tr>\n",
|
||||||
" <tr>\n",
|
" <tr>\n",
|
||||||
@@ -265,7 +347,7 @@
|
|||||||
" <td>icon</td>\n",
|
" <td>icon</td>\n",
|
||||||
" <td>[0.9438582062721252, 0.9580937027931213, 0.995...</td>\n",
|
" <td>[0.9438582062721252, 0.9580937027931213, 0.995...</td>\n",
|
||||||
" <td>True</td>\n",
|
" <td>True</td>\n",
|
||||||
" <td>Notifications.</td>\n",
|
" <td>battery charge indicator</td>\n",
|
||||||
" <td>147</td>\n",
|
" <td>147</td>\n",
|
||||||
" </tr>\n",
|
" </tr>\n",
|
||||||
" <tr>\n",
|
" <tr>\n",
|
||||||
@@ -273,7 +355,7 @@
|
|||||||
" <td>icon</td>\n",
|
" <td>icon</td>\n",
|
||||||
" <td>[0.31950756907463074, 0.3229200839996338, 0.33...</td>\n",
|
" <td>[0.31950756907463074, 0.3229200839996338, 0.33...</td>\n",
|
||||||
" <td>True</td>\n",
|
" <td>True</td>\n",
|
||||||
" <td>minimizing a window.</td>\n",
|
" <td>A menu or list of options.</td>\n",
|
||||||
" <td>148</td>\n",
|
" <td>148</td>\n",
|
||||||
" </tr>\n",
|
" </tr>\n",
|
||||||
" <tr>\n",
|
" <tr>\n",
|
||||||
@@ -281,7 +363,7 @@
|
|||||||
" <td>icon</td>\n",
|
" <td>icon</td>\n",
|
||||||
" <td>[0.08737719058990479, 0.148496612906456, 0.095...</td>\n",
|
" <td>[0.08737719058990479, 0.148496612906456, 0.095...</td>\n",
|
||||||
" <td>True</td>\n",
|
" <td>True</td>\n",
|
||||||
" <td>Redo</td>\n",
|
" <td>5,5L9,5 4.5z</td>\n",
|
||||||
" <td>149</td>\n",
|
" <td>149</td>\n",
|
||||||
" </tr>\n",
|
" </tr>\n",
|
||||||
" <tr>\n",
|
" <tr>\n",
|
||||||
@@ -289,7 +371,7 @@
|
|||||||
" <td>icon</td>\n",
|
" <td>icon</td>\n",
|
||||||
" <td>[0.7414734959602356, 0.000822930654976517, 0.7...</td>\n",
|
" <td>[0.7414734959602356, 0.000822930654976517, 0.7...</td>\n",
|
||||||
" <td>True</td>\n",
|
" <td>True</td>\n",
|
||||||
" <td>M0,0L9,0 4.5,5z</td>\n",
|
" <td>Unordered List</td>\n",
|
||||||
" <td>150</td>\n",
|
" <td>150</td>\n",
|
||||||
" </tr>\n",
|
" </tr>\n",
|
||||||
" </tbody>\n",
|
" </tbody>\n",
|
||||||
@@ -318,16 +400,16 @@
|
|||||||
"3 O Search 3 \n",
|
"3 O Search 3 \n",
|
||||||
"4 File 4 \n",
|
"4 File 4 \n",
|
||||||
".. ... ... \n",
|
".. ... ... \n",
|
||||||
"146 Redo 146 \n",
|
"146 Six 146 \n",
|
||||||
"147 Notifications. 147 \n",
|
"147 battery charge indicator 147 \n",
|
||||||
"148 minimizing a window. 148 \n",
|
"148 A menu or list of options. 148 \n",
|
||||||
"149 Redo 149 \n",
|
"149 5,5L9,5 4.5z 149 \n",
|
||||||
"150 M0,0L9,0 4.5,5z 150 \n",
|
"150 Unordered List 150 \n",
|
||||||
"\n",
|
"\n",
|
||||||
"[151 rows x 5 columns]"
|
"[151 rows x 5 columns]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 38,
|
"execution_count": 16,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@@ -376,7 +458,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "pilot",
|
"display_name": "omni",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 328 KiB After Width: | Height: | Size: 560 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 404 KiB After Width: | Height: | Size: 720 KiB |
@@ -1,4 +1,4 @@
|
|||||||
from utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_dino_model, get_yolo_model
|
from utils import get_som_labeled_img, check_ocr_box, get_yolo_model
|
||||||
import torch
|
import torch
|
||||||
from ultralytics import YOLO
|
from ultralytics import YOLO
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|||||||
15
utils.py
15
utils.py
@@ -84,11 +84,15 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_
|
|||||||
else:
|
else:
|
||||||
non_ocr_boxes = filtered_boxes
|
non_ocr_boxes = filtered_boxes
|
||||||
croped_pil_image = []
|
croped_pil_image = []
|
||||||
|
t0 = time.time()
|
||||||
for i, coord in enumerate(non_ocr_boxes):
|
for i, coord in enumerate(non_ocr_boxes):
|
||||||
xmin, xmax = int(coord[0]*image_source.shape[1]), int(coord[2]*image_source.shape[1])
|
xmin, xmax = int(coord[0]*image_source.shape[1]), int(coord[2]*image_source.shape[1])
|
||||||
ymin, ymax = int(coord[1]*image_source.shape[0]), int(coord[3]*image_source.shape[0])
|
ymin, ymax = int(coord[1]*image_source.shape[0]), int(coord[3]*image_source.shape[0])
|
||||||
cropped_image = image_source[ymin:ymax, xmin:xmax, :]
|
cropped_image = image_source[ymin:ymax, xmin:xmax, :]
|
||||||
|
# resize the image to 224x224 to avoid long overhead in clipimageprocessor # TODO
|
||||||
|
cropped_image = cv2.resize(cropped_image, (224, 224))
|
||||||
croped_pil_image.append(to_pil(cropped_image))
|
croped_pil_image.append(to_pil(cropped_image))
|
||||||
|
print('time to prepare bbox:', time.time()-t0)
|
||||||
|
|
||||||
model, processor = caption_model_processor['model'], caption_model_processor['processor']
|
model, processor = caption_model_processor['model'], caption_model_processor['processor']
|
||||||
if not prompt:
|
if not prompt:
|
||||||
@@ -103,14 +107,19 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_
|
|||||||
for i in range(0, len(croped_pil_image), batch_size):
|
for i in range(0, len(croped_pil_image), batch_size):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
batch = croped_pil_image[i:i+batch_size]
|
batch = croped_pil_image[i:i+batch_size]
|
||||||
|
t1 = time.time()
|
||||||
if model.device.type == 'cuda':
|
if model.device.type == 'cuda':
|
||||||
inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt").to(device=device, dtype=torch.float16)
|
inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt", do_resize=False).to(device=device, dtype=torch.float16)
|
||||||
else:
|
else:
|
||||||
inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt").to(device=device)
|
inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt").to(device=device)
|
||||||
|
t2 = time.time()
|
||||||
|
print('time to process image + tokenize text inputs:', t2-t1)
|
||||||
if 'florence' in model.config.name_or_path:
|
if 'florence' in model.config.name_or_path:
|
||||||
generated_ids = model.generate(input_ids=inputs["input_ids"],pixel_values=inputs["pixel_values"],max_new_tokens=100,num_beams=3, do_sample=False)
|
generated_ids = model.generate(input_ids=inputs["input_ids"],pixel_values=inputs["pixel_values"],max_new_tokens=20,num_beams=1, do_sample=False)
|
||||||
else:
|
else:
|
||||||
generated_ids = model.generate(**inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, num_return_sequences=1) # temperature=0.01, do_sample=True,
|
generated_ids = model.generate(**inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, num_return_sequences=1) # temperature=0.01, do_sample=True,
|
||||||
|
t3 = time.time()
|
||||||
|
print('time to generate:', t3-t2)
|
||||||
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
||||||
generated_text = [gen.strip() for gen in generated_text]
|
generated_text = [gen.strip() for gen in generated_text]
|
||||||
generated_texts.extend(generated_text)
|
generated_texts.extend(generated_text)
|
||||||
@@ -437,6 +446,7 @@ def get_som_labeled_img(img_path, model=None, BOX_TRESHOLD = 0.01, output_coord_
|
|||||||
|
|
||||||
|
|
||||||
# get parsed icon local semantics
|
# get parsed icon local semantics
|
||||||
|
time1 = time.time()
|
||||||
if use_local_semantics:
|
if use_local_semantics:
|
||||||
caption_model = caption_model_processor['model']
|
caption_model = caption_model_processor['model']
|
||||||
if 'phi3_v' in caption_model.config.model_type:
|
if 'phi3_v' in caption_model.config.model_type:
|
||||||
@@ -456,6 +466,7 @@ def get_som_labeled_img(img_path, model=None, BOX_TRESHOLD = 0.01, output_coord_
|
|||||||
else:
|
else:
|
||||||
ocr_text = [f"Text Box ID {i}: {txt}" for i, txt in enumerate(ocr_text)]
|
ocr_text = [f"Text Box ID {i}: {txt}" for i, txt in enumerate(ocr_text)]
|
||||||
parsed_content_merged = ocr_text
|
parsed_content_merged = ocr_text
|
||||||
|
print('time to get parsed content:', time.time()-time1)
|
||||||
|
|
||||||
filtered_boxes = box_convert(boxes=filtered_boxes, in_fmt="xyxy", out_fmt="cxcywh")
|
filtered_boxes = box_convert(boxes=filtered_boxes, in_fmt="xyxy", out_fmt="cxcywh")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user