diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5737680 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,201 @@ +# Dockerfile for OmniParser with GPU support and OpenGL libraries +# +# This Dockerfile is intended to create an environment with NVIDIA CUDA +# support and the necessary dependencies to run the OmniParser project. +# The configuration is designed to support applications that rely on +# Python 3.12, OpenCV, Hugging Face transformers, and Gradio. Additionally, +# it includes steps to pull large files from Git LFS and a script to +# convert model weights from .safetensor to .pt format. The container +# runs a Gradio server by default, exposed on port 7861. +# +# Base image: nvidia/cuda:12.3.1-devel-ubuntu22.04 +# +# Key features: +# - System dependencies for OpenGL to support graphical libraries. +# - Miniconda for Python 3.12, allowing for environment management. +# - Git Large File Storage (LFS) setup for handling large model files. +# - Requirement file installation, including specific versions of +# OpenCV and Hugging Face Hub. +# - Entrypoint script execution with Gradio server configuration for +# external access. + +FROM nvidia/cuda:12.3.1-devel-ubuntu22.04 + +# Install system dependencies with explicit OpenGL libraries +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + git \ + git-lfs \ + wget \ + libgl1 \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender1 \ + libglu1-mesa \ + libglib2.0-0 \ + libsm6 \ + libxrender1 \ + libxext6 \ + python3-opencv \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* \ + && git lfs install + +# Install Miniconda for Python 3.12 +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ + bash miniconda.sh -b -p /opt/conda && \ + rm miniconda.sh +ENV PATH="/opt/conda/bin:$PATH" + +# Create and activate Conda environment with Python 3.12, and set it as the default +RUN conda create -n omni python=3.12 && \ + echo "source activate omni" > ~/.bashrc +ENV CONDA_DEFAULT_ENV=omni +ENV PATH="/opt/conda/envs/omni/bin:$PATH" + +# Set the working directory in the container +WORKDIR /usr/src/app + +# Copy project files and requirements +COPY . . +COPY requirements.txt /usr/src/app/requirements.txt + +# Initialize Git LFS and pull LFS files +RUN git lfs install && \ + git lfs pull + +# Install dependencies from requirements.txt with specific opencv-python-headless version +RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \ + pip uninstall -y opencv-python opencv-python-headless && \ + pip install --no-cache-dir opencv-python-headless==4.8.1.78 && \ + pip install -r requirements.txt && \ + pip install huggingface_hub + +# Run download.py to fetch model weights and convert safetensors to .pt format +# RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \ +# python download.py && \ +# echo "Contents of weights directory:" && \ +# ls -lR weights && \ +# python weights/convert_safetensor_to_pt.py + +# Expose the default Gradio port +EXPOSE 7861 + +# Configure Gradio to be accessible externally +ENV GRADIO_SERVER_NAME="0.0.0.0" + +# Copy and set permissions for entrypoint script +# COPY entrypoint.sh /usr/src/app/entrypoint.sh +# RUN chmod +x /usr/src/app/entrypoint.sh + +# To debug, keep the container running +# CMD ["tail", "-f", "/dev/null"] + +################################################################################################ +# virtual display related setup --> from anthropic-quickstarts/computer-use-demo/Dockerfile + +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBIAN_PRIORITY=high + +RUN apt-get update && \ + apt-get -y upgrade && \ + apt-get -y install \ + # UI Requirements + xvfb \ + xterm \ + xdotool \ + scrot \ + imagemagick \ + sudo \ + mutter \ + x11vnc \ + # Python/pyenv reqs + build-essential \ + libssl-dev \ + zlib1g-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + curl \ + git \ + libncursesw5-dev \ + xz-utils \ + tk-dev \ + libxml2-dev \ + libxmlsec1-dev \ + libffi-dev \ + liblzma-dev \ + # Network tools + net-tools \ + netcat \ + # PPA req + software-properties-common && \ + # Userland apps + sudo add-apt-repository ppa:mozillateam/ppa && \ + sudo apt-get install -y --no-install-recommends \ + libreoffice \ + firefox-esr \ + x11-apps \ + xpdf \ + gedit \ + xpaint \ + tint2 \ + galculator \ + pcmanfm \ + unzip && \ + apt-get clean + +# Install noVNC +RUN git clone --branch v1.5.0 https://github.com/novnc/noVNC.git /opt/noVNC && \ + git clone --branch v0.12.0 https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \ + ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html + +# setup user +ENV USERNAME=computeruse +ENV HOME=/home/$USERNAME +RUN useradd -m -s /bin/bash -d $HOME $USERNAME +RUN echo "${USERNAME} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers +USER computeruse +WORKDIR $HOME + +# setup python +RUN git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \ + cd ~/.pyenv && src/configure && make -C src && cd .. && \ + echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc && \ + echo 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc && \ + echo 'eval "$(pyenv init -)"' >> ~/.bashrc +ENV PYENV_ROOT="$HOME/.pyenv" +ENV PATH="$PYENV_ROOT/bin:$PATH" +ENV PYENV_VERSION_MAJOR=3 +ENV PYENV_VERSION_MINOR=11 +ENV PYENV_VERSION_PATCH=6 +ENV PYENV_VERSION=$PYENV_VERSION_MAJOR.$PYENV_VERSION_MINOR.$PYENV_VERSION_PATCH +RUN eval "$(pyenv init -)" && \ + pyenv install $PYENV_VERSION && \ + pyenv global $PYENV_VERSION && \ + pyenv rehash + +ENV PATH="$HOME/.pyenv/shims:$HOME/.pyenv/bin:$PATH" + +RUN python -m pip install --upgrade pip==23.1.2 setuptools==58.0.4 wheel==0.40.0 && \ + python -m pip config set global.disable-pip-version-check true + +# only reinstall if requirements.txt changes +# COPY --chown=$USERNAME:$USERNAME computer_use_demo/requirements.txt $HOME/computer_use_demo/requirements.txt +# RUN python -m pip install -r $HOME/computer_use_demo/requirements.txt + +# setup desktop env & app +# COPY --chown=$USERNAME:$USERNAME image/ $HOME +# COPY --chown=$USERNAME:$USERNAME computer_use_demo/ $HOME/computer_use_demo/ + +ARG DISPLAY_NUM=1 +ARG HEIGHT=768 +ARG WIDTH=1024 +ENV DISPLAY_NUM=$DISPLAY_NUM +ENV HEIGHT=$HEIGHT +ENV WIDTH=$WIDTH + +# Set the entrypoint +# ENTRYPOINT ["/usr/src/app/entrypoint.sh"] + +# docker build . -t omniparser-x-demo:local # manually build the docker image (optional) diff --git a/__pycache__/utils.cpython-312.pyc b/__pycache__/utils.cpython-312.pyc index a1cfe48..22ae984 100644 Binary files a/__pycache__/utils.cpython-312.pyc and b/__pycache__/utils.cpython-312.pyc differ diff --git a/demo.ipynb b/demo.ipynb index 835c69a..7489fbe 100644 --- a/demo.ipynb +++ b/demo.ipynb @@ -2,14 +2,14 @@ "cells": [ { "cell_type": "code", - "execution_count": 35, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "model to cuda\n" + "model to cpu\n" ] } ], @@ -18,7 +18,8 @@ "import torch\n", "from ultralytics import YOLO\n", "from PIL import Image\n", - "device = 'cuda'\n", + "device = 'cpu'\n", + "device = 'gpu' if torch.cuda.is_available() else 'cpu'\n", "model_path='weights/icon_detect/best.pt'\n", "model_path='weights/icon_detect_v1_5/model_v1_5.pt'\n", "\n", @@ -30,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -57,7 +58,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -66,7 +67,7 @@ "(device(type='cuda', index=0), ultralytics.models.yolo.model.YOLO)" ] }, - "execution_count": 9, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -77,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -86,8 +87,15 @@ "text": [ "image size: (1919, 1079)\n", "\n", - "image 1/1 /home/yadonglu/OmniParser/imgs/word.png: 736x1280 115 icons, 51.7ms\n", - "Speed: 5.0ms preprocess, 51.7ms inference, 1.6ms postprocess per image at shape (1, 3, 736, 1280)\n" + "image 1/1 /home/yadonglu/OmniParser/imgs/word.png: 736x1280 115 icons, 13.7ms\n", + "Speed: 5.5ms preprocess, 13.7ms inference, 1.6ms postprocess per image at shape (1, 3, 736, 1280)\n", + "len(filtered_boxes): 151 65\n", + "time to prepare bbox: 0.01561737060546875\n", + "time to process image + tokenize text inputs: 0.09026336669921875\n", + "time to generate: 0.7382848262786865\n", + "time to get parsed content: 0.8477945327758789\n", + "ocr time: 0.6952385902404785\n", + "caption time: 1.245499849319458\n" ] } ], @@ -127,9 +135,83 @@ "cur_time_ocr = time.time() \n", "\n", "dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,use_local_semantics=True, iou_threshold=0.7, scale_img=False, batch_size=128)\n", - "cur_time_caption = time.time()\n" + "cur_time_caption = time.time()\n", + "print('ocr time:', cur_time_ocr - start)\n", + "print('caption time:', cur_time_caption - cur_time_ocr)\n" ] }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "image size: (1919, 1079)\n", + "\n", + "image 1/1 /home/yadonglu/OmniParser/imgs/word.png: 736x1280 115 icons, 299.2ms\n", + "Speed: 5.7ms preprocess, 299.2ms inference, 3.7ms postprocess per image at shape (1, 3, 736, 1280)\n", + "len(filtered_boxes): 151 65\n", + "time to prepare bbox: 0.016057729721069336\n", + "time to process image + tokenize text inputs: 1.802201509475708\n", + "time to generate: 61.352588415145874\n", + "time to get parsed content: 63.17377543449402\n", + "ocr time: 0.8477699756622314\n", + "caption time: 64.17442154884338\n" + ] + } + ], + "source": [ + "# run on cpu!!!\n", + "# reload utils\n", + "import importlib\n", + "import utils\n", + "importlib.reload(utils)\n", + "from utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_yolo_model\n", + "\n", + "image_path = 'imgs/google_page.png'\n", + "image_path = 'imgs/windows_home.png'\n", + "# image_path = 'imgs/windows_multitab.png'\n", + "# image_path = 'imgs/omni3.jpg'\n", + "# image_path = 'imgs/ios.png'\n", + "image_path = 'imgs/word.png'\n", + "# image_path = 'imgs/excel2.png'\n", + "# image_path = 'imgs/mobile.png'\n", + "\n", + "image = Image.open(image_path)\n", + "image_rgb = image.convert('RGB')\n", + "print('image size:', image.size)\n", + "\n", + "box_overlay_ratio = max(image.size) / 3200\n", + "draw_bbox_config = {\n", + " 'text_scale': 0.8 * box_overlay_ratio,\n", + " 'text_thickness': max(int(2 * box_overlay_ratio), 1),\n", + " 'text_padding': max(int(3 * box_overlay_ratio), 1),\n", + " 'thickness': max(int(3 * box_overlay_ratio), 1),\n", + "}\n", + "BOX_TRESHOLD = 0.05\n", + "\n", + "import time\n", + "start = time.time()\n", + "ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.5}, use_paddleocr=True)\n", + "text, ocr_bbox = ocr_bbox_rslt\n", + "cur_time_ocr = time.time() \n", + "\n", + "dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,use_local_semantics=True, iou_threshold=0.7, scale_img=False, batch_size=128)\n", + "cur_time_caption = time.time()\n", + "print('ocr time:', cur_time_ocr - start)\n", + "print('caption time:', cur_time_caption - cur_time_ocr)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 37, @@ -172,7 +254,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -257,7 +339,7 @@ " icon\n", " [0.27768608927726746, 0.1485075205564499, 0.28...\n", " True\n", - " Redo\n", + " Six\n", " 146\n", " \n", " \n", @@ -265,7 +347,7 @@ " icon\n", " [0.9438582062721252, 0.9580937027931213, 0.995...\n", " True\n", - " Notifications.\n", + " battery charge indicator\n", " 147\n", " \n", " \n", @@ -273,7 +355,7 @@ " icon\n", " [0.31950756907463074, 0.3229200839996338, 0.33...\n", " True\n", - " minimizing a window.\n", + " A menu or list of options.\n", " 148\n", " \n", " \n", @@ -281,7 +363,7 @@ " icon\n", " [0.08737719058990479, 0.148496612906456, 0.095...\n", " True\n", - " Redo\n", + " 5,5L9,5 4.5z\n", " 149\n", " \n", " \n", @@ -289,7 +371,7 @@ " icon\n", " [0.7414734959602356, 0.000822930654976517, 0.7...\n", " True\n", - " M0,0L9,0 4.5,5z\n", + " Unordered List\n", " 150\n", " \n", " \n", @@ -318,16 +400,16 @@ "3 O Search 3 \n", "4 File 4 \n", ".. ... ... \n", - "146 Redo 146 \n", - "147 Notifications. 147 \n", - "148 minimizing a window. 148 \n", - "149 Redo 149 \n", - "150 M0,0L9,0 4.5,5z 150 \n", + "146 Six 146 \n", + "147 battery charge indicator 147 \n", + "148 A menu or list of options. 148 \n", + "149 5,5L9,5 4.5z 149 \n", + "150 Unordered List 150 \n", "\n", "[151 rows x 5 columns]" ] }, - "execution_count": 38, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -376,7 +458,7 @@ ], "metadata": { "kernelspec": { - "display_name": "pilot", + "display_name": "omni", "language": "python", "name": "python3" }, diff --git a/imgs/demo_image.jpg b/imgs/demo_image.jpg index ab00b9e..eee74c1 100644 Binary files a/imgs/demo_image.jpg and b/imgs/demo_image.jpg differ diff --git a/imgs/demo_image_som.jpg b/imgs/demo_image_som.jpg index 0156c81..7f99769 100644 Binary files a/imgs/demo_image_som.jpg and b/imgs/demo_image_som.jpg differ diff --git a/omniparser.py b/omniparser.py index 37b7780..634ae9f 100644 --- a/omniparser.py +++ b/omniparser.py @@ -1,4 +1,4 @@ -from utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_dino_model, get_yolo_model +from utils import get_som_labeled_img, check_ocr_box, get_yolo_model import torch from ultralytics import YOLO from PIL import Image diff --git a/utils.py b/utils.py index c4865b0..c4b6f88 100755 --- a/utils.py +++ b/utils.py @@ -84,11 +84,15 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_ else: non_ocr_boxes = filtered_boxes croped_pil_image = [] + t0 = time.time() for i, coord in enumerate(non_ocr_boxes): xmin, xmax = int(coord[0]*image_source.shape[1]), int(coord[2]*image_source.shape[1]) ymin, ymax = int(coord[1]*image_source.shape[0]), int(coord[3]*image_source.shape[0]) cropped_image = image_source[ymin:ymax, xmin:xmax, :] + # resize the image to 224x224 to avoid long overhead in clipimageprocessor # TODO + cropped_image = cv2.resize(cropped_image, (224, 224)) croped_pil_image.append(to_pil(cropped_image)) + print('time to prepare bbox:', time.time()-t0) model, processor = caption_model_processor['model'], caption_model_processor['processor'] if not prompt: @@ -103,14 +107,19 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_ for i in range(0, len(croped_pil_image), batch_size): start = time.time() batch = croped_pil_image[i:i+batch_size] + t1 = time.time() if model.device.type == 'cuda': - inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt").to(device=device, dtype=torch.float16) + inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt", do_resize=False).to(device=device, dtype=torch.float16) else: inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt").to(device=device) + t2 = time.time() + print('time to process image + tokenize text inputs:', t2-t1) if 'florence' in model.config.name_or_path: - generated_ids = model.generate(input_ids=inputs["input_ids"],pixel_values=inputs["pixel_values"],max_new_tokens=100,num_beams=3, do_sample=False) + generated_ids = model.generate(input_ids=inputs["input_ids"],pixel_values=inputs["pixel_values"],max_new_tokens=20,num_beams=1, do_sample=False) else: generated_ids = model.generate(**inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, num_return_sequences=1) # temperature=0.01, do_sample=True, + t3 = time.time() + print('time to generate:', t3-t2) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) generated_text = [gen.strip() for gen in generated_text] generated_texts.extend(generated_text) @@ -437,6 +446,7 @@ def get_som_labeled_img(img_path, model=None, BOX_TRESHOLD = 0.01, output_coord_ # get parsed icon local semantics + time1 = time.time() if use_local_semantics: caption_model = caption_model_processor['model'] if 'phi3_v' in caption_model.config.model_type: @@ -456,6 +466,7 @@ def get_som_labeled_img(img_path, model=None, BOX_TRESHOLD = 0.01, output_coord_ else: ocr_text = [f"Text Box ID {i}: {txt}" for i, txt in enumerate(ocr_text)] parsed_content_merged = ocr_text + print('time to get parsed content:', time.time()-time1) filtered_boxes = box_convert(boxes=filtered_boxes, in_fmt="xyxy", out_fmt="cxcywh")