This commit is contained in:
yadonglu
2025-01-04 20:14:49 -08:00
7 changed files with 1290 additions and 1199 deletions

View File

@@ -19,9 +19,7 @@
# - Entrypoint script execution with Gradio server configuration for # - Entrypoint script execution with Gradio server configuration for
# external access. # external access.
# If it is gpu enviroment, use nvidia/cuda:12.3.1-devel-ubuntu22.04, otherwise use ubuntu:22.04 FROM nvidia/cuda:12.3.1-devel-ubuntu22.04
# FROM nvidia/cuda:12.3.1-devel-ubuntu22.04
FROM docker.io/ubuntu:22.04
# Install system dependencies with explicit OpenGL libraries # Install system dependencies with explicit OpenGL libraries
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
@@ -68,8 +66,8 @@ RUN git lfs install && \
# Install dependencies from requirements.txt with specific opencv-python-headless version # Install dependencies from requirements.txt with specific opencv-python-headless version
RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \ RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
# pip uninstall -y opencv-python opencv-python-headless && \ pip uninstall -y opencv-python opencv-python-headless && \
# pip install --no-cache-dir opencv-python-headless==4.8.1.78 && \ pip install --no-cache-dir opencv-python-headless==4.8.1.78 && \
pip install -r requirements.txt && \ pip install -r requirements.txt && \
pip install huggingface_hub pip install huggingface_hub
@@ -200,4 +198,4 @@ ENV WIDTH=$WIDTH
# Set the entrypoint # Set the entrypoint
# ENTRYPOINT ["/usr/src/app/entrypoint.sh"] # ENTRYPOINT ["/usr/src/app/entrypoint.sh"]
# sudo docker build . -t omniparser-x-demo:local # manually build the docker image (optional) # docker build . -t omniparser-x-demo:local # manually build the docker image (optional)

Binary file not shown.

View File

@@ -2,14 +2,14 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 35, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"model to cuda\n" "model to cpu\n"
] ]
} }
], ],
@@ -18,7 +18,8 @@
"import torch\n", "import torch\n",
"from ultralytics import YOLO\n", "from ultralytics import YOLO\n",
"from PIL import Image\n", "from PIL import Image\n",
"device = 'cuda'\n", "device = 'cpu'\n",
"device = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
"model_path='weights/icon_detect/best.pt'\n", "model_path='weights/icon_detect/best.pt'\n",
"model_path='weights/icon_detect_v1_5/model_v1_5.pt'\n", "model_path='weights/icon_detect_v1_5/model_v1_5.pt'\n",
"\n", "\n",
@@ -30,7 +31,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -57,7 +58,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -66,7 +67,7 @@
"(device(type='cuda', index=0), ultralytics.models.yolo.model.YOLO)" "(device(type='cuda', index=0), ultralytics.models.yolo.model.YOLO)"
] ]
}, },
"execution_count": 9, "execution_count": 3,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@@ -77,7 +78,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 36, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -86,8 +87,15 @@
"text": [ "text": [
"image size: (1919, 1079)\n", "image size: (1919, 1079)\n",
"\n", "\n",
"image 1/1 /home/yadonglu/OmniParser/imgs/word.png: 736x1280 115 icons, 51.7ms\n", "image 1/1 /home/yadonglu/OmniParser/imgs/word.png: 736x1280 115 icons, 13.7ms\n",
"Speed: 5.0ms preprocess, 51.7ms inference, 1.6ms postprocess per image at shape (1, 3, 736, 1280)\n" "Speed: 5.5ms preprocess, 13.7ms inference, 1.6ms postprocess per image at shape (1, 3, 736, 1280)\n",
"len(filtered_boxes): 151 65\n",
"time to prepare bbox: 0.01561737060546875\n",
"time to process image + tokenize text inputs: 0.09026336669921875\n",
"time to generate: 0.7382848262786865\n",
"time to get parsed content: 0.8477945327758789\n",
"ocr time: 0.6952385902404785\n",
"caption time: 1.245499849319458\n"
] ]
} }
], ],
@@ -127,9 +135,83 @@
"cur_time_ocr = time.time() \n", "cur_time_ocr = time.time() \n",
"\n", "\n",
"dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,use_local_semantics=True, iou_threshold=0.7, scale_img=False, batch_size=128)\n", "dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,use_local_semantics=True, iou_threshold=0.7, scale_img=False, batch_size=128)\n",
"cur_time_caption = time.time()\n" "cur_time_caption = time.time()\n",
"print('ocr time:', cur_time_ocr - start)\n",
"print('caption time:', cur_time_caption - cur_time_ocr)\n"
] ]
}, },
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"image size: (1919, 1079)\n",
"\n",
"image 1/1 /home/yadonglu/OmniParser/imgs/word.png: 736x1280 115 icons, 299.2ms\n",
"Speed: 5.7ms preprocess, 299.2ms inference, 3.7ms postprocess per image at shape (1, 3, 736, 1280)\n",
"len(filtered_boxes): 151 65\n",
"time to prepare bbox: 0.016057729721069336\n",
"time to process image + tokenize text inputs: 1.802201509475708\n",
"time to generate: 61.352588415145874\n",
"time to get parsed content: 63.17377543449402\n",
"ocr time: 0.8477699756622314\n",
"caption time: 64.17442154884338\n"
]
}
],
"source": [
"# run on cpu!!!\n",
"# reload utils\n",
"import importlib\n",
"import utils\n",
"importlib.reload(utils)\n",
"from utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_yolo_model\n",
"\n",
"image_path = 'imgs/google_page.png'\n",
"image_path = 'imgs/windows_home.png'\n",
"# image_path = 'imgs/windows_multitab.png'\n",
"# image_path = 'imgs/omni3.jpg'\n",
"# image_path = 'imgs/ios.png'\n",
"image_path = 'imgs/word.png'\n",
"# image_path = 'imgs/excel2.png'\n",
"# image_path = 'imgs/mobile.png'\n",
"\n",
"image = Image.open(image_path)\n",
"image_rgb = image.convert('RGB')\n",
"print('image size:', image.size)\n",
"\n",
"box_overlay_ratio = max(image.size) / 3200\n",
"draw_bbox_config = {\n",
" 'text_scale': 0.8 * box_overlay_ratio,\n",
" 'text_thickness': max(int(2 * box_overlay_ratio), 1),\n",
" 'text_padding': max(int(3 * box_overlay_ratio), 1),\n",
" 'thickness': max(int(3 * box_overlay_ratio), 1),\n",
"}\n",
"BOX_TRESHOLD = 0.05\n",
"\n",
"import time\n",
"start = time.time()\n",
"ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.5}, use_paddleocr=True)\n",
"text, ocr_bbox = ocr_bbox_rslt\n",
"cur_time_ocr = time.time() \n",
"\n",
"dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,use_local_semantics=True, iou_threshold=0.7, scale_img=False, batch_size=128)\n",
"cur_time_caption = time.time()\n",
"print('ocr time:', cur_time_ocr - start)\n",
"print('caption time:', cur_time_caption - cur_time_ocr)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 37, "execution_count": 37,
@@ -172,7 +254,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 38, "execution_count": 16,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -257,7 +339,7 @@
" <td>icon</td>\n", " <td>icon</td>\n",
" <td>[0.27768608927726746, 0.1485075205564499, 0.28...</td>\n", " <td>[0.27768608927726746, 0.1485075205564499, 0.28...</td>\n",
" <td>True</td>\n", " <td>True</td>\n",
" <td>Redo</td>\n", " <td>Six</td>\n",
" <td>146</td>\n", " <td>146</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
@@ -265,7 +347,7 @@
" <td>icon</td>\n", " <td>icon</td>\n",
" <td>[0.9438582062721252, 0.9580937027931213, 0.995...</td>\n", " <td>[0.9438582062721252, 0.9580937027931213, 0.995...</td>\n",
" <td>True</td>\n", " <td>True</td>\n",
" <td>Notifications.</td>\n", " <td>battery charge indicator</td>\n",
" <td>147</td>\n", " <td>147</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
@@ -273,7 +355,7 @@
" <td>icon</td>\n", " <td>icon</td>\n",
" <td>[0.31950756907463074, 0.3229200839996338, 0.33...</td>\n", " <td>[0.31950756907463074, 0.3229200839996338, 0.33...</td>\n",
" <td>True</td>\n", " <td>True</td>\n",
" <td>minimizing a window.</td>\n", " <td>A menu or list of options.</td>\n",
" <td>148</td>\n", " <td>148</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
@@ -281,7 +363,7 @@
" <td>icon</td>\n", " <td>icon</td>\n",
" <td>[0.08737719058990479, 0.148496612906456, 0.095...</td>\n", " <td>[0.08737719058990479, 0.148496612906456, 0.095...</td>\n",
" <td>True</td>\n", " <td>True</td>\n",
" <td>Redo</td>\n", " <td>5,5L9,5 4.5z</td>\n",
" <td>149</td>\n", " <td>149</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
@@ -289,7 +371,7 @@
" <td>icon</td>\n", " <td>icon</td>\n",
" <td>[0.7414734959602356, 0.000822930654976517, 0.7...</td>\n", " <td>[0.7414734959602356, 0.000822930654976517, 0.7...</td>\n",
" <td>True</td>\n", " <td>True</td>\n",
" <td>M0,0L9,0 4.5,5z</td>\n", " <td>Unordered List</td>\n",
" <td>150</td>\n", " <td>150</td>\n",
" </tr>\n", " </tr>\n",
" </tbody>\n", " </tbody>\n",
@@ -318,16 +400,16 @@
"3 O Search 3 \n", "3 O Search 3 \n",
"4 File 4 \n", "4 File 4 \n",
".. ... ... \n", ".. ... ... \n",
"146 Redo 146 \n", "146 Six 146 \n",
"147 Notifications. 147 \n", "147 battery charge indicator 147 \n",
"148 minimizing a window. 148 \n", "148 A menu or list of options. 148 \n",
"149 Redo 149 \n", "149 5,5L9,5 4.5z 149 \n",
"150 M0,0L9,0 4.5,5z 150 \n", "150 Unordered List 150 \n",
"\n", "\n",
"[151 rows x 5 columns]" "[151 rows x 5 columns]"
] ]
}, },
"execution_count": 38, "execution_count": 16,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@@ -376,7 +458,7 @@
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "pilot", "display_name": "omni",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
}, },

Binary file not shown.

Before

Width:  |  Height:  |  Size: 328 KiB

After

Width:  |  Height:  |  Size: 560 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 404 KiB

After

Width:  |  Height:  |  Size: 720 KiB

View File

@@ -1,4 +1,4 @@
from utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_dino_model, get_yolo_model from utils import get_som_labeled_img, check_ocr_box, get_yolo_model
import torch import torch
from ultralytics import YOLO from ultralytics import YOLO
from PIL import Image from PIL import Image

View File

@@ -84,11 +84,15 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_
else: else:
non_ocr_boxes = filtered_boxes non_ocr_boxes = filtered_boxes
croped_pil_image = [] croped_pil_image = []
t0 = time.time()
for i, coord in enumerate(non_ocr_boxes): for i, coord in enumerate(non_ocr_boxes):
xmin, xmax = int(coord[0]*image_source.shape[1]), int(coord[2]*image_source.shape[1]) xmin, xmax = int(coord[0]*image_source.shape[1]), int(coord[2]*image_source.shape[1])
ymin, ymax = int(coord[1]*image_source.shape[0]), int(coord[3]*image_source.shape[0]) ymin, ymax = int(coord[1]*image_source.shape[0]), int(coord[3]*image_source.shape[0])
cropped_image = image_source[ymin:ymax, xmin:xmax, :] cropped_image = image_source[ymin:ymax, xmin:xmax, :]
# resize the image to 224x224 to avoid long overhead in clipimageprocessor # TODO
cropped_image = cv2.resize(cropped_image, (224, 224))
croped_pil_image.append(to_pil(cropped_image)) croped_pil_image.append(to_pil(cropped_image))
print('time to prepare bbox:', time.time()-t0)
model, processor = caption_model_processor['model'], caption_model_processor['processor'] model, processor = caption_model_processor['model'], caption_model_processor['processor']
if not prompt: if not prompt:
@@ -103,14 +107,19 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_
for i in range(0, len(croped_pil_image), batch_size): for i in range(0, len(croped_pil_image), batch_size):
start = time.time() start = time.time()
batch = croped_pil_image[i:i+batch_size] batch = croped_pil_image[i:i+batch_size]
t1 = time.time()
if model.device.type == 'cuda': if model.device.type == 'cuda':
inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt").to(device=device, dtype=torch.float16) inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt", do_resize=False).to(device=device, dtype=torch.float16)
else: else:
inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt").to(device=device) inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt").to(device=device)
t2 = time.time()
print('time to process image + tokenize text inputs:', t2-t1)
if 'florence' in model.config.name_or_path: if 'florence' in model.config.name_or_path:
generated_ids = model.generate(input_ids=inputs["input_ids"],pixel_values=inputs["pixel_values"],max_new_tokens=100,num_beams=3, do_sample=False) generated_ids = model.generate(input_ids=inputs["input_ids"],pixel_values=inputs["pixel_values"],max_new_tokens=20,num_beams=1, do_sample=False)
else: else:
generated_ids = model.generate(**inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, num_return_sequences=1) # temperature=0.01, do_sample=True, generated_ids = model.generate(**inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, num_return_sequences=1) # temperature=0.01, do_sample=True,
t3 = time.time()
print('time to generate:', t3-t2)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
generated_text = [gen.strip() for gen in generated_text] generated_text = [gen.strip() for gen in generated_text]
generated_texts.extend(generated_text) generated_texts.extend(generated_text)
@@ -437,6 +446,7 @@ def get_som_labeled_img(img_path, model=None, BOX_TRESHOLD = 0.01, output_coord_
# get parsed icon local semantics # get parsed icon local semantics
time1 = time.time()
if use_local_semantics: if use_local_semantics:
caption_model = caption_model_processor['model'] caption_model = caption_model_processor['model']
if 'phi3_v' in caption_model.config.model_type: if 'phi3_v' in caption_model.config.model_type:
@@ -456,6 +466,7 @@ def get_som_labeled_img(img_path, model=None, BOX_TRESHOLD = 0.01, output_coord_
else: else:
ocr_text = [f"Text Box ID {i}: {txt}" for i, txt in enumerate(ocr_text)] ocr_text = [f"Text Box ID {i}: {txt}" for i, txt in enumerate(ocr_text)]
parsed_content_merged = ocr_text parsed_content_merged = ocr_text
print('time to get parsed content:', time.time()-time1)
filtered_boxes = box_convert(boxes=filtered_boxes, in_fmt="xyxy", out_fmt="cxcywh") filtered_boxes = box_convert(boxes=filtered_boxes, in_fmt="xyxy", out_fmt="cxcywh")