demo remote request

This commit is contained in:
yadonglu
2024-12-06 16:10:58 -08:00
parent a3215fd4b6
commit b1996356ca
6 changed files with 144 additions and 108 deletions

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@@ -1,44 +1,21 @@
# uvicorn test:app --host 0.0.0.0 --port 8000 --reload # uvicorn remote_request:app --host 0.0.0.0 --port 8000 --reload
from fastapi import FastAPI
from pydantic import BaseModel
print('hello') import sys
app = FastAPI() import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
class Item(BaseModel): from utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_yolo_model
base64_image: str
prompt: str
# goal: str
# idx: int
# step_logs: str
@app.post("/send_text/")
async def send_text(item: Item):
print(item.prompt)
# print(item.base64_image)
return {"text": item.prompt}
from utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_dino_model, get_yolo_model
import torch import torch
from ultralytics import YOLO
from PIL import Image from PIL import Image
from typing import Dict, Tuple, List from typing import Dict, Tuple, List
import io
import base64 import base64
config = { config = {
'som_model_path': 'finetuned_icon_detect.pt', 'som_model_path': '../weights/icon_detect_v1_5/model_v1_5.pt',
'device': 'cpu', 'device': 'cpu',
'caption_model_path': 'Salesforce/blip2-opt-2.7b', 'caption_model_name': 'florence2',
'draw_bbox_config': { 'caption_model_path': '../weights/icon_caption_florence',
'text_scale': 0.8,
'text_thickness': 2,
'text_padding': 3,
'thickness': 3,
},
'BOX_TRESHOLD': 0.05 'BOX_TRESHOLD': 0.05
} }
@@ -46,24 +23,55 @@ config = {
class Omniparser(object): class Omniparser(object):
def __init__(self, config: Dict): def __init__(self, config: Dict):
self.config = config self.config = config
device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.som_model = get_yolo_model(model_path=config['som_model_path']) self.som_model = get_yolo_model(model_path=config['som_model_path'])
device='gpu' if torch.cuda.is_available() else 'cpu' self.caption_model_processor = get_caption_model_processor(model_name=config['caption_model_name'], model_name_or_path=config['caption_model_path'], device=device)
self.caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence", device=device) print('Omniparser initialized!!!')
def parse(self, image_path: str): def parse(self, image_base64: str):
image_path = '../imgs/demo_image.jpg'
with open(image_path, "wb") as fh:
fh.write(base64.b64decode(image_base64))
print('Parsing image:', image_path) print('Parsing image:', image_path)
ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.5}, use_paddleocr=True)
image = Image.open(image_path)
print('image size:', image.size)
box_overlay_ratio = max(image.size) / 3200
draw_bbox_config = {
'text_scale': 0.8 * box_overlay_ratio,
'text_thickness': max(int(2 * box_overlay_ratio), 1),
'text_padding': max(int(3 * box_overlay_ratio), 1),
'thickness': max(int(3 * box_overlay_ratio), 1),
}
BOX_TRESHOLD = config['BOX_TRESHOLD']
ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.8}, use_paddleocr=True)
text, ocr_bbox = ocr_bbox_rslt text, ocr_bbox = ocr_bbox_rslt
dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, self.som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=self.caption_model_processor, ocr_text=text,use_local_semantics=True, iou_threshold=0.7, scale_img=False, batch_size=128) dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, self.som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=self.caption_model_processor, ocr_text=text,use_local_semantics=True, iou_threshold=0.7, scale_img=False, batch_size=128)
with open('../imgs/demo_image_som.jpg', "wb") as fh:
fh.write(base64.b64decode(dino_labled_img))
image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img))) return dino_labled_img, parsed_content_list
# formating output
return_list = [{'from': 'omniparser', 'shape': {'x':coord[0], 'y':coord[1], 'width':coord[2], 'height':coord[3]},
'text': parsed_content_list[i].split(': ')[1], 'type':'text'} for i, (k, coord) in enumerate(label_coordinates.items()) if i < len(parsed_content_list)]
return_list.extend(
[{'from': 'omniparser', 'shape': {'x':coord[0], 'y':coord[1], 'width':coord[2], 'height':coord[3]},
'text': 'None', 'type':'icon'} for i, (k, coord) in enumerate(label_coordinates.items()) if i >= len(parsed_content_list)]
)
return [image, return_list]
from fastapi import FastAPI
from pydantic import BaseModel
app = FastAPI()
class Item(BaseModel):
base64_image: str
prompt: str
Omniparser = Omniparser(config)
@app.post("/send_text/")
async def send_text(item: Item):
print('start parsing...')
import time
start = time.time()
dino_labled_img, parsed_content_list = Omniparser.parse(item.base64_image)
print('time:', time.time() - start)
return {"som_image_base64": dino_labled_img, "parsed_content_list": parsed_content_list}

BIN
imgs/demo_image.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 189 KiB

BIN
imgs/demo_image_som.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 214 KiB

BIN
imgs/mobile.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.1 MiB