diff --git a/__pycache__/utils.cpython-312.pyc b/__pycache__/utils.cpython-312.pyc index b04e151..8a168fd 100644 Binary files a/__pycache__/utils.cpython-312.pyc and b/__pycache__/utils.cpython-312.pyc differ diff --git a/demo.ipynb b/demo.ipynb index 73ba6cc..81a75c3 100644 --- a/demo.ipynb +++ b/demo.ipynb @@ -90,16 +90,18 @@ "image_path = 'imgs/google_page.png'\n", "# image_path = 'imgs/windows_home.png'\n", "image_path = 'imgs/windows_multitab.png'\n", - "draw_bbox_config = {\n", - " 'text_scale': 0.8,\n", - " 'text_thickness': 2,\n", - " 'text_padding': 3,\n", - " 'thickness': 3,\n", - "}\n", + "\n", "BOX_TRESHOLD = 0.03\n", "\n", "image = Image.open(image_path)\n", "image_rgb = image.convert('RGB')\n", + "box_overlay_ratio = image.size[0] / 3200\n", + "draw_bbox_config = {\n", + " 'text_scale': 0.8 * box_overlay_ratio,\n", + " 'text_thickness': max(int(2 * box_overlay_ratio), 1),\n", + " 'text_padding': max(int(3 * box_overlay_ratio), 1),\n", + " 'thickness': max(int(3 * box_overlay_ratio), 1),\n", + "}\n", "\n", "ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}, use_paddleocr=True)\n", "text, ocr_bbox = ocr_bbox_rslt\n", diff --git a/gradio_demo.py b/gradio_demo.py index 825f7fd..14dd6d0 100644 --- a/gradio_demo.py +++ b/gradio_demo.py @@ -16,29 +16,6 @@ yolo_model = get_yolo_model(model_path='weights/icon_detect/best.pt') caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence") # caption_model_processor = get_caption_model_processor(model_name="blip2", model_name_or_path="weights/icon_caption_blip2") -platform = 'pc' -if platform == 'pc': - draw_bbox_config = { - 'text_scale': 0.8, - 'text_thickness': 2, - 'text_padding': 2, - 'thickness': 2, - } -elif platform == 'web': - draw_bbox_config = { - 'text_scale': 0.8, - 'text_thickness': 2, - 'text_padding': 3, - 'thickness': 3, - } -elif platform == 'mobile': - draw_bbox_config = { - 'text_scale': 0.8, - 'text_thickness': 2, - 'text_padding': 3, - 'thickness': 3, - } - MARKDOWN = """ @@ -67,12 +44,20 @@ def process( image_save_path = 'imgs/saved_image_demo.png' image_input.save(image_save_path) + image = Image.open(image_save_path) + box_overlay_ratio = image.size[0] / 3200 + draw_bbox_config = { + 'text_scale': 0.8 * box_overlay_ratio, + 'text_thickness': max(int(2 * box_overlay_ratio), 1), + 'text_padding': max(int(3 * box_overlay_ratio), 1), + 'thickness': max(int(3 * box_overlay_ratio), 1), + } # import pdb; pdb.set_trace() ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_save_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}, use_paddleocr=use_paddleocr) text, ocr_bbox = ocr_bbox_rslt # print('prompt:', prompt) - dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_save_path, yolo_model, BOX_TRESHOLD = box_threshold, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,iou_threshold=iou_threshold, imgsz=imgsz) + dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_save_path, yolo_model, BOX_TRESHOLD = box_threshold, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,iou_threshold=iou_threshold, imgsz=imgsz) image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img))) print('finish processing') parsed_content_list = '\n'.join(parsed_content_list)