From 6b4828ba666697919a7111b10fb292802fe72879 Mon Sep 17 00:00:00 2001 From: yadonglu Date: Mon, 4 Nov 2024 21:42:18 -0800 Subject: [PATCH] adaptive bbox width for asthetics --- __pycache__/utils.cpython-312.pyc | Bin 23376 -> 23424 bytes demo.ipynb | 14 +++++++------ gradio_demo.py | 33 ++++++++---------------------- 3 files changed, 17 insertions(+), 30 deletions(-) diff --git a/__pycache__/utils.cpython-312.pyc b/__pycache__/utils.cpython-312.pyc index b04e151c2d0b6b634b9e8cb391b080dd8ad9b5f8..8a168fd2840b5f52b34e92ae3c08fdaabfb71d23 100644 GIT binary patch delta 1706 zcmb7EO-x%y5Z?9sG0)(?XY=q+j1Aa;0b@mk5YiAs_z4j|NtMzN8Y&?_aG?U!hg+0r zS8~)`QlZdvB&v|))T&Koa;Pjdryi(UwGvImp;9Tm^-xu6BSEV~J$4uT2$gc_$ZtG5 zGrKeM&FpS|fpd?reZy|IAx66ooI0*(`y+&q1Q3qt~crR%yRN7jwcG6mR|#SyHE(d zSBXV5B9tvP zB_hH48IRMNwq#J+GHyqRTC;X=zv6E#p;jUhnUqWk7nu#Sk6IT|Trha>xUelR%)f1< zC%47){M^*GG(DSHxCRp)jUL5_F%RP_&i>&{#<1EEmU#RKeA#g%sbGYH>|40TR5`R` zLk`dK_|5UH(()}u+p;-#tjH1k8%Yj1*SAv!(&^W0LTcA>(a5n4nbkX-eokc`f6wX9 z)O5~*P}%|4Pk0?GGKWUT)S+9b6MW5;uzor&xQGj$oc1T&R8XpKLU!6y0ZTL%?#V1DUouH zZ9+SQt(FZPShYL@fYWWul2b&qoN6q^h~I15%Rr@WS^;gH0|)VMc3!ISkWwP9Sq~cR zBR1m!DbY)KeCpQ~;s>Ftt;I(~;vywxO*Am)$|m=sF-D43QnqHxOZ!?HfZ48!S#3F( z7R*t&!a$`O4eF&tyNmQP61zbcgO-b3MoLg$?xa z2F{udWb?sVq_O(tV!Nz{H``{OsRtP1KH??iy8E6;tRLfBc(2wljm#bLGrjobT2u3E zS~;1X(A6C-4V$&oDtNQIPO2tVGbw)2aW&+-6GHV&FPp*uJngPiYKUM=lE}~Cy!JzNJr^oMM5c9|0VMY#`8`}^h1x8C_NvRLRmH!6gh)WM|q zO}jdHO#U{e4xW_1izq`#-0;PRBpGfEYlV%$S}Y~m?)k#Ap-EiP$V8)wvAATcFwBe( zIBN#ud`xO#>TKcO`2j3*4ugcOuSm~zFaBdB(#iF;T9Mfxr9sL;Yb`e!%rhE z(lB$$6`ZgBg=Lj*EC=goC+>j$(J(#(Z;Zy|m$6rS08_wEnu-(@e$?!qpx%g^%HA6lUeKTG**Q?O|aEl_Ak_QHnMfL_+1 z%}gPYm}rQkou)>pCPp6Ao>d z&dmAFnKS3ioYhb8?#I}1*Ws`upojlNlFBuQA8&uyFphB}Tg9!unC@A_8LkGJY|_;= zG})}j*k|}GZeniJX8l~L9$CYaPv}ulb?9}#JN0_tC-i2$0W>FH#Ics__oiP0`0wmT zRj-Dy#Nu6E{5u=#3VU{N#+36;B_io2KdWi;vA4Q3MJ`NtyR;Q^+Lps1lr9kxkux|a zi*W#LC<#*%B}~%}gs74Ro+F|N%r(=xOGeZ}6(SQevCK(N0UpCom1Pu@j6c94X1cE| zEk17~X79_1#RWXap6PCOT8%o;8J}|Y7h>;shh;0s?Az|j7Au4jgnwxnR#wrT9XZQ3 zreB%9SEAjsmG7C6Gx!L}PGxmy&(8ckm*bUc-*MSk1$1)r4NgDxCcpWb)5G4xLENC` z^^xFzbVEVtb);kJ&`s3IZubmJF5+U(ws{XvK^m!>dThn%!2}W0j-27fKvOVMm6lO2 zQQ6gac*RXT1peef8dIJevX~P!r54X~vPk9?C*%E#aiu~+Ul{j~>w%<{6WNFtZbGU$k74uiW0rgl*mnG4 zL?tCe-m!>$Q6I4z?-HwSvj~Ujs&lOp|B|IDpoLBZ;}zQkxGuERqXO9|H}W2n%HPn z`uIV7KHaIicHD)W7T8#w-c5o`G;8vJp*4^StLo^Zp_HGy{W;+7ttDYnRn*6{)6cjOYFh$uQG3e?7P!(dn0ta7T)nT zS@6th>4)l6cK*z`RN1VKw8?im)sc4Pi>NwsLisXm8|jtr`eLJU_UDmqY;FUqg>3lA zS?p^9(P#!1GaEO-vvHlXRb}i`c?`9E;)B>lfP1qkve+c3$`mD;{p=WrxS(xQjhE9>#s_+IXFEk}r3%+vD%R z@n$9t1$g&1_<;-_o56QB_(sMkdvBu3u7e04T6#OXJ28mOPF%fL%L12r+03Qte*s=U Bc8&l5 diff --git a/demo.ipynb b/demo.ipynb index 73ba6cc..81a75c3 100644 --- a/demo.ipynb +++ b/demo.ipynb @@ -90,16 +90,18 @@ "image_path = 'imgs/google_page.png'\n", "# image_path = 'imgs/windows_home.png'\n", "image_path = 'imgs/windows_multitab.png'\n", - "draw_bbox_config = {\n", - " 'text_scale': 0.8,\n", - " 'text_thickness': 2,\n", - " 'text_padding': 3,\n", - " 'thickness': 3,\n", - "}\n", + "\n", "BOX_TRESHOLD = 0.03\n", "\n", "image = Image.open(image_path)\n", "image_rgb = image.convert('RGB')\n", + "box_overlay_ratio = image.size[0] / 3200\n", + "draw_bbox_config = {\n", + " 'text_scale': 0.8 * box_overlay_ratio,\n", + " 'text_thickness': max(int(2 * box_overlay_ratio), 1),\n", + " 'text_padding': max(int(3 * box_overlay_ratio), 1),\n", + " 'thickness': max(int(3 * box_overlay_ratio), 1),\n", + "}\n", "\n", "ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}, use_paddleocr=True)\n", "text, ocr_bbox = ocr_bbox_rslt\n", diff --git a/gradio_demo.py b/gradio_demo.py index 825f7fd..14dd6d0 100644 --- a/gradio_demo.py +++ b/gradio_demo.py @@ -16,29 +16,6 @@ yolo_model = get_yolo_model(model_path='weights/icon_detect/best.pt') caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence") # caption_model_processor = get_caption_model_processor(model_name="blip2", model_name_or_path="weights/icon_caption_blip2") -platform = 'pc' -if platform == 'pc': - draw_bbox_config = { - 'text_scale': 0.8, - 'text_thickness': 2, - 'text_padding': 2, - 'thickness': 2, - } -elif platform == 'web': - draw_bbox_config = { - 'text_scale': 0.8, - 'text_thickness': 2, - 'text_padding': 3, - 'thickness': 3, - } -elif platform == 'mobile': - draw_bbox_config = { - 'text_scale': 0.8, - 'text_thickness': 2, - 'text_padding': 3, - 'thickness': 3, - } - MARKDOWN = """ @@ -67,12 +44,20 @@ def process( image_save_path = 'imgs/saved_image_demo.png' image_input.save(image_save_path) + image = Image.open(image_save_path) + box_overlay_ratio = image.size[0] / 3200 + draw_bbox_config = { + 'text_scale': 0.8 * box_overlay_ratio, + 'text_thickness': max(int(2 * box_overlay_ratio), 1), + 'text_padding': max(int(3 * box_overlay_ratio), 1), + 'thickness': max(int(3 * box_overlay_ratio), 1), + } # import pdb; pdb.set_trace() ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_save_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}, use_paddleocr=use_paddleocr) text, ocr_bbox = ocr_bbox_rslt # print('prompt:', prompt) - dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_save_path, yolo_model, BOX_TRESHOLD = box_threshold, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,iou_threshold=iou_threshold, imgsz=imgsz) + dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_save_path, yolo_model, BOX_TRESHOLD = box_threshold, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,iou_threshold=iou_threshold, imgsz=imgsz) image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img))) print('finish processing') parsed_content_list = '\n'.join(parsed_content_list)