add florence model; update demo.ipynb
3
.gitignore
vendored
@@ -1 +1,2 @@
|
||||
weights/
|
||||
weights/icon_caption_blip2
|
||||
weights/icon_caption_florence
|
||||
385
demo.ipynb
BIN
imgs/google_page.png
Normal file
|
After Width: | Height: | Size: 324 KiB |
|
Before Width: | Height: | Size: 1.8 MiB |
BIN
imgs/pc_1.png
|
Before Width: | Height: | Size: 197 KiB |
|
Before Width: | Height: | Size: 185 KiB |
|
Before Width: | Height: | Size: 805 B |
BIN
imgs/windows_home.png
Normal file
|
After Width: | Height: | Size: 5.8 MiB |
BIN
imgs/windows_multitab.png
Normal file
|
After Width: | Height: | Size: 459 KiB |
@@ -12,4 +12,4 @@ opencv-python-headless
|
||||
gradio
|
||||
dill
|
||||
accelerate
|
||||
|
||||
timm
|
||||
|
||||
12
utils.py
@@ -33,9 +33,10 @@ import supervision as sv
|
||||
import torchvision.transforms as T
|
||||
|
||||
|
||||
def get_caption_model_processor(model_name_or_path="Salesforce/blip2-opt-2.7b", device=None):
|
||||
def get_caption_model_processor(model_name, model_name_or_path="Salesforce/blip2-opt-2.7b", device=None):
|
||||
if not device:
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
if model_name == "blip2":
|
||||
from transformers import Blip2Processor, Blip2ForConditionalGeneration
|
||||
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
|
||||
if device == 'cpu':
|
||||
@@ -45,7 +46,14 @@ def get_caption_model_processor(model_name_or_path="Salesforce/blip2-opt-2.7b",
|
||||
else:
|
||||
model = Blip2ForConditionalGeneration.from_pretrained(
|
||||
model_name_or_path, device_map=None, torch_dtype=torch.float16
|
||||
)
|
||||
).to(device)
|
||||
elif model_name == "florence2":
|
||||
from transformers import AutoProcessor, AutoModelForCausalLM
|
||||
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
|
||||
if device == 'cpu':
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float32, trust_remote_code=True)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, trust_remote_code=True).to(device)
|
||||
return {'model': model.to(device), 'processor': processor}
|
||||
|
||||
|
||||
|
||||