From 7579a1fdc778ef17fc70e6bf0b150cb7c49bc903 Mon Sep 17 00:00:00 2001 From: Dom Date: Thu, 5 Mar 2026 00:37:31 +0100 Subject: [PATCH] chore: add .gitignore --- .gitignore | 89 ++++++- util/utils.py | 16 +- weights/icon_caption_florence/config.json | 239 ++++++++++++++++++ .../generation_config.json | 13 + weights/icon_detect/model.yaml | 129 ++++++++++ weights/icon_detect/train_args.yaml | 107 ++++++++ 6 files changed, 568 insertions(+), 25 deletions(-) create mode 100644 weights/icon_caption_florence/config.json create mode 100644 weights/icon_caption_florence/generation_config.json create mode 100644 weights/icon_detect/model.yaml create mode 100644 weights/icon_detect/train_args.yaml diff --git a/.gitignore b/.gitignore index 8b8235e..7cc8b7e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,13 +1,76 @@ -weights/icon_caption_blip2 -weights/icon_caption_florence -weights/icon_detect/ -weights/icon_detect_v1_5/ -weights/icon_detect_v1_5_2/ -.gradio -__pycache__/ -debug.ipynb -util/__pycache__/ -index.html?linkid=2289031 -wget-log -weights/icon_caption_florence_v2/ -omnitool/gradio/uploads/ \ No newline at end of file +# === Python === +__pycache__/ +*.py[cod] +*.pyo +*.egg-info/ +*.egg +dist/ +build/ +*.whl + +# === Virtual environments === +.venv/ +venv/ +venv_*/ +env/ + +# === ML Models & Data === +*.pt +*.pth +*.onnx +*.bin +*.safetensors +*.h5 +*.hdf5 +*.pkl +*.pickle +*.npy +*.npz +*.faiss +models/ +*.tar.gz +*.zip + +# === Documents & Media === +*.pdf +*.docx +*.xlsx +*.csv +*.png +*.jpg +*.jpeg +*.gif +*.mp3 +*.wav +*.mp4 + +# === IDE === +.idea/ +.vscode/ +*.swp +*.swo +*~ + +# === OS === +.DS_Store +Thumbs.db +.~lock.* + +# === Secrets === +.env +*.env +credentials.json +token.pickle + +# === Logs & Cache === +*.log +logs/ +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +htmlcov/ +.coverage + +# === Backups === +*_backup_* +backups/ diff --git a/util/utils.py b/util/utils.py index eb7c8b2..db97afc 100644 --- a/util/utils.py +++ b/util/utils.py @@ -20,15 +20,7 @@ from matplotlib import pyplot as plt import easyocr from paddleocr import PaddleOCR reader = easyocr.Reader(['en']) -paddle_ocr = PaddleOCR( - lang='en', # other lang also available - use_angle_cls=False, - use_gpu=False, # using cuda will conflict with pytorch in the same process - show_log=False, - max_batch_size=1024, - use_dilation=True, # improves accuracy - det_db_score_mode='slow', # improves accuracy - rec_batch_num=1024) +paddle_ocr = PaddleOCR(lang='en') import time import base64 @@ -59,12 +51,12 @@ def get_caption_model_processor(model_name, model_name_or_path="Salesforce/blip2 model_name_or_path, device_map=None, torch_dtype=torch.float16 ).to(device) elif model_name == "florence2": - from transformers import AutoProcessor, AutoModelForCausalLM + from transformers import AutoProcessor, AutoModelForCausalLM processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True) if device == 'cpu': - model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float32, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float32, trust_remote_code=True, attn_implementation="eager") else: - model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, trust_remote_code=True).to(device) + model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, trust_remote_code=True, attn_implementation="eager").to(device) return {'model': model.to(device), 'processor': processor} diff --git a/weights/icon_caption_florence/config.json b/weights/icon_caption_florence/config.json new file mode 100644 index 0000000..8a03d9e --- /dev/null +++ b/weights/icon_caption_florence/config.json @@ -0,0 +1,239 @@ +{ + "_name_or_path": "microsoft/Florence-2-base-ft", + "architectures": [ + "Florence2ForConditionalGeneration" + ], + "auto_map": { + "AutoConfig": "microsoft/Florence-2-base-ft--configuration_florence2.Florence2Config", + "AutoModelForCausalLM": "microsoft/Florence-2-base-ft--modeling_florence2.Florence2ForConditionalGeneration" + }, + "bos_token_id": 2, + "eos_token_id": 1, + "ignore_index": -100, + "is_encoder_decoder": true, + "model_type": "florence2", + "pad_token_id": 0, + "projection_dim": 768, + "text_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "", + "activation_dropout": 0.1, + "activation_function": "gelu", + "add_bias_logits": false, + "add_cross_attention": false, + "add_final_layer_norm": false, + "architectures": null, + "attention_dropout": 0.1, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 0, + "chunk_size_feed_forward": 0, + "classif_dropout": 0.1, + "classifier_dropout": 0.0, + "cross_attention_hidden_size": null, + "d_model": 768, + "decoder_attention_heads": 12, + "decoder_ffn_dim": 3072, + "decoder_layerdrop": 0.0, + "decoder_layers": 6, + "decoder_start_token_id": 2, + "diversity_penalty": 0.0, + "do_sample": false, + "dropout": 0.1, + "early_stopping": true, + "encoder_attention_heads": 12, + "encoder_ffn_dim": 3072, + "encoder_layerdrop": 0.0, + "encoder_layers": 6, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": 0, + "forced_eos_token_id": 2, + "gradient_checkpointing": false, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1", + "2": "LABEL_2" + }, + "init_std": 0.02, + "is_decoder": false, + "is_encoder_decoder": true, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1, + "LABEL_2": 2 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 1024, + "min_length": 0, + "model_type": "florence2_language", + "no_repeat_ngram_size": 3, + "normalize_before": false, + "num_beam_groups": 1, + "num_beams": 3, + "num_hidden_layers": 6, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 1, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "scale_embedding": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 51289 + }, + "torch_dtype": "float32", + "transformers_version": "4.46.1", + "vision_config": { + "_attn_implementation_autoset": false, + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "depths": [ + 1, + 1, + 9, + 1 + ], + "dim_embed": [ + 128, + 256, + 512, + 1024 + ], + "diversity_penalty": 0.0, + "do_sample": false, + "drop_path_rate": 0.1, + "early_stopping": false, + "enable_checkpoint": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_feature_source": [ + "spatial_avg_pool", + "temporal_avg_pool" + ], + "image_pos_embed": { + "max_pos_embeddings": 50, + "type": "learned_abs_2d" + }, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "davit", + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_groups": [ + 4, + 8, + 16, + 32 + ], + "num_heads": [ + 4, + 8, + 16, + 32 + ], + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_padding": [ + 3, + 1, + 1, + 1 + ], + "patch_prenorm": [ + false, + true, + true, + true + ], + "patch_size": [ + 7, + 3, + 3, + 3 + ], + "patch_stride": [ + 4, + 2, + 2, + 2 + ], + "prefix": null, + "problem_type": null, + "projection_dim": 768, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "visual_temporal_embedding": { + "max_temporal_embeddings": 100, + "type": "COSINE" + }, + "window_size": 12 + }, + "vocab_size": 51289 +} diff --git a/weights/icon_caption_florence/generation_config.json b/weights/icon_caption_florence/generation_config.json new file mode 100644 index 0000000..ef46845 --- /dev/null +++ b/weights/icon_caption_florence/generation_config.json @@ -0,0 +1,13 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "decoder_start_token_id": 2, + "early_stopping": true, + "eos_token_id": 1, + "forced_bos_token_id": 0, + "forced_eos_token_id": 2, + "no_repeat_ngram_size": 3, + "num_beams": 3, + "pad_token_id": 0, + "transformers_version": "4.46.1" +} diff --git a/weights/icon_detect/model.yaml b/weights/icon_detect/model.yaml new file mode 100644 index 0000000..8372bb8 --- /dev/null +++ b/weights/icon_detect/model.yaml @@ -0,0 +1,129 @@ +backbone: +- - -1 + - 1 + - Conv + - - 64 + - 3 + - 2 +- - -1 + - 1 + - Conv + - - 128 + - 3 + - 2 +- - -1 + - 3 + - C2f + - - 128 + - true +- - -1 + - 1 + - Conv + - - 256 + - 3 + - 2 +- - -1 + - 6 + - C2f + - - 256 + - true +- - -1 + - 1 + - Conv + - - 512 + - 3 + - 2 +- - -1 + - 6 + - C2f + - - 512 + - true +- - -1 + - 1 + - Conv + - - 1024 + - 3 + - 2 +- - -1 + - 3 + - C2f + - - 1024 + - true +- - -1 + - 1 + - SPPF + - - 1024 + - 5 +ch: 3 +depth_multiple: 0.33 +head: +- - -1 + - 1 + - nn.Upsample + - - None + - 2 + - nearest +- - - -1 + - 6 + - 1 + - Concat + - - 1 +- - -1 + - 3 + - C2f + - - 512 +- - -1 + - 1 + - nn.Upsample + - - None + - 2 + - nearest +- - - -1 + - 4 + - 1 + - Concat + - - 1 +- - -1 + - 3 + - C2f + - - 256 +- - -1 + - 1 + - Conv + - - 256 + - 3 + - 2 +- - - -1 + - 12 + - 1 + - Concat + - - 1 +- - -1 + - 3 + - C2f + - - 512 +- - -1 + - 1 + - Conv + - - 512 + - 3 + - 2 +- - - -1 + - 9 + - 1 + - Concat + - - 1 +- - -1 + - 3 + - C2f + - - 1024 +- - - 15 + - 18 + - 21 + - 1 + - Detect + - - nc +nc: 1 +scale: '' +width_multiple: 0.25 +yaml_file: weights/icon_detect_v1_5/model.yaml diff --git a/weights/icon_detect/train_args.yaml b/weights/icon_detect/train_args.yaml new file mode 100644 index 0000000..1324d38 --- /dev/null +++ b/weights/icon_detect/train_args.yaml @@ -0,0 +1,107 @@ +train_args: + agnostic_nms: false + amp: true + augment: false + auto_augment: randaugment + batch: 64 + box: 7.5 + cache: false + cfg: null + classes: null + close_mosaic: 10 + cls: 0.5 + conf: null + copy_paste: 0.0 + cos_lr: false + crop_fraction: 1.0 + degrees: 0.0 + deterministic: true + device: + - 0 + - 1 + - 2 + - 3 + dfl: 1.5 + dnn: false + dropout: 0.0 + dynamic: false + embed: null + epochs: 20 + erasing: 0.4 + exist_ok: false + fliplr: 0.5 + flipud: 0.0 + format: torchscript + fraction: 1.0 + freeze: null + half: false + hsv_h: 0.015 + hsv_s: 0.7 + hsv_v: 0.4 + imgsz: 1280 + int8: false + iou: 0.7 + keras: false + kobj: 1.0 + label_smoothing: 0.0 + line_width: null + lr0: 0.01 + lrf: 0.01 + mask_ratio: 4 + max_det: 300 + mixup: 0.0 + mode: train + model: yolov8n.pt + momentum: 0.937 + mosaic: 0.0 + multi_scale: false + nbs: 64 + nms: false + opset: null + optimize: false + optimizer: auto + overlap_mask: true + patience: 100 + perspective: 0.0 + plots: true + pose: 12.0 + pretrained: true + profile: false + project: null + rect: false + resume: false + retina_masks: false + save: true + save_conf: false + save_crop: false + save_frames: false + save_hybrid: false + save_json: false + save_period: -1 + save_txt: false + scale: 0.5 + seed: 0 + shear: 0.0 + show: false + show_boxes: true + show_conf: true + show_labels: true + simplify: false + single_cls: false + source: null + split: val + stream_buffer: false + task: detect + time: null + tracker: botsort.yaml + translate: 0.1 + val: true + verbose: true + vid_stride: 1 + visualize: false + warmup_bias_lr: 0.0 + warmup_epochs: 3.0 + warmup_momentum: 0.8 + weight_decay: 0.0005 + workers: 8 + workspace: 4