From 7579a1fdc778ef17fc70e6bf0b150cb7c49bc903 Mon Sep 17 00:00:00 2001
From: Dom <dom@localhost>
Date: Thu, 5 Mar 2026 00:37:31 +0100
Subject: [PATCH] chore: add .gitignore

---
 .gitignore                                    |  89 ++++++-
 util/utils.py                                 |  16 +-
 weights/icon_caption_florence/config.json     | 239 ++++++++++++++++++
 .../generation_config.json                    |  13 +
 weights/icon_detect/model.yaml                | 129 ++++++++++
 weights/icon_detect/train_args.yaml           | 107 ++++++++
 6 files changed, 568 insertions(+), 25 deletions(-)
 create mode 100644 weights/icon_caption_florence/config.json
 create mode 100644 weights/icon_caption_florence/generation_config.json
 create mode 100644 weights/icon_detect/model.yaml
 create mode 100644 weights/icon_detect/train_args.yaml

diff --git a/.gitignore b/.gitignore
index 8b8235e..7cc8b7e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,13 +1,76 @@
-weights/icon_caption_blip2
-weights/icon_caption_florence
-weights/icon_detect/
-weights/icon_detect_v1_5/
-weights/icon_detect_v1_5_2/
-.gradio
-__pycache__/
-debug.ipynb
-util/__pycache__/
-index.html?linkid=2289031
-wget-log
-weights/icon_caption_florence_v2/
-omnitool/gradio/uploads/
\ No newline at end of file
+# === Python ===
+__pycache__/
+*.py[cod]
+*.pyo
+*.egg-info/
+*.egg
+dist/
+build/
+*.whl
+
+# === Virtual environments ===
+.venv/
+venv/
+venv_*/
+env/
+
+# === ML Models & Data ===
+*.pt
+*.pth
+*.onnx
+*.bin
+*.safetensors
+*.h5
+*.hdf5
+*.pkl
+*.pickle
+*.npy
+*.npz
+*.faiss
+models/
+*.tar.gz
+*.zip
+
+# === Documents & Media ===
+*.pdf
+*.docx
+*.xlsx
+*.csv
+*.png
+*.jpg
+*.jpeg
+*.gif
+*.mp3
+*.wav
+*.mp4
+
+# === IDE ===
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+
+# === OS ===
+.DS_Store
+Thumbs.db
+.~lock.*
+
+# === Secrets ===
+.env
+*.env
+credentials.json
+token.pickle
+
+# === Logs & Cache ===
+*.log
+logs/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+htmlcov/
+.coverage
+
+# === Backups ===
+*_backup_*
+backups/
diff --git a/util/utils.py b/util/utils.py
index eb7c8b2..db97afc 100644
--- a/util/utils.py
+++ b/util/utils.py
@@ -20,15 +20,7 @@ from matplotlib import pyplot as plt
 import easyocr
 from paddleocr import PaddleOCR
 reader = easyocr.Reader(['en'])
-paddle_ocr = PaddleOCR(
-    lang='en',  # other lang also available
-    use_angle_cls=False,
-    use_gpu=False,  # using cuda will conflict with pytorch in the same process
-    show_log=False,
-    max_batch_size=1024,
-    use_dilation=True,  # improves accuracy
-    det_db_score_mode='slow',  # improves accuracy
-    rec_batch_num=1024)
+paddle_ocr = PaddleOCR(lang='en')
 import time
 import base64
 
@@ -59,12 +51,12 @@ def get_caption_model_processor(model_name, model_name_or_path="Salesforce/blip2
             model_name_or_path, device_map=None, torch_dtype=torch.float16
         ).to(device)
     elif model_name == "florence2":
-        from transformers import AutoProcessor, AutoModelForCausalLM 
+        from transformers import AutoProcessor, AutoModelForCausalLM
         processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
         if device == 'cpu':
-            model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float32, trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float32, trust_remote_code=True, attn_implementation="eager")
         else:
-            model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, trust_remote_code=True).to(device)
+            model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, trust_remote_code=True, attn_implementation="eager").to(device)
     return {'model': model.to(device), 'processor': processor}
 
 
diff --git a/weights/icon_caption_florence/config.json b/weights/icon_caption_florence/config.json
new file mode 100644
index 0000000..8a03d9e
--- /dev/null
+++ b/weights/icon_caption_florence/config.json
@@ -0,0 +1,239 @@
+{
+  "_name_or_path": "microsoft/Florence-2-base-ft",
+  "architectures": [
+    "Florence2ForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "microsoft/Florence-2-base-ft--configuration_florence2.Florence2Config",
+    "AutoModelForCausalLM": "microsoft/Florence-2-base-ft--modeling_florence2.Florence2ForConditionalGeneration"
+  },
+  "bos_token_id": 2,
+  "eos_token_id": 1,
+  "ignore_index": -100,
+  "is_encoder_decoder": true,
+  "model_type": "florence2",
+  "pad_token_id": 0,
+  "projection_dim": 768,
+  "text_config": {
+    "_attn_implementation_autoset": true,
+    "_name_or_path": "",
+    "activation_dropout": 0.1,
+    "activation_function": "gelu",
+    "add_bias_logits": false,
+    "add_cross_attention": false,
+    "add_final_layer_norm": false,
+    "architectures": null,
+    "attention_dropout": 0.1,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "classif_dropout": 0.1,
+    "classifier_dropout": 0.0,
+    "cross_attention_hidden_size": null,
+    "d_model": 768,
+    "decoder_attention_heads": 12,
+    "decoder_ffn_dim": 3072,
+    "decoder_layerdrop": 0.0,
+    "decoder_layers": 6,
+    "decoder_start_token_id": 2,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.1,
+    "early_stopping": true,
+    "encoder_attention_heads": 12,
+    "encoder_ffn_dim": 3072,
+    "encoder_layerdrop": 0.0,
+    "encoder_layers": 6,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": 0,
+    "forced_eos_token_id": 2,
+    "gradient_checkpointing": false,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1",
+      "2": "LABEL_2"
+    },
+    "init_std": 0.02,
+    "is_decoder": false,
+    "is_encoder_decoder": true,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1,
+      "LABEL_2": 2
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 1024,
+    "min_length": 0,
+    "model_type": "florence2_language",
+    "no_repeat_ngram_size": 3,
+    "normalize_before": false,
+    "num_beam_groups": 1,
+    "num_beams": 3,
+    "num_hidden_layers": 6,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "scale_embedding": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 51289
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.1",
+  "vision_config": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "depths": [
+      1,
+      1,
+      9,
+      1
+    ],
+    "dim_embed": [
+      128,
+      256,
+      512,
+      1024
+    ],
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "drop_path_rate": 0.1,
+    "early_stopping": false,
+    "enable_checkpoint": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_feature_source": [
+      "spatial_avg_pool",
+      "temporal_avg_pool"
+    ],
+    "image_pos_embed": {
+      "max_pos_embeddings": 50,
+      "type": "learned_abs_2d"
+    },
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "davit",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_groups": [
+      4,
+      8,
+      16,
+      32
+    ],
+    "num_heads": [
+      4,
+      8,
+      16,
+      32
+    ],
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_padding": [
+      3,
+      1,
+      1,
+      1
+    ],
+    "patch_prenorm": [
+      false,
+      true,
+      true,
+      true
+    ],
+    "patch_size": [
+      7,
+      3,
+      3,
+      3
+    ],
+    "patch_stride": [
+      4,
+      2,
+      2,
+      2
+    ],
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 768,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "visual_temporal_embedding": {
+      "max_temporal_embeddings": 100,
+      "type": "COSINE"
+    },
+    "window_size": 12
+  },
+  "vocab_size": 51289
+}
diff --git a/weights/icon_caption_florence/generation_config.json b/weights/icon_caption_florence/generation_config.json
new file mode 100644
index 0000000..ef46845
--- /dev/null
+++ b/weights/icon_caption_florence/generation_config.json
@@ -0,0 +1,13 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "decoder_start_token_id": 2,
+  "early_stopping": true,
+  "eos_token_id": 1,
+  "forced_bos_token_id": 0,
+  "forced_eos_token_id": 2,
+  "no_repeat_ngram_size": 3,
+  "num_beams": 3,
+  "pad_token_id": 0,
+  "transformers_version": "4.46.1"
+}
diff --git a/weights/icon_detect/model.yaml b/weights/icon_detect/model.yaml
new file mode 100644
index 0000000..8372bb8
--- /dev/null
+++ b/weights/icon_detect/model.yaml
@@ -0,0 +1,129 @@
+backbone:
+- - -1
+  - 1
+  - Conv
+  - - 64
+    - 3
+    - 2
+- - -1
+  - 1
+  - Conv
+  - - 128
+    - 3
+    - 2
+- - -1
+  - 3
+  - C2f
+  - - 128
+    - true
+- - -1
+  - 1
+  - Conv
+  - - 256
+    - 3
+    - 2
+- - -1
+  - 6
+  - C2f
+  - - 256
+    - true
+- - -1
+  - 1
+  - Conv
+  - - 512
+    - 3
+    - 2
+- - -1
+  - 6
+  - C2f
+  - - 512
+    - true
+- - -1
+  - 1
+  - Conv
+  - - 1024
+    - 3
+    - 2
+- - -1
+  - 3
+  - C2f
+  - - 1024
+    - true
+- - -1
+  - 1
+  - SPPF
+  - - 1024
+    - 5
+ch: 3
+depth_multiple: 0.33
+head:
+- - -1
+  - 1
+  - nn.Upsample
+  - - None
+    - 2
+    - nearest
+- - - -1
+    - 6
+  - 1
+  - Concat
+  - - 1
+- - -1
+  - 3
+  - C2f
+  - - 512
+- - -1
+  - 1
+  - nn.Upsample
+  - - None
+    - 2
+    - nearest
+- - - -1
+    - 4
+  - 1
+  - Concat
+  - - 1
+- - -1
+  - 3
+  - C2f
+  - - 256
+- - -1
+  - 1
+  - Conv
+  - - 256
+    - 3
+    - 2
+- - - -1
+    - 12
+  - 1
+  - Concat
+  - - 1
+- - -1
+  - 3
+  - C2f
+  - - 512
+- - -1
+  - 1
+  - Conv
+  - - 512
+    - 3
+    - 2
+- - - -1
+    - 9
+  - 1
+  - Concat
+  - - 1
+- - -1
+  - 3
+  - C2f
+  - - 1024
+- - - 15
+    - 18
+    - 21
+  - 1
+  - Detect
+  - - nc
+nc: 1
+scale: ''
+width_multiple: 0.25
+yaml_file: weights/icon_detect_v1_5/model.yaml
diff --git a/weights/icon_detect/train_args.yaml b/weights/icon_detect/train_args.yaml
new file mode 100644
index 0000000..1324d38
--- /dev/null
+++ b/weights/icon_detect/train_args.yaml
@@ -0,0 +1,107 @@
+train_args:
+  agnostic_nms: false
+  amp: true
+  augment: false
+  auto_augment: randaugment
+  batch: 64
+  box: 7.5
+  cache: false
+  cfg: null
+  classes: null
+  close_mosaic: 10
+  cls: 0.5
+  conf: null
+  copy_paste: 0.0
+  cos_lr: false
+  crop_fraction: 1.0
+  degrees: 0.0
+  deterministic: true
+  device:
+  - 0
+  - 1
+  - 2
+  - 3
+  dfl: 1.5
+  dnn: false
+  dropout: 0.0
+  dynamic: false
+  embed: null
+  epochs: 20
+  erasing: 0.4
+  exist_ok: false
+  fliplr: 0.5
+  flipud: 0.0
+  format: torchscript
+  fraction: 1.0
+  freeze: null
+  half: false
+  hsv_h: 0.015
+  hsv_s: 0.7
+  hsv_v: 0.4
+  imgsz: 1280
+  int8: false
+  iou: 0.7
+  keras: false
+  kobj: 1.0
+  label_smoothing: 0.0
+  line_width: null
+  lr0: 0.01
+  lrf: 0.01
+  mask_ratio: 4
+  max_det: 300
+  mixup: 0.0
+  mode: train
+  model: yolov8n.pt
+  momentum: 0.937
+  mosaic: 0.0
+  multi_scale: false
+  nbs: 64
+  nms: false
+  opset: null
+  optimize: false
+  optimizer: auto
+  overlap_mask: true
+  patience: 100
+  perspective: 0.0
+  plots: true
+  pose: 12.0
+  pretrained: true
+  profile: false
+  project: null
+  rect: false
+  resume: false
+  retina_masks: false
+  save: true
+  save_conf: false
+  save_crop: false
+  save_frames: false
+  save_hybrid: false
+  save_json: false
+  save_period: -1
+  save_txt: false
+  scale: 0.5
+  seed: 0
+  shear: 0.0
+  show: false
+  show_boxes: true
+  show_conf: true
+  show_labels: true
+  simplify: false
+  single_cls: false
+  source: null
+  split: val
+  stream_buffer: false
+  task: detect
+  time: null
+  tracker: botsort.yaml
+  translate: 0.1
+  val: true
+  verbose: true
+  vid_stride: 1
+  visualize: false
+  warmup_bias_lr: 0.0
+  warmup_epochs: 3.0
+  warmup_momentum: 0.8
+  weight_decay: 0.0005
+  workers: 8
+  workspace: 4