chore: add .gitignore

2026-03-05 00:37:31 +01:00
parent b0d5c9f570
commit 7579a1fdc7
6 changed files with 568 additions and 25 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,13 +1,76 @@
-weights/icon_caption_blip2
+# === Python ===
 weights/icon_caption_florence
 weights/icon_detect/
 weights/icon_detect_v1_5/
 weights/icon_detect_v1_5_2/
 .gradio
 __pycache__/
-debug.ipynb
+*.py[cod]
-util/__pycache__/
+*.pyo
-index.html?linkid=2289031
+*.egg-info/
-wget-log
+*.egg
-weights/icon_caption_florence_v2/
+dist/
-omnitool/gradio/uploads/
+build/
 *.whl
 # === Virtual environments ===
 .venv/
 venv/
 venv_*/
 env/
 # === ML Models & Data ===
 *.pt
 *.pth
 *.onnx
 *.bin
 *.safetensors
 *.h5
 *.hdf5
 *.pkl
 *.pickle
 *.npy
 *.npz
 *.faiss
 models/
 *.tar.gz
 *.zip
 # === Documents & Media ===
 *.pdf
 *.docx
 *.xlsx
 *.csv
 *.png
 *.jpg
 *.jpeg
 *.gif
 *.mp3
 *.wav
 *.mp4
 # === IDE ===
 .idea/
 .vscode/
 *.swp
 *.swo
 *~
 # === OS ===
 .DS_Store
 Thumbs.db
 .~lock.*
 # === Secrets ===
 .env
 *.env
 credentials.json
 token.pickle
 # === Logs & Cache ===
 *.log
 logs/
 .pytest_cache/
 .mypy_cache/
 .ruff_cache/
 htmlcov/
 .coverage
 # === Backups ===
 *_backup_*
 backups/
--- a/util/utils.py
+++ b/util/utils.py
@@ -20,15 +20,7 @@ from matplotlib import pyplot as plt
 import easyocr
 from paddleocr import PaddleOCR
 reader = easyocr.Reader(['en'])
-paddle_ocr = PaddleOCR(
+paddle_ocr = PaddleOCR(lang='en')
    lang='en',  # other lang also available
    use_angle_cls=False,
    use_gpu=False,  # using cuda will conflict with pytorch in the same process
    show_log=False,
    max_batch_size=1024,
    use_dilation=True,  # improves accuracy
    det_db_score_mode='slow',  # improves accuracy
    rec_batch_num=1024)
 import time
 import base64
@@ -62,9 +54,9 @@ def get_caption_model_processor(model_name, model_name_or_path="Salesforce/blip2
        from transformers import AutoProcessor, AutoModelForCausalLM
        processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
        if device == 'cpu':
-            model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float32, trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float32, trust_remote_code=True, attn_implementation="eager")
        else:
-            model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, trust_remote_code=True).to(device)
+            model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, trust_remote_code=True, attn_implementation="eager").to(device)
    return {'model': model.to(device), 'processor': processor}
--- a/weights/icon_caption_florence/config.json
+++ b/weights/icon_caption_florence/config.json
@@ -0,0 +1,239 @@
 {
  "_name_or_path": "microsoft/Florence-2-base-ft",
  "architectures": [
    "Florence2ForConditionalGeneration"
  ],
  "auto_map": {
    "AutoConfig": "microsoft/Florence-2-base-ft--configuration_florence2.Florence2Config",
    "AutoModelForCausalLM": "microsoft/Florence-2-base-ft--modeling_florence2.Florence2ForConditionalGeneration"
  },
  "bos_token_id": 2,
  "eos_token_id": 1,
  "ignore_index": -100,
  "is_encoder_decoder": true,
  "model_type": "florence2",
  "pad_token_id": 0,
  "projection_dim": 768,
  "text_config": {
    "_attn_implementation_autoset": true,
    "_name_or_path": "",
    "activation_dropout": 0.1,
    "activation_function": "gelu",
    "add_bias_logits": false,
    "add_cross_attention": false,
    "add_final_layer_norm": false,
    "architectures": null,
    "attention_dropout": 0.1,
    "bad_words_ids": null,
    "begin_suppress_tokens": null,
    "bos_token_id": 0,
    "chunk_size_feed_forward": 0,
    "classif_dropout": 0.1,
    "classifier_dropout": 0.0,
    "cross_attention_hidden_size": null,
    "d_model": 768,
    "decoder_attention_heads": 12,
    "decoder_ffn_dim": 3072,
    "decoder_layerdrop": 0.0,
    "decoder_layers": 6,
    "decoder_start_token_id": 2,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "dropout": 0.1,
    "early_stopping": true,
    "encoder_attention_heads": 12,
    "encoder_ffn_dim": 3072,
    "encoder_layerdrop": 0.0,
    "encoder_layers": 6,
    "encoder_no_repeat_ngram_size": 0,
    "eos_token_id": 2,
    "exponential_decay_length_penalty": null,
    "finetuning_task": null,
    "forced_bos_token_id": 0,
    "forced_eos_token_id": 2,
    "gradient_checkpointing": false,
    "id2label": {
      "0": "LABEL_0",
      "1": "LABEL_1",
      "2": "LABEL_2"
    },
    "init_std": 0.02,
    "is_decoder": false,
    "is_encoder_decoder": true,
    "label2id": {
      "LABEL_0": 0,
      "LABEL_1": 1,
      "LABEL_2": 2
    },
    "length_penalty": 1.0,
    "max_length": 20,
    "max_position_embeddings": 1024,
    "min_length": 0,
    "model_type": "florence2_language",
    "no_repeat_ngram_size": 3,
    "normalize_before": false,
    "num_beam_groups": 1,
    "num_beams": 3,
    "num_hidden_layers": 6,
    "num_return_sequences": 1,
    "output_attentions": false,
    "output_hidden_states": false,
    "output_scores": false,
    "pad_token_id": 1,
    "prefix": null,
    "problem_type": null,
    "pruned_heads": {},
    "remove_invalid_values": false,
    "repetition_penalty": 1.0,
    "return_dict": true,
    "return_dict_in_generate": false,
    "scale_embedding": false,
    "sep_token_id": null,
    "suppress_tokens": null,
    "task_specific_params": null,
    "temperature": 1.0,
    "tf_legacy_loss": false,
    "tie_encoder_decoder": false,
    "tie_word_embeddings": true,
    "tokenizer_class": null,
    "top_k": 50,
    "top_p": 1.0,
    "torch_dtype": null,
    "torchscript": false,
    "typical_p": 1.0,
    "use_bfloat16": false,
    "use_cache": true,
    "vocab_size": 51289
  },
  "torch_dtype": "float32",
  "transformers_version": "4.46.1",
  "vision_config": {
    "_attn_implementation_autoset": false,
    "_name_or_path": "",
    "add_cross_attention": false,
    "architectures": null,
    "bad_words_ids": null,
    "begin_suppress_tokens": null,
    "bos_token_id": null,
    "chunk_size_feed_forward": 0,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "depths": [
      1,
      1,
      9,
      1
    ],
    "dim_embed": [
      128,
      256,
      512,
      1024
    ],
    "diversity_penalty": 0.0,
    "do_sample": false,
    "drop_path_rate": 0.1,
    "early_stopping": false,
    "enable_checkpoint": false,
    "encoder_no_repeat_ngram_size": 0,
    "eos_token_id": null,
    "exponential_decay_length_penalty": null,
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
    "id2label": {
      "0": "LABEL_0",
      "1": "LABEL_1"
    },
    "image_feature_source": [
      "spatial_avg_pool",
      "temporal_avg_pool"
    ],
    "image_pos_embed": {
      "max_pos_embeddings": 50,
      "type": "learned_abs_2d"
    },
    "is_decoder": false,
    "is_encoder_decoder": false,
    "label2id": {
      "LABEL_0": 0,
      "LABEL_1": 1
    },
    "length_penalty": 1.0,
    "max_length": 20,
    "min_length": 0,
    "model_type": "davit",
    "no_repeat_ngram_size": 0,
    "num_beam_groups": 1,
    "num_beams": 1,
    "num_groups": [
      4,
      8,
      16,
      32
    ],
    "num_heads": [
      4,
      8,
      16,
      32
    ],
    "num_return_sequences": 1,
    "output_attentions": false,
    "output_hidden_states": false,
    "output_scores": false,
    "pad_token_id": null,
    "patch_padding": [
      3,
      1,
      1,
      1
    ],
    "patch_prenorm": [
      false,
      true,
      true,
      true
    ],
    "patch_size": [
      7,
      3,
      3,
      3
    ],
    "patch_stride": [
      4,
      2,
      2,
      2
    ],
    "prefix": null,
    "problem_type": null,
    "projection_dim": 768,
    "pruned_heads": {},
    "remove_invalid_values": false,
    "repetition_penalty": 1.0,
    "return_dict": true,
    "return_dict_in_generate": false,
    "sep_token_id": null,
    "suppress_tokens": null,
    "task_specific_params": null,
    "temperature": 1.0,
    "tf_legacy_loss": false,
    "tie_encoder_decoder": false,
    "tie_word_embeddings": true,
    "tokenizer_class": null,
    "top_k": 50,
    "top_p": 1.0,
    "torch_dtype": null,
    "torchscript": false,
    "typical_p": 1.0,
    "use_bfloat16": false,
    "visual_temporal_embedding": {
      "max_temporal_embeddings": 100,
      "type": "COSINE"
    },
    "window_size": 12
  },
  "vocab_size": 51289
 }
--- a/weights/icon_caption_florence/generation_config.json
+++ b/weights/icon_caption_florence/generation_config.json
@@ -0,0 +1,13 @@
 {
  "_from_model_config": true,
  "bos_token_id": 2,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 1,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 3,
  "pad_token_id": 0,
  "transformers_version": "4.46.1"
 }
--- a/weights/icon_detect/model.yaml
+++ b/weights/icon_detect/model.yaml
@@ -0,0 +1,129 @@
 backbone:
 - - -1
  - 1
  - Conv
  - - 64
    - 3
    - 2
 - - -1
  - 1
  - Conv
  - - 128
    - 3
    - 2
 - - -1
  - 3
  - C2f
  - - 128
    - true
 - - -1
  - 1
  - Conv
  - - 256
    - 3
    - 2
 - - -1
  - 6
  - C2f
  - - 256
    - true
 - - -1
  - 1
  - Conv
  - - 512
    - 3
    - 2
 - - -1
  - 6
  - C2f
  - - 512
    - true
 - - -1
  - 1
  - Conv
  - - 1024
    - 3
    - 2
 - - -1
  - 3
  - C2f
  - - 1024
    - true
 - - -1
  - 1
  - SPPF
  - - 1024
    - 5
 ch: 3
 depth_multiple: 0.33
 head:
 - - -1
  - 1
  - nn.Upsample
  - - None
    - 2
    - nearest
 - - - -1
    - 6
  - 1
  - Concat
  - - 1
 - - -1
  - 3
  - C2f
  - - 512
 - - -1
  - 1
  - nn.Upsample
  - - None
    - 2
    - nearest
 - - - -1
    - 4
  - 1
  - Concat
  - - 1
 - - -1
  - 3
  - C2f
  - - 256
 - - -1
  - 1
  - Conv
  - - 256
    - 3
    - 2
 - - - -1
    - 12
  - 1
  - Concat
  - - 1
 - - -1
  - 3
  - C2f
  - - 512
 - - -1
  - 1
  - Conv
  - - 512
    - 3
    - 2
 - - - -1
    - 9
  - 1
  - Concat
  - - 1
 - - -1
  - 3
  - C2f
  - - 1024
 - - - 15
    - 18
    - 21
  - 1
  - Detect
  - - nc
 nc: 1
 scale: ''
 width_multiple: 0.25
 yaml_file: weights/icon_detect_v1_5/model.yaml
--- a/weights/icon_detect/train_args.yaml
+++ b/weights/icon_detect/train_args.yaml
@@ -0,0 +1,107 @@
 train_args:
  agnostic_nms: false
  amp: true
  augment: false
  auto_augment: randaugment
  batch: 64
  box: 7.5
  cache: false
  cfg: null
  classes: null
  close_mosaic: 10
  cls: 0.5
  conf: null
  copy_paste: 0.0
  cos_lr: false
  crop_fraction: 1.0
  degrees: 0.0
  deterministic: true
  device:
  - 0
  - 1
  - 2
  - 3
  dfl: 1.5
  dnn: false
  dropout: 0.0
  dynamic: false
  embed: null
  epochs: 20
  erasing: 0.4
  exist_ok: false
  fliplr: 0.5
  flipud: 0.0
  format: torchscript
  fraction: 1.0
  freeze: null
  half: false
  hsv_h: 0.015
  hsv_s: 0.7
  hsv_v: 0.4
  imgsz: 1280
  int8: false
  iou: 0.7
  keras: false
  kobj: 1.0
  label_smoothing: 0.0
  line_width: null
  lr0: 0.01
  lrf: 0.01
  mask_ratio: 4
  max_det: 300
  mixup: 0.0
  mode: train
  model: yolov8n.pt
  momentum: 0.937
  mosaic: 0.0
  multi_scale: false
  nbs: 64
  nms: false
  opset: null
  optimize: false
  optimizer: auto
  overlap_mask: true
  patience: 100
  perspective: 0.0
  plots: true
  pose: 12.0
  pretrained: true
  profile: false
  project: null
  rect: false
  resume: false
  retina_masks: false
  save: true
  save_conf: false
  save_crop: false
  save_frames: false
  save_hybrid: false
  save_json: false
  save_period: -1
  save_txt: false
  scale: 0.5
  seed: 0
  shear: 0.0
  show: false
  show_boxes: true
  show_conf: true
  show_labels: true
  simplify: false
  single_cls: false
  source: null
  split: val
  stream_buffer: false
  task: detect
  time: null
  tracker: botsort.yaml
  translate: 0.1
  val: true
  verbose: true
  vid_stride: 1
  visualize: false
  warmup_bias_lr: 0.0
  warmup_epochs: 3.0
  warmup_momentum: 0.8
  weight_decay: 0.0005
  workers: 8
  workspace: 4