feat(grounding): module smart_resize officiel Qwen3-VL
Module pur core/grounding/smart_resize.py implémentant la formule
smart_resize officielle (transformers.qwen2_vl.image_processing_qwen2_vl,
utilisée par Qwen3VLProcessor pour les images via wrap Qwen2VLImageProcessor).
Helpers exposés : _round_by_factor, _floor_by_factor, _ceil_by_factor.
Constantes : FACTOR_DEFAULT=28, MIN_PIXELS_DEFAULT=3136,
MAX_PIXELS_DEFAULT=1_003_520, MAX_RATIO_DEFAULT=200.
Tests : tests/unit/test_smart_resize.py — 32 cas, 100% coverage sur le
module (mesure via coverage API directe, pytest-cov bloqué par bug cv2
préexistant tracé dans DETTE-011).
refs DETTE-006 (étape 1/5 du fix smart_resize)
refs DETTE-007 (création de la 3ème implémentation, à unifier post-démo)
refs DETTE-010 (vérif preprocessor_config.json checkpoint Qwen3-VL-8B
bloquante avant Étape 2)
refs DETTE-011 (bug cv2 contourné pour mesure coverage)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
77
core/grounding/smart_resize.py
Normal file
77
core/grounding/smart_resize.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""
|
||||
Smart resize officiel Qwen3-VL (algorithme commun Qwen2-VL/Qwen3-VL pour images).
|
||||
|
||||
Source de référence : transformers.models.qwen2_vl.image_processing_qwen2_vl.smart_resize
|
||||
(transformers 4.57.3). Qwen3-VL utilise Qwen2VLImageProcessor pour les images via
|
||||
Qwen3VLProcessor.image_processor_class — la formule est donc commune Qwen2-VL/Qwen3-VL
|
||||
sur le pipeline image.
|
||||
|
||||
Conditions garanties par smart_resize :
|
||||
1. height et width retournés divisibles par `factor` (par défaut 28).
|
||||
2. Total pixels dans l'intervalle [min_pixels, max_pixels].
|
||||
3. Aspect ratio conservé au plus près.
|
||||
|
||||
Module image-only. Pour traitement vidéo Qwen3-VL (factor=32, autres bornes),
|
||||
module dédié à créer si besoin futur.
|
||||
"""
|
||||
|
||||
# DETTE-007 — Trois implémentations smart_resize coexistent dans le repo
|
||||
# (core/grounding/server.py:15, core/grounding/infigui_worker.py:99, ce module).
|
||||
# Unification post-démo Kerella.
|
||||
|
||||
import math
|
||||
|
||||
|
||||
FACTOR_DEFAULT = 28
|
||||
MIN_PIXELS_DEFAULT = 56 * 56 # 3136
|
||||
MAX_PIXELS_DEFAULT = 14 * 14 * 4 * 1280 # 1_003_520
|
||||
MAX_RATIO_DEFAULT = 200
|
||||
|
||||
|
||||
def _round_by_factor(number: int, factor: int) -> int:
|
||||
"""Closest integer to `number` divisible by `factor`."""
|
||||
return round(number / factor) * factor
|
||||
|
||||
|
||||
def _floor_by_factor(number: int, factor: int) -> int:
|
||||
"""Largest integer ≤ `number` divisible by `factor`."""
|
||||
return math.floor(number / factor) * factor
|
||||
|
||||
|
||||
def _ceil_by_factor(number: int, factor: int) -> int:
|
||||
"""Smallest integer ≥ `number` divisible by `factor`."""
|
||||
return math.ceil(number / factor) * factor
|
||||
|
||||
|
||||
def smart_resize(
|
||||
height: int,
|
||||
width: int,
|
||||
factor: int = FACTOR_DEFAULT,
|
||||
min_pixels: int = MIN_PIXELS_DEFAULT,
|
||||
max_pixels: int = MAX_PIXELS_DEFAULT,
|
||||
) -> tuple[int, int]:
|
||||
"""Rescale (height, width) to satisfy the three conditions of the module docstring.
|
||||
|
||||
Raises:
|
||||
ValueError: if max(height, width) / min(height, width) > MAX_RATIO_DEFAULT
|
||||
(aspect ratio out of supported domain).
|
||||
|
||||
Returns:
|
||||
(resized_height, resized_width).
|
||||
"""
|
||||
if max(height, width) / min(height, width) > MAX_RATIO_DEFAULT:
|
||||
raise ValueError(
|
||||
f"absolute aspect ratio must be smaller than {MAX_RATIO_DEFAULT}, "
|
||||
f"got {max(height, width) / min(height, width)}"
|
||||
)
|
||||
h_bar = round(height / factor) * factor
|
||||
w_bar = round(width / factor) * factor
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = max(factor, math.floor(height / beta / factor) * factor)
|
||||
w_bar = max(factor, math.floor(width / beta / factor) * factor)
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = math.ceil(height * beta / factor) * factor
|
||||
w_bar = math.ceil(width * beta / factor) * factor
|
||||
return h_bar, w_bar
|
||||
Reference in New Issue
Block a user