rpa_vision_v3/tools/probe_qwen3vl_processor.py

"""
Dump runtime des attributs et comportement effectif du processor
Qwen3-VL-8B-Instruct. Script jetable, à supprimer après usage.

Usage : python tools/probe_qwen3vl_processor.py
"""
from transformers import AutoProcessor
from PIL import Image
import torch

MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"
FIXTURE = "data/training/live_sessions/bg_DESKTOP-58D5CAC_windows/shots/heartbeat_1773792436.png"

print("=" * 70)
print("DUMP PROCESSOR :", MODEL_ID)
print("=" * 70)

proc = AutoProcessor.from_pretrained(MODEL_ID)
ip = proc.image_processor

# Section 1 — Attributs bruts
print("\n--- ATTRIBUTS BRUTS ---")
print("class:", type(ip).__name__)
print("size:", ip.size)
print("patch_size:", ip.patch_size)
print("merge_size:", ip.merge_size)
for attr in ['min_pixels', 'max_pixels', 'temporal_patch_size',
             'image_mean', 'image_std', 'do_resize', 'do_rescale',
             'rescale_factor', 'do_normalize', 'do_convert_rgb']:
    print(f"{attr}:", getattr(ip, attr, '<absent>'))

# Section 2 — Comportement effectif sur fixture
print("\n--- COMPORTEMENT EFFECTIF SUR FIXTURE ---")
img = Image.open(FIXTURE)
print(f"Image source : {img.size} (W×H)")

out = ip(images=img, return_tensors='pt')
print(f"Keys retournées : {list(out.keys())}")
print(f"pixel_values shape : {out['pixel_values'].shape}")
print(f"image_grid_thw : {out.get('image_grid_thw')}")

# Section 3 — Reconstruction des dimensions resize
print("\n--- RECONSTRUCTION DIMS RESIZE ---")
grid = out.get('image_grid_thw')
if grid is not None:
    grid = grid[0].tolist()  # [t, h, w]
    factor = ip.patch_size * ip.merge_size
    H_resized = grid[1] * factor
    W_resized = grid[2] * factor
    print(f"grid_thw : t={grid[0]}, h={grid[1]}, w={grid[2]}")
    print(f"factor calculé (patch_size × merge_size) : {factor}")
    print(f"Dims resize reconstruites : {W_resized}×{H_resized} (W×H)")
    print(f"Dims source : {img.size}")
    print(f"Ratio resize : {W_resized / img.size[0]:.4f} (W), "
          f"{H_resized / img.size[1]:.4f} (H)")

# Section 4 — Test borne haute pour comprendre min/max_pixels
print("\n--- TEST BORNE HAUTE (image grande) ---")
big_img = Image.new('RGB', (4096, 2560), color='white')
big_out = ip(images=big_img, return_tensors='pt')
big_grid = big_out['image_grid_thw'][0].tolist()
factor = ip.patch_size * ip.merge_size
print(f"Image source : {big_img.size}")
print(f"grid_thw : {big_grid}")
print(f"Dims resize : {big_grid[2] * factor}×{big_grid[1] * factor}")
print(f"Pixels totaux après resize : "
      f"{big_grid[1] * factor * big_grid[2] * factor}")

# Section 5 — Test borne basse (image petite)
print("\n--- TEST BORNE BASSE (image petite) ---")
small_img = Image.new('RGB', (128, 64), color='white')
small_out = ip(images=small_img, return_tensors='pt')
small_grid = small_out['image_grid_thw'][0].tolist()
print(f"Image source : {small_img.size}")
print(f"grid_thw : {small_grid}")
print(f"Dims resize : {small_grid[2] * factor}×{small_grid[1] * factor}")

print("\n" + "=" * 70)
print("FIN DUMP")
print("=" * 70)