"""Test Qwen2.5-VL-3B — plus léger et rapide."""
import time
import torch
from pathlib import Path
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
from qwen_vl_utils import process_vision_info
from pipeline.ingest import pdf_to_images

MODEL = "Qwen/Qwen2.5-VL-3B-Instruct"
PROMPT = """Lis cette fiche médicale OGC et renvoie STRICTEMENT le JSON suivant, sans commentaire ni markdown.
Codes CIM-10 : lettre + 2 à 4 chiffres (ex: K650). Codes CCAM : 4 lettres + 3 chiffres (ex: EBFA012).
Codes GHM : 2 chiffres + lettre + 3 chiffres (ex: 11M122). Codes GHS : 3-5 chiffres.

{
  "n_ogc": "",
  "codage_etab": {"dp": "", "dp_libelle": "", "dr": ""},
  "codage_reco": {"dp": "", "dr": ""},
  "ghm_etab": "", "ghs_etab": "",
  "ghm_reco": "", "ghs_reco": "",
  "praticien_conseil": ""
}"""

CASES = ["2018 CARC/OGC 7.pdf", "2018 CARC/OGC 27.pdf", "2018 CARC/OGC 55.pdf", "2018 CARC/OGC 86.pdf"]


def main():
    print(f"--- Chargement {MODEL} ---")
    t0 = time.time()
    processor = AutoProcessor.from_pretrained(
        MODEL,
        min_pixels=256 * 28 * 28,
        max_pixels=1280 * 28 * 28,
    )
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        MODEL, torch_dtype=torch.bfloat16, device_map="auto",
    )
    print(f"Chargé en {time.time()-t0:.1f}s, VRAM={torch.cuda.memory_allocated()/1e9:.2f} Go")

    Path("test_qwen3b_out").mkdir(exist_ok=True)
    total_time = 0
    for pdf in CASES:
        name = Path(pdf).stem
        img = str(pdf_to_images(pdf)[0])
        messages = [{"role": "user", "content": [
            {"type": "image", "image": img}, {"type": "text", "text": PROMPT}]}]
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, _ = process_vision_info(messages)
        inputs = processor(
            text=[text], images=image_inputs, videos=None,
            padding=True, return_tensors="pt",
        ).to(model.device)
        t0 = time.time()
        with torch.no_grad():
            gen = model.generate(**inputs, max_new_tokens=1024)
        out_ids = gen[:, inputs.input_ids.shape[1]:]
        output = processor.batch_decode(out_ids, skip_special_tokens=True)[0]
        elapsed = time.time() - t0
        total_time += elapsed
        print(f"\n=== {name} ({elapsed:.1f}s) ===")
        print(output[:600])
        (Path("test_qwen3b_out") / f"{name}.txt").write_text(output)
    print(f"\nTotal inférence : {total_time:.1f}s ({total_time/4:.1f}s moy/page)")


if __name__ == "__main__":
    main()