Aivanov_scan_ogc/pipeline/ocr_glm.py

"""Wrapper singleton pour GLM-OCR 0.9B."""
import time
from pathlib import Path
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText

MODEL_PATH = "zai-org/GLM-OCR"


class GLMOCR:
    """Charge GLM-OCR une fois, réutilise le modèle pour toutes les pages."""
    _instance = None

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super().__new__(cls)
            cls._instance._init_model()
        return cls._instance

    def _init_model(self):
        t0 = time.time()
        self.processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
        self.model = AutoModelForImageTextToText.from_pretrained(
            MODEL_PATH,
            torch_dtype="auto",
            device_map="auto",
            trust_remote_code=True,
        )
        self.load_time = time.time() - t0
        self.vram_gb = torch.cuda.memory_allocated() / 1e9 if torch.cuda.is_available() else 0.0

    def run(self, image_path: str | Path, prompt: str, max_new_tokens: int = 4096) -> dict:
        """Exécute GLM-OCR sur une image avec un prompt, retourne {text, elapsed_s}."""
        image_path = str(image_path)
        messages = [{
            "role": "user",
            "content": [
                {"type": "image", "url": image_path},
                {"type": "text", "text": prompt},
            ],
        }]
        t0 = time.time()
        inputs = self.processor.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt",
        ).to(self.model.device)
        inputs.pop("token_type_ids", None)

        with torch.no_grad():
            generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
        output = self.processor.decode(
            generated_ids[0][inputs["input_ids"].shape[1]:],
            skip_special_tokens=False,
        )
        # Nettoyer le marqueur de fin utilisateur
        output = output.replace("<|user|>", "").strip()
        return {"text": output, "elapsed_s": time.time() - t0}