"""Wrapper singleton pour GLM-OCR 0.9B.""" import time from pathlib import Path import torch from transformers import AutoProcessor, AutoModelForImageTextToText MODEL_PATH = "zai-org/GLM-OCR" class GLMOCR: """Charge GLM-OCR une fois, réutilise le modèle pour toutes les pages.""" _instance = None def __new__(cls): if cls._instance is None: cls._instance = super().__new__(cls) cls._instance._init_model() return cls._instance def _init_model(self): t0 = time.time() self.processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True) self.model = AutoModelForImageTextToText.from_pretrained( MODEL_PATH, torch_dtype="auto", device_map="auto", trust_remote_code=True, ) self.load_time = time.time() - t0 self.vram_gb = torch.cuda.memory_allocated() / 1e9 if torch.cuda.is_available() else 0.0 def run(self, image_path: str | Path, prompt: str, max_new_tokens: int = 4096) -> dict: """Exécute GLM-OCR sur une image avec un prompt, retourne {text, elapsed_s}.""" image_path = str(image_path) messages = [{ "role": "user", "content": [ {"type": "image", "url": image_path}, {"type": "text", "text": prompt}, ], }] t0 = time.time() inputs = self.processor.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", ).to(self.model.device) inputs.pop("token_type_ids", None) with torch.no_grad(): generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens) output = self.processor.decode( generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False, ) # Nettoyer le marqueur de fin utilisateur output = output.replace("<|user|>", "").strip() return {"text": output, "elapsed_s": time.time() - t0}