fix: SentenceTransformer meta tensor avec accelerate + torch 2.10

low_cpu_mem_usage=False évite les meta tensors lors du chargement
de l'embedding (sentence-transformers 5.x + accelerate 1.12).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
dom
2026-02-18 01:16:01 +01:00
parent 8c1b5a243e
commit 44118f69aa

View File

@@ -35,21 +35,27 @@ def _get_embed_model():
"""Charge le modèle d'embedding (singleton). """Charge le modèle d'embedding (singleton).
Tente CUDA d'abord, fallback CPU si OOM (Ollama peut occuper la VRAM). Tente CUDA d'abord, fallback CPU si OOM (Ollama peut occuper la VRAM).
low_cpu_mem_usage=False évite les meta tensors (accelerate + sentence-transformers 5.x).
""" """
global _embed_model global _embed_model
if _embed_model is None: if _embed_model is None:
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
import torch import torch
_device = "cuda" if torch.cuda.is_available() else "cpu" _device = "cuda" if torch.cuda.is_available() else "cpu"
_model_kwargs = {"low_cpu_mem_usage": False}
try: try:
logger.info("Chargement du modèle d'embedding (%s)...", _device) logger.info("Chargement du modèle d'embedding (%s)...", _device)
_embed_model = SentenceTransformer(EMBEDDING_MODEL, device=_device) _embed_model = SentenceTransformer(
EMBEDDING_MODEL, device=_device, model_kwargs=_model_kwargs,
)
except (torch.OutOfMemoryError, torch.cuda.CudaError, torch.AcceleratorError, RuntimeError) as exc: except (torch.OutOfMemoryError, torch.cuda.CudaError, torch.AcceleratorError, RuntimeError) as exc:
exc_msg = str(exc).lower() exc_msg = str(exc).lower()
if _device == "cuda" and ("memory" in exc_msg or "meta tensor" in exc_msg): if _device == "cuda" and ("memory" in exc_msg or "meta tensor" in exc_msg):
logger.warning("CUDA erreur pour l'embedding — fallback CPU : %s", exc) logger.warning("CUDA erreur pour l'embedding — fallback CPU : %s", exc)
torch.cuda.empty_cache() torch.cuda.empty_cache()
_embed_model = SentenceTransformer(EMBEDDING_MODEL, device="cpu") _embed_model = SentenceTransformer(
EMBEDDING_MODEL, device="cpu", model_kwargs=_model_kwargs,
)
else: else:
raise raise
_embed_model.max_seq_length = 512 _embed_model.max_seq_length = 512