fix: SentenceTransformer meta tensor avec accelerate + torch 2.10
low_cpu_mem_usage=False évite les meta tensors lors du chargement de l'embedding (sentence-transformers 5.x + accelerate 1.12). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -35,21 +35,27 @@ def _get_embed_model():
|
|||||||
"""Charge le modèle d'embedding (singleton).
|
"""Charge le modèle d'embedding (singleton).
|
||||||
|
|
||||||
Tente CUDA d'abord, fallback CPU si OOM (Ollama peut occuper la VRAM).
|
Tente CUDA d'abord, fallback CPU si OOM (Ollama peut occuper la VRAM).
|
||||||
|
low_cpu_mem_usage=False évite les meta tensors (accelerate + sentence-transformers 5.x).
|
||||||
"""
|
"""
|
||||||
global _embed_model
|
global _embed_model
|
||||||
if _embed_model is None:
|
if _embed_model is None:
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
import torch
|
import torch
|
||||||
_device = "cuda" if torch.cuda.is_available() else "cpu"
|
_device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
_model_kwargs = {"low_cpu_mem_usage": False}
|
||||||
try:
|
try:
|
||||||
logger.info("Chargement du modèle d'embedding (%s)...", _device)
|
logger.info("Chargement du modèle d'embedding (%s)...", _device)
|
||||||
_embed_model = SentenceTransformer(EMBEDDING_MODEL, device=_device)
|
_embed_model = SentenceTransformer(
|
||||||
|
EMBEDDING_MODEL, device=_device, model_kwargs=_model_kwargs,
|
||||||
|
)
|
||||||
except (torch.OutOfMemoryError, torch.cuda.CudaError, torch.AcceleratorError, RuntimeError) as exc:
|
except (torch.OutOfMemoryError, torch.cuda.CudaError, torch.AcceleratorError, RuntimeError) as exc:
|
||||||
exc_msg = str(exc).lower()
|
exc_msg = str(exc).lower()
|
||||||
if _device == "cuda" and ("memory" in exc_msg or "meta tensor" in exc_msg):
|
if _device == "cuda" and ("memory" in exc_msg or "meta tensor" in exc_msg):
|
||||||
logger.warning("CUDA erreur pour l'embedding — fallback CPU : %s", exc)
|
logger.warning("CUDA erreur pour l'embedding — fallback CPU : %s", exc)
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
_embed_model = SentenceTransformer(EMBEDDING_MODEL, device="cpu")
|
_embed_model = SentenceTransformer(
|
||||||
|
EMBEDDING_MODEL, device="cpu", model_kwargs=_model_kwargs,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
_embed_model.max_seq_length = 512
|
_embed_model.max_seq_length = 512
|
||||||
|
|||||||
Reference in New Issue
Block a user