fix: thread-safety embedding singleton + QC alertes string

- Ajout threading.Lock sur _get_embed_model() pour empêcher les chargements concurrents depuis ThreadPool(2) — élimine les erreurs "meta tensor" et les doubles CUDA OOM - Sentinelle _embed_failed évite les retries infinis après échec - NotImplementedError ajouté aux exceptions capturées (meta tensor) - Fallback CPU protégé par try/except avec _embed_failed - QC: alertes_globales string wrappée en liste (évite itération par caractère quand le LLM retourne une string au lieu d'une liste) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-18 11:42:14 +01:00
parent 44118f69aa
commit e74064a2e1
2 changed files with 29 additions and 7 deletions
--- a/src/medical/cim10_extractor.py
+++ b/src/medical/cim10_extractor.py
@@ -1124,6 +1124,8 @@ Réponds avec un JSON :
            )
    alertes_globales = result.get("alertes_globales", [])
    if isinstance(alertes_globales, str):
        alertes_globales = [alertes_globales]
    for a in alertes_globales:
        if isinstance(a, str) and a.strip():
            dossier.alertes_codage.append(f"QC: {a}")
--- a/src/medical/rag_search.py
+++ b/src/medical/rag_search.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 import logging
 import threading
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from ..config import (
@@ -21,6 +22,8 @@ logger = logging.getLogger(__name__)
 # Singleton pour le modèle d'embedding (chargé une seule fois)
 _embed_model = None
 _embed_lock = threading.Lock()
 _embed_failed = False  # Sentinelle pour éviter les retries infinis
 # Singleton pour le cross-encoder de re-ranking (CPU uniquement)
 _reranker_model = None
@@ -32,13 +35,23 @@ _MIN_SCORE_CPAM = 0.40
 def _get_embed_model():
-    """Charge le modèle d'embedding (singleton).
+    """Charge le modèle d'embedding (singleton thread-safe).
    Tente CUDA d'abord, fallback CPU si OOM (Ollama peut occuper la VRAM).
    low_cpu_mem_usage=False évite les meta tensors (accelerate + sentence-transformers 5.x).
    Un Lock empêche les chargements concurrents depuis le ThreadPool.
    """
-    global _embed_model
+    global _embed_model, _embed_failed
-    if _embed_model is None:
+    if _embed_model is not None:
        return _embed_model
    if _embed_failed:
        raise RuntimeError("Modèle d'embedding indisponible (échec précédent)")
    with _embed_lock:
        # Double-check après acquisition du lock
        if _embed_model is not None:
            return _embed_model
        if _embed_failed:
            raise RuntimeError("Modèle d'embedding indisponible (échec précédent)")
        from sentence_transformers import SentenceTransformer
        import torch
        _device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -48,15 +61,22 @@ def _get_embed_model():
            _embed_model = SentenceTransformer(
                EMBEDDING_MODEL, device=_device, model_kwargs=_model_kwargs,
            )
-        except (torch.OutOfMemoryError, torch.cuda.CudaError, torch.AcceleratorError, RuntimeError) as exc:
+        except (torch.OutOfMemoryError, torch.cuda.CudaError, torch.AcceleratorError,
                RuntimeError, NotImplementedError) as exc:
            exc_msg = str(exc).lower()
            if _device == "cuda" and ("memory" in exc_msg or "meta tensor" in exc_msg):
                logger.warning("CUDA erreur pour l'embedding — fallback CPU : %s", exc)
                torch.cuda.empty_cache()
                try:
                    _embed_model = SentenceTransformer(
                        EMBEDDING_MODEL, device="cpu", model_kwargs=_model_kwargs,
                    )
                except Exception as exc2:
                    logger.error("Fallback CPU aussi en échec : %s", exc2)
                    _embed_failed = True
                    raise
            else:
                _embed_failed = True
                raise
        _embed_model.max_seq_length = 512
    return _embed_model