From e74064a2e120a3bab35e4f3608525162d840e24b Mon Sep 17 00:00:00 2001
From: dom <dom@local>
Date: Wed, 18 Feb 2026 11:42:14 +0100
Subject: [PATCH] fix: thread-safety embedding singleton + QC alertes string
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Ajout threading.Lock sur _get_embed_model() pour empêcher les
  chargements concurrents depuis ThreadPool(2) — élimine les erreurs
  "meta tensor" et les doubles CUDA OOM
- Sentinelle _embed_failed évite les retries infinis après échec
- NotImplementedError ajouté aux exceptions capturées (meta tensor)
- Fallback CPU protégé par try/except avec _embed_failed
- QC: alertes_globales string wrappée en liste (évite itération par
  caractère quand le LLM retourne une string au lieu d'une liste)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/medical/cim10_extractor.py |  2 ++
 src/medical/rag_search.py      | 34 +++++++++++++++++++++++++++-------
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/src/medical/cim10_extractor.py b/src/medical/cim10_extractor.py
index 6cc5a27..16894ea 100644
--- a/src/medical/cim10_extractor.py
+++ b/src/medical/cim10_extractor.py
@@ -1124,6 +1124,8 @@ Réponds avec un JSON :
             )
 
     alertes_globales = result.get("alertes_globales", [])
+    if isinstance(alertes_globales, str):
+        alertes_globales = [alertes_globales]
     for a in alertes_globales:
         if isinstance(a, str) and a.strip():
             dossier.alertes_codage.append(f"QC: {a}")
diff --git a/src/medical/rag_search.py b/src/medical/rag_search.py
index 6ccd9c0..a0c69df 100644
--- a/src/medical/rag_search.py
+++ b/src/medical/rag_search.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import logging
+import threading
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from ..config import (
@@ -21,6 +22,8 @@ logger = logging.getLogger(__name__)
 
 # Singleton pour le modèle d'embedding (chargé une seule fois)
 _embed_model = None
+_embed_lock = threading.Lock()
+_embed_failed = False  # Sentinelle pour éviter les retries infinis
 
 # Singleton pour le cross-encoder de re-ranking (CPU uniquement)
 _reranker_model = None
@@ -32,13 +35,23 @@ _MIN_SCORE_CPAM = 0.40
 
 
 def _get_embed_model():
-    """Charge le modèle d'embedding (singleton).
+    """Charge le modèle d'embedding (singleton thread-safe).
 
     Tente CUDA d'abord, fallback CPU si OOM (Ollama peut occuper la VRAM).
     low_cpu_mem_usage=False évite les meta tensors (accelerate + sentence-transformers 5.x).
+    Un Lock empêche les chargements concurrents depuis le ThreadPool.
     """
-    global _embed_model
-    if _embed_model is None:
+    global _embed_model, _embed_failed
+    if _embed_model is not None:
+        return _embed_model
+    if _embed_failed:
+        raise RuntimeError("Modèle d'embedding indisponible (échec précédent)")
+    with _embed_lock:
+        # Double-check après acquisition du lock
+        if _embed_model is not None:
+            return _embed_model
+        if _embed_failed:
+            raise RuntimeError("Modèle d'embedding indisponible (échec précédent)")
         from sentence_transformers import SentenceTransformer
         import torch
         _device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -48,15 +61,22 @@ def _get_embed_model():
             _embed_model = SentenceTransformer(
                 EMBEDDING_MODEL, device=_device, model_kwargs=_model_kwargs,
             )
-        except (torch.OutOfMemoryError, torch.cuda.CudaError, torch.AcceleratorError, RuntimeError) as exc:
+        except (torch.OutOfMemoryError, torch.cuda.CudaError, torch.AcceleratorError,
+                RuntimeError, NotImplementedError) as exc:
             exc_msg = str(exc).lower()
             if _device == "cuda" and ("memory" in exc_msg or "meta tensor" in exc_msg):
                 logger.warning("CUDA erreur pour l'embedding — fallback CPU : %s", exc)
                 torch.cuda.empty_cache()
-                _embed_model = SentenceTransformer(
-                    EMBEDDING_MODEL, device="cpu", model_kwargs=_model_kwargs,
-                )
+                try:
+                    _embed_model = SentenceTransformer(
+                        EMBEDDING_MODEL, device="cpu", model_kwargs=_model_kwargs,
+                    )
+                except Exception as exc2:
+                    logger.error("Fallback CPU aussi en échec : %s", exc2)
+                    _embed_failed = True
+                    raise
             else:
+                _embed_failed = True
                 raise
         _embed_model.max_seq_length = 512
     return _embed_model