fix(cli): avoid duplicate ONNX native load in Windows frozen

This commit is contained in:
2026-06-12 16:49:11 +02:00
parent 1bced55b81
commit fff4a2d902
4 changed files with 143 additions and 43 deletions

View File

@@ -122,7 +122,13 @@ except Exception:
_HOSPITAL_FILTER_AVAILABLE = False _HOSPITAL_FILTER_AVAILABLE = False
HospitalFilter = None # type: ignore HospitalFilter = None # type: ignore
# NER manager (facultatif) # NER manager legacy/Optimum (facultatif). Le CLI production le désactive pour
# éviter un double chargement natif ONNX en build Windows frozen ; il passe
# explicitement CamemBERT-bio, EDS et GLiNER au moteur.
if os.environ.get("ANON_SKIP_LEGACY_ONNX_MANAGER") == "1":
NerModelManager = None # type: ignore
NerThresholds = None # type: ignore
else:
try: try:
from ner_manager_onnx import NerModelManager, NerThresholds from ner_manager_onnx import NerModelManager, NerThresholds
except Exception: except Exception:

View File

@@ -18,6 +18,7 @@ from __future__ import annotations
import json import json
import logging import logging
import threading
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
@@ -41,6 +42,9 @@ except ImportError:
DEFAULT_MODEL_DIR = Path(__file__).parent / "models" / "camembert-bio-deid" / "onnx" DEFAULT_MODEL_DIR = Path(__file__).parent / "models" / "camembert-bio-deid" / "onnx"
_LOAD_LOCK = threading.RLock()
_PROCESS_CACHE: Dict[Path, Dict[str, Any]] = {}
# Mapping labels BIO du modèle → clés PLACEHOLDERS (anonymizer_core) # Mapping labels BIO du modèle → clés PLACEHOLDERS (anonymizer_core)
CAMEMBERT_LABEL_MAP: Dict[str, str] = { CAMEMBERT_LABEL_MAP: Dict[str, str] = {
"PER": "NOM", "PER": "NOM",
@@ -79,6 +83,9 @@ class CamembertNerManager:
def load(self) -> None: def load(self) -> None:
"""Charge le modèle ONNX et le tokenizer.""" """Charge le modèle ONNX et le tokenizer."""
if self._loaded and self._session is not None and self._tokenizer is not None:
return
if not _ORT_AVAILABLE: if not _ORT_AVAILABLE:
raise RuntimeError("onnxruntime non disponible. Installez : pip install onnxruntime") raise RuntimeError("onnxruntime non disponible. Installez : pip install onnxruntime")
if not _TOKENIZERS_AVAILABLE: if not _TOKENIZERS_AVAILABLE:
@@ -88,6 +95,18 @@ class CamembertNerManager:
if not model_path.exists(): if not model_path.exists():
raise FileNotFoundError(f"Modèle ONNX non trouvé: {model_path}") raise FileNotFoundError(f"Modèle ONNX non trouvé: {model_path}")
cache_key = self._model_dir.resolve()
with _LOAD_LOCK:
cached = _PROCESS_CACHE.get(cache_key)
if cached is not None:
self._session = cached["session"]
self._tokenizer = cached["tokenizer"]
self._id2label = dict(cached["id2label"])
self._version = cached.get("version", "?")
self._loaded = True
log.info(f"CamemBERT-bio ONNX réutilisé: {self._model_dir} ({len(self._id2label)} labels)")
return
self.unload() self.unload()
# Charger id2label depuis config.json # Charger id2label depuis config.json
@@ -96,7 +115,9 @@ class CamembertNerManager:
cfg = json.load(f) cfg = json.load(f)
self._id2label = {int(k): v for k, v in cfg.get("id2label", {}).items()} self._id2label = {int(k): v for k, v in cfg.get("id2label", {}).items()}
# Session ONNX (CPU) # Session ONNX (CPU). Une seule session CamemBERT par process et par
# dossier modèle : certains runtimes Windows/PyInstaller refusent de
# recharger le module natif plus d'une fois dans le même process.
opts = ort.SessionOptions() opts = ort.SessionOptions()
opts.inter_op_num_threads = 2 opts.inter_op_num_threads = 2
opts.intra_op_num_threads = 4 opts.intra_op_num_threads = 4
@@ -127,6 +148,13 @@ class CamembertNerManager:
else: else:
log.info(f"CamemBERT-bio ONNX chargé: {self._model_dir} ({len(self._id2label)} labels)") log.info(f"CamemBERT-bio ONNX chargé: {self._model_dir} ({len(self._id2label)} labels)")
_PROCESS_CACHE[cache_key] = {
"session": self._session,
"tokenizer": self._tokenizer,
"id2label": dict(self._id2label),
"version": self._version,
}
def unload(self) -> None: def unload(self) -> None:
self._session = None self._session = None
self._tokenizer = None self._tokenizer = None

View File

@@ -80,6 +80,17 @@ _SUPPORTED_EXT = {
".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp",
} }
# Le CLI production n'utilise pas le manager ONNX legacy/Optimum
# (NerModelManager). Le désactiver évite un second chargement natif ONNX dans le
# même process Windows/PyInstaller avant CamemBERT-bio, qui est le modèle
# obligatoire du CLI.
os.environ.setdefault("ANON_SKIP_LEGACY_ONNX_MANAGER", "1")
_n_cpu_threads = str(os.cpu_count() or 4)
for _env in ("OMP_NUM_THREADS", "MKL_NUM_THREADS", "OPENBLAS_NUM_THREADS",
"NUMEXPR_NUM_THREADS", "VECLIB_MAXIMUM_THREADS"):
os.environ.setdefault(_env, _n_cpu_threads)
def _resolve(p: str) -> Path: def _resolve(p: str) -> Path:
"""Résout un chemin relatif contre le cwd de lancement (pas _MEIPASS).""" """Résout un chemin relatif contre le cwd de lancement (pas _MEIPASS)."""
@@ -140,12 +151,6 @@ def main(argv: list[str] | None = None) -> int:
log.error("CLI: aucun document supporté trouvé sous %s", inp) log.error("CLI: aucun document supporté trouvé sous %s", inp)
return 2 return 2
import anonymizer_core_refactored_onnx as core
# H1 : aligne les threads torch (idempotent).
if hasattr(core, "_configure_torch_threads"):
core._configure_torch_threads()
# --- Modèles --- # --- Modèles ---
# OBLIGATOIRE (sauf --no-ner) : CamemBERT-bio ONNX. Fail-closed. # OBLIGATOIRE (sauf --no-ner) : CamemBERT-bio ONNX. Fail-closed.
eds_mgr = camembert_mgr = gliner_mgr = None eds_mgr = camembert_mgr = gliner_mgr = None
@@ -189,6 +194,12 @@ def main(argv: list[str] | None = None) -> int:
log.warning("CLI: --no-ner -> MODE REGEX SEUL assumé (aucun modèle NER). " log.warning("CLI: --no-ner -> MODE REGEX SEUL assumé (aucun modèle NER). "
"Qualité réduite : à n'utiliser qu'en connaissance de cause.") "Qualité réduite : à n'utiliser qu'en connaissance de cause.")
import anonymizer_core_refactored_onnx as core
# H1 : aligne les threads torch (idempotent).
if hasattr(core, "_configure_torch_threads"):
core._configure_torch_threads()
use_ner = bool(eds_mgr or gliner_mgr or camembert_mgr) use_ner = bool(eds_mgr or gliner_mgr or camembert_mgr)
log.info("CLI: %d document(s), ner=%s (camembert=%s eds=%s gliner=%s) -> sortie=%s", log.info("CLI: %d document(s), ner=%s (camembert=%s eds=%s gliner=%s) -> sortie=%s",
len(docs), use_ner, bool(camembert_mgr), bool(eds_mgr), bool(gliner_mgr), out_root) len(docs), use_ner, bool(camembert_mgr), bool(eds_mgr), bool(gliner_mgr), out_root)

View File

@@ -0,0 +1,55 @@
import json
def test_camembert_load_is_idempotent_and_reuses_process_session(tmp_path, monkeypatch):
import camembert_ner_manager as module
model_dir = tmp_path / "camembert-bio-deid" / "onnx"
model_dir.mkdir(parents=True)
(model_dir / "model.onnx").write_bytes(b"fake")
(model_dir / "config.json").write_text(
json.dumps({"id2label": {"0": "O", "1": "B-PER"}}),
encoding="utf-8",
)
(model_dir.parent / "VERSION.json").write_text(
json.dumps({"current_version": "v-test", "versions": {"v-test": {"f1": 1, "recall": 1}}}),
encoding="utf-8",
)
created_sessions = []
class FakeSessionOptions:
inter_op_num_threads = 0
intra_op_num_threads = 0
class FakeOrt:
SessionOptions = FakeSessionOptions
@staticmethod
def InferenceSession(path, sess_options=None, providers=None):
session = {"path": path, "providers": providers}
created_sessions.append(session)
return session
class FakeTokenizer:
@staticmethod
def from_pretrained(path):
return {"tokenizer_path": path}
monkeypatch.setattr(module, "_ORT_AVAILABLE", True)
monkeypatch.setattr(module, "_TOKENIZERS_AVAILABLE", True)
monkeypatch.setattr(module, "ort", FakeOrt)
monkeypatch.setattr(module, "AutoTokenizer", FakeTokenizer)
module._PROCESS_CACHE.clear()
first = module.CamembertNerManager(model_dir)
first.load()
first.load()
second = module.CamembertNerManager(model_dir)
second.load()
assert len(created_sessions) == 1
assert first.is_loaded()
assert second.is_loaded()
assert first._session is second._session