fix(cli): avoid duplicate ONNX native load in Windows frozen

This commit is contained in:
2026-06-12 16:49:11 +02:00
parent 34c681b791
commit 6c6f6532fc
4 changed files with 143 additions and 43 deletions

View File

@@ -122,12 +122,18 @@ except Exception:
_HOSPITAL_FILTER_AVAILABLE = False _HOSPITAL_FILTER_AVAILABLE = False
HospitalFilter = None # type: ignore HospitalFilter = None # type: ignore
# NER manager (facultatif) # NER manager legacy/Optimum (facultatif). Le CLI production le désactive pour
try: # éviter un double chargement natif ONNX en build Windows frozen ; il passe
from ner_manager_onnx import NerModelManager, NerThresholds # explicitement CamemBERT-bio, EDS et GLiNER au moteur.
except Exception: if os.environ.get("ANON_SKIP_LEGACY_ONNX_MANAGER") == "1":
NerModelManager = None # type: ignore NerModelManager = None # type: ignore
NerThresholds = None # type: ignore NerThresholds = None # type: ignore
else:
try:
from ner_manager_onnx import NerModelManager, NerThresholds
except Exception:
NerModelManager = None # type: ignore
NerThresholds = None # type: ignore
# EDS-Pseudo manager (facultatif) # EDS-Pseudo manager (facultatif)
try: try:

View File

@@ -18,6 +18,7 @@ from __future__ import annotations
import json import json
import logging import logging
import threading
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
@@ -41,6 +42,9 @@ except ImportError:
DEFAULT_MODEL_DIR = Path(__file__).parent / "models" / "camembert-bio-deid" / "onnx" DEFAULT_MODEL_DIR = Path(__file__).parent / "models" / "camembert-bio-deid" / "onnx"
_LOAD_LOCK = threading.RLock()
_PROCESS_CACHE: Dict[Path, Dict[str, Any]] = {}
# Mapping labels BIO du modèle → clés PLACEHOLDERS (anonymizer_core) # Mapping labels BIO du modèle → clés PLACEHOLDERS (anonymizer_core)
CAMEMBERT_LABEL_MAP: Dict[str, str] = { CAMEMBERT_LABEL_MAP: Dict[str, str] = {
"PER": "NOM", "PER": "NOM",
@@ -79,6 +83,9 @@ class CamembertNerManager:
def load(self) -> None: def load(self) -> None:
"""Charge le modèle ONNX et le tokenizer.""" """Charge le modèle ONNX et le tokenizer."""
if self._loaded and self._session is not None and self._tokenizer is not None:
return
if not _ORT_AVAILABLE: if not _ORT_AVAILABLE:
raise RuntimeError("onnxruntime non disponible. Installez : pip install onnxruntime") raise RuntimeError("onnxruntime non disponible. Installez : pip install onnxruntime")
if not _TOKENIZERS_AVAILABLE: if not _TOKENIZERS_AVAILABLE:
@@ -88,44 +95,65 @@ class CamembertNerManager:
if not model_path.exists(): if not model_path.exists():
raise FileNotFoundError(f"Modèle ONNX non trouvé: {model_path}") raise FileNotFoundError(f"Modèle ONNX non trouvé: {model_path}")
self.unload() cache_key = self._model_dir.resolve()
with _LOAD_LOCK:
cached = _PROCESS_CACHE.get(cache_key)
if cached is not None:
self._session = cached["session"]
self._tokenizer = cached["tokenizer"]
self._id2label = dict(cached["id2label"])
self._version = cached.get("version", "?")
self._loaded = True
log.info(f"CamemBERT-bio ONNX réutilisé: {self._model_dir} ({len(self._id2label)} labels)")
return
# Charger id2label depuis config.json self.unload()
config_path = self._model_dir / "config.json"
with open(config_path, encoding="utf-8") as f:
cfg = json.load(f)
self._id2label = {int(k): v for k, v in cfg.get("id2label", {}).items()}
# Session ONNX (CPU) # Charger id2label depuis config.json
opts = ort.SessionOptions() config_path = self._model_dir / "config.json"
opts.inter_op_num_threads = 2 with open(config_path, encoding="utf-8") as f:
opts.intra_op_num_threads = 4 cfg = json.load(f)
self._session = ort.InferenceSession( self._id2label = {int(k): v for k, v in cfg.get("id2label", {}).items()}
str(model_path),
sess_options=opts,
providers=["CPUExecutionProvider"],
)
# Tokenizer # Session ONNX (CPU). Une seule session CamemBERT par process et par
self._tokenizer = AutoTokenizer.from_pretrained(str(self._model_dir)) # dossier modèle : certains runtimes Windows/PyInstaller refusent de
self._loaded = True # recharger le module natif plus d'une fois dans le même process.
opts = ort.SessionOptions()
opts.inter_op_num_threads = 2
opts.intra_op_num_threads = 4
self._session = ort.InferenceSession(
str(model_path),
sess_options=opts,
providers=["CPUExecutionProvider"],
)
# Lire la version depuis VERSION.json (si disponible) # Tokenizer
self._version = "?" self._tokenizer = AutoTokenizer.from_pretrained(str(self._model_dir))
version_path = self._model_dir.parent / "VERSION.json" self._loaded = True
if version_path.exists():
try: # Lire la version depuis VERSION.json (si disponible)
with open(version_path, encoding="utf-8") as vf: self._version = "?"
vinfo = json.load(vf) version_path = self._model_dir.parent / "VERSION.json"
self._version = vinfo.get("current_version", "?") if version_path.exists():
v_meta = vinfo.get("versions", {}).get(self._version, {}) try:
f1 = v_meta.get("f1", "?") with open(version_path, encoding="utf-8") as vf:
recall = v_meta.get("recall", "?") vinfo = json.load(vf)
log.info(f"CamemBERT-bio ONNX {self._version} chargé (F1={f1}, R={recall}, {len(self._id2label)} labels)") self._version = vinfo.get("current_version", "?")
except Exception: v_meta = vinfo.get("versions", {}).get(self._version, {})
f1 = v_meta.get("f1", "?")
recall = v_meta.get("recall", "?")
log.info(f"CamemBERT-bio ONNX {self._version} chargé (F1={f1}, R={recall}, {len(self._id2label)} labels)")
except Exception:
log.info(f"CamemBERT-bio ONNX chargé: {self._model_dir} ({len(self._id2label)} labels)")
else:
log.info(f"CamemBERT-bio ONNX chargé: {self._model_dir} ({len(self._id2label)} labels)") log.info(f"CamemBERT-bio ONNX chargé: {self._model_dir} ({len(self._id2label)} labels)")
else:
log.info(f"CamemBERT-bio ONNX chargé: {self._model_dir} ({len(self._id2label)} labels)") _PROCESS_CACHE[cache_key] = {
"session": self._session,
"tokenizer": self._tokenizer,
"id2label": dict(self._id2label),
"version": self._version,
}
def unload(self) -> None: def unload(self) -> None:
self._session = None self._session = None

View File

@@ -80,6 +80,17 @@ _SUPPORTED_EXT = {
".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp",
} }
# Le CLI production n'utilise pas le manager ONNX legacy/Optimum
# (NerModelManager). Le désactiver évite un second chargement natif ONNX dans le
# même process Windows/PyInstaller avant CamemBERT-bio, qui est le modèle
# obligatoire du CLI.
os.environ.setdefault("ANON_SKIP_LEGACY_ONNX_MANAGER", "1")
_n_cpu_threads = str(os.cpu_count() or 4)
for _env in ("OMP_NUM_THREADS", "MKL_NUM_THREADS", "OPENBLAS_NUM_THREADS",
"NUMEXPR_NUM_THREADS", "VECLIB_MAXIMUM_THREADS"):
os.environ.setdefault(_env, _n_cpu_threads)
def _resolve(p: str) -> Path: def _resolve(p: str) -> Path:
"""Résout un chemin relatif contre le cwd de lancement (pas _MEIPASS).""" """Résout un chemin relatif contre le cwd de lancement (pas _MEIPASS)."""
@@ -140,12 +151,6 @@ def main(argv: list[str] | None = None) -> int:
log.error("CLI: aucun document supporté trouvé sous %s", inp) log.error("CLI: aucun document supporté trouvé sous %s", inp)
return 2 return 2
import anonymizer_core_refactored_onnx as core
# H1 : aligne les threads torch (idempotent).
if hasattr(core, "_configure_torch_threads"):
core._configure_torch_threads()
# --- Modèles --- # --- Modèles ---
# OBLIGATOIRE (sauf --no-ner) : CamemBERT-bio ONNX. Fail-closed. # OBLIGATOIRE (sauf --no-ner) : CamemBERT-bio ONNX. Fail-closed.
eds_mgr = camembert_mgr = gliner_mgr = None eds_mgr = camembert_mgr = gliner_mgr = None
@@ -189,6 +194,12 @@ def main(argv: list[str] | None = None) -> int:
log.warning("CLI: --no-ner -> MODE REGEX SEUL assumé (aucun modèle NER). " log.warning("CLI: --no-ner -> MODE REGEX SEUL assumé (aucun modèle NER). "
"Qualité réduite : à n'utiliser qu'en connaissance de cause.") "Qualité réduite : à n'utiliser qu'en connaissance de cause.")
import anonymizer_core_refactored_onnx as core
# H1 : aligne les threads torch (idempotent).
if hasattr(core, "_configure_torch_threads"):
core._configure_torch_threads()
use_ner = bool(eds_mgr or gliner_mgr or camembert_mgr) use_ner = bool(eds_mgr or gliner_mgr or camembert_mgr)
log.info("CLI: %d document(s), ner=%s (camembert=%s eds=%s gliner=%s) -> sortie=%s", log.info("CLI: %d document(s), ner=%s (camembert=%s eds=%s gliner=%s) -> sortie=%s",
len(docs), use_ner, bool(camembert_mgr), bool(eds_mgr), bool(gliner_mgr), out_root) len(docs), use_ner, bool(camembert_mgr), bool(eds_mgr), bool(gliner_mgr), out_root)

View File

@@ -0,0 +1,55 @@
import json
def test_camembert_load_is_idempotent_and_reuses_process_session(tmp_path, monkeypatch):
import camembert_ner_manager as module
model_dir = tmp_path / "camembert-bio-deid" / "onnx"
model_dir.mkdir(parents=True)
(model_dir / "model.onnx").write_bytes(b"fake")
(model_dir / "config.json").write_text(
json.dumps({"id2label": {"0": "O", "1": "B-PER"}}),
encoding="utf-8",
)
(model_dir.parent / "VERSION.json").write_text(
json.dumps({"current_version": "v-test", "versions": {"v-test": {"f1": 1, "recall": 1}}}),
encoding="utf-8",
)
created_sessions = []
class FakeSessionOptions:
inter_op_num_threads = 0
intra_op_num_threads = 0
class FakeOrt:
SessionOptions = FakeSessionOptions
@staticmethod
def InferenceSession(path, sess_options=None, providers=None):
session = {"path": path, "providers": providers}
created_sessions.append(session)
return session
class FakeTokenizer:
@staticmethod
def from_pretrained(path):
return {"tokenizer_path": path}
monkeypatch.setattr(module, "_ORT_AVAILABLE", True)
monkeypatch.setattr(module, "_TOKENIZERS_AVAILABLE", True)
monkeypatch.setattr(module, "ort", FakeOrt)
monkeypatch.setattr(module, "AutoTokenizer", FakeTokenizer)
module._PROCESS_CACHE.clear()
first = module.CamembertNerManager(model_dir)
first.load()
first.load()
second = module.CamembertNerManager(model_dir)
second.load()
assert len(created_sessions) == 1
assert first.is_loaded()
assert second.is_loaded()
assert first._session is second._session