fix(cli): avoid duplicate ONNX native load in Windows frozen
This commit is contained in:
@@ -122,12 +122,18 @@ except Exception:
|
||||
_HOSPITAL_FILTER_AVAILABLE = False
|
||||
HospitalFilter = None # type: ignore
|
||||
|
||||
# NER manager (facultatif)
|
||||
try:
|
||||
from ner_manager_onnx import NerModelManager, NerThresholds
|
||||
except Exception:
|
||||
# NER manager legacy/Optimum (facultatif). Le CLI production le désactive pour
|
||||
# éviter un double chargement natif ONNX en build Windows frozen ; il passe
|
||||
# explicitement CamemBERT-bio, EDS et GLiNER au moteur.
|
||||
if os.environ.get("ANON_SKIP_LEGACY_ONNX_MANAGER") == "1":
|
||||
NerModelManager = None # type: ignore
|
||||
NerThresholds = None # type: ignore
|
||||
else:
|
||||
try:
|
||||
from ner_manager_onnx import NerModelManager, NerThresholds
|
||||
except Exception:
|
||||
NerModelManager = None # type: ignore
|
||||
NerThresholds = None # type: ignore
|
||||
|
||||
# EDS-Pseudo manager (facultatif)
|
||||
try:
|
||||
|
||||
@@ -18,6 +18,7 @@ from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
@@ -41,6 +42,9 @@ except ImportError:
|
||||
|
||||
DEFAULT_MODEL_DIR = Path(__file__).parent / "models" / "camembert-bio-deid" / "onnx"
|
||||
|
||||
_LOAD_LOCK = threading.RLock()
|
||||
_PROCESS_CACHE: Dict[Path, Dict[str, Any]] = {}
|
||||
|
||||
# Mapping labels BIO du modèle → clés PLACEHOLDERS (anonymizer_core)
|
||||
CAMEMBERT_LABEL_MAP: Dict[str, str] = {
|
||||
"PER": "NOM",
|
||||
@@ -79,6 +83,9 @@ class CamembertNerManager:
|
||||
|
||||
def load(self) -> None:
|
||||
"""Charge le modèle ONNX et le tokenizer."""
|
||||
if self._loaded and self._session is not None and self._tokenizer is not None:
|
||||
return
|
||||
|
||||
if not _ORT_AVAILABLE:
|
||||
raise RuntimeError("onnxruntime non disponible. Installez : pip install onnxruntime")
|
||||
if not _TOKENIZERS_AVAILABLE:
|
||||
@@ -88,44 +95,65 @@ class CamembertNerManager:
|
||||
if not model_path.exists():
|
||||
raise FileNotFoundError(f"Modèle ONNX non trouvé: {model_path}")
|
||||
|
||||
self.unload()
|
||||
cache_key = self._model_dir.resolve()
|
||||
with _LOAD_LOCK:
|
||||
cached = _PROCESS_CACHE.get(cache_key)
|
||||
if cached is not None:
|
||||
self._session = cached["session"]
|
||||
self._tokenizer = cached["tokenizer"]
|
||||
self._id2label = dict(cached["id2label"])
|
||||
self._version = cached.get("version", "?")
|
||||
self._loaded = True
|
||||
log.info(f"CamemBERT-bio ONNX réutilisé: {self._model_dir} ({len(self._id2label)} labels)")
|
||||
return
|
||||
|
||||
# Charger id2label depuis config.json
|
||||
config_path = self._model_dir / "config.json"
|
||||
with open(config_path, encoding="utf-8") as f:
|
||||
cfg = json.load(f)
|
||||
self._id2label = {int(k): v for k, v in cfg.get("id2label", {}).items()}
|
||||
self.unload()
|
||||
|
||||
# Session ONNX (CPU)
|
||||
opts = ort.SessionOptions()
|
||||
opts.inter_op_num_threads = 2
|
||||
opts.intra_op_num_threads = 4
|
||||
self._session = ort.InferenceSession(
|
||||
str(model_path),
|
||||
sess_options=opts,
|
||||
providers=["CPUExecutionProvider"],
|
||||
)
|
||||
# Charger id2label depuis config.json
|
||||
config_path = self._model_dir / "config.json"
|
||||
with open(config_path, encoding="utf-8") as f:
|
||||
cfg = json.load(f)
|
||||
self._id2label = {int(k): v for k, v in cfg.get("id2label", {}).items()}
|
||||
|
||||
# Tokenizer
|
||||
self._tokenizer = AutoTokenizer.from_pretrained(str(self._model_dir))
|
||||
self._loaded = True
|
||||
# Session ONNX (CPU). Une seule session CamemBERT par process et par
|
||||
# dossier modèle : certains runtimes Windows/PyInstaller refusent de
|
||||
# recharger le module natif plus d'une fois dans le même process.
|
||||
opts = ort.SessionOptions()
|
||||
opts.inter_op_num_threads = 2
|
||||
opts.intra_op_num_threads = 4
|
||||
self._session = ort.InferenceSession(
|
||||
str(model_path),
|
||||
sess_options=opts,
|
||||
providers=["CPUExecutionProvider"],
|
||||
)
|
||||
|
||||
# Lire la version depuis VERSION.json (si disponible)
|
||||
self._version = "?"
|
||||
version_path = self._model_dir.parent / "VERSION.json"
|
||||
if version_path.exists():
|
||||
try:
|
||||
with open(version_path, encoding="utf-8") as vf:
|
||||
vinfo = json.load(vf)
|
||||
self._version = vinfo.get("current_version", "?")
|
||||
v_meta = vinfo.get("versions", {}).get(self._version, {})
|
||||
f1 = v_meta.get("f1", "?")
|
||||
recall = v_meta.get("recall", "?")
|
||||
log.info(f"CamemBERT-bio ONNX {self._version} chargé (F1={f1}, R={recall}, {len(self._id2label)} labels)")
|
||||
except Exception:
|
||||
# Tokenizer
|
||||
self._tokenizer = AutoTokenizer.from_pretrained(str(self._model_dir))
|
||||
self._loaded = True
|
||||
|
||||
# Lire la version depuis VERSION.json (si disponible)
|
||||
self._version = "?"
|
||||
version_path = self._model_dir.parent / "VERSION.json"
|
||||
if version_path.exists():
|
||||
try:
|
||||
with open(version_path, encoding="utf-8") as vf:
|
||||
vinfo = json.load(vf)
|
||||
self._version = vinfo.get("current_version", "?")
|
||||
v_meta = vinfo.get("versions", {}).get(self._version, {})
|
||||
f1 = v_meta.get("f1", "?")
|
||||
recall = v_meta.get("recall", "?")
|
||||
log.info(f"CamemBERT-bio ONNX {self._version} chargé (F1={f1}, R={recall}, {len(self._id2label)} labels)")
|
||||
except Exception:
|
||||
log.info(f"CamemBERT-bio ONNX chargé: {self._model_dir} ({len(self._id2label)} labels)")
|
||||
else:
|
||||
log.info(f"CamemBERT-bio ONNX chargé: {self._model_dir} ({len(self._id2label)} labels)")
|
||||
else:
|
||||
log.info(f"CamemBERT-bio ONNX chargé: {self._model_dir} ({len(self._id2label)} labels)")
|
||||
|
||||
_PROCESS_CACHE[cache_key] = {
|
||||
"session": self._session,
|
||||
"tokenizer": self._tokenizer,
|
||||
"id2label": dict(self._id2label),
|
||||
"version": self._version,
|
||||
}
|
||||
|
||||
def unload(self) -> None:
|
||||
self._session = None
|
||||
|
||||
@@ -80,6 +80,17 @@ _SUPPORTED_EXT = {
|
||||
".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp",
|
||||
}
|
||||
|
||||
# Le CLI production n'utilise pas le manager ONNX legacy/Optimum
|
||||
# (NerModelManager). Le désactiver évite un second chargement natif ONNX dans le
|
||||
# même process Windows/PyInstaller avant CamemBERT-bio, qui est le modèle
|
||||
# obligatoire du CLI.
|
||||
os.environ.setdefault("ANON_SKIP_LEGACY_ONNX_MANAGER", "1")
|
||||
|
||||
_n_cpu_threads = str(os.cpu_count() or 4)
|
||||
for _env in ("OMP_NUM_THREADS", "MKL_NUM_THREADS", "OPENBLAS_NUM_THREADS",
|
||||
"NUMEXPR_NUM_THREADS", "VECLIB_MAXIMUM_THREADS"):
|
||||
os.environ.setdefault(_env, _n_cpu_threads)
|
||||
|
||||
|
||||
def _resolve(p: str) -> Path:
|
||||
"""Résout un chemin relatif contre le cwd de lancement (pas _MEIPASS)."""
|
||||
@@ -140,12 +151,6 @@ def main(argv: list[str] | None = None) -> int:
|
||||
log.error("CLI: aucun document supporté trouvé sous %s", inp)
|
||||
return 2
|
||||
|
||||
import anonymizer_core_refactored_onnx as core
|
||||
|
||||
# H1 : aligne les threads torch (idempotent).
|
||||
if hasattr(core, "_configure_torch_threads"):
|
||||
core._configure_torch_threads()
|
||||
|
||||
# --- Modèles ---
|
||||
# OBLIGATOIRE (sauf --no-ner) : CamemBERT-bio ONNX. Fail-closed.
|
||||
eds_mgr = camembert_mgr = gliner_mgr = None
|
||||
@@ -189,6 +194,12 @@ def main(argv: list[str] | None = None) -> int:
|
||||
log.warning("CLI: --no-ner -> MODE REGEX SEUL assumé (aucun modèle NER). "
|
||||
"Qualité réduite : à n'utiliser qu'en connaissance de cause.")
|
||||
|
||||
import anonymizer_core_refactored_onnx as core
|
||||
|
||||
# H1 : aligne les threads torch (idempotent).
|
||||
if hasattr(core, "_configure_torch_threads"):
|
||||
core._configure_torch_threads()
|
||||
|
||||
use_ner = bool(eds_mgr or gliner_mgr or camembert_mgr)
|
||||
log.info("CLI: %d document(s), ner=%s (camembert=%s eds=%s gliner=%s) -> sortie=%s",
|
||||
len(docs), use_ner, bool(camembert_mgr), bool(eds_mgr), bool(gliner_mgr), out_root)
|
||||
|
||||
55
tests/unit/test_camembert_manager_cache.py
Normal file
55
tests/unit/test_camembert_manager_cache.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import json
|
||||
|
||||
|
||||
def test_camembert_load_is_idempotent_and_reuses_process_session(tmp_path, monkeypatch):
|
||||
import camembert_ner_manager as module
|
||||
|
||||
model_dir = tmp_path / "camembert-bio-deid" / "onnx"
|
||||
model_dir.mkdir(parents=True)
|
||||
(model_dir / "model.onnx").write_bytes(b"fake")
|
||||
(model_dir / "config.json").write_text(
|
||||
json.dumps({"id2label": {"0": "O", "1": "B-PER"}}),
|
||||
encoding="utf-8",
|
||||
)
|
||||
(model_dir.parent / "VERSION.json").write_text(
|
||||
json.dumps({"current_version": "v-test", "versions": {"v-test": {"f1": 1, "recall": 1}}}),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
created_sessions = []
|
||||
|
||||
class FakeSessionOptions:
|
||||
inter_op_num_threads = 0
|
||||
intra_op_num_threads = 0
|
||||
|
||||
class FakeOrt:
|
||||
SessionOptions = FakeSessionOptions
|
||||
|
||||
@staticmethod
|
||||
def InferenceSession(path, sess_options=None, providers=None):
|
||||
session = {"path": path, "providers": providers}
|
||||
created_sessions.append(session)
|
||||
return session
|
||||
|
||||
class FakeTokenizer:
|
||||
@staticmethod
|
||||
def from_pretrained(path):
|
||||
return {"tokenizer_path": path}
|
||||
|
||||
monkeypatch.setattr(module, "_ORT_AVAILABLE", True)
|
||||
monkeypatch.setattr(module, "_TOKENIZERS_AVAILABLE", True)
|
||||
monkeypatch.setattr(module, "ort", FakeOrt)
|
||||
monkeypatch.setattr(module, "AutoTokenizer", FakeTokenizer)
|
||||
module._PROCESS_CACHE.clear()
|
||||
|
||||
first = module.CamembertNerManager(model_dir)
|
||||
first.load()
|
||||
first.load()
|
||||
|
||||
second = module.CamembertNerManager(model_dir)
|
||||
second.load()
|
||||
|
||||
assert len(created_sessions) == 1
|
||||
assert first.is_loaded()
|
||||
assert second.is_loaded()
|
||||
assert first._session is second._session
|
||||
Reference in New Issue
Block a user