fix(cli): avoid duplicate ONNX native load in Windows frozen
This commit is contained in:
@@ -122,10 +122,16 @@ except Exception:
|
|||||||
_HOSPITAL_FILTER_AVAILABLE = False
|
_HOSPITAL_FILTER_AVAILABLE = False
|
||||||
HospitalFilter = None # type: ignore
|
HospitalFilter = None # type: ignore
|
||||||
|
|
||||||
# NER manager (facultatif)
|
# NER manager legacy/Optimum (facultatif). Le CLI production le désactive pour
|
||||||
try:
|
# éviter un double chargement natif ONNX en build Windows frozen ; il passe
|
||||||
|
# explicitement CamemBERT-bio, EDS et GLiNER au moteur.
|
||||||
|
if os.environ.get("ANON_SKIP_LEGACY_ONNX_MANAGER") == "1":
|
||||||
|
NerModelManager = None # type: ignore
|
||||||
|
NerThresholds = None # type: ignore
|
||||||
|
else:
|
||||||
|
try:
|
||||||
from ner_manager_onnx import NerModelManager, NerThresholds
|
from ner_manager_onnx import NerModelManager, NerThresholds
|
||||||
except Exception:
|
except Exception:
|
||||||
NerModelManager = None # type: ignore
|
NerModelManager = None # type: ignore
|
||||||
NerThresholds = None # type: ignore
|
NerThresholds = None # type: ignore
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import threading
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
@@ -41,6 +42,9 @@ except ImportError:
|
|||||||
|
|
||||||
DEFAULT_MODEL_DIR = Path(__file__).parent / "models" / "camembert-bio-deid" / "onnx"
|
DEFAULT_MODEL_DIR = Path(__file__).parent / "models" / "camembert-bio-deid" / "onnx"
|
||||||
|
|
||||||
|
_LOAD_LOCK = threading.RLock()
|
||||||
|
_PROCESS_CACHE: Dict[Path, Dict[str, Any]] = {}
|
||||||
|
|
||||||
# Mapping labels BIO du modèle → clés PLACEHOLDERS (anonymizer_core)
|
# Mapping labels BIO du modèle → clés PLACEHOLDERS (anonymizer_core)
|
||||||
CAMEMBERT_LABEL_MAP: Dict[str, str] = {
|
CAMEMBERT_LABEL_MAP: Dict[str, str] = {
|
||||||
"PER": "NOM",
|
"PER": "NOM",
|
||||||
@@ -79,6 +83,9 @@ class CamembertNerManager:
|
|||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
"""Charge le modèle ONNX et le tokenizer."""
|
"""Charge le modèle ONNX et le tokenizer."""
|
||||||
|
if self._loaded and self._session is not None and self._tokenizer is not None:
|
||||||
|
return
|
||||||
|
|
||||||
if not _ORT_AVAILABLE:
|
if not _ORT_AVAILABLE:
|
||||||
raise RuntimeError("onnxruntime non disponible. Installez : pip install onnxruntime")
|
raise RuntimeError("onnxruntime non disponible. Installez : pip install onnxruntime")
|
||||||
if not _TOKENIZERS_AVAILABLE:
|
if not _TOKENIZERS_AVAILABLE:
|
||||||
@@ -88,6 +95,18 @@ class CamembertNerManager:
|
|||||||
if not model_path.exists():
|
if not model_path.exists():
|
||||||
raise FileNotFoundError(f"Modèle ONNX non trouvé: {model_path}")
|
raise FileNotFoundError(f"Modèle ONNX non trouvé: {model_path}")
|
||||||
|
|
||||||
|
cache_key = self._model_dir.resolve()
|
||||||
|
with _LOAD_LOCK:
|
||||||
|
cached = _PROCESS_CACHE.get(cache_key)
|
||||||
|
if cached is not None:
|
||||||
|
self._session = cached["session"]
|
||||||
|
self._tokenizer = cached["tokenizer"]
|
||||||
|
self._id2label = dict(cached["id2label"])
|
||||||
|
self._version = cached.get("version", "?")
|
||||||
|
self._loaded = True
|
||||||
|
log.info(f"CamemBERT-bio ONNX réutilisé: {self._model_dir} ({len(self._id2label)} labels)")
|
||||||
|
return
|
||||||
|
|
||||||
self.unload()
|
self.unload()
|
||||||
|
|
||||||
# Charger id2label depuis config.json
|
# Charger id2label depuis config.json
|
||||||
@@ -96,7 +115,9 @@ class CamembertNerManager:
|
|||||||
cfg = json.load(f)
|
cfg = json.load(f)
|
||||||
self._id2label = {int(k): v for k, v in cfg.get("id2label", {}).items()}
|
self._id2label = {int(k): v for k, v in cfg.get("id2label", {}).items()}
|
||||||
|
|
||||||
# Session ONNX (CPU)
|
# Session ONNX (CPU). Une seule session CamemBERT par process et par
|
||||||
|
# dossier modèle : certains runtimes Windows/PyInstaller refusent de
|
||||||
|
# recharger le module natif plus d'une fois dans le même process.
|
||||||
opts = ort.SessionOptions()
|
opts = ort.SessionOptions()
|
||||||
opts.inter_op_num_threads = 2
|
opts.inter_op_num_threads = 2
|
||||||
opts.intra_op_num_threads = 4
|
opts.intra_op_num_threads = 4
|
||||||
@@ -127,6 +148,13 @@ class CamembertNerManager:
|
|||||||
else:
|
else:
|
||||||
log.info(f"CamemBERT-bio ONNX chargé: {self._model_dir} ({len(self._id2label)} labels)")
|
log.info(f"CamemBERT-bio ONNX chargé: {self._model_dir} ({len(self._id2label)} labels)")
|
||||||
|
|
||||||
|
_PROCESS_CACHE[cache_key] = {
|
||||||
|
"session": self._session,
|
||||||
|
"tokenizer": self._tokenizer,
|
||||||
|
"id2label": dict(self._id2label),
|
||||||
|
"version": self._version,
|
||||||
|
}
|
||||||
|
|
||||||
def unload(self) -> None:
|
def unload(self) -> None:
|
||||||
self._session = None
|
self._session = None
|
||||||
self._tokenizer = None
|
self._tokenizer = None
|
||||||
|
|||||||
@@ -80,6 +80,17 @@ _SUPPORTED_EXT = {
|
|||||||
".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp",
|
".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Le CLI production n'utilise pas le manager ONNX legacy/Optimum
|
||||||
|
# (NerModelManager). Le désactiver évite un second chargement natif ONNX dans le
|
||||||
|
# même process Windows/PyInstaller avant CamemBERT-bio, qui est le modèle
|
||||||
|
# obligatoire du CLI.
|
||||||
|
os.environ.setdefault("ANON_SKIP_LEGACY_ONNX_MANAGER", "1")
|
||||||
|
|
||||||
|
_n_cpu_threads = str(os.cpu_count() or 4)
|
||||||
|
for _env in ("OMP_NUM_THREADS", "MKL_NUM_THREADS", "OPENBLAS_NUM_THREADS",
|
||||||
|
"NUMEXPR_NUM_THREADS", "VECLIB_MAXIMUM_THREADS"):
|
||||||
|
os.environ.setdefault(_env, _n_cpu_threads)
|
||||||
|
|
||||||
|
|
||||||
def _resolve(p: str) -> Path:
|
def _resolve(p: str) -> Path:
|
||||||
"""Résout un chemin relatif contre le cwd de lancement (pas _MEIPASS)."""
|
"""Résout un chemin relatif contre le cwd de lancement (pas _MEIPASS)."""
|
||||||
@@ -140,12 +151,6 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
log.error("CLI: aucun document supporté trouvé sous %s", inp)
|
log.error("CLI: aucun document supporté trouvé sous %s", inp)
|
||||||
return 2
|
return 2
|
||||||
|
|
||||||
import anonymizer_core_refactored_onnx as core
|
|
||||||
|
|
||||||
# H1 : aligne les threads torch (idempotent).
|
|
||||||
if hasattr(core, "_configure_torch_threads"):
|
|
||||||
core._configure_torch_threads()
|
|
||||||
|
|
||||||
# --- Modèles ---
|
# --- Modèles ---
|
||||||
# OBLIGATOIRE (sauf --no-ner) : CamemBERT-bio ONNX. Fail-closed.
|
# OBLIGATOIRE (sauf --no-ner) : CamemBERT-bio ONNX. Fail-closed.
|
||||||
eds_mgr = camembert_mgr = gliner_mgr = None
|
eds_mgr = camembert_mgr = gliner_mgr = None
|
||||||
@@ -189,6 +194,12 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
log.warning("CLI: --no-ner -> MODE REGEX SEUL assumé (aucun modèle NER). "
|
log.warning("CLI: --no-ner -> MODE REGEX SEUL assumé (aucun modèle NER). "
|
||||||
"Qualité réduite : à n'utiliser qu'en connaissance de cause.")
|
"Qualité réduite : à n'utiliser qu'en connaissance de cause.")
|
||||||
|
|
||||||
|
import anonymizer_core_refactored_onnx as core
|
||||||
|
|
||||||
|
# H1 : aligne les threads torch (idempotent).
|
||||||
|
if hasattr(core, "_configure_torch_threads"):
|
||||||
|
core._configure_torch_threads()
|
||||||
|
|
||||||
use_ner = bool(eds_mgr or gliner_mgr or camembert_mgr)
|
use_ner = bool(eds_mgr or gliner_mgr or camembert_mgr)
|
||||||
log.info("CLI: %d document(s), ner=%s (camembert=%s eds=%s gliner=%s) -> sortie=%s",
|
log.info("CLI: %d document(s), ner=%s (camembert=%s eds=%s gliner=%s) -> sortie=%s",
|
||||||
len(docs), use_ner, bool(camembert_mgr), bool(eds_mgr), bool(gliner_mgr), out_root)
|
len(docs), use_ner, bool(camembert_mgr), bool(eds_mgr), bool(gliner_mgr), out_root)
|
||||||
|
|||||||
55
tests/unit/test_camembert_manager_cache.py
Normal file
55
tests/unit/test_camembert_manager_cache.py
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
def test_camembert_load_is_idempotent_and_reuses_process_session(tmp_path, monkeypatch):
|
||||||
|
import camembert_ner_manager as module
|
||||||
|
|
||||||
|
model_dir = tmp_path / "camembert-bio-deid" / "onnx"
|
||||||
|
model_dir.mkdir(parents=True)
|
||||||
|
(model_dir / "model.onnx").write_bytes(b"fake")
|
||||||
|
(model_dir / "config.json").write_text(
|
||||||
|
json.dumps({"id2label": {"0": "O", "1": "B-PER"}}),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(model_dir.parent / "VERSION.json").write_text(
|
||||||
|
json.dumps({"current_version": "v-test", "versions": {"v-test": {"f1": 1, "recall": 1}}}),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
created_sessions = []
|
||||||
|
|
||||||
|
class FakeSessionOptions:
|
||||||
|
inter_op_num_threads = 0
|
||||||
|
intra_op_num_threads = 0
|
||||||
|
|
||||||
|
class FakeOrt:
|
||||||
|
SessionOptions = FakeSessionOptions
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def InferenceSession(path, sess_options=None, providers=None):
|
||||||
|
session = {"path": path, "providers": providers}
|
||||||
|
created_sessions.append(session)
|
||||||
|
return session
|
||||||
|
|
||||||
|
class FakeTokenizer:
|
||||||
|
@staticmethod
|
||||||
|
def from_pretrained(path):
|
||||||
|
return {"tokenizer_path": path}
|
||||||
|
|
||||||
|
monkeypatch.setattr(module, "_ORT_AVAILABLE", True)
|
||||||
|
monkeypatch.setattr(module, "_TOKENIZERS_AVAILABLE", True)
|
||||||
|
monkeypatch.setattr(module, "ort", FakeOrt)
|
||||||
|
monkeypatch.setattr(module, "AutoTokenizer", FakeTokenizer)
|
||||||
|
module._PROCESS_CACHE.clear()
|
||||||
|
|
||||||
|
first = module.CamembertNerManager(model_dir)
|
||||||
|
first.load()
|
||||||
|
first.load()
|
||||||
|
|
||||||
|
second = module.CamembertNerManager(model_dir)
|
||||||
|
second.load()
|
||||||
|
|
||||||
|
assert len(created_sessions) == 1
|
||||||
|
assert first.is_loaded()
|
||||||
|
assert second.is_loaded()
|
||||||
|
assert first._session is second._session
|
||||||
Reference in New Issue
Block a user