From 6c6f6532fcc12fef423b5104780c1480505f8ab4 Mon Sep 17 00:00:00 2001 From: Domi31tls Date: Fri, 12 Jun 2026 16:49:11 +0200 Subject: [PATCH] fix(cli): avoid duplicate ONNX native load in Windows frozen --- anonymizer_core_refactored_onnx.py | 14 +++- camembert_ner_manager.py | 94 ++++++++++++++-------- scripts/anonymize_cli.py | 23 ++++-- tests/unit/test_camembert_manager_cache.py | 55 +++++++++++++ 4 files changed, 143 insertions(+), 43 deletions(-) create mode 100644 tests/unit/test_camembert_manager_cache.py diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index bb0b54a..67b4784 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -122,12 +122,18 @@ except Exception: _HOSPITAL_FILTER_AVAILABLE = False HospitalFilter = None # type: ignore -# NER manager (facultatif) -try: - from ner_manager_onnx import NerModelManager, NerThresholds -except Exception: +# NER manager legacy/Optimum (facultatif). Le CLI production le désactive pour +# éviter un double chargement natif ONNX en build Windows frozen ; il passe +# explicitement CamemBERT-bio, EDS et GLiNER au moteur. +if os.environ.get("ANON_SKIP_LEGACY_ONNX_MANAGER") == "1": NerModelManager = None # type: ignore NerThresholds = None # type: ignore +else: + try: + from ner_manager_onnx import NerModelManager, NerThresholds + except Exception: + NerModelManager = None # type: ignore + NerThresholds = None # type: ignore # EDS-Pseudo manager (facultatif) try: diff --git a/camembert_ner_manager.py b/camembert_ner_manager.py index 0ec5944..627c990 100644 --- a/camembert_ner_manager.py +++ b/camembert_ner_manager.py @@ -18,6 +18,7 @@ from __future__ import annotations import json import logging +import threading from pathlib import Path from typing import Any, Dict, List, Optional @@ -41,6 +42,9 @@ except ImportError: DEFAULT_MODEL_DIR = Path(__file__).parent / "models" / "camembert-bio-deid" / "onnx" +_LOAD_LOCK = threading.RLock() +_PROCESS_CACHE: Dict[Path, Dict[str, Any]] = {} + # Mapping labels BIO du modèle → clés PLACEHOLDERS (anonymizer_core) CAMEMBERT_LABEL_MAP: Dict[str, str] = { "PER": "NOM", @@ -79,6 +83,9 @@ class CamembertNerManager: def load(self) -> None: """Charge le modèle ONNX et le tokenizer.""" + if self._loaded and self._session is not None and self._tokenizer is not None: + return + if not _ORT_AVAILABLE: raise RuntimeError("onnxruntime non disponible. Installez : pip install onnxruntime") if not _TOKENIZERS_AVAILABLE: @@ -88,44 +95,65 @@ class CamembertNerManager: if not model_path.exists(): raise FileNotFoundError(f"Modèle ONNX non trouvé: {model_path}") - self.unload() + cache_key = self._model_dir.resolve() + with _LOAD_LOCK: + cached = _PROCESS_CACHE.get(cache_key) + if cached is not None: + self._session = cached["session"] + self._tokenizer = cached["tokenizer"] + self._id2label = dict(cached["id2label"]) + self._version = cached.get("version", "?") + self._loaded = True + log.info(f"CamemBERT-bio ONNX réutilisé: {self._model_dir} ({len(self._id2label)} labels)") + return - # Charger id2label depuis config.json - config_path = self._model_dir / "config.json" - with open(config_path, encoding="utf-8") as f: - cfg = json.load(f) - self._id2label = {int(k): v for k, v in cfg.get("id2label", {}).items()} + self.unload() - # Session ONNX (CPU) - opts = ort.SessionOptions() - opts.inter_op_num_threads = 2 - opts.intra_op_num_threads = 4 - self._session = ort.InferenceSession( - str(model_path), - sess_options=opts, - providers=["CPUExecutionProvider"], - ) + # Charger id2label depuis config.json + config_path = self._model_dir / "config.json" + with open(config_path, encoding="utf-8") as f: + cfg = json.load(f) + self._id2label = {int(k): v for k, v in cfg.get("id2label", {}).items()} - # Tokenizer - self._tokenizer = AutoTokenizer.from_pretrained(str(self._model_dir)) - self._loaded = True + # Session ONNX (CPU). Une seule session CamemBERT par process et par + # dossier modèle : certains runtimes Windows/PyInstaller refusent de + # recharger le module natif plus d'une fois dans le même process. + opts = ort.SessionOptions() + opts.inter_op_num_threads = 2 + opts.intra_op_num_threads = 4 + self._session = ort.InferenceSession( + str(model_path), + sess_options=opts, + providers=["CPUExecutionProvider"], + ) - # Lire la version depuis VERSION.json (si disponible) - self._version = "?" - version_path = self._model_dir.parent / "VERSION.json" - if version_path.exists(): - try: - with open(version_path, encoding="utf-8") as vf: - vinfo = json.load(vf) - self._version = vinfo.get("current_version", "?") - v_meta = vinfo.get("versions", {}).get(self._version, {}) - f1 = v_meta.get("f1", "?") - recall = v_meta.get("recall", "?") - log.info(f"CamemBERT-bio ONNX {self._version} chargé (F1={f1}, R={recall}, {len(self._id2label)} labels)") - except Exception: + # Tokenizer + self._tokenizer = AutoTokenizer.from_pretrained(str(self._model_dir)) + self._loaded = True + + # Lire la version depuis VERSION.json (si disponible) + self._version = "?" + version_path = self._model_dir.parent / "VERSION.json" + if version_path.exists(): + try: + with open(version_path, encoding="utf-8") as vf: + vinfo = json.load(vf) + self._version = vinfo.get("current_version", "?") + v_meta = vinfo.get("versions", {}).get(self._version, {}) + f1 = v_meta.get("f1", "?") + recall = v_meta.get("recall", "?") + log.info(f"CamemBERT-bio ONNX {self._version} chargé (F1={f1}, R={recall}, {len(self._id2label)} labels)") + except Exception: + log.info(f"CamemBERT-bio ONNX chargé: {self._model_dir} ({len(self._id2label)} labels)") + else: log.info(f"CamemBERT-bio ONNX chargé: {self._model_dir} ({len(self._id2label)} labels)") - else: - log.info(f"CamemBERT-bio ONNX chargé: {self._model_dir} ({len(self._id2label)} labels)") + + _PROCESS_CACHE[cache_key] = { + "session": self._session, + "tokenizer": self._tokenizer, + "id2label": dict(self._id2label), + "version": self._version, + } def unload(self) -> None: self._session = None diff --git a/scripts/anonymize_cli.py b/scripts/anonymize_cli.py index a0c4c3c..efd000f 100644 --- a/scripts/anonymize_cli.py +++ b/scripts/anonymize_cli.py @@ -80,6 +80,17 @@ _SUPPORTED_EXT = { ".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", } +# Le CLI production n'utilise pas le manager ONNX legacy/Optimum +# (NerModelManager). Le désactiver évite un second chargement natif ONNX dans le +# même process Windows/PyInstaller avant CamemBERT-bio, qui est le modèle +# obligatoire du CLI. +os.environ.setdefault("ANON_SKIP_LEGACY_ONNX_MANAGER", "1") + +_n_cpu_threads = str(os.cpu_count() or 4) +for _env in ("OMP_NUM_THREADS", "MKL_NUM_THREADS", "OPENBLAS_NUM_THREADS", + "NUMEXPR_NUM_THREADS", "VECLIB_MAXIMUM_THREADS"): + os.environ.setdefault(_env, _n_cpu_threads) + def _resolve(p: str) -> Path: """Résout un chemin relatif contre le cwd de lancement (pas _MEIPASS).""" @@ -140,12 +151,6 @@ def main(argv: list[str] | None = None) -> int: log.error("CLI: aucun document supporté trouvé sous %s", inp) return 2 - import anonymizer_core_refactored_onnx as core - - # H1 : aligne les threads torch (idempotent). - if hasattr(core, "_configure_torch_threads"): - core._configure_torch_threads() - # --- Modèles --- # OBLIGATOIRE (sauf --no-ner) : CamemBERT-bio ONNX. Fail-closed. eds_mgr = camembert_mgr = gliner_mgr = None @@ -189,6 +194,12 @@ def main(argv: list[str] | None = None) -> int: log.warning("CLI: --no-ner -> MODE REGEX SEUL assumé (aucun modèle NER). " "Qualité réduite : à n'utiliser qu'en connaissance de cause.") + import anonymizer_core_refactored_onnx as core + + # H1 : aligne les threads torch (idempotent). + if hasattr(core, "_configure_torch_threads"): + core._configure_torch_threads() + use_ner = bool(eds_mgr or gliner_mgr or camembert_mgr) log.info("CLI: %d document(s), ner=%s (camembert=%s eds=%s gliner=%s) -> sortie=%s", len(docs), use_ner, bool(camembert_mgr), bool(eds_mgr), bool(gliner_mgr), out_root) diff --git a/tests/unit/test_camembert_manager_cache.py b/tests/unit/test_camembert_manager_cache.py new file mode 100644 index 0000000..1035db1 --- /dev/null +++ b/tests/unit/test_camembert_manager_cache.py @@ -0,0 +1,55 @@ +import json + + +def test_camembert_load_is_idempotent_and_reuses_process_session(tmp_path, monkeypatch): + import camembert_ner_manager as module + + model_dir = tmp_path / "camembert-bio-deid" / "onnx" + model_dir.mkdir(parents=True) + (model_dir / "model.onnx").write_bytes(b"fake") + (model_dir / "config.json").write_text( + json.dumps({"id2label": {"0": "O", "1": "B-PER"}}), + encoding="utf-8", + ) + (model_dir.parent / "VERSION.json").write_text( + json.dumps({"current_version": "v-test", "versions": {"v-test": {"f1": 1, "recall": 1}}}), + encoding="utf-8", + ) + + created_sessions = [] + + class FakeSessionOptions: + inter_op_num_threads = 0 + intra_op_num_threads = 0 + + class FakeOrt: + SessionOptions = FakeSessionOptions + + @staticmethod + def InferenceSession(path, sess_options=None, providers=None): + session = {"path": path, "providers": providers} + created_sessions.append(session) + return session + + class FakeTokenizer: + @staticmethod + def from_pretrained(path): + return {"tokenizer_path": path} + + monkeypatch.setattr(module, "_ORT_AVAILABLE", True) + monkeypatch.setattr(module, "_TOKENIZERS_AVAILABLE", True) + monkeypatch.setattr(module, "ort", FakeOrt) + monkeypatch.setattr(module, "AutoTokenizer", FakeTokenizer) + module._PROCESS_CACHE.clear() + + first = module.CamembertNerManager(model_dir) + first.load() + first.load() + + second = module.CamembertNerManager(model_dir) + second.load() + + assert len(created_sessions) == 1 + assert first.is_loaded() + assert second.is_loaded() + assert first._session is second._session