From ea1752d4a790427e6272e5a47c153752052e99a3 Mon Sep 17 00:00:00 2001 From: Domi31tls Date: Wed, 17 Jun 2026 19:52:29 +0200 Subject: [PATCH] feat(cli): charger les moteurs optionnels depuis les modeles embarques --- anonymisation_cli_onefile.spec | 35 ++++++++++++++++++++++++++ eds_pseudo_manager.py | 34 ++++++++++++++++++++++--- engine_capabilities.py | 16 ++++++++++-- gliner_manager.py | 25 +++++++++++++++--- tests/unit/test_engine_capabilities.py | 19 ++++++++++++++ 5 files changed, 120 insertions(+), 9 deletions(-) diff --git a/anonymisation_cli_onefile.spec b/anonymisation_cli_onefile.spec index 50af45a..8f608d5 100644 --- a/anonymisation_cli_onefile.spec +++ b/anonymisation_cli_onefile.spec @@ -1,6 +1,8 @@ import os from pathlib import Path +from PyInstaller.utils.hooks import collect_all, copy_metadata + # Spec CLI frozen — EXE de PRODUCTION (anonymisation fichier unique sans GUI). # Même moteur / mêmes datas que anonymisation_onefile.spec, mais : # - entrypoint = scripts/anonymize_cli.py (CLI production, pas launcher.py) @@ -22,6 +24,7 @@ def _data_entry(relative_path: str, target_dir: str | None = None): return (str(src), target_dir or relative_path) +binaries = [] datas = [] for relative_path, target_dir in [ ("config", "config"), @@ -95,9 +98,41 @@ hiddenimports = [ ] +def _collect_optional_package(package_name: str): + try: + package_datas, package_binaries, package_hiddenimports = collect_all(package_name) + datas.extend(package_datas) + binaries.extend(package_binaries) + hiddenimports.extend(package_hiddenimports) + try: + datas.extend(copy_metadata(package_name, recursive=True)) + except Exception: + pass + except Exception: + pass + + +for _package_name in [ + "edsnlp", + "spacy", + "thinc", + "blis", + "srsly", + "catalogue", + "confection", + "cymem", + "preshed", + "murmurhash", + "gliner", + "loguru", +]: + _collect_optional_package(_package_name) + + a = Analysis( [str(project_dir / "scripts" / "anonymize_cli.py")], pathex=[str(project_dir)], + binaries=binaries, datas=datas, hiddenimports=hiddenimports, cipher=block_cipher, diff --git a/eds_pseudo_manager.py b/eds_pseudo_manager.py index 87cd354..48e3e0c 100644 --- a/eds_pseudo_manager.py +++ b/eds_pseudo_manager.py @@ -9,6 +9,7 @@ Mapping des 13 labels EDS-Pseudo vers les clés PLACEHOLDERS du core d'anonymisa Dépendance : pip install 'edsnlp[ml]>=0.12.0' """ from __future__ import annotations +import sys from pathlib import Path from typing import Any, Dict, List, Optional @@ -41,6 +42,26 @@ EDS_MODELS_CATALOG: Dict[str, str] = { "EDS-Pseudo AP-HP (edsnlp)": "AP-HP/eds-pseudo-public", } +DEFAULT_MODEL = "AP-HP/eds-pseudo-public" +BUNDLED_MODEL_DIR = "eds-pseudo-public" + + +def _app_dir() -> Path: + if getattr(sys, "frozen", False): + return Path(getattr(sys, "_MEIPASS", Path(sys.executable).parent)) + return Path(__file__).resolve().parent + + +def _bundled_model_path(cache_dir: Optional[Path] = None) -> Optional[Path]: + candidates = [] + if cache_dir is not None: + candidates.append(Path(cache_dir) / BUNDLED_MODEL_DIR) + candidates.append(_app_dir() / "models" / BUNDLED_MODEL_DIR) + for candidate in candidates: + if candidate.is_dir(): + return candidate + return None + class EdsPseudoManager: """Gestionnaire pour le modèle EDS-Pseudo (edsnlp). Même interface que NerModelManager.""" @@ -54,16 +75,21 @@ class EdsPseudoManager: def is_loaded(self) -> bool: return self._loaded and self._nlp is not None - def load(self, model_id_or_path: str = "AP-HP/eds-pseudo-public") -> None: + def load(self, model_id_or_path: str = DEFAULT_MODEL) -> None: if not _EDSNLP_AVAILABLE: raise RuntimeError("edsnlp non disponible. Installez : pip install 'edsnlp[ml]>=0.12.0'") self.unload() - self.model_id = model_id_or_path - path = Path(model_id_or_path) + source = model_id_or_path + if model_id_or_path == DEFAULT_MODEL: + bundled = _bundled_model_path(self.cache_dir) + if bundled is not None: + source = str(bundled) + self.model_id = source + path = Path(source) if path.is_dir(): self._nlp = edsnlp.load(path) else: - self._nlp = edsnlp.load(model_id_or_path) + self._nlp = edsnlp.load(source) # Activer les scores de confiance NER (edsnlp >= 0.16) try: ner_pipe = self._nlp.get_pipe('ner') diff --git a/engine_capabilities.py b/engine_capabilities.py index d98bd35..0b71dd0 100644 --- a/engine_capabilities.py +++ b/engine_capabilities.py @@ -73,6 +73,14 @@ def _camembert_model_path() -> Path: return _app_dir() / "models" / "camembert-bio-deid" / "onnx" / "model.onnx" +def _eds_model_path() -> Path: + return _app_dir() / "models" / "eds-pseudo-public" + + +def _gliner_model_path() -> Path: + return _app_dir() / "models" / "gliner_multi_pii-v1" + + def _probe_camembert() -> "tuple[bool, str]": if not _has_module("onnxruntime"): return False, "onnxruntime non embarqué dans cette version" @@ -85,13 +93,17 @@ def _probe_eds() -> "tuple[bool, str]": missing = [m for m in ("edsnlp", "spacy") if not _has_module(m)] if missing: return False, "non embarqué dans cette version (manque : " + ", ".join(missing) + ")" - return True, "edsnlp + spacy disponibles" + if not _eds_model_path().is_dir(): + return False, "dépendances disponibles, modèle AP-HP eds-pseudo-public non embarqué" + return True, "edsnlp + spacy + modèle AP-HP embarqués" def _probe_gliner() -> "tuple[bool, str]": if not _has_module("gliner"): return False, "non embarqué dans cette version (manque : gliner)" - return True, "gliner disponible" + if not _gliner_model_path().is_dir(): + return False, "dépendance disponible, modèle GLiNER non embarqué" + return True, "gliner + modèle local embarqués" def _default_probes() -> Dict[str, Probe]: diff --git a/gliner_manager.py b/gliner_manager.py index bd52dae..04d4696 100644 --- a/gliner_manager.py +++ b/gliner_manager.py @@ -13,6 +13,8 @@ Version compatible : gliner==0.2.18 (pas plus récent, casse optimum-onnx) from __future__ import annotations import logging +import sys +from pathlib import Path from typing import Any, Dict, List, Optional log = logging.getLogger(__name__) @@ -56,6 +58,18 @@ GLINER_LABEL_MAP: Dict[str, str] = { } DEFAULT_MODEL = "urchade/gliner_multi_pii-v1" +BUNDLED_MODEL_DIR = "gliner_multi_pii-v1" + + +def _app_dir() -> Path: + if getattr(sys, "frozen", False): + return Path(getattr(sys, "_MEIPASS", Path(sys.executable).parent)) + return Path(__file__).resolve().parent + + +def _bundled_model_path() -> Optional[Path]: + candidate = _app_dir() / "models" / BUNDLED_MODEL_DIR + return candidate if candidate.is_dir() else None class GlinerManager: @@ -73,10 +87,15 @@ class GlinerManager: if not _GLINER_AVAILABLE: raise RuntimeError("gliner non disponible. Installez : pip install 'gliner==0.2.18'") self.unload() - self.model_id = model_id - self._model = GLiNER.from_pretrained(model_id) + source = model_id + if model_id == DEFAULT_MODEL: + bundled = _bundled_model_path() + if bundled is not None: + source = str(bundled) + self.model_id = source + self._model = GLiNER.from_pretrained(source) self._loaded = True - log.info(f"GLiNER chargé: {model_id}") + log.info(f"GLiNER chargé: {source}") def unload(self) -> None: self._model = None diff --git a/tests/unit/test_engine_capabilities.py b/tests/unit/test_engine_capabilities.py index 9cd5264..52f8068 100644 --- a/tests/unit/test_engine_capabilities.py +++ b/tests/unit/test_engine_capabilities.py @@ -64,3 +64,22 @@ def test_default_probes_run_without_crash_and_are_consistent(): for cap in caps.values(): assert isinstance(cap.available, bool) assert isinstance(cap.reason, str) and cap.reason + + +def test_optional_engines_require_bundled_models(monkeypatch, tmp_path): + monkeypatch.setattr(ec, "_has_module", lambda name: name in {"onnxruntime", "edsnlp", "spacy", "gliner"}) + monkeypatch.setattr(ec, "_app_dir", lambda: tmp_path) + (tmp_path / "models" / "camembert-bio-deid" / "onnx").mkdir(parents=True) + (tmp_path / "models" / "camembert-bio-deid" / "onnx" / "model.onnx").write_bytes(b"fake") + + caps = ec.capabilities_map() + assert caps["eds"].available is False + assert "modèle" in caps["eds"].reason + assert caps["gliner"].available is False + assert "modèle" in caps["gliner"].reason + + (tmp_path / "models" / "eds-pseudo-public").mkdir() + (tmp_path / "models" / "gliner_multi_pii-v1").mkdir() + caps = ec.capabilities_map() + assert caps["eds"].available is True + assert caps["gliner"].available is True