#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ EDS-Pseudo Manager — Interface compatible NerModelManager pour le modèle AP-HP eds-pseudo. -------------------------------------------------------------------------------------------- Utilise edsnlp pour charger le pipeline eds-pseudo (F1=0.97 sur données cliniques AP-HP). Mapping des 13 labels EDS-Pseudo vers les clés PLACEHOLDERS du core d'anonymisation. Dépendance : pip install 'edsnlp[ml]>=0.12.0' """ from __future__ import annotations from pathlib import Path from typing import Any, Dict, List, Optional try: import edsnlp _EDSNLP_AVAILABLE = True except ImportError: edsnlp = None # type: ignore _EDSNLP_AVAILABLE = False # Mapping labels EDS-Pseudo → clés PLACEHOLDERS (anonymizer_core) EDS_LABEL_MAP: Dict[str, str] = { "NOM": "NOM", "PRENOM": "NOM", "MAIL": "EMAIL", "TEL": "TEL", "SECU": "NIR", "ADRESSE": "ADRESSE", "ZIP": "CODE_POSTAL", "VILLE": "VILLE", "HOPITAL": "ETAB", "DATE": "DATE", "DATE_NAISSANCE": "DATE_NAISSANCE", "IPP": "IPP", "NDA": "NDA", } # Catalogue affiché dans la GUI EDS_MODELS_CATALOG: Dict[str, str] = { "EDS-Pseudo AP-HP (edsnlp)": "AP-HP/eds-pseudo-public", } class EdsPseudoManager: """Gestionnaire pour le modèle EDS-Pseudo (edsnlp). Même interface que NerModelManager.""" def __init__(self, cache_dir: Optional[Path] = None): self.cache_dir = Path(cache_dir) if cache_dir else None self.model_id: Optional[str] = None self._nlp = None self._loaded = False def is_loaded(self) -> bool: return self._loaded and self._nlp is not None def load(self, model_id_or_path: str = "AP-HP/eds-pseudo-public") -> None: if not _EDSNLP_AVAILABLE: raise RuntimeError("edsnlp non disponible. Installez : pip install 'edsnlp[ml]>=0.12.0'") self.unload() self.model_id = model_id_or_path path = Path(model_id_or_path) if path.is_dir(): # Chargement local (modèle fine-tuné) self._nlp = edsnlp.load(path) else: # Chargement depuis HuggingFace Hub self._nlp = edsnlp.load(model_id_or_path) self._loaded = True def unload(self) -> None: self._nlp = None self._loaded = False self.model_id = None def models_catalog(self) -> Dict[str, str]: return dict(EDS_MODELS_CATALOG) def infer_paragraphs( self, paragraphs: List[str], thresholds: Optional[Any] = None, max_length: int = 384, stride: int = 128, ) -> List[List[Dict[str, Any]]]: """Pour chaque paragraphe, retourne une liste d'entités détectées. Chaque entité a les clés : entity_group, word, start, end, score, eds_mapped_key. """ if not self.is_loaded(): return [[] for _ in paragraphs] out: List[List[Dict[str, Any]]] = [] for para in paragraphs: if not para.strip(): out.append([]) continue doc = self._nlp(para) ents: List[Dict[str, Any]] = [] for ent in doc.ents: label = ent.label_.upper() mapped = EDS_LABEL_MAP.get(label, None) if mapped is None: continue ents.append({ "entity_group": label, "word": ent.text, "start": ent.start_char, "end": ent.end_char, "score": 1.0, # edsnlp ne fournit pas de score de confiance "eds_mapped_key": mapped, }) out.append(ents) return out