Initial commit — Pseudonymisation de PDF v5
- GUI v5 : vue unique épurée (tkinter), 2 étapes visuelles - Core ONNX : anonymisation regex + NER optionnel - Extraction globale des noms depuis champs structurés (Patient, Rédigé par, MME/Madame, DR) - Génération simultanée PDF Image + PDF Anonymisé (structure préservée) - Build Windows via Nuitka (script batch + GitHub Actions CI) - install.sh pour setup/run Linux Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
114
eds_pseudo_manager.py
Normal file
114
eds_pseudo_manager.py
Normal file
@@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
EDS-Pseudo Manager — Interface compatible NerModelManager pour le modèle AP-HP eds-pseudo.
|
||||
--------------------------------------------------------------------------------------------
|
||||
Utilise edsnlp pour charger le pipeline eds-pseudo (F1=0.97 sur données cliniques AP-HP).
|
||||
Mapping des 13 labels EDS-Pseudo vers les clés PLACEHOLDERS du core d'anonymisation.
|
||||
|
||||
Dépendance : pip install 'edsnlp[ml]>=0.12.0'
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
try:
|
||||
import edsnlp
|
||||
_EDSNLP_AVAILABLE = True
|
||||
except ImportError:
|
||||
edsnlp = None # type: ignore
|
||||
_EDSNLP_AVAILABLE = False
|
||||
|
||||
# Mapping labels EDS-Pseudo → clés PLACEHOLDERS (anonymizer_core)
|
||||
EDS_LABEL_MAP: Dict[str, str] = {
|
||||
"NOM": "NOM",
|
||||
"PRENOM": "NOM",
|
||||
"MAIL": "EMAIL",
|
||||
"TEL": "TEL",
|
||||
"SECU": "NIR",
|
||||
"ADRESSE": "ADRESSE",
|
||||
"ZIP": "CODE_POSTAL",
|
||||
"VILLE": "VILLE",
|
||||
"HOPITAL": "ETAB",
|
||||
"DATE": "DATE",
|
||||
"DATE_NAISSANCE": "DATE_NAISSANCE",
|
||||
"IPP": "IPP",
|
||||
"NDA": "NDA",
|
||||
}
|
||||
|
||||
# Catalogue affiché dans la GUI
|
||||
EDS_MODELS_CATALOG: Dict[str, str] = {
|
||||
"EDS-Pseudo AP-HP (edsnlp)": "AP-HP/eds-pseudo-public",
|
||||
}
|
||||
|
||||
|
||||
class EdsPseudoManager:
|
||||
"""Gestionnaire pour le modèle EDS-Pseudo (edsnlp). Même interface que NerModelManager."""
|
||||
|
||||
def __init__(self, cache_dir: Optional[Path] = None):
|
||||
self.cache_dir = Path(cache_dir) if cache_dir else None
|
||||
self.model_id: Optional[str] = None
|
||||
self._nlp = None
|
||||
self._loaded = False
|
||||
|
||||
def is_loaded(self) -> bool:
|
||||
return self._loaded and self._nlp is not None
|
||||
|
||||
def load(self, model_id_or_path: str = "AP-HP/eds-pseudo-public") -> None:
|
||||
if not _EDSNLP_AVAILABLE:
|
||||
raise RuntimeError("edsnlp non disponible. Installez : pip install 'edsnlp[ml]>=0.12.0'")
|
||||
self.unload()
|
||||
self.model_id = model_id_or_path
|
||||
path = Path(model_id_or_path)
|
||||
if path.is_dir():
|
||||
# Chargement local (modèle fine-tuné)
|
||||
self._nlp = edsnlp.load(path)
|
||||
else:
|
||||
# Chargement depuis HuggingFace Hub
|
||||
self._nlp = edsnlp.load(model_id_or_path)
|
||||
self._loaded = True
|
||||
|
||||
def unload(self) -> None:
|
||||
self._nlp = None
|
||||
self._loaded = False
|
||||
self.model_id = None
|
||||
|
||||
def models_catalog(self) -> Dict[str, str]:
|
||||
return dict(EDS_MODELS_CATALOG)
|
||||
|
||||
def infer_paragraphs(
|
||||
self,
|
||||
paragraphs: List[str],
|
||||
thresholds: Optional[Any] = None,
|
||||
max_length: int = 384,
|
||||
stride: int = 128,
|
||||
) -> List[List[Dict[str, Any]]]:
|
||||
"""Pour chaque paragraphe, retourne une liste d'entités détectées.
|
||||
|
||||
Chaque entité a les clés : entity_group, word, start, end, score, eds_mapped_key.
|
||||
"""
|
||||
if not self.is_loaded():
|
||||
return [[] for _ in paragraphs]
|
||||
|
||||
out: List[List[Dict[str, Any]]] = []
|
||||
for para in paragraphs:
|
||||
if not para.strip():
|
||||
out.append([])
|
||||
continue
|
||||
doc = self._nlp(para)
|
||||
ents: List[Dict[str, Any]] = []
|
||||
for ent in doc.ents:
|
||||
label = ent.label_.upper()
|
||||
mapped = EDS_LABEL_MAP.get(label, None)
|
||||
if mapped is None:
|
||||
continue
|
||||
ents.append({
|
||||
"entity_group": label,
|
||||
"word": ent.text,
|
||||
"start": ent.start_char,
|
||||
"end": ent.end_char,
|
||||
"score": 1.0, # edsnlp ne fournit pas de score de confiance
|
||||
"eds_mapped_key": mapped,
|
||||
})
|
||||
out.append(ents)
|
||||
return out
|
||||
Reference in New Issue
Block a user