Chantier 1: Intégration BDPM (5737 médicaments officiels) dans medication whitelist Chantier 2: Safe patterns contextuels (dosages mg/mL/cpr, formes pharma, même ligne) Chantier 3: Scores de confiance NER réels (edsnlp 0.20 ner_confidence_score) Chantier 4: GLiNER zero-shot (urchade/gliner_multi_pii-v1) en vote croisé Chantier 5: Scripts export silver annotations + fine-tuning CamemBERT-bio 0 fuite, 0 régression, -18 FP supplémentaires éliminés. Sécurité: GLiNER ne peut rejeter que si confiance NER < 0.70. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
181 lines
6.0 KiB
Python
181 lines
6.0 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
GLiNER Manager — NER zero-shot pour validation croisée des entités.
|
|
-------------------------------------------------------------------
|
|
Utilise GLiNER (< 500M params, CPU) comme 3e signal NER en vote majoritaire
|
|
avec CamemBERT ONNX + EDS-Pseudo. Réduit les faux positifs : une entité
|
|
flaggée par 1 seul modèle sur 3 est supprimée.
|
|
|
|
Modèle : urchade/gliner_multi_pii-v1 (1.1 GB, ~95ms/inférence CPU)
|
|
Version compatible : gliner==0.2.18 (pas plus récent, casse optimum-onnx)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
try:
|
|
from gliner import GLiNER
|
|
_GLINER_AVAILABLE = True
|
|
except ImportError:
|
|
GLiNER = None # type: ignore
|
|
_GLINER_AVAILABLE = False
|
|
|
|
# Labels zero-shot pour la détection PII en contexte clinique français
|
|
GLINER_PII_LABELS = [
|
|
"person_name",
|
|
"date_of_birth",
|
|
"phone_number",
|
|
"email_address",
|
|
"social_security_number",
|
|
"postal_address",
|
|
"hospital",
|
|
"city",
|
|
]
|
|
|
|
# Labels pour identifier les termes médicaux (anti-PII : si classé ici → pas un nom)
|
|
GLINER_SAFE_LABELS = [
|
|
"medication",
|
|
"medical_condition",
|
|
"medical_procedure",
|
|
]
|
|
|
|
# Mapping GLiNER label → clé PLACEHOLDERS
|
|
GLINER_LABEL_MAP: Dict[str, str] = {
|
|
"person_name": "NOM",
|
|
"date_of_birth": "DATE_NAISSANCE",
|
|
"phone_number": "TEL",
|
|
"email_address": "EMAIL",
|
|
"social_security_number": "NIR",
|
|
"postal_address": "ADRESSE",
|
|
"hospital": "ETAB",
|
|
"city": "VILLE",
|
|
}
|
|
|
|
DEFAULT_MODEL = "urchade/gliner_multi_pii-v1"
|
|
|
|
|
|
class GlinerManager:
|
|
"""Gestionnaire GLiNER pour NER zero-shot. Utilisé en vote majoritaire."""
|
|
|
|
def __init__(self):
|
|
self._model = None
|
|
self._loaded = False
|
|
self.model_id: Optional[str] = None
|
|
|
|
def is_loaded(self) -> bool:
|
|
return self._loaded and self._model is not None
|
|
|
|
def load(self, model_id: str = DEFAULT_MODEL) -> None:
|
|
if not _GLINER_AVAILABLE:
|
|
raise RuntimeError("gliner non disponible. Installez : pip install 'gliner==0.2.18'")
|
|
self.unload()
|
|
self.model_id = model_id
|
|
self._model = GLiNER.from_pretrained(model_id)
|
|
self._loaded = True
|
|
log.info(f"GLiNER chargé: {model_id}")
|
|
|
|
def unload(self) -> None:
|
|
self._model = None
|
|
self._loaded = False
|
|
self.model_id = None
|
|
|
|
def predict(
|
|
self,
|
|
text: str,
|
|
labels: Optional[List[str]] = None,
|
|
threshold: float = 0.5,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Prédit les entités dans un texte.
|
|
|
|
Returns:
|
|
Liste de dicts avec: text, label, score, start, end
|
|
"""
|
|
if not self.is_loaded():
|
|
return []
|
|
if labels is None:
|
|
labels = GLINER_PII_LABELS + GLINER_SAFE_LABELS
|
|
try:
|
|
entities = self._model.predict_entities(text, labels, threshold=threshold)
|
|
return [
|
|
{
|
|
"text": e["text"],
|
|
"label": e["label"],
|
|
"score": e["score"],
|
|
"start": e["start"],
|
|
"end": e["end"],
|
|
}
|
|
for e in entities
|
|
]
|
|
except Exception as e:
|
|
log.warning(f"GLiNER predict error: {e}")
|
|
return []
|
|
|
|
def is_pii(self, text: str, entity_text: str, threshold: float = 0.5) -> Optional[str]:
|
|
"""Vérifie si un token est un PII selon GLiNER.
|
|
|
|
Returns:
|
|
La clé PLACEHOLDERS mappée si PII, None sinon.
|
|
"""
|
|
if not self.is_loaded():
|
|
return None
|
|
entities = self.predict(text, threshold=threshold)
|
|
for e in entities:
|
|
if e["text"].strip().lower() == entity_text.strip().lower():
|
|
if e["label"] in GLINER_LABEL_MAP:
|
|
return GLINER_LABEL_MAP[e["label"]]
|
|
if e["label"] in GLINER_SAFE_LABELS:
|
|
return None # Explicitement classé comme terme médical
|
|
return None # Pas trouvé → pas de vote
|
|
|
|
def validate_entities(
|
|
self,
|
|
text: str,
|
|
eds_entities: List[Dict[str, Any]],
|
|
threshold: float = 0.4,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Valide les entités EDS-Pseudo via GLiNER (vote croisé).
|
|
|
|
Chaque entité EDS reçoit un champ 'gliner_confirmed': True/False/None.
|
|
- True : GLiNER aussi détecte ce span comme PII
|
|
- False : GLiNER classifie ce span comme terme médical (medication/condition/procedure)
|
|
- None : GLiNER ne détecte rien (neutre)
|
|
"""
|
|
if not self.is_loaded() or not eds_entities:
|
|
return eds_entities
|
|
|
|
# Prédiction GLiNER sur tout le texte
|
|
all_labels = GLINER_PII_LABELS + GLINER_SAFE_LABELS
|
|
gliner_preds = self.predict(text, labels=all_labels, threshold=threshold)
|
|
|
|
# Index rapide : pour chaque position de caractère, quelles entités GLiNER couvrent
|
|
for e in eds_entities:
|
|
e_start = e.get("start", -1)
|
|
e_end = e.get("end", -1)
|
|
e_word = (e.get("word") or "").lower()
|
|
|
|
confirmed = None # par défaut: neutre
|
|
for g in gliner_preds:
|
|
g_text = g["text"].lower()
|
|
# Match par overlap ou par texte identique
|
|
overlap = (
|
|
(g["start"] <= e_start < g["end"]) or
|
|
(g["start"] < e_end <= g["end"]) or
|
|
(e_start <= g["start"] and e_end >= g["end"])
|
|
)
|
|
text_match = g_text == e_word or e_word in g_text or g_text in e_word
|
|
|
|
if overlap or text_match:
|
|
if g["label"] in GLINER_SAFE_LABELS:
|
|
confirmed = False # GLiNER dit: c'est médical, pas PII
|
|
break
|
|
elif g["label"] in GLINER_LABEL_MAP:
|
|
confirmed = True # GLiNER confirme: c'est PII
|
|
|
|
e["gliner_confirmed"] = confirmed
|
|
|
|
return eds_entities
|