feat(phase2): Multi-signal NER — BDPM gazetteers, confiance EDS, safe patterns, GLiNER
Chantier 1: Intégration BDPM (5737 médicaments officiels) dans medication whitelist Chantier 2: Safe patterns contextuels (dosages mg/mL/cpr, formes pharma, même ligne) Chantier 3: Scores de confiance NER réels (edsnlp 0.20 ner_confidence_score) Chantier 4: GLiNER zero-shot (urchade/gliner_multi_pii-v1) en vote croisé Chantier 5: Scripts export silver annotations + fine-tuning CamemBERT-bio 0 fuite, 0 régression, -18 FP supplémentaires éliminés. Sécurité: GLiNER ne peut rejeter que si confiance NER < 0.70. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
180
gliner_manager.py
Normal file
180
gliner_manager.py
Normal file
@@ -0,0 +1,180 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
GLiNER Manager — NER zero-shot pour validation croisée des entités.
|
||||
-------------------------------------------------------------------
|
||||
Utilise GLiNER (< 500M params, CPU) comme 3e signal NER en vote majoritaire
|
||||
avec CamemBERT ONNX + EDS-Pseudo. Réduit les faux positifs : une entité
|
||||
flaggée par 1 seul modèle sur 3 est supprimée.
|
||||
|
||||
Modèle : urchade/gliner_multi_pii-v1 (1.1 GB, ~95ms/inférence CPU)
|
||||
Version compatible : gliner==0.2.18 (pas plus récent, casse optimum-onnx)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
from gliner import GLiNER
|
||||
_GLINER_AVAILABLE = True
|
||||
except ImportError:
|
||||
GLiNER = None # type: ignore
|
||||
_GLINER_AVAILABLE = False
|
||||
|
||||
# Labels zero-shot pour la détection PII en contexte clinique français
|
||||
GLINER_PII_LABELS = [
|
||||
"person_name",
|
||||
"date_of_birth",
|
||||
"phone_number",
|
||||
"email_address",
|
||||
"social_security_number",
|
||||
"postal_address",
|
||||
"hospital",
|
||||
"city",
|
||||
]
|
||||
|
||||
# Labels pour identifier les termes médicaux (anti-PII : si classé ici → pas un nom)
|
||||
GLINER_SAFE_LABELS = [
|
||||
"medication",
|
||||
"medical_condition",
|
||||
"medical_procedure",
|
||||
]
|
||||
|
||||
# Mapping GLiNER label → clé PLACEHOLDERS
|
||||
GLINER_LABEL_MAP: Dict[str, str] = {
|
||||
"person_name": "NOM",
|
||||
"date_of_birth": "DATE_NAISSANCE",
|
||||
"phone_number": "TEL",
|
||||
"email_address": "EMAIL",
|
||||
"social_security_number": "NIR",
|
||||
"postal_address": "ADRESSE",
|
||||
"hospital": "ETAB",
|
||||
"city": "VILLE",
|
||||
}
|
||||
|
||||
DEFAULT_MODEL = "urchade/gliner_multi_pii-v1"
|
||||
|
||||
|
||||
class GlinerManager:
|
||||
"""Gestionnaire GLiNER pour NER zero-shot. Utilisé en vote majoritaire."""
|
||||
|
||||
def __init__(self):
|
||||
self._model = None
|
||||
self._loaded = False
|
||||
self.model_id: Optional[str] = None
|
||||
|
||||
def is_loaded(self) -> bool:
|
||||
return self._loaded and self._model is not None
|
||||
|
||||
def load(self, model_id: str = DEFAULT_MODEL) -> None:
|
||||
if not _GLINER_AVAILABLE:
|
||||
raise RuntimeError("gliner non disponible. Installez : pip install 'gliner==0.2.18'")
|
||||
self.unload()
|
||||
self.model_id = model_id
|
||||
self._model = GLiNER.from_pretrained(model_id)
|
||||
self._loaded = True
|
||||
log.info(f"GLiNER chargé: {model_id}")
|
||||
|
||||
def unload(self) -> None:
|
||||
self._model = None
|
||||
self._loaded = False
|
||||
self.model_id = None
|
||||
|
||||
def predict(
|
||||
self,
|
||||
text: str,
|
||||
labels: Optional[List[str]] = None,
|
||||
threshold: float = 0.5,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Prédit les entités dans un texte.
|
||||
|
||||
Returns:
|
||||
Liste de dicts avec: text, label, score, start, end
|
||||
"""
|
||||
if not self.is_loaded():
|
||||
return []
|
||||
if labels is None:
|
||||
labels = GLINER_PII_LABELS + GLINER_SAFE_LABELS
|
||||
try:
|
||||
entities = self._model.predict_entities(text, labels, threshold=threshold)
|
||||
return [
|
||||
{
|
||||
"text": e["text"],
|
||||
"label": e["label"],
|
||||
"score": e["score"],
|
||||
"start": e["start"],
|
||||
"end": e["end"],
|
||||
}
|
||||
for e in entities
|
||||
]
|
||||
except Exception as e:
|
||||
log.warning(f"GLiNER predict error: {e}")
|
||||
return []
|
||||
|
||||
def is_pii(self, text: str, entity_text: str, threshold: float = 0.5) -> Optional[str]:
|
||||
"""Vérifie si un token est un PII selon GLiNER.
|
||||
|
||||
Returns:
|
||||
La clé PLACEHOLDERS mappée si PII, None sinon.
|
||||
"""
|
||||
if not self.is_loaded():
|
||||
return None
|
||||
entities = self.predict(text, threshold=threshold)
|
||||
for e in entities:
|
||||
if e["text"].strip().lower() == entity_text.strip().lower():
|
||||
if e["label"] in GLINER_LABEL_MAP:
|
||||
return GLINER_LABEL_MAP[e["label"]]
|
||||
if e["label"] in GLINER_SAFE_LABELS:
|
||||
return None # Explicitement classé comme terme médical
|
||||
return None # Pas trouvé → pas de vote
|
||||
|
||||
def validate_entities(
|
||||
self,
|
||||
text: str,
|
||||
eds_entities: List[Dict[str, Any]],
|
||||
threshold: float = 0.4,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Valide les entités EDS-Pseudo via GLiNER (vote croisé).
|
||||
|
||||
Chaque entité EDS reçoit un champ 'gliner_confirmed': True/False/None.
|
||||
- True : GLiNER aussi détecte ce span comme PII
|
||||
- False : GLiNER classifie ce span comme terme médical (medication/condition/procedure)
|
||||
- None : GLiNER ne détecte rien (neutre)
|
||||
"""
|
||||
if not self.is_loaded() or not eds_entities:
|
||||
return eds_entities
|
||||
|
||||
# Prédiction GLiNER sur tout le texte
|
||||
all_labels = GLINER_PII_LABELS + GLINER_SAFE_LABELS
|
||||
gliner_preds = self.predict(text, labels=all_labels, threshold=threshold)
|
||||
|
||||
# Index rapide : pour chaque position de caractère, quelles entités GLiNER couvrent
|
||||
for e in eds_entities:
|
||||
e_start = e.get("start", -1)
|
||||
e_end = e.get("end", -1)
|
||||
e_word = (e.get("word") or "").lower()
|
||||
|
||||
confirmed = None # par défaut: neutre
|
||||
for g in gliner_preds:
|
||||
g_text = g["text"].lower()
|
||||
# Match par overlap ou par texte identique
|
||||
overlap = (
|
||||
(g["start"] <= e_start < g["end"]) or
|
||||
(g["start"] < e_end <= g["end"]) or
|
||||
(e_start <= g["start"] and e_end >= g["end"])
|
||||
)
|
||||
text_match = g_text == e_word or e_word in g_text or g_text in e_word
|
||||
|
||||
if overlap or text_match:
|
||||
if g["label"] in GLINER_SAFE_LABELS:
|
||||
confirmed = False # GLiNER dit: c'est médical, pas PII
|
||||
break
|
||||
elif g["label"] in GLINER_LABEL_MAP:
|
||||
confirmed = True # GLiNER confirme: c'est PII
|
||||
|
||||
e["gliner_confirmed"] = confirmed
|
||||
|
||||
return eds_entities
|
||||
Reference in New Issue
Block a user