Files
t2a_v2/src/medical/edsnlp_pipeline.py
dom 4a12cd2676 feat: pipeline T2A - anonymisation, extraction CIM-10 et intégration edsnlp
Pipeline complet de traitement de documents médicaux PDF :
- Extraction texte (pdfplumber) et classification (Trackare/CRH)
- Anonymisation multi-couche (regex + NER CamemBERT + sweep)
- Extraction médicale CIM-10 hybride : edsnlp (AP-HP) enrichit les
  diagnostics, médicaments (codes ATC via Romedi) et négation,
  avec fallback regex pour les patterns spécifiques
- Fix sentencepiece pinné à <0.2.0 pour compatibilité CamemBERT

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 15:24:12 +01:00

141 lines
3.4 KiB
Python

"""Pipeline edsnlp pour l'extraction médicale (CIM-10, médicaments, négation)."""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import Optional
logger = logging.getLogger(__name__)
_nlp = None
_available = None
@dataclass
class CIM10Entity:
texte: str
code: str
negation: bool = False
hypothese: bool = False
@dataclass
class DrugEntity:
texte: str
code_atc: Optional[str] = None
negation: bool = False
@dataclass
class DateEntity:
texte: str
value: Optional[str] = None
@dataclass
class EdsnlpResult:
cim10_entities: list[CIM10Entity] = field(default_factory=list)
drug_entities: list[DrugEntity] = field(default_factory=list)
date_entities: list[DateEntity] = field(default_factory=list)
def is_available() -> bool:
"""Vérifie si edsnlp est installé et utilisable."""
global _available
if _available is not None:
return _available
try:
import edsnlp # noqa: F401
_available = True
except ImportError:
_available = False
return _available
def get_pipeline():
"""Retourne le pipeline edsnlp (singleton lazy-loaded)."""
global _nlp
if _nlp is not None:
return _nlp
if not is_available():
raise RuntimeError("edsnlp n'est pas installé")
import edsnlp
logger.info("Initialisation du pipeline edsnlp...")
nlp = edsnlp.blank("eds")
nlp.add_pipe("eds.normalizer")
nlp.add_pipe("eds.sentences")
nlp.add_pipe("eds.cim10", config=dict(attr="NORM", term_matcher="simstring"))
nlp.add_pipe("eds.drugs", config=dict(attr="NORM", term_matcher="exact"))
nlp.add_pipe("eds.negation")
nlp.add_pipe("eds.hypothesis")
nlp.add_pipe("eds.dates")
_nlp = nlp
logger.info("Pipeline edsnlp initialisé avec succès")
return _nlp
def analyze(text: str) -> EdsnlpResult:
"""Analyse un texte médical avec edsnlp.
Retourne les entités CIM-10, médicaments et dates détectées.
"""
result = EdsnlpResult()
if not is_available():
return result
try:
nlp = get_pipeline()
doc = nlp(text)
except Exception:
logger.exception("Erreur lors de l'analyse edsnlp")
return result
for ent in doc.ents:
negation = getattr(ent._, "negation", False) or False
hypothese = getattr(ent._, "hypothesis", False) or False
if ent.label_ == "cim10":
code = ent.kb_id_ or ""
if code:
result.cim10_entities.append(CIM10Entity(
texte=ent.text,
code=code,
negation=negation,
hypothese=hypothese,
))
elif ent.label_ == "drug":
code_atc = ent.kb_id_ or None
result.drug_entities.append(DrugEntity(
texte=ent.text,
code_atc=code_atc,
negation=negation,
))
# Dates
for span in doc.spans.get("dates", []):
date_value = None
if hasattr(span._, "date"):
date_obj = span._.date
if date_obj is not None:
date_value = str(date_obj)
result.date_entities.append(DateEntity(
texte=span.text,
value=date_value,
))
return result
def reset():
"""Réinitialise le pipeline (utile pour les tests)."""
global _nlp, _available
_nlp = None
_available = None