Pipeline complet de traitement de documents médicaux PDF : - Extraction texte (pdfplumber) et classification (Trackare/CRH) - Anonymisation multi-couche (regex + NER CamemBERT + sweep) - Extraction médicale CIM-10 hybride : edsnlp (AP-HP) enrichit les diagnostics, médicaments (codes ATC via Romedi) et négation, avec fallback regex pour les patterns spécifiques - Fix sentencepiece pinné à <0.2.0 pour compatibilité CamemBERT Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
141 lines
3.4 KiB
Python
141 lines
3.4 KiB
Python
"""Pipeline edsnlp pour l'extraction médicale (CIM-10, médicaments, négation)."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
from typing import Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_nlp = None
|
|
_available = None
|
|
|
|
|
|
@dataclass
|
|
class CIM10Entity:
|
|
texte: str
|
|
code: str
|
|
negation: bool = False
|
|
hypothese: bool = False
|
|
|
|
|
|
@dataclass
|
|
class DrugEntity:
|
|
texte: str
|
|
code_atc: Optional[str] = None
|
|
negation: bool = False
|
|
|
|
|
|
@dataclass
|
|
class DateEntity:
|
|
texte: str
|
|
value: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class EdsnlpResult:
|
|
cim10_entities: list[CIM10Entity] = field(default_factory=list)
|
|
drug_entities: list[DrugEntity] = field(default_factory=list)
|
|
date_entities: list[DateEntity] = field(default_factory=list)
|
|
|
|
|
|
def is_available() -> bool:
|
|
"""Vérifie si edsnlp est installé et utilisable."""
|
|
global _available
|
|
if _available is not None:
|
|
return _available
|
|
try:
|
|
import edsnlp # noqa: F401
|
|
_available = True
|
|
except ImportError:
|
|
_available = False
|
|
return _available
|
|
|
|
|
|
def get_pipeline():
|
|
"""Retourne le pipeline edsnlp (singleton lazy-loaded)."""
|
|
global _nlp
|
|
if _nlp is not None:
|
|
return _nlp
|
|
|
|
if not is_available():
|
|
raise RuntimeError("edsnlp n'est pas installé")
|
|
|
|
import edsnlp
|
|
|
|
logger.info("Initialisation du pipeline edsnlp...")
|
|
nlp = edsnlp.blank("eds")
|
|
|
|
nlp.add_pipe("eds.normalizer")
|
|
nlp.add_pipe("eds.sentences")
|
|
nlp.add_pipe("eds.cim10", config=dict(attr="NORM", term_matcher="simstring"))
|
|
nlp.add_pipe("eds.drugs", config=dict(attr="NORM", term_matcher="exact"))
|
|
nlp.add_pipe("eds.negation")
|
|
nlp.add_pipe("eds.hypothesis")
|
|
nlp.add_pipe("eds.dates")
|
|
|
|
_nlp = nlp
|
|
logger.info("Pipeline edsnlp initialisé avec succès")
|
|
return _nlp
|
|
|
|
|
|
def analyze(text: str) -> EdsnlpResult:
|
|
"""Analyse un texte médical avec edsnlp.
|
|
|
|
Retourne les entités CIM-10, médicaments et dates détectées.
|
|
"""
|
|
result = EdsnlpResult()
|
|
|
|
if not is_available():
|
|
return result
|
|
|
|
try:
|
|
nlp = get_pipeline()
|
|
doc = nlp(text)
|
|
except Exception:
|
|
logger.exception("Erreur lors de l'analyse edsnlp")
|
|
return result
|
|
|
|
for ent in doc.ents:
|
|
negation = getattr(ent._, "negation", False) or False
|
|
hypothese = getattr(ent._, "hypothesis", False) or False
|
|
|
|
if ent.label_ == "cim10":
|
|
code = ent.kb_id_ or ""
|
|
if code:
|
|
result.cim10_entities.append(CIM10Entity(
|
|
texte=ent.text,
|
|
code=code,
|
|
negation=negation,
|
|
hypothese=hypothese,
|
|
))
|
|
elif ent.label_ == "drug":
|
|
code_atc = ent.kb_id_ or None
|
|
result.drug_entities.append(DrugEntity(
|
|
texte=ent.text,
|
|
code_atc=code_atc,
|
|
negation=negation,
|
|
))
|
|
|
|
# Dates
|
|
for span in doc.spans.get("dates", []):
|
|
date_value = None
|
|
if hasattr(span._, "date"):
|
|
date_obj = span._.date
|
|
if date_obj is not None:
|
|
date_value = str(date_obj)
|
|
result.date_entities.append(DateEntity(
|
|
texte=span.text,
|
|
value=date_value,
|
|
))
|
|
|
|
return result
|
|
|
|
|
|
def reset():
|
|
"""Réinitialise le pipeline (utile pour les tests)."""
|
|
global _nlp, _available
|
|
_nlp = None
|
|
_available = None
|