feat: pipeline T2A - anonymisation, extraction CIM-10 et intégration edsnlp
Pipeline complet de traitement de documents médicaux PDF : - Extraction texte (pdfplumber) et classification (Trackare/CRH) - Anonymisation multi-couche (regex + NER CamemBERT + sweep) - Extraction médicale CIM-10 hybride : edsnlp (AP-HP) enrichit les diagnostics, médicaments (codes ATC via Romedi) et négation, avec fallback regex pour les patterns spécifiques - Fix sentencepiece pinné à <0.2.0 pour compatibilité CamemBERT Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
0
src/medical/__init__.py
Normal file
0
src/medical/__init__.py
Normal file
606
src/medical/cim10_extractor.py
Normal file
606
src/medical/cim10_extractor.py
Normal file
@@ -0,0 +1,606 @@
|
||||
"""Extraction d'informations médicales structurées pour le codage CIM-10."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from ..config import (
|
||||
ActeCCAM,
|
||||
BiologieCle,
|
||||
Diagnostic,
|
||||
DossierMedical,
|
||||
Imagerie,
|
||||
Sejour,
|
||||
Traitement,
|
||||
)
|
||||
|
||||
try:
|
||||
from .edsnlp_pipeline import EdsnlpResult
|
||||
except ImportError:
|
||||
EdsnlpResult = None # type: ignore[assignment,misc]
|
||||
|
||||
# Mapping diagnostics fréquents → codes CIM-10
|
||||
CIM10_MAP: dict[str, str] = {
|
||||
# Pancréatite
|
||||
"pancréatite aiguë biliaire": "K85.1",
|
||||
"pancréatite aigue biliaire": "K85.1",
|
||||
"pancréatite aiguë lithiasique": "K85.1",
|
||||
"pancréatite aigue lithiasique": "K85.1",
|
||||
"pancréatite aiguë": "K85.9",
|
||||
"pancréatite aigue": "K85.9",
|
||||
"pancréatite": "K85.9",
|
||||
# Lithiases biliaires
|
||||
"lithiase cholédoque": "K80.5",
|
||||
"lithiase du cholédoque": "K80.5",
|
||||
"calcul des canaux biliaires": "K80.5",
|
||||
"lithiase vésiculaire": "K80.2",
|
||||
"lithiases vésiculaires": "K80.2",
|
||||
"vésicule lithiasique": "K80.2",
|
||||
"colique hépatique": "K80.2",
|
||||
# Cholécystite
|
||||
"cholécystite aiguë": "K81.0",
|
||||
"cholecystite aigue": "K81.0",
|
||||
"angiocholite": "K83.0",
|
||||
# Obésité
|
||||
"obésité": "E66.0",
|
||||
"obesite": "E66.0",
|
||||
"surpoids": "E66.0",
|
||||
# Réactions médicamenteuses
|
||||
"éruption médicamenteuse": "L27.0",
|
||||
"eruption medicamenteuse": "L27.0",
|
||||
"éruption cutanée médicamenteuse": "L27.0",
|
||||
"toxidermie": "L27.0",
|
||||
"réaction au tramadol": "L27.0",
|
||||
"allergie médicamenteuse": "T88.7",
|
||||
# Douleur
|
||||
"douleur abdominale": "R10.4",
|
||||
"douleur hypochondre droit": "R10.1",
|
||||
# Ictère
|
||||
"ictère": "R17",
|
||||
"jaunisse": "R17",
|
||||
# HTA
|
||||
"hypertension artérielle": "I10",
|
||||
"hta": "I10",
|
||||
# Diabète
|
||||
"diabète type 2": "E11.9",
|
||||
"diabète de type 2": "E11.9",
|
||||
"diabète type 1": "E10.9",
|
||||
}
|
||||
|
||||
# Mapping actes → codes CCAM
|
||||
CCAM_MAP: dict[str, str] = {
|
||||
"cholécystectomie": "HMFC004",
|
||||
"cholecystectomie": "HMFC004",
|
||||
"cholécystectomie par cœlioscopie": "HMFC004",
|
||||
"cholecystectomie par coelioscopie": "HMFC004",
|
||||
"cholangiographie": "HHHE002",
|
||||
"cholangiographie peropératoire": "HHHE002",
|
||||
"cpre": "HHHE002",
|
||||
"sphinctérotomie endoscopique": "HHHE003",
|
||||
"scanner abdominal": "ZCQK002",
|
||||
"tdm abdominal": "ZCQK002",
|
||||
"échographie abdominale": "ZCQJ001",
|
||||
"echo abdominale": "ZCQJ001",
|
||||
"irm abdominale": "ZCQN001",
|
||||
}
|
||||
|
||||
|
||||
def extract_medical_info(
|
||||
parsed_data: dict,
|
||||
anonymized_text: str,
|
||||
edsnlp_result: Optional[EdsnlpResult] = None,
|
||||
) -> DossierMedical:
|
||||
"""Extrait les informations médicales structurées depuis les données parsées et le texte."""
|
||||
dossier = DossierMedical()
|
||||
dossier.document_type = parsed_data.get("type", "")
|
||||
|
||||
_extract_sejour(parsed_data, dossier)
|
||||
_extract_diagnostics(parsed_data, anonymized_text, dossier, edsnlp_result)
|
||||
_extract_actes(anonymized_text, dossier)
|
||||
_extract_antecedents(anonymized_text, dossier)
|
||||
_extract_traitements(parsed_data, anonymized_text, dossier, edsnlp_result)
|
||||
_extract_biologie(anonymized_text, dossier)
|
||||
_extract_imagerie(anonymized_text, dossier)
|
||||
_extract_complications(anonymized_text, dossier, edsnlp_result)
|
||||
|
||||
return dossier
|
||||
|
||||
|
||||
def _extract_sejour(parsed: dict, dossier: DossierMedical) -> None:
|
||||
"""Extrait les informations de séjour."""
|
||||
patient = parsed.get("patient", {})
|
||||
sejour_data = parsed.get("sejour", {})
|
||||
|
||||
dossier.sejour = Sejour(
|
||||
sexe=patient.get("sexe"),
|
||||
date_entree=sejour_data.get("date_entree"),
|
||||
date_sortie=sejour_data.get("date_sortie"),
|
||||
mode_entree=parsed.get("urgences", {}).get("mode_entree"),
|
||||
)
|
||||
|
||||
# Calcul de l'âge à partir de la date de naissance et de la date d'entrée
|
||||
dob = patient.get("date_naissance")
|
||||
date_entree = sejour_data.get("date_entree")
|
||||
if dob and date_entree:
|
||||
try:
|
||||
dob_dt = datetime.strptime(dob, "%d/%m/%Y")
|
||||
entree_dt = datetime.strptime(date_entree, "%d/%m/%Y")
|
||||
age = entree_dt.year - dob_dt.year
|
||||
if (entree_dt.month, entree_dt.day) < (dob_dt.month, dob_dt.day):
|
||||
age -= 1
|
||||
dossier.sejour.age = age
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Durée de séjour
|
||||
if sejour_data.get("date_entree") and sejour_data.get("date_sortie"):
|
||||
try:
|
||||
d1 = datetime.strptime(sejour_data["date_entree"], "%d/%m/%Y")
|
||||
d2 = datetime.strptime(sejour_data["date_sortie"], "%d/%m/%Y")
|
||||
dossier.sejour.duree_sejour = (d2 - d1).days
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# IMC, poids, taille
|
||||
vitals = parsed.get("signes_vitaux", {})
|
||||
if vitals.get("imc"):
|
||||
dossier.sejour.imc = vitals["imc"]
|
||||
elif patient.get("imc"):
|
||||
dossier.sejour.imc = patient["imc"]
|
||||
|
||||
if vitals.get("poids_kg"):
|
||||
dossier.sejour.poids = vitals["poids_kg"]
|
||||
elif patient.get("poids_kg"):
|
||||
dossier.sejour.poids = patient["poids_kg"]
|
||||
|
||||
if vitals.get("taille_cm"):
|
||||
dossier.sejour.taille = vitals["taille_cm"]
|
||||
elif patient.get("taille_cm"):
|
||||
dossier.sejour.taille = patient["taille_cm"]
|
||||
|
||||
|
||||
def _extract_diagnostics(
|
||||
parsed: dict,
|
||||
text: str,
|
||||
dossier: DossierMedical,
|
||||
edsnlp_result: Optional[EdsnlpResult] = None,
|
||||
) -> None:
|
||||
"""Extrait le diagnostic principal et les diagnostics associés."""
|
||||
text_lower = text.lower()
|
||||
|
||||
# Diagnostics codés depuis Trackare (prioritaires)
|
||||
for diag in parsed.get("diagnostics", []):
|
||||
d = Diagnostic(
|
||||
texte=diag.get("libelle", ""),
|
||||
cim10_suggestion=diag.get("code_cim10"),
|
||||
)
|
||||
if diag.get("type", "").lower() == "principal":
|
||||
dossier.diagnostic_principal = d
|
||||
else:
|
||||
dossier.diagnostics_associes.append(d)
|
||||
|
||||
# Extraction du texte "Au total:" ou conclusion
|
||||
conclusion = ""
|
||||
m = re.search(
|
||||
r"Au total\s*[::]?\s*(.*?)(?=\n\s*(?:Devenir|TTT|Sortie|$))",
|
||||
text,
|
||||
re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
if m:
|
||||
conclusion = m.group(1).strip()
|
||||
|
||||
# Enrichissement via edsnlp (CIM-10)
|
||||
edsnlp_codes: dict[str, str] = {}
|
||||
if edsnlp_result:
|
||||
for ent in edsnlp_result.cim10_entities:
|
||||
if not ent.negation and not ent.hypothese:
|
||||
edsnlp_codes[ent.code] = ent.texte
|
||||
|
||||
# Si pas de DP depuis le codage, chercher dans le texte
|
||||
if not dossier.diagnostic_principal:
|
||||
# D'abord essayer le fallback regex (plus précis pour les patterns spécifiques)
|
||||
dp = _find_diagnostic_principal(text_lower, conclusion)
|
||||
if dp:
|
||||
dossier.diagnostic_principal = dp
|
||||
elif edsnlp_codes:
|
||||
# Utiliser la première entité CIM-10 edsnlp comme DP
|
||||
code, texte = next(iter(edsnlp_codes.items()))
|
||||
dossier.diagnostic_principal = Diagnostic(
|
||||
texte=texte.capitalize(), cim10_suggestion=code,
|
||||
)
|
||||
|
||||
# Diagnostics associés depuis le texte (regex)
|
||||
das = _find_diagnostics_associes(text_lower, conclusion, dossier)
|
||||
dossier.diagnostics_associes.extend(das)
|
||||
|
||||
# Enrichissement DAS depuis edsnlp
|
||||
if edsnlp_result:
|
||||
existing_codes = set()
|
||||
if dossier.diagnostic_principal:
|
||||
existing_codes.add(dossier.diagnostic_principal.cim10_suggestion)
|
||||
for d in dossier.diagnostics_associes:
|
||||
existing_codes.add(d.cim10_suggestion)
|
||||
|
||||
for ent in edsnlp_result.cim10_entities:
|
||||
if ent.negation or ent.hypothese:
|
||||
continue
|
||||
if ent.code not in existing_codes:
|
||||
dossier.diagnostics_associes.append(Diagnostic(
|
||||
texte=ent.texte.capitalize(),
|
||||
cim10_suggestion=ent.code,
|
||||
))
|
||||
existing_codes.add(ent.code)
|
||||
|
||||
|
||||
def _find_diagnostic_principal(text_lower: str, conclusion: str) -> Diagnostic | None:
|
||||
"""Trouve le diagnostic principal dans le texte."""
|
||||
conclusion_lower = conclusion.lower()
|
||||
|
||||
# Chercher dans la conclusion d'abord
|
||||
for terme, code in CIM10_MAP.items():
|
||||
if terme in conclusion_lower:
|
||||
return Diagnostic(texte=terme.capitalize(), cim10_suggestion=code)
|
||||
|
||||
# Patterns courants pour le DP
|
||||
dp_patterns = [
|
||||
r"pancréatite\s+aigu[eë]\s+(?:d'origine\s+)?lithiasique",
|
||||
r"pancréatite\s+aigu[eë]\s+biliaire",
|
||||
r"pancréatite\s+aigu[eë]",
|
||||
]
|
||||
for pat in dp_patterns:
|
||||
if re.search(pat, text_lower):
|
||||
matched = re.search(pat, text_lower).group(0)
|
||||
code = _lookup_cim10(matched)
|
||||
return Diagnostic(texte=matched.capitalize(), cim10_suggestion=code)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _find_diagnostics_associes(
|
||||
text_lower: str, conclusion: str, dossier: DossierMedical
|
||||
) -> list[Diagnostic]:
|
||||
"""Trouve les diagnostics associés."""
|
||||
das: list[Diagnostic] = []
|
||||
existing_codes = set()
|
||||
if dossier.diagnostic_principal:
|
||||
existing_codes.add(dossier.diagnostic_principal.cim10_suggestion)
|
||||
for d in dossier.diagnostics_associes:
|
||||
existing_codes.add(d.cim10_suggestion)
|
||||
|
||||
# Lithiase cholédoque
|
||||
if re.search(r"lithiase\s+(?:du\s+)?(?:bas\s+)?cholédoque", text_lower):
|
||||
if "K80.5" not in existing_codes:
|
||||
das.append(Diagnostic(texte="Lithiase du cholédoque", cim10_suggestion="K80.5"))
|
||||
existing_codes.add("K80.5")
|
||||
|
||||
# Éruption médicamenteuse
|
||||
if re.search(r"éruption\s+cutanée|eruption\s+cutanée|toxidermie|réaction\s+au\s+tramadol", text_lower):
|
||||
if "L27.0" not in existing_codes:
|
||||
das.append(Diagnostic(texte="Éruption cutanée médicamenteuse", cim10_suggestion="L27.0"))
|
||||
existing_codes.add("L27.0")
|
||||
|
||||
# Obésité (IMC >= 30)
|
||||
if re.search(r"imc\s*[:=]?\s*(\d{2,3}[.,]\d+)", text_lower):
|
||||
m = re.search(r"imc\s*[:=]?\s*(\d{2,3}[.,]\d+)", text_lower)
|
||||
if m:
|
||||
imc_val = float(m.group(1).replace(",", "."))
|
||||
if imc_val >= 30 and "E66.0" not in existing_codes:
|
||||
das.append(Diagnostic(texte=f"Obésité (IMC {imc_val})", cim10_suggestion="E66.0"))
|
||||
existing_codes.add("E66.0")
|
||||
|
||||
# Lithiases vésiculaires
|
||||
if re.search(r"vésicule\s+lithiasique|lithiases?\s+vésiculaire", text_lower):
|
||||
if "K80.2" not in existing_codes:
|
||||
das.append(Diagnostic(texte="Lithiase vésiculaire", cim10_suggestion="K80.2"))
|
||||
existing_codes.add("K80.2")
|
||||
|
||||
return das
|
||||
|
||||
|
||||
def _extract_actes(text: str, dossier: DossierMedical) -> None:
|
||||
"""Extrait les actes CCAM."""
|
||||
text_lower = text.lower()
|
||||
|
||||
# Cholécystectomie par cœlioscopie
|
||||
if re.search(r"chol[ée]cystectomie\s+par\s+c[oœ][ea]lioscopie", text_lower):
|
||||
date = _find_act_date(text, r"chol[ée]cystectomie")
|
||||
dossier.actes_ccam.append(ActeCCAM(
|
||||
texte="Cholécystectomie par cœlioscopie",
|
||||
code_ccam_suggestion="HMFC004",
|
||||
date=date,
|
||||
))
|
||||
elif re.search(r"chol[ée]cystectomie|cholecystectomie", text_lower):
|
||||
date = _find_act_date(text, r"chol[ée]cystectomie|cholecystectomie")
|
||||
dossier.actes_ccam.append(ActeCCAM(
|
||||
texte="Cholécystectomie",
|
||||
code_ccam_suggestion="HMFC004",
|
||||
date=date,
|
||||
))
|
||||
|
||||
# Cholangiographie
|
||||
if re.search(r"cholangiographie", text_lower):
|
||||
date = _find_act_date(text, r"cholangiographie")
|
||||
dossier.actes_ccam.append(ActeCCAM(
|
||||
texte="Cholangiographie peropératoire",
|
||||
code_ccam_suggestion="HHHE002",
|
||||
date=date,
|
||||
))
|
||||
|
||||
# TDM
|
||||
if re.search(r"(?:tdm|scanner|tomodensitométrie)", text_lower):
|
||||
date = _find_act_date(text, r"(?:TDM|scanner)")
|
||||
dossier.actes_ccam.append(ActeCCAM(
|
||||
texte="TDM abdominal",
|
||||
code_ccam_suggestion="ZCQK002",
|
||||
date=date,
|
||||
))
|
||||
|
||||
|
||||
def _extract_antecedents(text: str, dossier: DossierMedical) -> None:
|
||||
"""Extrait les antécédents."""
|
||||
m = re.search(
|
||||
r"Antécédents?\s*[::]?\s*\n?(.*?)(?=\n\s*(?:Traitements?\s*[::]|Allergie|Histoire de la maladie|Examen clinique|\n\n))",
|
||||
text,
|
||||
re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
if m:
|
||||
block = m.group(1).strip()
|
||||
for line in block.split("\n"):
|
||||
line = line.strip().lstrip("- •")
|
||||
# Filtrer les lignes non pertinentes
|
||||
if (line and len(line) > 5 and line != "0"
|
||||
and not re.match(r"^\d", line)
|
||||
and "Item de" not in line
|
||||
and "surveillance" not in line.lower()
|
||||
and "Température" not in line
|
||||
and "Signes Vitaux" not in line
|
||||
and "Pouls" not in line
|
||||
and "Type de note" not in line
|
||||
and "Aucune donnée" not in line
|
||||
and "renseignée" not in line
|
||||
and "habitudes de vie" not in line
|
||||
and "Systolique" not in line
|
||||
and "Diastolique" not in line
|
||||
and "Saturation" not in line):
|
||||
dossier.antecedents.append(line)
|
||||
|
||||
|
||||
def _extract_traitements(
|
||||
parsed: dict,
|
||||
text: str,
|
||||
dossier: DossierMedical,
|
||||
edsnlp_result: Optional[EdsnlpResult] = None,
|
||||
) -> None:
|
||||
"""Extrait les traitements de sortie."""
|
||||
# Construire un index des médicaments edsnlp avec codes ATC
|
||||
drug_atc: dict[str, str] = {}
|
||||
if edsnlp_result:
|
||||
for drug in edsnlp_result.drug_entities:
|
||||
if not drug.negation and drug.code_atc:
|
||||
drug_atc[drug.texte.lower()] = drug.code_atc
|
||||
|
||||
# Depuis le texte — section "TTT de sortie" (limiter à quelques lignes)
|
||||
m = re.search(
|
||||
r"(?:TTT|Traitement)\s+de\s+sortie\s*[::]?\s*\n?(.*?)(?=\n\s*(?:Devenir|Rédigé|Cordialement|Patient:|Episode|Le \d{2}/\d{2}|\n\n)|$)",
|
||||
text,
|
||||
re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
if m:
|
||||
block = m.group(1).strip()
|
||||
lines = block.split("\n")
|
||||
for line in lines[:10]: # Limiter à 10 lignes max
|
||||
line = line.strip().lstrip("- •")
|
||||
if not line or len(line) <= 2:
|
||||
continue
|
||||
# Ignorer les footers et lignes non-médicament
|
||||
if re.match(r"^(Patient|Episode|Le \d|Page|V\d)", line):
|
||||
break
|
||||
med = line
|
||||
poso = None
|
||||
# Séparer médicament et posologie
|
||||
poso_match = re.search(r"\s+(si besoin|matin|soir|midi|\d+\s*(?:mg|cp|gel).*)", line, re.IGNORECASE)
|
||||
if poso_match:
|
||||
med = line[:poso_match.start()].strip()
|
||||
poso = poso_match.group(1).strip()
|
||||
# Chercher le code ATC via edsnlp
|
||||
code_atc = _match_drug_atc(med, drug_atc)
|
||||
dossier.traitements_sortie.append(Traitement(
|
||||
medicament=med,
|
||||
posologie=poso,
|
||||
code_atc=code_atc,
|
||||
))
|
||||
|
||||
# Si rien trouvé, chercher les prescriptions "Presc. de Sortie"
|
||||
if not dossier.traitements_sortie:
|
||||
for m_presc in re.finditer(
|
||||
r"([A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂ0-9\s\-/%.]+?)(?:\s+\d+\s*(?:mg|G|CPR|GEL))?.*?Presc\.\s*de\s*Sortie",
|
||||
text,
|
||||
):
|
||||
med = m_presc.group(1).strip()
|
||||
if len(med) > 3:
|
||||
code_atc = _match_drug_atc(med, drug_atc)
|
||||
dossier.traitements_sortie.append(Traitement(
|
||||
medicament=med, code_atc=code_atc,
|
||||
))
|
||||
|
||||
|
||||
def _match_drug_atc(med_name: str, drug_atc: dict[str, str]) -> Optional[str]:
|
||||
"""Cherche un code ATC correspondant au médicament dans les résultats edsnlp."""
|
||||
if not drug_atc:
|
||||
return None
|
||||
med_lower = med_name.lower().strip()
|
||||
# Correspondance exacte
|
||||
if med_lower in drug_atc:
|
||||
return drug_atc[med_lower]
|
||||
# Correspondance partielle : le nom edsnlp est contenu dans le nom du médicament
|
||||
for drug_text, atc in drug_atc.items():
|
||||
if drug_text in med_lower or med_lower in drug_text:
|
||||
return atc
|
||||
return None
|
||||
|
||||
|
||||
def _extract_biologie(text: str, dossier: DossierMedical) -> None:
|
||||
"""Extrait les résultats biologiques clés."""
|
||||
bio_patterns = [
|
||||
(r"[Ll]ipas[ée]mie\s*(?:[àa=:])?\s*(\d+)", "Lipasémie", None),
|
||||
(r"CRP\s*[=:à]?\s*(\d+(?:[.,]\d+)?)", "CRP", None),
|
||||
(r"ASAT\s*[=:à]?\s*([\d.,]+)\s*(?:N|U/L)?", "ASAT", None),
|
||||
(r"ALAT\s*[=:à]?\s*([\d.,]+)\s*(?:N|U/L)?", "ALAT", None),
|
||||
(r"GGT\s*[=:à]?\s*(\d+)\s*(?:U/L)?", "GGT", None),
|
||||
(r"PAL\s*[=:à]?\s*(\d+)\s*(?:U/L)?", "PAL", None),
|
||||
(r"[Bb]ilirubine\s+(?:totale\s+)?[àa=:]\s*(\d+)\s*(?:µmol/L)?", "Bilirubine totale", None),
|
||||
(r"troponine\s+(négative|positive|normale)", "Troponine", None),
|
||||
]
|
||||
|
||||
for pattern, test_name, _ in bio_patterns:
|
||||
m = re.search(pattern, text)
|
||||
if m:
|
||||
value = m.group(1)
|
||||
anomalie = _is_abnormal(test_name, value)
|
||||
dossier.biologie_cle.append(BiologieCle(
|
||||
test=test_name,
|
||||
valeur=value,
|
||||
anomalie=anomalie,
|
||||
))
|
||||
|
||||
|
||||
def _extract_imagerie(text: str, dossier: DossierMedical) -> None:
|
||||
"""Extrait les résultats d'imagerie."""
|
||||
# TDM
|
||||
tdm_match = re.search(
|
||||
r"(?:TDM|[Ss]canner|tomodensitométrie).*?(?:retrouve|montre|objective)\s*[::]?\s*(.*?)(?=\n\s*(?:Cholécystectomie|Au total|Devenir|\n\n))",
|
||||
text,
|
||||
re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
if tdm_match:
|
||||
conclusion = tdm_match.group(1).strip()
|
||||
# Score de Balthazar
|
||||
score = None
|
||||
m = re.search(r"[Bb]althazar\s*(?:[àa=:])?\s*(\d+|[A-E])", text)
|
||||
if m:
|
||||
score = f"Balthazar {m.group(1)}"
|
||||
dossier.imagerie.append(Imagerie(
|
||||
type="TDM abdominal",
|
||||
conclusion=conclusion[:500],
|
||||
score=score,
|
||||
))
|
||||
|
||||
# Échographie
|
||||
echo_match = re.search(
|
||||
r"(?:[ée]cho(?:graphie)?)\s*.*?(?:retrouve|montre|objective)\s*[::]?\s*(.*?)(?=\n\n)",
|
||||
text,
|
||||
re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
if echo_match:
|
||||
dossier.imagerie.append(Imagerie(
|
||||
type="Échographie",
|
||||
conclusion=echo_match.group(1).strip()[:500],
|
||||
))
|
||||
|
||||
|
||||
def _extract_complications(
|
||||
text: str,
|
||||
dossier: DossierMedical,
|
||||
edsnlp_result: Optional[EdsnlpResult] = None,
|
||||
) -> None:
|
||||
"""Extrait les complications mentionnées."""
|
||||
text_lower = text.lower()
|
||||
|
||||
# Termes de négation détectés par edsnlp pour chaque entité
|
||||
edsnlp_negated_terms: set[str] = set()
|
||||
if edsnlp_result:
|
||||
for ent in edsnlp_result.cim10_entities:
|
||||
if ent.negation:
|
||||
edsnlp_negated_terms.add(ent.texte.lower())
|
||||
|
||||
complication_terms = [
|
||||
"éruption cutanée",
|
||||
"eruption cutanée",
|
||||
"fièvre",
|
||||
"infection",
|
||||
"hémorragie",
|
||||
"hématome",
|
||||
"abcès",
|
||||
"fistule",
|
||||
"iléus",
|
||||
"occlusion",
|
||||
]
|
||||
|
||||
for term in complication_terms:
|
||||
if term in text_lower:
|
||||
# Vérifier la négation via edsnlp d'abord
|
||||
if edsnlp_result and _is_negated_by_edsnlp(term, edsnlp_negated_terms):
|
||||
continue
|
||||
# Fallback regex pour la négation
|
||||
pattern = rf"(?:pas de|sans|absence de|aucun[e]?)\s+{re.escape(term)}"
|
||||
if not re.search(pattern, text_lower):
|
||||
dossier.complications.append(term.capitalize())
|
||||
|
||||
|
||||
def _is_negated_by_edsnlp(term: str, negated_terms: set[str]) -> bool:
|
||||
"""Vérifie si un terme est nié selon edsnlp."""
|
||||
term_lower = term.lower()
|
||||
for neg_term in negated_terms:
|
||||
if term_lower in neg_term or neg_term in term_lower:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _find_act_date(text: str, act_pattern: str) -> str | None:
|
||||
"""Trouve la date associée à un acte."""
|
||||
# Chercher "acte le DD/MM" ou "acte le DD/MM/YYYY"
|
||||
m = re.search(
|
||||
rf"{act_pattern}.*?(?:le\s+)?(\d{{2}}/\d{{2}}(?:/\d{{4}})?)",
|
||||
text,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if m:
|
||||
return m.group(1)
|
||||
|
||||
# Chercher dans la ligne d'observation juste avant
|
||||
m = re.search(
|
||||
rf"(\d{{2}}/\d{{2}}/\d{{4}}).*?{act_pattern}",
|
||||
text,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if m:
|
||||
return m.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def _lookup_cim10(text: str) -> str | None:
|
||||
"""Cherche un code CIM-10 pour un texte donné."""
|
||||
text_lower = text.lower().strip()
|
||||
for terme, code in CIM10_MAP.items():
|
||||
if terme in text_lower:
|
||||
return code
|
||||
return None
|
||||
|
||||
|
||||
def _is_abnormal(test: str, value: str) -> bool | None:
|
||||
"""Détermine si un résultat biologique est anormal."""
|
||||
try:
|
||||
val = float(value.replace(",", "."))
|
||||
except (ValueError, AttributeError):
|
||||
if value.lower() in ("négative", "negative", "normale", "normal"):
|
||||
return False
|
||||
if value.lower() in ("positive", "positif", "élevée", "elevee"):
|
||||
return True
|
||||
return None
|
||||
|
||||
normals: dict[str, tuple[float, float]] = {
|
||||
"Lipasémie": (0, 60),
|
||||
"CRP": (0, 5),
|
||||
"ASAT": (0, 40),
|
||||
"ALAT": (0, 40),
|
||||
"GGT": (0, 60),
|
||||
"PAL": (0, 150),
|
||||
"Bilirubine totale": (0, 17),
|
||||
}
|
||||
|
||||
if test in normals:
|
||||
lo, hi = normals[test]
|
||||
return val > hi or val < lo
|
||||
return None
|
||||
140
src/medical/edsnlp_pipeline.py
Normal file
140
src/medical/edsnlp_pipeline.py
Normal file
@@ -0,0 +1,140 @@
|
||||
"""Pipeline edsnlp pour l'extraction médicale (CIM-10, médicaments, négation)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_nlp = None
|
||||
_available = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CIM10Entity:
|
||||
texte: str
|
||||
code: str
|
||||
negation: bool = False
|
||||
hypothese: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class DrugEntity:
|
||||
texte: str
|
||||
code_atc: Optional[str] = None
|
||||
negation: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class DateEntity:
|
||||
texte: str
|
||||
value: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class EdsnlpResult:
|
||||
cim10_entities: list[CIM10Entity] = field(default_factory=list)
|
||||
drug_entities: list[DrugEntity] = field(default_factory=list)
|
||||
date_entities: list[DateEntity] = field(default_factory=list)
|
||||
|
||||
|
||||
def is_available() -> bool:
|
||||
"""Vérifie si edsnlp est installé et utilisable."""
|
||||
global _available
|
||||
if _available is not None:
|
||||
return _available
|
||||
try:
|
||||
import edsnlp # noqa: F401
|
||||
_available = True
|
||||
except ImportError:
|
||||
_available = False
|
||||
return _available
|
||||
|
||||
|
||||
def get_pipeline():
|
||||
"""Retourne le pipeline edsnlp (singleton lazy-loaded)."""
|
||||
global _nlp
|
||||
if _nlp is not None:
|
||||
return _nlp
|
||||
|
||||
if not is_available():
|
||||
raise RuntimeError("edsnlp n'est pas installé")
|
||||
|
||||
import edsnlp
|
||||
|
||||
logger.info("Initialisation du pipeline edsnlp...")
|
||||
nlp = edsnlp.blank("eds")
|
||||
|
||||
nlp.add_pipe("eds.normalizer")
|
||||
nlp.add_pipe("eds.sentences")
|
||||
nlp.add_pipe("eds.cim10", config=dict(attr="NORM", term_matcher="simstring"))
|
||||
nlp.add_pipe("eds.drugs", config=dict(attr="NORM", term_matcher="exact"))
|
||||
nlp.add_pipe("eds.negation")
|
||||
nlp.add_pipe("eds.hypothesis")
|
||||
nlp.add_pipe("eds.dates")
|
||||
|
||||
_nlp = nlp
|
||||
logger.info("Pipeline edsnlp initialisé avec succès")
|
||||
return _nlp
|
||||
|
||||
|
||||
def analyze(text: str) -> EdsnlpResult:
|
||||
"""Analyse un texte médical avec edsnlp.
|
||||
|
||||
Retourne les entités CIM-10, médicaments et dates détectées.
|
||||
"""
|
||||
result = EdsnlpResult()
|
||||
|
||||
if not is_available():
|
||||
return result
|
||||
|
||||
try:
|
||||
nlp = get_pipeline()
|
||||
doc = nlp(text)
|
||||
except Exception:
|
||||
logger.exception("Erreur lors de l'analyse edsnlp")
|
||||
return result
|
||||
|
||||
for ent in doc.ents:
|
||||
negation = getattr(ent._, "negation", False) or False
|
||||
hypothese = getattr(ent._, "hypothesis", False) or False
|
||||
|
||||
if ent.label_ == "cim10":
|
||||
code = ent.kb_id_ or ""
|
||||
if code:
|
||||
result.cim10_entities.append(CIM10Entity(
|
||||
texte=ent.text,
|
||||
code=code,
|
||||
negation=negation,
|
||||
hypothese=hypothese,
|
||||
))
|
||||
elif ent.label_ == "drug":
|
||||
code_atc = ent.kb_id_ or None
|
||||
result.drug_entities.append(DrugEntity(
|
||||
texte=ent.text,
|
||||
code_atc=code_atc,
|
||||
negation=negation,
|
||||
))
|
||||
|
||||
# Dates
|
||||
for span in doc.spans.get("dates", []):
|
||||
date_value = None
|
||||
if hasattr(span._, "date"):
|
||||
date_obj = span._.date
|
||||
if date_obj is not None:
|
||||
date_value = str(date_obj)
|
||||
result.date_entities.append(DateEntity(
|
||||
texte=span.text,
|
||||
value=date_value,
|
||||
))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def reset():
|
||||
"""Réinitialise le pipeline (utile pour les tests)."""
|
||||
global _nlp, _available
|
||||
_nlp = None
|
||||
_available = None
|
||||
Reference in New Issue
Block a user