feat: dictionnaire CIM-10 complet (10 893 codes) + robustesse regex
- Nouveau module cim10_dict.py : extraction depuis metadata.json FAISS, lookup intelligent avec normalisation Unicode (accents, trémas, apostrophes) - cim10_extractor : _lookup_cim10 utilise le dictionnaire complet, _find_dp normalisé, _find_das élargi à 20 patterns (cardio, métabo, infectieux, rénal...), biologie +6 tests (TGO/TGP, Hb, créatinine), traitements sans limite de lignes - document_classifier : scoring pondéré, classify_with_confidence(), scan 5000 chars - CLI --build-dict pour regénérer data/cim10_dict.json - 32 nouveaux tests unitaires (124 total, 0 échec) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -8,6 +8,8 @@ from src.medical.cim10_extractor import (
|
||||
_lookup_cim10,
|
||||
_is_abnormal,
|
||||
)
|
||||
from src.medical.cim10_dict import normalize_text, load_dict, lookup, reset_cache
|
||||
from src.extraction.document_classifier import classify, classify_with_confidence
|
||||
|
||||
|
||||
class TestCIM10Lookup:
|
||||
@@ -236,3 +238,221 @@ Devenir : sortie le 03/03."""
|
||||
complication_terms = [c.lower() for c in dossier.complications]
|
||||
assert "fièvre" not in complication_terms
|
||||
assert "infection" not in complication_terms
|
||||
|
||||
|
||||
# === Nouveaux tests : dictionnaire CIM-10, normalisation, robustesse ===
|
||||
|
||||
|
||||
class TestCIM10Dict:
|
||||
"""Tests pour le chargement du dictionnaire CIM-10 complet."""
|
||||
|
||||
def test_load_dict_not_empty(self):
|
||||
d = load_dict()
|
||||
assert len(d) > 10000
|
||||
|
||||
def test_known_codes_present(self):
|
||||
d = load_dict()
|
||||
assert "K85.1" in d
|
||||
assert "K80.5" in d
|
||||
assert "I10" in d
|
||||
assert "E66.0" in d
|
||||
assert "L27.0" in d
|
||||
|
||||
def test_labels_non_empty(self):
|
||||
d = load_dict()
|
||||
for code, label in list(d.items())[:100]:
|
||||
assert label, f"Label vide pour {code}"
|
||||
|
||||
|
||||
class TestNormalizeText:
|
||||
"""Tests pour normalize_text : accents, casse, whitespace."""
|
||||
|
||||
def test_accents_removed(self):
|
||||
assert normalize_text("Pancréatite") == "pancreatite"
|
||||
|
||||
def test_lowercase(self):
|
||||
assert normalize_text("PANCRÉATITE AIGUË") == "pancreatite aigue"
|
||||
|
||||
def test_whitespace_collapsed(self):
|
||||
assert normalize_text(" pancréatite aiguë ") == "pancreatite aigue"
|
||||
|
||||
def test_trema(self):
|
||||
assert normalize_text("aigüe") == "aigue"
|
||||
|
||||
def test_mixed(self):
|
||||
assert normalize_text("Éruption Cutanée Médicamenteuse") == "eruption cutanee medicamenteuse"
|
||||
|
||||
|
||||
class TestDictLookup:
|
||||
"""Tests pour lookup : priorité domain override, match exact, substring."""
|
||||
|
||||
def test_domain_override_priority(self):
|
||||
"""CIM10_MAP (override) a priorité sur le dictionnaire complet."""
|
||||
override = {"pancréatite aiguë biliaire": "K85.1"}
|
||||
result = lookup("pancréatite aiguë biliaire", domain_overrides=override)
|
||||
assert result == "K85.1"
|
||||
|
||||
def test_exact_normalized_match(self):
|
||||
"""Match exact normalisé dans le dictionnaire complet."""
|
||||
# "Hypertension essentielle (primitive)" est le label exact de I10
|
||||
result = lookup("Hypertension essentielle (primitive)")
|
||||
assert result == "I10"
|
||||
|
||||
def test_substring_match(self):
|
||||
"""Match substring normalisé."""
|
||||
result = lookup("patient avec cholécystite aiguë sévère")
|
||||
assert result == "K81.0"
|
||||
|
||||
def test_unknown_returns_none(self):
|
||||
result = lookup("texte complètement inconnu xyz123")
|
||||
assert result is None
|
||||
|
||||
def test_accent_insensitive(self):
|
||||
"""La recherche ignore les accents."""
|
||||
result = lookup("pancreatite aigue d'origine biliaire")
|
||||
assert result == "K85.1"
|
||||
|
||||
|
||||
class TestDiagnosticAccentVariations:
|
||||
"""Tests pour la détection de diagnostics avec variations d'accents."""
|
||||
|
||||
def _extract(self, text: str) -> DossierMedical:
|
||||
parsed = {
|
||||
"type": "crh",
|
||||
"patient": {"sexe": "M"},
|
||||
"sejour": {},
|
||||
"diagnostics": [],
|
||||
}
|
||||
return extract_medical_info(parsed, text)
|
||||
|
||||
def test_pancreatite_sans_accents(self):
|
||||
dossier = self._extract("Pancreatite aigue biliaire.\nDevenir : retour.")
|
||||
assert dossier.diagnostic_principal is not None
|
||||
assert dossier.diagnostic_principal.cim10_suggestion == "K85.1"
|
||||
|
||||
def test_pancreatite_trema(self):
|
||||
dossier = self._extract("Pancréatite aigüe biliaire.\nDevenir : retour.")
|
||||
assert dossier.diagnostic_principal is not None
|
||||
assert dossier.diagnostic_principal.cim10_suggestion == "K85.1"
|
||||
|
||||
def test_pancreatite_majuscules(self):
|
||||
dossier = self._extract("PANCREATITE AIGUE BILIAIRE.\nDevenir : retour.")
|
||||
assert dossier.diagnostic_principal is not None
|
||||
assert dossier.diagnostic_principal.cim10_suggestion == "K85.1"
|
||||
|
||||
def test_hta_as_das(self):
|
||||
"""HTA détectée comme DAS même sans accent."""
|
||||
dossier = self._extract("Douleur abdominale.\nhypertension arterielle connue.\nDevenir : retour.")
|
||||
codes = {d.cim10_suggestion for d in dossier.diagnostics_associes}
|
||||
assert "I10" in codes
|
||||
|
||||
|
||||
class TestBiologieEdgeCases:
|
||||
"""Tests pour l'extraction biologie avec variantes."""
|
||||
|
||||
def _extract_bio(self, text: str) -> list:
|
||||
parsed = {
|
||||
"type": "crh",
|
||||
"patient": {"sexe": "M"},
|
||||
"sejour": {},
|
||||
"diagnostics": [],
|
||||
}
|
||||
dossier = extract_medical_info(parsed, text)
|
||||
return dossier.biologie_cle
|
||||
|
||||
def test_crp_with_unit(self):
|
||||
bio = self._extract_bio("CRP=45 mg/L")
|
||||
assert any(b.test == "CRP" and b.valeur == "45" for b in bio)
|
||||
|
||||
def test_lipasemie_ui_l(self):
|
||||
bio = self._extract_bio("Lipasémie à 850 UI/L")
|
||||
assert any(b.test == "Lipasémie" and b.valeur == "850" for b in bio)
|
||||
|
||||
def test_troponine_us(self):
|
||||
bio = self._extract_bio("Troponine us négative")
|
||||
assert any(b.test == "Troponine" and b.valeur == "négative" for b in bio)
|
||||
|
||||
def test_hb_shorthand(self):
|
||||
bio = self._extract_bio("Hb = 11.5 g/dL")
|
||||
assert any(b.test == "Hémoglobine" and b.valeur == "11.5" for b in bio)
|
||||
|
||||
def test_tgo_alias(self):
|
||||
bio = self._extract_bio("TGO = 120 UI/L")
|
||||
assert any(b.test == "ASAT" and b.valeur == "120" for b in bio)
|
||||
|
||||
def test_creatinine(self):
|
||||
bio = self._extract_bio("Créatinine à 95 µmol/L")
|
||||
assert any(b.test == "Créatinine" and b.valeur == "95" for b in bio)
|
||||
|
||||
|
||||
class TestTraitementEdgeCases:
|
||||
"""Tests pour l'extraction des traitements."""
|
||||
|
||||
def _extract_ttt(self, text: str) -> list:
|
||||
parsed = {
|
||||
"type": "crh",
|
||||
"patient": {"sexe": "M"},
|
||||
"sejour": {},
|
||||
"diagnostics": [],
|
||||
}
|
||||
dossier = extract_medical_info(parsed, text)
|
||||
return dossier.traitements_sortie
|
||||
|
||||
def test_more_than_10_medications(self):
|
||||
"""Vérifie que la limite de 10 est supprimée."""
|
||||
meds = "\n".join(f"Médicament{i} 100mg matin" for i in range(15))
|
||||
text = f"TTT de sortie :\n{meds}\n\nDevenir : retour."
|
||||
ttt = self._extract_ttt(text)
|
||||
assert len(ttt) >= 15
|
||||
|
||||
def test_posologie_sachet(self):
|
||||
text = "TTT de sortie :\nMovicol 1 sachet matin\n\nDevenir : retour."
|
||||
ttt = self._extract_ttt(text)
|
||||
assert len(ttt) >= 1
|
||||
|
||||
def test_posologie_x_par_jour(self):
|
||||
text = "TTT de sortie :\nParacétamol 1g 3x/jour\n\nDevenir : retour."
|
||||
ttt = self._extract_ttt(text)
|
||||
assert len(ttt) >= 1
|
||||
assert ttt[0].posologie is not None
|
||||
|
||||
def test_stop_on_footer(self):
|
||||
text = "TTT de sortie :\nParacétamol\nDoliprane\nDr Martin signature\nAutre médicament\n\nDevenir : retour."
|
||||
ttt = self._extract_ttt(text)
|
||||
meds = [t.medicament for t in ttt]
|
||||
assert "Autre médicament" not in meds
|
||||
|
||||
def test_pendant_x_jours(self):
|
||||
text = "TTT de sortie :\nAmoxicilline 1g pendant 7 jours\n\nDevenir : retour."
|
||||
ttt = self._extract_ttt(text)
|
||||
assert len(ttt) >= 1
|
||||
assert ttt[0].posologie is not None
|
||||
assert "7 jours" in ttt[0].posologie
|
||||
|
||||
|
||||
class TestClassifierConfidence:
|
||||
"""Tests pour classify_with_confidence."""
|
||||
|
||||
def test_high_confidence_trackare(self):
|
||||
text = "Dossier Patient\nIPP: 12345\nDétails épisode\nEpisode No: 67890\nSignes vitaux\n"
|
||||
result = classify_with_confidence(text)
|
||||
assert result.doc_type == "trackare"
|
||||
assert result.confidence >= 0.7
|
||||
|
||||
def test_high_confidence_crh(self):
|
||||
text = "Mon cher confrère,\nCompte rendu d'hospitalisation\nVotre patient a été admis dans le service de gastro\n"
|
||||
result = classify_with_confidence(text)
|
||||
assert result.doc_type == "crh"
|
||||
assert result.confidence >= 0.7
|
||||
|
||||
def test_ambiguous_case(self):
|
||||
text = "Document médical quelconque sans marqueurs spécifiques."
|
||||
result = classify_with_confidence(text)
|
||||
assert result.confidence <= 0.6
|
||||
|
||||
def test_backward_compatible(self):
|
||||
"""classify() retourne toujours une string."""
|
||||
text = "Dossier Patient\nIPP: 12345\n"
|
||||
result = classify(text)
|
||||
assert isinstance(result, str)
|
||||
assert result in ("crh", "trackare")
|
||||
|
||||
Reference in New Issue
Block a user