feat: dictionnaire CIM-10 complet (10 893 codes) + robustesse regex
- Nouveau module cim10_dict.py : extraction depuis metadata.json FAISS, lookup intelligent avec normalisation Unicode (accents, trémas, apostrophes) - cim10_extractor : _lookup_cim10 utilise le dictionnaire complet, _find_dp normalisé, _find_das élargi à 20 patterns (cardio, métabo, infectieux, rénal...), biologie +6 tests (TGO/TGP, Hb, créatinine), traitements sans limite de lignes - document_classifier : scoring pondéré, classify_with_confidence(), scan 5000 chars - CLI --build-dict pour regénérer data/cim10_dict.json - 32 nouveaux tests unitaires (124 total, 0 échec) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,44 +2,93 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClassificationResult:
|
||||
"""Résultat de classification avec score de confiance."""
|
||||
doc_type: str
|
||||
confidence: float
|
||||
scores: dict[str, float]
|
||||
|
||||
|
||||
# Marqueurs pondérés : (texte, poids)
|
||||
_TRACKARE_MARKERS: list[tuple[str, int]] = [
|
||||
("ipp:", 3),
|
||||
("episode no:", 3),
|
||||
("dossier patient", 2),
|
||||
("détails des patients", 2),
|
||||
("détails épisode", 2),
|
||||
("liste des contacts", 1),
|
||||
("notes paramédicales", 1),
|
||||
("signes vitaux", 1),
|
||||
("traitements médicamenteux", 1),
|
||||
("observations médicales", 1),
|
||||
("constantes", 1),
|
||||
("prescriptions", 1),
|
||||
("presc. de sortie", 2),
|
||||
("type de note", 1),
|
||||
]
|
||||
|
||||
_CRH_MARKERS: list[tuple[str, int]] = [
|
||||
("mon cher confrère", 3),
|
||||
("cher confrère", 3),
|
||||
("chère consœur", 3),
|
||||
("compte rendu d'hospitalisation", 3),
|
||||
("compte-rendu", 2),
|
||||
("service de gastro", 2),
|
||||
("service de chirurgie", 2),
|
||||
("service de médecine", 2),
|
||||
("pôle spécialités", 1),
|
||||
("votre patient", 2),
|
||||
("votre patiente", 2),
|
||||
("au total", 1),
|
||||
("ttt de sortie", 1),
|
||||
("devenir", 1),
|
||||
("cordialement", 1),
|
||||
]
|
||||
|
||||
_SCAN_LENGTH = 5000
|
||||
|
||||
|
||||
def classify_with_confidence(text: str) -> ClassificationResult:
|
||||
"""Classifie un document avec un score de confiance.
|
||||
|
||||
Retourne un ClassificationResult avec le type, la confiance (0.0-1.0),
|
||||
et les scores détaillés.
|
||||
"""
|
||||
text_lower = text[:_SCAN_LENGTH].lower()
|
||||
|
||||
trackare_score = sum(weight for marker, weight in _TRACKARE_MARKERS if marker in text_lower)
|
||||
crh_score = sum(weight for marker, weight in _CRH_MARKERS if marker in text_lower)
|
||||
|
||||
total = trackare_score + crh_score
|
||||
if total == 0:
|
||||
return ClassificationResult(doc_type="crh", confidence=0.5, scores={"trackare": 0, "crh": 0})
|
||||
|
||||
if trackare_score > crh_score:
|
||||
confidence = trackare_score / total
|
||||
doc_type = "trackare"
|
||||
elif crh_score > trackare_score:
|
||||
confidence = crh_score / total
|
||||
doc_type = "crh"
|
||||
else:
|
||||
# Égalité — défaut CRH
|
||||
confidence = 0.5
|
||||
doc_type = "crh"
|
||||
|
||||
return ClassificationResult(
|
||||
doc_type=doc_type,
|
||||
confidence=round(confidence, 2),
|
||||
scores={"trackare": trackare_score, "crh": crh_score},
|
||||
)
|
||||
|
||||
|
||||
def classify(text: str) -> str:
|
||||
"""Classifie un document extrait en CRH ou Trackare.
|
||||
|
||||
Retourne "crh" ou "trackare".
|
||||
Signature inchangée pour rétrocompatibilité.
|
||||
"""
|
||||
text_lower = text[:3000].lower()
|
||||
|
||||
trackare_markers = [
|
||||
"dossier patient",
|
||||
"détails des patients",
|
||||
"détails épisode",
|
||||
"liste des contacts",
|
||||
"notes paramédicales",
|
||||
"signes vitaux",
|
||||
"traitements médicamenteux",
|
||||
"observations médicales",
|
||||
]
|
||||
trackare_score = sum(1 for m in trackare_markers if m in text_lower)
|
||||
|
||||
crh_markers = [
|
||||
"mon cher confrère",
|
||||
"cher confrère",
|
||||
"compte rendu d'hospitalisation",
|
||||
"compte-rendu",
|
||||
"service de gastro",
|
||||
"pôle spécialités",
|
||||
"votre patient",
|
||||
]
|
||||
crh_score = sum(1 for m in crh_markers if m in text_lower)
|
||||
|
||||
if trackare_score >= 2:
|
||||
return "trackare"
|
||||
if crh_score >= 2:
|
||||
return "crh"
|
||||
|
||||
# Heuristique : Trackare contient des tableaux avec IPP
|
||||
if "ipp:" in text_lower or "episode no:" in text_lower:
|
||||
return "trackare"
|
||||
|
||||
return "crh"
|
||||
return classify_with_confidence(text).doc_type
|
||||
|
||||
Reference in New Issue
Block a user