feat: dictionnaire CIM-10 complet (10 893 codes) + robustesse regex

- Nouveau module cim10_dict.py : extraction depuis metadata.json FAISS, lookup intelligent avec normalisation Unicode (accents, trémas, apostrophes) - cim10_extractor : _lookup_cim10 utilise le dictionnaire complet, _find_dp normalisé, _find_das élargi à 20 patterns (cardio, métabo, infectieux, rénal...), biologie +6 tests (TGO/TGP, Hb, créatinine), traitements sans limite de lignes - document_classifier : scoring pondéré, classify_with_confidence(), scan 5000 chars - CLI --build-dict pour regénérer data/cim10_dict.json - 32 nouveaux tests unitaires (124 total, 0 échec) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 08:09:32 +01:00
parent 037d255aa0
commit 12f4479cd2
6 changed files with 608 additions and 91 deletions
--- a/src/extraction/document_classifier.py
+++ b/src/extraction/document_classifier.py
@@ -2,44 +2,93 @@

 from __future__ import annotations

+from dataclasses import dataclass
+
+
+@dataclass
+class ClassificationResult:
+    """Résultat de classification avec score de confiance."""
+    doc_type: str
+    confidence: float
+    scores: dict[str, float]
+
+
+# Marqueurs pondérés : (texte, poids)
+_TRACKARE_MARKERS: list[tuple[str, int]] = [
+    ("ipp:", 3),
+    ("episode no:", 3),
+    ("dossier patient", 2),
+    ("détails des patients", 2),
+    ("détails épisode", 2),
+    ("liste des contacts", 1),
+    ("notes paramédicales", 1),
+    ("signes vitaux", 1),
+    ("traitements médicamenteux", 1),
+    ("observations médicales", 1),
+    ("constantes", 1),
+    ("prescriptions", 1),
+    ("presc. de sortie", 2),
+    ("type de note", 1),
+]
+
+_CRH_MARKERS: list[tuple[str, int]] = [
+    ("mon cher confrère", 3),
+    ("cher confrère", 3),
+    ("chère consœur", 3),
+    ("compte rendu d'hospitalisation", 3),
+    ("compte-rendu", 2),
+    ("service de gastro", 2),
+    ("service de chirurgie", 2),
+    ("service de médecine", 2),
+    ("pôle spécialités", 1),
+    ("votre patient", 2),
+    ("votre patiente", 2),
+    ("au total", 1),
+    ("ttt de sortie", 1),
+    ("devenir", 1),
+    ("cordialement", 1),
+]
+
+_SCAN_LENGTH = 5000
+
+
+def classify_with_confidence(text: str) -> ClassificationResult:
+    """Classifie un document avec un score de confiance.
+
+    Retourne un ClassificationResult avec le type, la confiance (0.0-1.0),
+    et les scores détaillés.
+    """
+    text_lower = text[:_SCAN_LENGTH].lower()
+
+    trackare_score = sum(weight for marker, weight in _TRACKARE_MARKERS if marker in text_lower)
+    crh_score = sum(weight for marker, weight in _CRH_MARKERS if marker in text_lower)
+
+    total = trackare_score + crh_score
+    if total == 0:
+        return ClassificationResult(doc_type="crh", confidence=0.5, scores={"trackare": 0, "crh": 0})
+
+    if trackare_score > crh_score:
+        confidence = trackare_score / total
+        doc_type = "trackare"
+    elif crh_score > trackare_score:
+        confidence = crh_score / total
+        doc_type = "crh"
+    else:
+        # Égalité — défaut CRH
+        confidence = 0.5
+        doc_type = "crh"
+
+    return ClassificationResult(
+        doc_type=doc_type,
+        confidence=round(confidence, 2),
+        scores={"trackare": trackare_score, "crh": crh_score},
+    )
+

 def classify(text: str) -> str:
    """Classifie un document extrait en CRH ou Trackare.

    Retourne "crh" ou "trackare".
+    Signature inchangée pour rétrocompatibilité.
    """
-    text_lower = text[:3000].lower()
-
-    trackare_markers = [
-        "dossier patient",
-        "détails des patients",
-        "détails épisode",
-        "liste des contacts",
-        "notes paramédicales",
-        "signes vitaux",
-        "traitements médicamenteux",
-        "observations médicales",
-    ]
-    trackare_score = sum(1 for m in trackare_markers if m in text_lower)
-
-    crh_markers = [
-        "mon cher confrère",
-        "cher confrère",
-        "compte rendu d'hospitalisation",
-        "compte-rendu",
-        "service de gastro",
-        "pôle spécialités",
-        "votre patient",
-    ]
-    crh_score = sum(1 for m in crh_markers if m in text_lower)
-
-    if trackare_score >= 2:
-        return "trackare"
-    if crh_score >= 2:
-        return "crh"
-
-    # Heuristique : Trackare contient des tableaux avec IPP
-    if "ipp:" in text_lower or "episode no:" in text_lower:
-        return "trackare"
-
-    return "crh"
+    return classify_with_confidence(text).doc_type