feat: dictionnaire de codage + détection anomalies statistiques
- Script build_coding_dict.py génère le dictionnaire depuis le batch (240 dossiers) - coding_dictionary.json : co-occurrences DP→DAS, fréquences, associations bio - anomaly_stats.py : 8 checks (DP/DAS rare, DAS manquant, bio-DAS, âge atypique) - Intégré dans le pipeline cim10_extractor post-DIM-senior Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
206
src/medical/anomaly_stats.py
Normal file
206
src/medical/anomaly_stats.py
Normal file
@@ -0,0 +1,206 @@
|
||||
"""Detection d'anomalies statistiques dans le codage PMSI.
|
||||
|
||||
Compare le codage d'un dossier au dictionnaire de codage construit
|
||||
a partir du batch pour detecter les ecarts significatifs.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from ..config import DossierMedical
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_DICT_PATH = Path(__file__).resolve().parent.parent.parent / "config" / "coding_dictionary.json"
|
||||
_dict_cache: dict | None = None
|
||||
|
||||
|
||||
def _load_dict() -> dict:
|
||||
global _dict_cache
|
||||
if _dict_cache is not None:
|
||||
return _dict_cache
|
||||
try:
|
||||
_dict_cache = json.loads(_DICT_PATH.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
logger.warning("coding_dictionary.json introuvable — anomalies stats desactivees")
|
||||
_dict_cache = {}
|
||||
return _dict_cache
|
||||
|
||||
|
||||
def check_statistical_anomalies(dossier: DossierMedical) -> list[str]:
|
||||
"""Detecte les anomalies statistiques par rapport au dictionnaire de codage.
|
||||
|
||||
Returns:
|
||||
Liste d'alertes textuelles.
|
||||
"""
|
||||
cd = _load_dict()
|
||||
if not cd:
|
||||
return []
|
||||
|
||||
alerts: list[str] = []
|
||||
n_dossiers = cd.get("metadata", {}).get("n_dossiers", 1)
|
||||
|
||||
# --- Extraire les codes du dossier ---
|
||||
dp_code = ""
|
||||
if dossier.diagnostic_principal and dossier.diagnostic_principal.cim10_suggestion:
|
||||
dp_code = dossier.diagnostic_principal.cim10_suggestion
|
||||
|
||||
das_codes = set()
|
||||
for das in dossier.diagnostics_associes:
|
||||
if das.cim10_suggestion:
|
||||
das_codes.add(das.cim10_suggestion)
|
||||
|
||||
acte_codes = set()
|
||||
for acte in dossier.actes_ccam:
|
||||
code = getattr(acte, "code_ccam", None) or getattr(acte, "ccam_suggestion", None) or ""
|
||||
if code:
|
||||
acte_codes.add(code.upper())
|
||||
|
||||
bio_abnormal = set()
|
||||
for bio in dossier.biologie_cle:
|
||||
if bio.anomalie:
|
||||
bio_abnormal.add(bio.test)
|
||||
|
||||
duree = dossier.sejour.duree_sejour if dossier.sejour else None
|
||||
nb_das = len(dossier.diagnostics_associes)
|
||||
|
||||
dp_dict = cd.get("dp", {})
|
||||
das_dict = cd.get("das", {})
|
||||
cooc = cd.get("dp_das_cooccurrence", {})
|
||||
das_bio = cd.get("das_bio_association", {})
|
||||
|
||||
# --- 1. DP jamais vu dans le batch ---
|
||||
if dp_code and dp_code not in dp_dict:
|
||||
alerts.append(
|
||||
f"STATS [DP rare]: {dp_code} jamais observe dans le batch "
|
||||
f"({n_dossiers} dossiers) — verifier le codage"
|
||||
)
|
||||
|
||||
# --- 2. DAS jamais vu dans le batch ---
|
||||
for code in das_codes:
|
||||
if code not in das_dict:
|
||||
alerts.append(
|
||||
f"STATS [DAS rare]: {code} jamais observe dans le batch — "
|
||||
f"code potentiellement errone"
|
||||
)
|
||||
|
||||
# --- 3. DAS singleton ---
|
||||
# Desactive quand le batch est petit (< 500 dossiers) car trop de faux positifs.
|
||||
# Sera utile avec un dictionnaire construit sur 1000+ dossiers.
|
||||
if n_dossiers >= 500:
|
||||
for code in das_codes:
|
||||
entry = das_dict.get(code, {})
|
||||
if entry.get("freq", 0) == 1:
|
||||
alerts.append(
|
||||
f"STATS [DAS singleton]: {code} ({entry.get('texte', '?')}) "
|
||||
f"n'apparait qu'une seule fois dans le batch — a verifier"
|
||||
)
|
||||
|
||||
# --- 4. DAS attendus manquants pour ce DP ---
|
||||
if dp_code and dp_code in cooc:
|
||||
expected = cooc[dp_code]
|
||||
dp_freq = dp_dict.get(dp_code, {}).get("freq", 1)
|
||||
|
||||
for das_code, count in expected.items():
|
||||
ratio = count / dp_freq
|
||||
# Si ce DAS apparait dans >60% des dossiers avec ce DP et qu'il est absent
|
||||
if ratio >= 0.6 and das_code not in das_codes:
|
||||
das_texte = das_dict.get(das_code, {}).get("texte", "")
|
||||
alerts.append(
|
||||
f"STATS [DAS attendu manquant]: {das_code} ({das_texte}) "
|
||||
f"present dans {count}/{dp_freq} dossiers avec DP {dp_code} "
|
||||
f"mais absent ici"
|
||||
)
|
||||
|
||||
# --- 5. Combinaison DP+DAS jamais vue ---
|
||||
if dp_code and dp_code in cooc:
|
||||
known_das = set(cooc[dp_code].keys())
|
||||
# Aussi considerer les DAS vus dans d'autres DP
|
||||
all_known_das = set(das_dict.keys())
|
||||
|
||||
for code in das_codes:
|
||||
if code not in all_known_das:
|
||||
continue # Deja signale comme "jamais vu"
|
||||
if code not in known_das and dp_dict.get(dp_code, {}).get("freq", 0) >= 10:
|
||||
das_texte = das_dict.get(code, {}).get("texte", "")
|
||||
alerts.append(
|
||||
f"STATS [combinaison inedite]: {dp_code} + {code} ({das_texte}) "
|
||||
f"jamais observe ensemble dans le batch"
|
||||
)
|
||||
|
||||
# --- 6. Ratio DAS/duree atypique ---
|
||||
if duree is not None and duree >= 3:
|
||||
# Heuristique : on attend ~1-2 DAS par jour de sejour
|
||||
expected_min = max(1, duree // 4)
|
||||
expected_max = max(10, duree * 3)
|
||||
|
||||
if nb_das < expected_min and duree >= 7:
|
||||
alerts.append(
|
||||
f"STATS [sous-codage]: {nb_das} DAS pour {duree} jours de sejour "
|
||||
f"(attendu >= {expected_min})"
|
||||
)
|
||||
elif nb_das > expected_max:
|
||||
alerts.append(
|
||||
f"STATS [sur-codage]: {nb_das} DAS pour {duree} jours de sejour "
|
||||
f"(attendu <= {expected_max}) — exces de codes"
|
||||
)
|
||||
|
||||
# --- 7. Bio anormale sans DAS correspondant ---
|
||||
bio_das_expected = {
|
||||
"Créatinine": ["N17", "N18", "N19"],
|
||||
"Hémoglobine": ["D50", "D62", "D64"],
|
||||
"Plaquettes": ["D69"],
|
||||
"Troponine": ["I21", "I20", "I25"],
|
||||
"BNP": ["I50", "I11"],
|
||||
"ALAT": ["K71", "K72", "K73", "K74", "K75", "K76"],
|
||||
"ASAT": ["K71", "K72", "K73", "K74", "K75", "K76"],
|
||||
"TSH": ["E03", "E05"],
|
||||
"Lipasémie": ["K85"],
|
||||
"CRP": [], # Trop non-specifique
|
||||
"Leucocytes": [], # Trop non-specifique
|
||||
}
|
||||
|
||||
for bio_test in bio_abnormal:
|
||||
expected_prefixes = bio_das_expected.get(bio_test, [])
|
||||
if not expected_prefixes:
|
||||
continue
|
||||
all_codes = das_codes | ({dp_code} if dp_code else set())
|
||||
has_match = any(
|
||||
any(c.startswith(p) for p in expected_prefixes)
|
||||
for c in all_codes
|
||||
)
|
||||
if not has_match:
|
||||
# Verifier dans le dico si cette association est frequente
|
||||
for das3 in expected_prefixes[:1]:
|
||||
bio_assoc = das_bio.get(das3, {})
|
||||
if bio_test in bio_assoc and bio_assoc[bio_test] >= 5:
|
||||
alerts.append(
|
||||
f"STATS [bio-DAS incoherent]: {bio_test} anormal mais aucun code "
|
||||
f"{'/'.join(expected_prefixes[:3])} dans le codage "
|
||||
f"(observe {bio_assoc[bio_test]}x dans le batch)"
|
||||
)
|
||||
break
|
||||
|
||||
# --- 8. Age atypique pour le DP ---
|
||||
if dp_code and dp_code in dp_dict:
|
||||
dp_entry = dp_dict[dp_code]
|
||||
age = dossier.sejour.age if dossier.sejour else None
|
||||
age_min = dp_entry.get("age_min")
|
||||
age_max = dp_entry.get("age_max")
|
||||
age_moy = dp_entry.get("age_moy")
|
||||
if age is not None and age_min is not None and age_max is not None:
|
||||
# Alerte si l'age est tres eloigne de la fourchette observee
|
||||
margin = max(10, (age_max - age_min) * 0.3)
|
||||
if age < age_min - margin or age > age_max + margin:
|
||||
alerts.append(
|
||||
f"STATS [age atypique]: Patient {age} ans pour DP {dp_code} "
|
||||
f"(observe {age_min}-{age_max} ans, moy {age_moy})"
|
||||
)
|
||||
|
||||
# Score
|
||||
dossier.quality_flags["stats_anomaly_count"] = len(alerts)
|
||||
|
||||
return alerts
|
||||
@@ -219,6 +219,14 @@ def extract_medical_info(
|
||||
except Exception:
|
||||
logger.error("DIM-SENIOR: erreur détection erreurs fréquentes", exc_info=True)
|
||||
|
||||
# Post-processing : anomalies statistiques (dictionnaire de codage)
|
||||
try:
|
||||
from .anomaly_stats import check_statistical_anomalies
|
||||
stats_alerts = check_statistical_anomalies(dossier)
|
||||
dossier.alertes_codage.extend(stats_alerts)
|
||||
except Exception:
|
||||
logger.error("STATS: erreur détection anomalies statistiques", exc_info=True)
|
||||
|
||||
# Post-processing : resélection DP si exclu par vetos/exclusions
|
||||
if dossier.document_type != "trackare":
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user