feat: dictionnaire de codage + détection anomalies statistiques

- Script build_coding_dict.py génère le dictionnaire depuis le batch (240 dossiers) - coding_dictionary.json : co-occurrences DP→DAS, fréquences, associations bio - anomaly_stats.py : 8 checks (DP/DAS rare, DAS manquant, bio-DAS, âge atypique) - Intégré dans le pipeline cim10_extractor post-DIM-senior Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 23:48:36 +01:00
parent 13fe9fa666
commit a371626f40
4 changed files with 5049 additions and 0 deletions
--- a/scripts/build_coding_dict.py
+++ b/scripts/build_coding_dict.py
@@ -0,0 +1,314 @@
+#!/usr/bin/env python3
+"""Construit le dictionnaire de codage a partir des resultats du batch.
+
+Parcourt output/structured/ et genere config/coding_dictionary.json
+avec les co-occurrences, frequences et associations observees.
+
+Usage:
+    python3 scripts/build_coding_dict.py [--output config/coding_dictionary.json]
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import sys
+from collections import Counter, defaultdict
+from pathlib import Path
+
+# Heuristique : filtrer les vrais medicaments dans les traitements
+_MED_SUFFIXES = re.compile(
+    r"(ine|ide|ol|one|ate|ase|mab|nib|zol|pam|lam|zide|pine|pril|tan|"
+    r"oxine|xone|dine|mide|fene|phene|mine|sone|lone|done|cine|il|"
+    r"lin|ril|mox|tine|zine|vir|cin)$",
+    re.IGNORECASE,
+)
+_MED_KNOWN = {
+    "insuline", "heparine", "paracetamol", "doliprane", "aspirine",
+    "augmentin", "ceftriaxone", "amoxicilline", "metformine", "amlodipine",
+    "ramipril", "bisoprolol", "furosemide", "lasilix", "kardegic",
+    "lovenox", "spasfon", "perfalgan", "morphine", "tramadol",
+    "ketoprofene", "profenid", "omeprazole", "pantoprazole", "lanzor",
+    "atorvastatine", "simvastatine", "levothyrox", "cordarone",
+    "amiodarone", "digoxine", "warfarine", "coumadine", "xarelto",
+    "eliquis", "pradaxa", "dabigatran", "rivaroxaban", "apixaban",
+    "methotrexate", "salbutamol", "ventoline", "seretide", "spiriva",
+    "cortancyl", "prednisone", "prednisolone", "solupred", "celestene",
+    "dexamethasone", "hydrocortisone", "zymad", "uvedose", "calcidose",
+    "diffu-k", "potassium", "magnesium", "fer", "tardyferon", "speciafoldine",
+    "acide folique", "vitamine", "enoxaparine", "tinzaparine", "fondaparinux",
+    "arixtra", "clopidogrel", "plavix", "ticagrelor", "brilique",
+}
+
+
+def _is_medication(text: str) -> str | None:
+    """Extrait le nom du medicament si c'est un vrai traitement."""
+    if not text or len(text) < 3:
+        return None
+    # Nettoyer
+    words = text.strip().lower().split()
+    if not words:
+        return None
+    first = words[0].rstrip(".,;:")
+
+    # Rejeter les phrases (>4 mots sans chiffre de posologie)
+    if len(words) > 6 and not any(c.isdigit() for c in text):
+        return None
+
+    # Rejeter les patterns evidents de non-medicament
+    reject_starts = (
+        "ce document", "parents", "il pourra", "document",
+        "prévoir", "réévaluation", "evènement", "transfusion",
+        "note", "consultation", "histoire", "pas de", "suite",
+        "dr.", "mme", "mr.", "bilan", "a revoir", "rdv",
+    )
+    text_lower = text.lower().strip()
+    if any(text_lower.startswith(r) for r in reject_starts):
+        return None
+
+    # Check connu
+    if first in _MED_KNOWN:
+        return first
+    for known in _MED_KNOWN:
+        if known in text_lower[:40]:
+            return known
+
+    # Check suffixe
+    if _MED_SUFFIXES.search(first) and len(first) >= 4:
+        return first
+
+    return None
+
+
+def load_dossiers(structured_dir: str) -> list[dict]:
+    """Charge tous les dossiers uniques depuis output/structured/."""
+    dossiers = []
+    seen_nda = set()
+
+    for d in sorted(os.listdir(structured_dir)):
+        full = os.path.join(structured_dir, d)
+        if not os.path.isdir(full) or d == "pseudonymise":
+            continue
+        if "_" not in d:
+            continue
+
+        for f in os.listdir(full):
+            if f.endswith("_cim10.json"):
+                try:
+                    data = json.load(open(os.path.join(full, f)))
+                    nda = d.split("_", 1)[1]
+                    if nda not in seen_nda:
+                        seen_nda.add(nda)
+                        dossiers.append(data)
+                except Exception:
+                    pass
+
+    return dossiers
+
+
+def build_dictionary(dossiers: list[dict]) -> dict:
+    """Construit le dictionnaire de codage."""
+    dp_freq = Counter()
+    das_freq = Counter()
+    dp_das = defaultdict(Counter)
+    dp_acte = defaultdict(Counter)
+    das_bio = defaultdict(Counter)
+    das_treatment = defaultdict(Counter)
+    dp_texte = {}  # dp_code -> texte le plus frequent
+    das_texte = {}
+    dp_texte_counter = defaultdict(Counter)
+    das_texte_counter = defaultdict(Counter)
+    duree_das = []
+    age_dp = defaultdict(list)
+
+    for data in dossiers:
+        dp = data.get("diagnostic_principal", {})
+        dp_code = (dp.get("cim10_final") or dp.get("cim10_suggestion") or "").strip()
+        dp_text = (dp.get("texte") or "").strip()
+
+        das_list = data.get("diagnostics_associes", [])
+        das_codes = []
+        for das in das_list:
+            c = (das.get("cim10_final") or das.get("cim10_suggestion") or "").strip()
+            t = (das.get("texte") or "").strip()
+            if c:
+                das_codes.append(c)
+                das_freq[c] += 1
+                if t:
+                    das_texte_counter[c][t] += 1
+
+        if dp_code:
+            dp_freq[dp_code] += 1
+            if dp_text:
+                dp_texte_counter[dp_code][dp_text] += 1
+
+            for c in das_codes:
+                dp_das[dp_code][c] += 1
+
+        # Actes
+        for a in data.get("actes_ccam", []):
+            code = (
+                a.get("code_ccam")
+                or a.get("ccam_suggestion")
+                or a.get("code_ccam_suggestion")
+                or ""
+            ).strip()
+            if code and dp_code:
+                dp_acte[dp_code][code] += 1
+
+        # Bio anormale -> DAS
+        abnormal = [
+            b.get("test", "")
+            for b in data.get("biologie_cle", [])
+            if b.get("anomalie")
+        ]
+        for c in das_codes:
+            c3 = c[:3]
+            for bt in abnormal:
+                if bt:
+                    das_bio[c3][bt] += 1
+
+        # Traitements -> DAS
+        for t in data.get("traitements_sortie", []):
+            med = _is_medication(t.get("medicament", ""))
+            if med:
+                for c in das_codes:
+                    das_treatment[c[:3]][med] += 1
+
+        # Metadata
+        sejour = data.get("sejour", {})
+        duree = sejour.get("duree_sejour")
+        age = sejour.get("age")
+        if duree is not None:
+            duree_das.append((duree, len(das_codes)))
+        if age is not None and dp_code:
+            age_dp[dp_code].append(age)
+
+    # Texte le plus frequent par code
+    for code, counter in dp_texte_counter.items():
+        dp_texte[code] = counter.most_common(1)[0][0]
+    for code, counter in das_texte_counter.items():
+        das_texte[code] = counter.most_common(1)[0][0]
+
+    # Construire le dico final
+    dictionary = {
+        "metadata": {
+            "n_dossiers": len(dossiers),
+            "n_dp_distinct": len(dp_freq),
+            "n_das_distinct": len(das_freq),
+            "version": 1,
+        },
+        "dp": {},
+        "das": {},
+        "dp_das_cooccurrence": {},
+        "dp_acte_cooccurrence": {},
+        "das_bio_association": {},
+        "das_treatment_association": {},
+    }
+
+    # DP
+    for code, n in dp_freq.most_common():
+        entry = {"freq": n, "texte": dp_texte.get(code, "")}
+        ages = age_dp.get(code, [])
+        if ages:
+            entry["age_moy"] = round(sum(ages) / len(ages), 1)
+            entry["age_min"] = min(ages)
+            entry["age_max"] = max(ages)
+        dictionary["dp"][code] = entry
+
+    # DAS
+    for code, n in das_freq.most_common():
+        dictionary["das"][code] = {
+            "freq": n,
+            "texte": das_texte.get(code, ""),
+            "pct": round(100 * n / len(dossiers), 1),
+        }
+
+    # Co-occurrences DP->DAS (seuil >= 2)
+    for dp_code, das_counter in dp_das.items():
+        pairs = {
+            das_code: count
+            for das_code, count in das_counter.most_common(30)
+            if count >= 2
+        }
+        if pairs:
+            dictionary["dp_das_cooccurrence"][dp_code] = pairs
+
+    # Co-occurrences DP->ACTE
+    for dp_code, acte_counter in dp_acte.items():
+        pairs = {
+            acte: count
+            for acte, count in acte_counter.most_common(10)
+        }
+        if pairs:
+            dictionary["dp_acte_cooccurrence"][dp_code] = pairs
+
+    # DAS -> Bio (top 5 par DAS, seuil >= 3)
+    for das3, bio_counter in das_bio.items():
+        top = {
+            test: count
+            for test, count in bio_counter.most_common(5)
+            if count >= 3
+        }
+        if top:
+            dictionary["das_bio_association"][das3] = top
+
+    # DAS -> Traitements (top 5 par DAS, seuil >= 3)
+    for das3, trt_counter in das_treatment.items():
+        top = {
+            med: count
+            for med, count in trt_counter.most_common(5)
+            if count >= 3
+        }
+        if top:
+            dictionary["das_treatment_association"][das3] = top
+
+    return dictionary
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Build coding dictionary from batch results")
+    parser.add_argument(
+        "--input",
+        default="output/structured",
+        help="Directory containing structured outputs",
+    )
+    parser.add_argument(
+        "--output",
+        default="config/coding_dictionary.json",
+        help="Output dictionary JSON path",
+    )
+    args = parser.parse_args()
+
+    project_root = Path(__file__).resolve().parent.parent
+    input_dir = project_root / args.input
+    output_path = project_root / args.output
+
+    print(f"Loading dossiers from {input_dir}...")
+    dossiers = load_dossiers(str(input_dir))
+    print(f"Loaded {len(dossiers)} dossiers")
+
+    print("Building dictionary...")
+    dictionary = build_dictionary(dossiers)
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(
+        json.dumps(dictionary, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+
+    # Stats
+    meta = dictionary["metadata"]
+    print(f"\nDictionary written to {output_path}")
+    print(f"  {meta['n_dossiers']} dossiers")
+    print(f"  {meta['n_dp_distinct']} DP distincts")
+    print(f"  {meta['n_das_distinct']} DAS distincts")
+    print(f"  {len(dictionary['dp_das_cooccurrence'])} DP avec co-occurrences")
+    print(f"  {len(dictionary['das_bio_association'])} DAS3 avec associations bio")
+    print(f"  {len(dictionary['das_treatment_association'])} DAS3 avec associations traitement")
+
+
+if __name__ == "__main__":
+    main()