#!/usr/bin/env python3 """Construit le dictionnaire de codage a partir des resultats du batch. Parcourt output/structured/ et genere config/coding_dictionary.json avec les co-occurrences, frequences et associations observees. Usage: python3 scripts/build_coding_dict.py [--output config/coding_dictionary.json] """ from __future__ import annotations import argparse import json import os import re import sys from collections import Counter, defaultdict from pathlib import Path # Heuristique : filtrer les vrais medicaments dans les traitements _MED_SUFFIXES = re.compile( r"(ine|ide|ol|one|ate|ase|mab|nib|zol|pam|lam|zide|pine|pril|tan|" r"oxine|xone|dine|mide|fene|phene|mine|sone|lone|done|cine|il|" r"lin|ril|mox|tine|zine|vir|cin)$", re.IGNORECASE, ) _MED_KNOWN = { "insuline", "heparine", "paracetamol", "doliprane", "aspirine", "augmentin", "ceftriaxone", "amoxicilline", "metformine", "amlodipine", "ramipril", "bisoprolol", "furosemide", "lasilix", "kardegic", "lovenox", "spasfon", "perfalgan", "morphine", "tramadol", "ketoprofene", "profenid", "omeprazole", "pantoprazole", "lanzor", "atorvastatine", "simvastatine", "levothyrox", "cordarone", "amiodarone", "digoxine", "warfarine", "coumadine", "xarelto", "eliquis", "pradaxa", "dabigatran", "rivaroxaban", "apixaban", "methotrexate", "salbutamol", "ventoline", "seretide", "spiriva", "cortancyl", "prednisone", "prednisolone", "solupred", "celestene", "dexamethasone", "hydrocortisone", "zymad", "uvedose", "calcidose", "diffu-k", "potassium", "magnesium", "fer", "tardyferon", "speciafoldine", "acide folique", "vitamine", "enoxaparine", "tinzaparine", "fondaparinux", "arixtra", "clopidogrel", "plavix", "ticagrelor", "brilique", } def _is_medication(text: str) -> str | None: """Extrait le nom du medicament si c'est un vrai traitement.""" if not text or len(text) < 3: return None # Nettoyer words = text.strip().lower().split() if not words: return None first = words[0].rstrip(".,;:") # Rejeter les phrases (>4 mots sans chiffre de posologie) if len(words) > 6 and not any(c.isdigit() for c in text): return None # Rejeter les patterns evidents de non-medicament reject_starts = ( "ce document", "parents", "il pourra", "document", "prévoir", "réévaluation", "evènement", "transfusion", "note", "consultation", "histoire", "pas de", "suite", "dr.", "mme", "mr.", "bilan", "a revoir", "rdv", ) text_lower = text.lower().strip() if any(text_lower.startswith(r) for r in reject_starts): return None # Check connu if first in _MED_KNOWN: return first for known in _MED_KNOWN: if known in text_lower[:40]: return known # Check suffixe if _MED_SUFFIXES.search(first) and len(first) >= 4: return first return None def load_dossiers(structured_dir: str) -> list[dict]: """Charge tous les dossiers uniques depuis output/structured/.""" dossiers = [] seen_nda = set() for d in sorted(os.listdir(structured_dir)): full = os.path.join(structured_dir, d) if not os.path.isdir(full) or d == "pseudonymise": continue if "_" not in d: continue for f in os.listdir(full): if f.endswith("_cim10.json"): try: data = json.load(open(os.path.join(full, f))) nda = d.split("_", 1)[1] if nda not in seen_nda: seen_nda.add(nda) dossiers.append(data) except Exception: pass return dossiers def build_dictionary(dossiers: list[dict]) -> dict: """Construit le dictionnaire de codage.""" dp_freq = Counter() das_freq = Counter() dp_das = defaultdict(Counter) dp_acte = defaultdict(Counter) das_bio = defaultdict(Counter) das_treatment = defaultdict(Counter) dp_texte = {} # dp_code -> texte le plus frequent das_texte = {} dp_texte_counter = defaultdict(Counter) das_texte_counter = defaultdict(Counter) duree_das = [] age_dp = defaultdict(list) for data in dossiers: dp = data.get("diagnostic_principal", {}) dp_code = (dp.get("cim10_final") or dp.get("cim10_suggestion") or "").strip() dp_text = (dp.get("texte") or "").strip() das_list = data.get("diagnostics_associes", []) das_codes = [] for das in das_list: c = (das.get("cim10_final") or das.get("cim10_suggestion") or "").strip() t = (das.get("texte") or "").strip() if c: das_codes.append(c) das_freq[c] += 1 if t: das_texte_counter[c][t] += 1 if dp_code: dp_freq[dp_code] += 1 if dp_text: dp_texte_counter[dp_code][dp_text] += 1 for c in das_codes: dp_das[dp_code][c] += 1 # Actes for a in data.get("actes_ccam", []): code = ( a.get("code_ccam") or a.get("ccam_suggestion") or a.get("code_ccam_suggestion") or "" ).strip() if code and dp_code: dp_acte[dp_code][code] += 1 # Bio anormale -> DAS abnormal = [ b.get("test", "") for b in data.get("biologie_cle", []) if b.get("anomalie") ] for c in das_codes: c3 = c[:3] for bt in abnormal: if bt: das_bio[c3][bt] += 1 # Traitements -> DAS for t in data.get("traitements_sortie", []): med = _is_medication(t.get("medicament", "")) if med: for c in das_codes: das_treatment[c[:3]][med] += 1 # Metadata sejour = data.get("sejour", {}) duree = sejour.get("duree_sejour") age = sejour.get("age") if duree is not None: duree_das.append((duree, len(das_codes))) if age is not None and dp_code: age_dp[dp_code].append(age) # Texte le plus frequent par code for code, counter in dp_texte_counter.items(): dp_texte[code] = counter.most_common(1)[0][0] for code, counter in das_texte_counter.items(): das_texte[code] = counter.most_common(1)[0][0] # Construire le dico final dictionary = { "metadata": { "n_dossiers": len(dossiers), "n_dp_distinct": len(dp_freq), "n_das_distinct": len(das_freq), "version": 1, }, "dp": {}, "das": {}, "dp_das_cooccurrence": {}, "dp_acte_cooccurrence": {}, "das_bio_association": {}, "das_treatment_association": {}, } # DP for code, n in dp_freq.most_common(): entry = {"freq": n, "texte": dp_texte.get(code, "")} ages = age_dp.get(code, []) if ages: entry["age_moy"] = round(sum(ages) / len(ages), 1) entry["age_min"] = min(ages) entry["age_max"] = max(ages) dictionary["dp"][code] = entry # DAS for code, n in das_freq.most_common(): dictionary["das"][code] = { "freq": n, "texte": das_texte.get(code, ""), "pct": round(100 * n / len(dossiers), 1), } # Co-occurrences DP->DAS (seuil >= 2) for dp_code, das_counter in dp_das.items(): pairs = { das_code: count for das_code, count in das_counter.most_common(30) if count >= 2 } if pairs: dictionary["dp_das_cooccurrence"][dp_code] = pairs # Co-occurrences DP->ACTE for dp_code, acte_counter in dp_acte.items(): pairs = { acte: count for acte, count in acte_counter.most_common(10) } if pairs: dictionary["dp_acte_cooccurrence"][dp_code] = pairs # DAS -> Bio (top 5 par DAS, seuil >= 3) for das3, bio_counter in das_bio.items(): top = { test: count for test, count in bio_counter.most_common(5) if count >= 3 } if top: dictionary["das_bio_association"][das3] = top # DAS -> Traitements (top 5 par DAS, seuil >= 3) for das3, trt_counter in das_treatment.items(): top = { med: count for med, count in trt_counter.most_common(5) if count >= 3 } if top: dictionary["das_treatment_association"][das3] = top return dictionary def main(): parser = argparse.ArgumentParser(description="Build coding dictionary from batch results") parser.add_argument( "--input", default="output/structured", help="Directory containing structured outputs", ) parser.add_argument( "--output", default="config/coding_dictionary.json", help="Output dictionary JSON path", ) args = parser.parse_args() project_root = Path(__file__).resolve().parent.parent input_dir = project_root / args.input output_path = project_root / args.output print(f"Loading dossiers from {input_dir}...") dossiers = load_dossiers(str(input_dir)) print(f"Loaded {len(dossiers)} dossiers") print("Building dictionary...") dictionary = build_dictionary(dossiers) output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text( json.dumps(dictionary, ensure_ascii=False, indent=2), encoding="utf-8", ) # Stats meta = dictionary["metadata"] print(f"\nDictionary written to {output_path}") print(f" {meta['n_dossiers']} dossiers") print(f" {meta['n_dp_distinct']} DP distincts") print(f" {meta['n_das_distinct']} DAS distincts") print(f" {len(dictionary['dp_das_cooccurrence'])} DP avec co-occurrences") print(f" {len(dictionary['das_bio_association'])} DAS3 avec associations bio") print(f" {len(dictionary['das_treatment_association'])} DAS3 avec associations traitement") if __name__ == "__main__": main()