Files
t2a_v2/scripts/build_coding_dict.py
dom a371626f40 feat: dictionnaire de codage + détection anomalies statistiques
- Script build_coding_dict.py génère le dictionnaire depuis le batch (240 dossiers)
- coding_dictionary.json : co-occurrences DP→DAS, fréquences, associations bio
- anomaly_stats.py : 8 checks (DP/DAS rare, DAS manquant, bio-DAS, âge atypique)
- Intégré dans le pipeline cim10_extractor post-DIM-senior

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 23:48:36 +01:00

315 lines
10 KiB
Python

#!/usr/bin/env python3
"""Construit le dictionnaire de codage a partir des resultats du batch.
Parcourt output/structured/ et genere config/coding_dictionary.json
avec les co-occurrences, frequences et associations observees.
Usage:
python3 scripts/build_coding_dict.py [--output config/coding_dictionary.json]
"""
from __future__ import annotations
import argparse
import json
import os
import re
import sys
from collections import Counter, defaultdict
from pathlib import Path
# Heuristique : filtrer les vrais medicaments dans les traitements
_MED_SUFFIXES = re.compile(
r"(ine|ide|ol|one|ate|ase|mab|nib|zol|pam|lam|zide|pine|pril|tan|"
r"oxine|xone|dine|mide|fene|phene|mine|sone|lone|done|cine|il|"
r"lin|ril|mox|tine|zine|vir|cin)$",
re.IGNORECASE,
)
_MED_KNOWN = {
"insuline", "heparine", "paracetamol", "doliprane", "aspirine",
"augmentin", "ceftriaxone", "amoxicilline", "metformine", "amlodipine",
"ramipril", "bisoprolol", "furosemide", "lasilix", "kardegic",
"lovenox", "spasfon", "perfalgan", "morphine", "tramadol",
"ketoprofene", "profenid", "omeprazole", "pantoprazole", "lanzor",
"atorvastatine", "simvastatine", "levothyrox", "cordarone",
"amiodarone", "digoxine", "warfarine", "coumadine", "xarelto",
"eliquis", "pradaxa", "dabigatran", "rivaroxaban", "apixaban",
"methotrexate", "salbutamol", "ventoline", "seretide", "spiriva",
"cortancyl", "prednisone", "prednisolone", "solupred", "celestene",
"dexamethasone", "hydrocortisone", "zymad", "uvedose", "calcidose",
"diffu-k", "potassium", "magnesium", "fer", "tardyferon", "speciafoldine",
"acide folique", "vitamine", "enoxaparine", "tinzaparine", "fondaparinux",
"arixtra", "clopidogrel", "plavix", "ticagrelor", "brilique",
}
def _is_medication(text: str) -> str | None:
"""Extrait le nom du medicament si c'est un vrai traitement."""
if not text or len(text) < 3:
return None
# Nettoyer
words = text.strip().lower().split()
if not words:
return None
first = words[0].rstrip(".,;:")
# Rejeter les phrases (>4 mots sans chiffre de posologie)
if len(words) > 6 and not any(c.isdigit() for c in text):
return None
# Rejeter les patterns evidents de non-medicament
reject_starts = (
"ce document", "parents", "il pourra", "document",
"prévoir", "réévaluation", "evènement", "transfusion",
"note", "consultation", "histoire", "pas de", "suite",
"dr.", "mme", "mr.", "bilan", "a revoir", "rdv",
)
text_lower = text.lower().strip()
if any(text_lower.startswith(r) for r in reject_starts):
return None
# Check connu
if first in _MED_KNOWN:
return first
for known in _MED_KNOWN:
if known in text_lower[:40]:
return known
# Check suffixe
if _MED_SUFFIXES.search(first) and len(first) >= 4:
return first
return None
def load_dossiers(structured_dir: str) -> list[dict]:
"""Charge tous les dossiers uniques depuis output/structured/."""
dossiers = []
seen_nda = set()
for d in sorted(os.listdir(structured_dir)):
full = os.path.join(structured_dir, d)
if not os.path.isdir(full) or d == "pseudonymise":
continue
if "_" not in d:
continue
for f in os.listdir(full):
if f.endswith("_cim10.json"):
try:
data = json.load(open(os.path.join(full, f)))
nda = d.split("_", 1)[1]
if nda not in seen_nda:
seen_nda.add(nda)
dossiers.append(data)
except Exception:
pass
return dossiers
def build_dictionary(dossiers: list[dict]) -> dict:
"""Construit le dictionnaire de codage."""
dp_freq = Counter()
das_freq = Counter()
dp_das = defaultdict(Counter)
dp_acte = defaultdict(Counter)
das_bio = defaultdict(Counter)
das_treatment = defaultdict(Counter)
dp_texte = {} # dp_code -> texte le plus frequent
das_texte = {}
dp_texte_counter = defaultdict(Counter)
das_texte_counter = defaultdict(Counter)
duree_das = []
age_dp = defaultdict(list)
for data in dossiers:
dp = data.get("diagnostic_principal", {})
dp_code = (dp.get("cim10_final") or dp.get("cim10_suggestion") or "").strip()
dp_text = (dp.get("texte") or "").strip()
das_list = data.get("diagnostics_associes", [])
das_codes = []
for das in das_list:
c = (das.get("cim10_final") or das.get("cim10_suggestion") or "").strip()
t = (das.get("texte") or "").strip()
if c:
das_codes.append(c)
das_freq[c] += 1
if t:
das_texte_counter[c][t] += 1
if dp_code:
dp_freq[dp_code] += 1
if dp_text:
dp_texte_counter[dp_code][dp_text] += 1
for c in das_codes:
dp_das[dp_code][c] += 1
# Actes
for a in data.get("actes_ccam", []):
code = (
a.get("code_ccam")
or a.get("ccam_suggestion")
or a.get("code_ccam_suggestion")
or ""
).strip()
if code and dp_code:
dp_acte[dp_code][code] += 1
# Bio anormale -> DAS
abnormal = [
b.get("test", "")
for b in data.get("biologie_cle", [])
if b.get("anomalie")
]
for c in das_codes:
c3 = c[:3]
for bt in abnormal:
if bt:
das_bio[c3][bt] += 1
# Traitements -> DAS
for t in data.get("traitements_sortie", []):
med = _is_medication(t.get("medicament", ""))
if med:
for c in das_codes:
das_treatment[c[:3]][med] += 1
# Metadata
sejour = data.get("sejour", {})
duree = sejour.get("duree_sejour")
age = sejour.get("age")
if duree is not None:
duree_das.append((duree, len(das_codes)))
if age is not None and dp_code:
age_dp[dp_code].append(age)
# Texte le plus frequent par code
for code, counter in dp_texte_counter.items():
dp_texte[code] = counter.most_common(1)[0][0]
for code, counter in das_texte_counter.items():
das_texte[code] = counter.most_common(1)[0][0]
# Construire le dico final
dictionary = {
"metadata": {
"n_dossiers": len(dossiers),
"n_dp_distinct": len(dp_freq),
"n_das_distinct": len(das_freq),
"version": 1,
},
"dp": {},
"das": {},
"dp_das_cooccurrence": {},
"dp_acte_cooccurrence": {},
"das_bio_association": {},
"das_treatment_association": {},
}
# DP
for code, n in dp_freq.most_common():
entry = {"freq": n, "texte": dp_texte.get(code, "")}
ages = age_dp.get(code, [])
if ages:
entry["age_moy"] = round(sum(ages) / len(ages), 1)
entry["age_min"] = min(ages)
entry["age_max"] = max(ages)
dictionary["dp"][code] = entry
# DAS
for code, n in das_freq.most_common():
dictionary["das"][code] = {
"freq": n,
"texte": das_texte.get(code, ""),
"pct": round(100 * n / len(dossiers), 1),
}
# Co-occurrences DP->DAS (seuil >= 2)
for dp_code, das_counter in dp_das.items():
pairs = {
das_code: count
for das_code, count in das_counter.most_common(30)
if count >= 2
}
if pairs:
dictionary["dp_das_cooccurrence"][dp_code] = pairs
# Co-occurrences DP->ACTE
for dp_code, acte_counter in dp_acte.items():
pairs = {
acte: count
for acte, count in acte_counter.most_common(10)
}
if pairs:
dictionary["dp_acte_cooccurrence"][dp_code] = pairs
# DAS -> Bio (top 5 par DAS, seuil >= 3)
for das3, bio_counter in das_bio.items():
top = {
test: count
for test, count in bio_counter.most_common(5)
if count >= 3
}
if top:
dictionary["das_bio_association"][das3] = top
# DAS -> Traitements (top 5 par DAS, seuil >= 3)
for das3, trt_counter in das_treatment.items():
top = {
med: count
for med, count in trt_counter.most_common(5)
if count >= 3
}
if top:
dictionary["das_treatment_association"][das3] = top
return dictionary
def main():
parser = argparse.ArgumentParser(description="Build coding dictionary from batch results")
parser.add_argument(
"--input",
default="output/structured",
help="Directory containing structured outputs",
)
parser.add_argument(
"--output",
default="config/coding_dictionary.json",
help="Output dictionary JSON path",
)
args = parser.parse_args()
project_root = Path(__file__).resolve().parent.parent
input_dir = project_root / args.input
output_path = project_root / args.output
print(f"Loading dossiers from {input_dir}...")
dossiers = load_dossiers(str(input_dir))
print(f"Loaded {len(dossiers)} dossiers")
print("Building dictionary...")
dictionary = build_dictionary(dossiers)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(
json.dumps(dictionary, ensure_ascii=False, indent=2),
encoding="utf-8",
)
# Stats
meta = dictionary["metadata"]
print(f"\nDictionary written to {output_path}")
print(f" {meta['n_dossiers']} dossiers")
print(f" {meta['n_dp_distinct']} DP distincts")
print(f" {meta['n_das_distinct']} DAS distincts")
print(f" {len(dictionary['dp_das_cooccurrence'])} DP avec co-occurrences")
print(f" {len(dictionary['das_bio_association'])} DAS3 avec associations bio")
print(f" {len(dictionary['das_treatment_association'])} DAS3 avec associations traitement")
if __name__ == "__main__":
main()