#!/usr/bin/env python3 """ Exercices de discrimination entre codes CIM-10 siblings (même parent). Stratégie : - Utilise la hiérarchie FHIR pour identifier les groupes de siblings - Focus sur les top 100 familles CIM-10 les plus fréquentes du pipeline - Pour chaque groupe, génère un scénario clinique via Claude Opus 4.6 - La réponse explique pourquoi un code et pas l'autre Cible : 800 exemples Sources : smt_cim10_fhir.json + cache Ollama (codes fréquents) Nécessite : ANTHROPIC_API_KEY en variable d'environnement Usage : python scripts/15_generate_discrimination.py [--dry-run] [--max N] """ import argparse import json import os import random import re import sys import time from collections import Counter from pathlib import Path random.seed(42) BASE = Path(__file__).resolve().parent.parent T2A = BASE.parent / "t2a" RAW = BASE / "data" / "raw" OUTPUT = BASE / "data" / "processed" / "discrimination_chatml.jsonl" OUTPUT.parent.mkdir(parents=True, exist_ok=True) MODEL = "claude-opus-4-6" SYSTEM_PROMPT = ( "Tu es un médecin DIM expert en codage CIM-10 pour le PMSI français. " "Tu sais discriminer les codes CIM-10 proches (siblings) et choisir le plus approprié." ) GENERATION_PROMPT = """Tu es un formateur DIM. Génère un exercice de discrimination entre codes CIM-10 proches. CODES À DISCRIMINER (même catégorie {parent}) : {codes_list} Génère un objet JSON avec : 1. "scenario" : un cas clinique réaliste (2-3 phrases) où le choix entre ces codes est subtil 2. "reponse" : un objet JSON contenant : - "analyse_clinique" : interprétation du cas clinique - "codes_candidats" : les 2-3 codes candidats et pourquoi chacun est envisagé - "discrimination" : la différence clé entre ces codes (inclusions, exclusions, spécificité) - "code" : le code correct pour ce scénario - "confidence" : "high" - "justification" : pourquoi CE code et pas les autres Réponds UNIQUEMENT avec le JSON, sans texte avant/après.""" def load_fhir() -> tuple[list, dict]: """Charger les concepts FHIR.""" fhir_path = RAW / "smt_cim10_fhir.json" data = json.loads(fhir_path.read_text()) concepts = data["concept"] by_code = {c["code"]: c for c in concepts} return concepts, by_code def get_parent(concept: dict) -> str: for p in concept.get("property", []): if p["code"] == "parent": return p.get("valueCode", "") return "" def get_type(concept: dict) -> str: for p in concept.get("property", []): if p["code"] == "type": return p.get("valueString", "") return "" def get_inclusion_note(concept: dict) -> str: for p in concept.get("property", []): if p["code"] == "inclusionNote": return p.get("valueString", "") return "" def get_exclusion_note(concept: dict) -> str: for p in concept.get("property", []): if p["code"] == "exclusionNote": return p.get("valueString", "") return "" def get_frequent_families() -> Counter: """Extraire les familles CIM-10 les plus fréquentes depuis le cache Ollama.""" families = Counter() for cache_path in [T2A / "data" / "ollama_cache.json", T2A / "data" / "ollama_cache_gemma3.bak"]: if not cache_path.exists(): continue data = json.loads(cache_path.read_text()) for entry in data.get("entries", {}).values(): code = entry.get("code", "") if code and len(code) >= 3 and code[0].isalpha(): families[code[:3]] += 1 return families def build_sibling_groups(concepts: list, by_code: dict) -> dict[str, list[dict]]: """Grouper les codes par parent (siblings).""" children_by_parent = {} for c in concepts: if get_type(c) != "category": continue parent = get_parent(c) if parent and parent in by_code: children_by_parent.setdefault(parent, []).append(c) return children_by_parent def format_codes_for_prompt(siblings: list[dict]) -> str: """Formater les codes pour le prompt LLM.""" lines = [] for sib in siblings: code = sib["code"] display = sib["display"] incl = get_inclusion_note(sib) excl = get_exclusion_note(sib) line = f"- {code} : {display}" if incl: line += f"\n Comprend : {incl[:200]}" if excl: line += f"\n Exclut : {excl[:200]}" lines.append(line) return "\n".join(lines) def call_claude(client, prompt: str, max_retries: int = 2) -> str | None: """Appel Claude Opus 4.6 via API Anthropic avec retry.""" for attempt in range(max_retries + 1): try: response = client.messages.create( model=MODEL, max_tokens=2048, temperature=0.7, messages=[{"role": "user", "content": prompt}], ) return response.content[0].text except Exception as e: if attempt < max_retries: wait = 2 ** (attempt + 1) print(f" Retry in {wait}s: {e}") time.sleep(wait) else: print(f" Claude error: {e}") return None def parse_llm_response(response_text: str) -> dict | None: """Parse la réponse JSON du LLM.""" if not response_text: return None text = response_text.strip() if "```json" in text: text = text.split("```json", 1)[1].split("```", 1)[0].strip() elif "```" in text: text = text.split("```", 1)[1].split("```", 1)[0].strip() try: data = json.loads(text) if "scenario" in data and "reponse" in data: return data except json.JSONDecodeError: pass # Fallback brace_start = text.find("{") if brace_start >= 0: depth = 0 for i in range(brace_start, len(text)): if text[i] == "{": depth += 1 elif text[i] == "}": depth -= 1 if depth == 0: try: data = json.loads(text[brace_start:i+1]) if "scenario" in data: return data except json.JSONDecodeError: break return None def make_chatml(scenario: str, response: dict, parent_code: str, siblings_desc: str) -> dict: """Créer un exemple ChatML.""" user_content = ( f"Cas clinique :\n{scenario}\n\n" f"Codes CIM-10 candidats (catégorie {parent_code}) :\n{siblings_desc}\n\n" "Quel code est le plus approprié ? Explique ton raisonnement de discrimination." ) assistant_content = json.dumps(response, ensure_ascii=False) return { "messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_content}, {"role": "assistant", "content": assistant_content}, ] } def main(): parser = argparse.ArgumentParser() parser.add_argument("--dry-run", action="store_true", help="Pas d'appel LLM") parser.add_argument("--max", type=int, default=800, help="Max exemples à générer") args = parser.parse_args() print("=" * 60) print("Génération d'exercices de discrimination CIM-10") print(f"Modèle : {MODEL}") print("=" * 60) # Vérifier la clé API if not args.dry_run: api_key = os.environ.get("ANTHROPIC_API_KEY") if not api_key: print("Erreur: ANTHROPIC_API_KEY non définie.") print(" export ANTHROPIC_API_KEY='sk-ant-...'") sys.exit(1) import anthropic client = anthropic.Anthropic(api_key=api_key) else: client = None print("\nChargement FHIR...") concepts, by_code = load_fhir() print(f" {len(concepts)} concepts") print("\nIdentification des familles fréquentes (cache Ollama)...") freq_families = get_frequent_families() top_families = [code for code, _ in freq_families.most_common(150)] print(f" Top familles : {len(top_families)} (ex: {', '.join(top_families[:10])})") print("\nConstruction des groupes de siblings...") sibling_groups = build_sibling_groups(concepts, by_code) print(f" {len(sibling_groups)} groupes") # Filtrer : 2-8 siblings, prioriser les familles fréquentes candidates = [] for parent_code, siblings in sibling_groups.items(): if len(siblings) < 2 or len(siblings) > 8: continue priority = 2 if parent_code[:3] in top_families else 0 n_with_notes = sum(1 for s in siblings if get_inclusion_note(s) or get_exclusion_note(s)) priority += n_with_notes candidates.append((parent_code, siblings, priority)) candidates.sort(key=lambda x: -x[2]) print(f" Candidats filtrés (2-8 siblings) : {len(candidates)}") target = min(args.max, len(candidates)) candidates = candidates[:target] if args.dry_run: for parent_code, siblings, prio in candidates[:20]: parent_display = by_code[parent_code]["display"] if parent_code in by_code else "?" sib_codes = ", ".join(s["code"] for s in siblings) print(f" [{prio}] {parent_code} ({parent_display}): {sib_codes}") print(f"\n[DRY RUN] {len(candidates)} groupes à traiter. Relancez sans --dry-run.") return # Générer les exercices via Claude examples = [] n_ok = 0 n_fail = 0 for i, (parent_code, siblings, _) in enumerate(candidates): if len(siblings) > 4: selected = random.sample(siblings, 4) else: selected = siblings codes_list = format_codes_for_prompt(selected) parent_display = by_code[parent_code]["display"] if parent_code in by_code else parent_code prompt = GENERATION_PROMPT.format(parent=f"{parent_code} ({parent_display})", codes_list=codes_list) response_text = call_claude(client, prompt) parsed = parse_llm_response(response_text) if parsed and "scenario" in parsed and "reponse" in parsed: siblings_desc = "\n".join(f"- {s['code']} : {s['display']}" for s in selected) example = make_chatml(parsed["scenario"], parsed["reponse"], parent_code, siblings_desc) examples.append(example) n_ok += 1 else: n_fail += 1 if (i + 1) % 50 == 0: print(f" Progression : {i+1}/{len(candidates)} (ok={n_ok}, fail={n_fail})") # Mélanger et sauvegarder random.shuffle(examples) with open(OUTPUT, "w") as f: for ex in examples: f.write(json.dumps(ex, ensure_ascii=False) + "\n") print(f"\n{'='*60}") print(f"Total : {len(examples)} exemples → {OUTPUT}") print(f" OK : {n_ok}, Échecs : {n_fail}") print(f"Taille : {OUTPUT.stat().st_size / 1024:.0f} Ko") if __name__ == "__main__": main()