feat: rééquilibrage dataset LoRA — raisonnement DIM vs mémorisation
Passe de 95/3/2 (lookups/raisonnement/règles) à ~31/49/20. Dataset cible ~16K exemples denses (vs 66K de lookups avant). Modifiés : - 03_convert_cache.py : cache complet 1840 entrées (actuel + backup) - 04_build_dataset.py : subsampling agressif (CIM-10 1.5K, CCAM 1.5K, CoCoA 2K) + sélection intelligente priorisant le raisonnement - 12_generate_pipeline_examples.py : 3 templates (court + long + CPAM), cache actuel, cible ~2800 exemples Créés : - 13_generate_fascicule_reasoning.py : parsing 10 fascicules ATIH, génération Q&A raisonnement via Claude Opus 4.6 (~450 exemples) - 14_generate_negative_examples.py : 1000 exemples négatifs (symptômes/DP, redondances sémantiques, DAS non significatifs) - 15_generate_discrimination.py : 800 exercices de discrimination entre codes siblings CIM-10 via Claude Opus 4.6 - 16_parse_guide_metho.py : extraction Guide Méthodologique MCO 2026, Q&A directes + raisonnement via Claude Opus 4.6 (~500 exemples) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
321
scripts/15_generate_discrimination.py
Normal file
321
scripts/15_generate_discrimination.py
Normal file
@@ -0,0 +1,321 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Exercices de discrimination entre codes CIM-10 siblings (même parent).
|
||||
|
||||
Stratégie :
|
||||
- Utilise la hiérarchie FHIR pour identifier les groupes de siblings
|
||||
- Focus sur les top 100 familles CIM-10 les plus fréquentes du pipeline
|
||||
- Pour chaque groupe, génère un scénario clinique via Claude Opus 4.6
|
||||
- La réponse explique pourquoi un code et pas l'autre
|
||||
|
||||
Cible : 800 exemples
|
||||
|
||||
Sources : smt_cim10_fhir.json + cache Ollama (codes fréquents)
|
||||
Nécessite : ANTHROPIC_API_KEY en variable d'environnement
|
||||
|
||||
Usage :
|
||||
python scripts/15_generate_discrimination.py [--dry-run] [--max N]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
random.seed(42)
|
||||
|
||||
BASE = Path(__file__).resolve().parent.parent
|
||||
T2A = BASE.parent / "t2a"
|
||||
RAW = BASE / "data" / "raw"
|
||||
OUTPUT = BASE / "data" / "processed" / "discrimination_chatml.jsonl"
|
||||
OUTPUT.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
MODEL = "claude-opus-4-6"
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"Tu es un médecin DIM expert en codage CIM-10 pour le PMSI français. "
|
||||
"Tu sais discriminer les codes CIM-10 proches (siblings) et choisir le plus approprié."
|
||||
)
|
||||
|
||||
GENERATION_PROMPT = """Tu es un formateur DIM. Génère un exercice de discrimination entre codes CIM-10 proches.
|
||||
|
||||
CODES À DISCRIMINER (même catégorie {parent}) :
|
||||
{codes_list}
|
||||
|
||||
Génère un objet JSON avec :
|
||||
1. "scenario" : un cas clinique réaliste (2-3 phrases) où le choix entre ces codes est subtil
|
||||
2. "reponse" : un objet JSON contenant :
|
||||
- "analyse_clinique" : interprétation du cas clinique
|
||||
- "codes_candidats" : les 2-3 codes candidats et pourquoi chacun est envisagé
|
||||
- "discrimination" : la différence clé entre ces codes (inclusions, exclusions, spécificité)
|
||||
- "code" : le code correct pour ce scénario
|
||||
- "confidence" : "high"
|
||||
- "justification" : pourquoi CE code et pas les autres
|
||||
|
||||
Réponds UNIQUEMENT avec le JSON, sans texte avant/après."""
|
||||
|
||||
|
||||
def load_fhir() -> tuple[list, dict]:
|
||||
"""Charger les concepts FHIR."""
|
||||
fhir_path = RAW / "smt_cim10_fhir.json"
|
||||
data = json.loads(fhir_path.read_text())
|
||||
concepts = data["concept"]
|
||||
by_code = {c["code"]: c for c in concepts}
|
||||
return concepts, by_code
|
||||
|
||||
|
||||
def get_parent(concept: dict) -> str:
|
||||
for p in concept.get("property", []):
|
||||
if p["code"] == "parent":
|
||||
return p.get("valueCode", "")
|
||||
return ""
|
||||
|
||||
|
||||
def get_type(concept: dict) -> str:
|
||||
for p in concept.get("property", []):
|
||||
if p["code"] == "type":
|
||||
return p.get("valueString", "")
|
||||
return ""
|
||||
|
||||
|
||||
def get_inclusion_note(concept: dict) -> str:
|
||||
for p in concept.get("property", []):
|
||||
if p["code"] == "inclusionNote":
|
||||
return p.get("valueString", "")
|
||||
return ""
|
||||
|
||||
|
||||
def get_exclusion_note(concept: dict) -> str:
|
||||
for p in concept.get("property", []):
|
||||
if p["code"] == "exclusionNote":
|
||||
return p.get("valueString", "")
|
||||
return ""
|
||||
|
||||
|
||||
def get_frequent_families() -> Counter:
|
||||
"""Extraire les familles CIM-10 les plus fréquentes depuis le cache Ollama."""
|
||||
families = Counter()
|
||||
for cache_path in [T2A / "data" / "ollama_cache.json", T2A / "data" / "ollama_cache_gemma3.bak"]:
|
||||
if not cache_path.exists():
|
||||
continue
|
||||
data = json.loads(cache_path.read_text())
|
||||
for entry in data.get("entries", {}).values():
|
||||
code = entry.get("code", "")
|
||||
if code and len(code) >= 3 and code[0].isalpha():
|
||||
families[code[:3]] += 1
|
||||
return families
|
||||
|
||||
|
||||
def build_sibling_groups(concepts: list, by_code: dict) -> dict[str, list[dict]]:
|
||||
"""Grouper les codes par parent (siblings)."""
|
||||
children_by_parent = {}
|
||||
for c in concepts:
|
||||
if get_type(c) != "category":
|
||||
continue
|
||||
parent = get_parent(c)
|
||||
if parent and parent in by_code:
|
||||
children_by_parent.setdefault(parent, []).append(c)
|
||||
return children_by_parent
|
||||
|
||||
|
||||
def format_codes_for_prompt(siblings: list[dict]) -> str:
|
||||
"""Formater les codes pour le prompt LLM."""
|
||||
lines = []
|
||||
for sib in siblings:
|
||||
code = sib["code"]
|
||||
display = sib["display"]
|
||||
incl = get_inclusion_note(sib)
|
||||
excl = get_exclusion_note(sib)
|
||||
line = f"- {code} : {display}"
|
||||
if incl:
|
||||
line += f"\n Comprend : {incl[:200]}"
|
||||
if excl:
|
||||
line += f"\n Exclut : {excl[:200]}"
|
||||
lines.append(line)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def call_claude(client, prompt: str, max_retries: int = 2) -> str | None:
|
||||
"""Appel Claude Opus 4.6 via API Anthropic avec retry."""
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
response = client.messages.create(
|
||||
model=MODEL,
|
||||
max_tokens=2048,
|
||||
temperature=0.7,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
)
|
||||
return response.content[0].text
|
||||
except Exception as e:
|
||||
if attempt < max_retries:
|
||||
wait = 2 ** (attempt + 1)
|
||||
print(f" Retry in {wait}s: {e}")
|
||||
time.sleep(wait)
|
||||
else:
|
||||
print(f" Claude error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def parse_llm_response(response_text: str) -> dict | None:
|
||||
"""Parse la réponse JSON du LLM."""
|
||||
if not response_text:
|
||||
return None
|
||||
text = response_text.strip()
|
||||
if "```json" in text:
|
||||
text = text.split("```json", 1)[1].split("```", 1)[0].strip()
|
||||
elif "```" in text:
|
||||
text = text.split("```", 1)[1].split("```", 1)[0].strip()
|
||||
|
||||
try:
|
||||
data = json.loads(text)
|
||||
if "scenario" in data and "reponse" in data:
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Fallback
|
||||
brace_start = text.find("{")
|
||||
if brace_start >= 0:
|
||||
depth = 0
|
||||
for i in range(brace_start, len(text)):
|
||||
if text[i] == "{":
|
||||
depth += 1
|
||||
elif text[i] == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
try:
|
||||
data = json.loads(text[brace_start:i+1])
|
||||
if "scenario" in data:
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
break
|
||||
return None
|
||||
|
||||
|
||||
def make_chatml(scenario: str, response: dict, parent_code: str, siblings_desc: str) -> dict:
|
||||
"""Créer un exemple ChatML."""
|
||||
user_content = (
|
||||
f"Cas clinique :\n{scenario}\n\n"
|
||||
f"Codes CIM-10 candidats (catégorie {parent_code}) :\n{siblings_desc}\n\n"
|
||||
"Quel code est le plus approprié ? Explique ton raisonnement de discrimination."
|
||||
)
|
||||
assistant_content = json.dumps(response, ensure_ascii=False)
|
||||
|
||||
return {
|
||||
"messages": [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_content},
|
||||
{"role": "assistant", "content": assistant_content},
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true", help="Pas d'appel LLM")
|
||||
parser.add_argument("--max", type=int, default=800, help="Max exemples à générer")
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print("Génération d'exercices de discrimination CIM-10")
|
||||
print(f"Modèle : {MODEL}")
|
||||
print("=" * 60)
|
||||
|
||||
# Vérifier la clé API
|
||||
if not args.dry_run:
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
if not api_key:
|
||||
print("Erreur: ANTHROPIC_API_KEY non définie.")
|
||||
print(" export ANTHROPIC_API_KEY='sk-ant-...'")
|
||||
sys.exit(1)
|
||||
import anthropic
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
else:
|
||||
client = None
|
||||
|
||||
print("\nChargement FHIR...")
|
||||
concepts, by_code = load_fhir()
|
||||
print(f" {len(concepts)} concepts")
|
||||
|
||||
print("\nIdentification des familles fréquentes (cache Ollama)...")
|
||||
freq_families = get_frequent_families()
|
||||
top_families = [code for code, _ in freq_families.most_common(150)]
|
||||
print(f" Top familles : {len(top_families)} (ex: {', '.join(top_families[:10])})")
|
||||
|
||||
print("\nConstruction des groupes de siblings...")
|
||||
sibling_groups = build_sibling_groups(concepts, by_code)
|
||||
print(f" {len(sibling_groups)} groupes")
|
||||
|
||||
# Filtrer : 2-8 siblings, prioriser les familles fréquentes
|
||||
candidates = []
|
||||
for parent_code, siblings in sibling_groups.items():
|
||||
if len(siblings) < 2 or len(siblings) > 8:
|
||||
continue
|
||||
priority = 2 if parent_code[:3] in top_families else 0
|
||||
n_with_notes = sum(1 for s in siblings if get_inclusion_note(s) or get_exclusion_note(s))
|
||||
priority += n_with_notes
|
||||
candidates.append((parent_code, siblings, priority))
|
||||
|
||||
candidates.sort(key=lambda x: -x[2])
|
||||
print(f" Candidats filtrés (2-8 siblings) : {len(candidates)}")
|
||||
|
||||
target = min(args.max, len(candidates))
|
||||
candidates = candidates[:target]
|
||||
|
||||
if args.dry_run:
|
||||
for parent_code, siblings, prio in candidates[:20]:
|
||||
parent_display = by_code[parent_code]["display"] if parent_code in by_code else "?"
|
||||
sib_codes = ", ".join(s["code"] for s in siblings)
|
||||
print(f" [{prio}] {parent_code} ({parent_display}): {sib_codes}")
|
||||
print(f"\n[DRY RUN] {len(candidates)} groupes à traiter. Relancez sans --dry-run.")
|
||||
return
|
||||
|
||||
# Générer les exercices via Claude
|
||||
examples = []
|
||||
n_ok = 0
|
||||
n_fail = 0
|
||||
|
||||
for i, (parent_code, siblings, _) in enumerate(candidates):
|
||||
if len(siblings) > 4:
|
||||
selected = random.sample(siblings, 4)
|
||||
else:
|
||||
selected = siblings
|
||||
|
||||
codes_list = format_codes_for_prompt(selected)
|
||||
parent_display = by_code[parent_code]["display"] if parent_code in by_code else parent_code
|
||||
|
||||
prompt = GENERATION_PROMPT.format(parent=f"{parent_code} ({parent_display})", codes_list=codes_list)
|
||||
response_text = call_claude(client, prompt)
|
||||
parsed = parse_llm_response(response_text)
|
||||
|
||||
if parsed and "scenario" in parsed and "reponse" in parsed:
|
||||
siblings_desc = "\n".join(f"- {s['code']} : {s['display']}" for s in selected)
|
||||
example = make_chatml(parsed["scenario"], parsed["reponse"], parent_code, siblings_desc)
|
||||
examples.append(example)
|
||||
n_ok += 1
|
||||
else:
|
||||
n_fail += 1
|
||||
|
||||
if (i + 1) % 50 == 0:
|
||||
print(f" Progression : {i+1}/{len(candidates)} (ok={n_ok}, fail={n_fail})")
|
||||
|
||||
# Mélanger et sauvegarder
|
||||
random.shuffle(examples)
|
||||
|
||||
with open(OUTPUT, "w") as f:
|
||||
for ex in examples:
|
||||
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Total : {len(examples)} exemples → {OUTPUT}")
|
||||
print(f" OK : {n_ok}, Échecs : {n_fail}")
|
||||
print(f"Taille : {OUTPUT.stat().st_size / 1024:.0f} Ko")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user