feat: rééquilibrage dataset LoRA — raisonnement DIM vs mémorisation
Passe de 95/3/2 (lookups/raisonnement/règles) à ~31/49/20. Dataset cible ~16K exemples denses (vs 66K de lookups avant). Modifiés : - 03_convert_cache.py : cache complet 1840 entrées (actuel + backup) - 04_build_dataset.py : subsampling agressif (CIM-10 1.5K, CCAM 1.5K, CoCoA 2K) + sélection intelligente priorisant le raisonnement - 12_generate_pipeline_examples.py : 3 templates (court + long + CPAM), cache actuel, cible ~2800 exemples Créés : - 13_generate_fascicule_reasoning.py : parsing 10 fascicules ATIH, génération Q&A raisonnement via Claude Opus 4.6 (~450 exemples) - 14_generate_negative_examples.py : 1000 exemples négatifs (symptômes/DP, redondances sémantiques, DAS non significatifs) - 15_generate_discrimination.py : 800 exercices de discrimination entre codes siblings CIM-10 via Claude Opus 4.6 - 16_parse_guide_metho.py : extraction Guide Méthodologique MCO 2026, Q&A directes + raisonnement via Claude Opus 4.6 (~500 exemples) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
170
scripts/02_generate_ccam_pairs.py
Normal file
170
scripts/02_generate_ccam_pairs.py
Normal file
@@ -0,0 +1,170 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 1B — Génération de paires ChatML CCAM depuis ccam_dict.json.
|
||||
|
||||
Sources : ccam_dict.json (8 257 codes) du projet T2A
|
||||
Produit : data/processed/ccam_chatml.jsonl
|
||||
|
||||
Types d'exemples générés :
|
||||
1. code → description (lookup)
|
||||
2. description → code (codage)
|
||||
3. discrimination par regroupement (codes du même regroupement)
|
||||
"""
|
||||
|
||||
import json
|
||||
import random
|
||||
from pathlib import Path
|
||||
|
||||
random.seed(42)
|
||||
|
||||
BASE = Path(__file__).resolve().parent.parent
|
||||
T2A = Path("/home/dom/ai/t2a")
|
||||
OUT = BASE / "data" / "processed"
|
||||
OUT.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
SYSTEM_MSG = "Tu es un médecin DIM expert en codage CCAM pour le PMSI français."
|
||||
|
||||
|
||||
def load_ccam():
|
||||
"""Charger le dictionnaire CCAM."""
|
||||
with open(T2A / "data" / "ccam_dict.json") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def make_chatml(system, user, assistant):
|
||||
return {
|
||||
"messages": [
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": user},
|
||||
{"role": "assistant", "content": assistant},
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def generate_lookup_pairs(ccam):
|
||||
"""Type 1 : code → description."""
|
||||
pairs = []
|
||||
for code, info in ccam.items():
|
||||
desc = info.get("description", "")
|
||||
if not desc or len(desc) < 5:
|
||||
continue
|
||||
|
||||
regroupement = info.get("regroupement", "")
|
||||
activite = info.get("activite", "")
|
||||
tarif = info.get("tarif_s1")
|
||||
|
||||
answer_parts = [f"{code} — {desc}"]
|
||||
if regroupement:
|
||||
answer_parts.append(f"Regroupement : {regroupement}")
|
||||
if activite:
|
||||
answer_parts.append(f"Activité : {activite}")
|
||||
if tarif:
|
||||
answer_parts.append(f"Tarif secteur 1 : {tarif} €")
|
||||
|
||||
templates = [
|
||||
f"Que désigne le code CCAM {code} ?",
|
||||
f"Quel est le libellé de l'acte CCAM {code} ?",
|
||||
f"Décris l'acte CCAM {code}.",
|
||||
]
|
||||
|
||||
pairs.append(make_chatml(SYSTEM_MSG, random.choice(templates), "\n".join(answer_parts)))
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def generate_coding_pairs(ccam):
|
||||
"""Type 2 : description → code."""
|
||||
pairs = []
|
||||
for code, info in ccam.items():
|
||||
desc = info.get("description", "")
|
||||
if not desc or len(desc) < 10:
|
||||
continue
|
||||
|
||||
answer = json.dumps({
|
||||
"code": code,
|
||||
"confidence": "high",
|
||||
"justification": f"Correspondance directe avec le libellé CCAM : {code} {desc}."
|
||||
}, ensure_ascii=False)
|
||||
|
||||
templates = [
|
||||
f"Quel est le code CCAM pour : {desc} ?",
|
||||
f"Code CCAM pour « {desc} » ?",
|
||||
f"Codage CCAM de l'acte : {desc}",
|
||||
]
|
||||
|
||||
pairs.append(make_chatml(SYSTEM_MSG, random.choice(templates), answer))
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def generate_regroupement_pairs(ccam):
|
||||
"""Type 3 : regroupement → liste des actes du même regroupement."""
|
||||
pairs = []
|
||||
|
||||
# Grouper par regroupement
|
||||
by_regroup = {}
|
||||
for code, info in ccam.items():
|
||||
reg = info.get("regroupement", "")
|
||||
if reg:
|
||||
by_regroup.setdefault(reg, []).append((code, info))
|
||||
|
||||
for reg, actes in by_regroup.items():
|
||||
if len(actes) < 2 or len(actes) > 20:
|
||||
continue
|
||||
|
||||
question = f"Quels sont les actes CCAM du regroupement {reg} ?"
|
||||
|
||||
lines = [f"Le regroupement {reg} comprend {len(actes)} actes :\n"]
|
||||
for code, info in actes[:15]:
|
||||
desc = info.get("description", "")
|
||||
tarif = info.get("tarif_s1")
|
||||
line = f"- {code} : {desc}"
|
||||
if tarif:
|
||||
line += f" ({tarif} €)"
|
||||
lines.append(line)
|
||||
|
||||
if len(actes) > 15:
|
||||
lines.append(f" ... et {len(actes) - 15} autres actes.")
|
||||
|
||||
answer = "\n".join(lines)
|
||||
if len(answer) > 2000:
|
||||
continue
|
||||
|
||||
pairs.append(make_chatml(SYSTEM_MSG, question, answer))
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def main():
|
||||
print("Chargement du dictionnaire CCAM...")
|
||||
ccam = load_ccam()
|
||||
print(f" {len(ccam)} codes chargés")
|
||||
|
||||
print("\nGénération des paires...")
|
||||
|
||||
print(" Type 1 : code → description (lookup)")
|
||||
lookup = generate_lookup_pairs(ccam)
|
||||
print(f" → {len(lookup)} exemples")
|
||||
|
||||
print(" Type 2 : description → code (codage)")
|
||||
coding = generate_coding_pairs(ccam)
|
||||
print(f" → {len(coding)} exemples")
|
||||
|
||||
print(" Type 3 : regroupement")
|
||||
regroup = generate_regroupement_pairs(ccam)
|
||||
print(f" → {len(regroup)} exemples")
|
||||
|
||||
all_pairs = lookup + coding + regroup
|
||||
random.shuffle(all_pairs)
|
||||
|
||||
output_path = OUT / "ccam_chatml.jsonl"
|
||||
with open(output_path, "w") as f:
|
||||
for pair in all_pairs:
|
||||
f.write(json.dumps(pair, ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"\nTotal : {len(all_pairs)} exemples → {output_path}")
|
||||
print(f"Taille : {output_path.stat().st_size / 1024 / 1024:.1f} Mo")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user