feat: rééquilibrage dataset LoRA — raisonnement DIM vs mémorisation
Passe de 95/3/2 (lookups/raisonnement/règles) à ~31/49/20. Dataset cible ~16K exemples denses (vs 66K de lookups avant). Modifiés : - 03_convert_cache.py : cache complet 1840 entrées (actuel + backup) - 04_build_dataset.py : subsampling agressif (CIM-10 1.5K, CCAM 1.5K, CoCoA 2K) + sélection intelligente priorisant le raisonnement - 12_generate_pipeline_examples.py : 3 templates (court + long + CPAM), cache actuel, cible ~2800 exemples Créés : - 13_generate_fascicule_reasoning.py : parsing 10 fascicules ATIH, génération Q&A raisonnement via Claude Opus 4.6 (~450 exemples) - 14_generate_negative_examples.py : 1000 exemples négatifs (symptômes/DP, redondances sémantiques, DAS non significatifs) - 15_generate_discrimination.py : 800 exercices de discrimination entre codes siblings CIM-10 via Claude Opus 4.6 - 16_parse_guide_metho.py : extraction Guide Méthodologique MCO 2026, Q&A directes + raisonnement via Claude Opus 4.6 (~500 exemples) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
206
scripts/03_convert_cache.py
Normal file
206
scripts/03_convert_cache.py
Normal file
@@ -0,0 +1,206 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 1C — Conversion du cache Ollama en exemples de raisonnement ChatML.
|
||||
|
||||
Sources : ollama_cache.json (1 840 entrées avec raisonnement complet)
|
||||
Produit : data/processed/reasoning_chatml.jsonl
|
||||
|
||||
V2 : Utilise le cache actuel complet (1 840 entrées vs 100 avant).
|
||||
Filtre pour ne garder que les entrées avec raisonnement structuré.
|
||||
Supporte aussi les clés das_llm::das_extract:: du pipeline étendu.
|
||||
|
||||
Chaque entrée du cache contient un raisonnement structuré :
|
||||
- analyse_clinique → codes_candidats → discrimination → regle_pmsi → code + justification
|
||||
Ces exemples sont les plus précieux car ils montrent le raisonnement DIM complet.
|
||||
"""
|
||||
|
||||
import json
|
||||
import random
|
||||
from pathlib import Path
|
||||
|
||||
random.seed(42)
|
||||
|
||||
BASE = Path(__file__).resolve().parent.parent
|
||||
T2A = Path("/home/dom/ai/t2a")
|
||||
OUT = BASE / "data" / "processed"
|
||||
OUT.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
SYSTEM_MSG = "Tu es un médecin DIM expert en codage PMSI. Tu codes les diagnostics en CIM-10 en suivant une démarche structurée : analyse clinique, identification des codes candidats, discrimination, vérification des règles PMSI."
|
||||
|
||||
|
||||
def make_chatml(system, user, assistant):
|
||||
return {
|
||||
"messages": [
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": user},
|
||||
{"role": "assistant", "content": assistant},
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def load_cache():
|
||||
"""Charger le cache Ollama (actuel + backup si disponible)."""
|
||||
entries = {}
|
||||
|
||||
# Cache actuel (1 840 entrées)
|
||||
cache_path = T2A / "data" / "ollama_cache.json"
|
||||
if cache_path.exists():
|
||||
with open(cache_path) as f:
|
||||
data = json.load(f)
|
||||
entries.update(data.get("entries", {}))
|
||||
print(f" Cache actuel : {len(data.get('entries', {}))} entrées")
|
||||
|
||||
# Cache backup (peut contenir des entrées supplémentaires)
|
||||
backup_path = T2A / "data" / "ollama_cache_gemma3.bak"
|
||||
if backup_path.exists():
|
||||
with open(backup_path) as f:
|
||||
data = json.load(f)
|
||||
backup_entries = data.get("entries", {})
|
||||
new_count = sum(1 for k in backup_entries if k not in entries)
|
||||
entries.update(backup_entries)
|
||||
print(f" Cache backup : {len(backup_entries)} entrées (+{new_count} nouvelles)")
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
def parse_cache_key(key):
|
||||
"""Extraire le type (dp/das) et le texte depuis la clé du cache.
|
||||
|
||||
Formats supportés :
|
||||
- "dp::texte du diagnostic"
|
||||
- "das::texte du diagnostic"
|
||||
- "das_llm::das_extract::hash::texte" (pipeline étendu)
|
||||
"""
|
||||
if key.startswith("das_llm::das_extract::"):
|
||||
# Format : das_llm::das_extract::HASH::texte
|
||||
parts = key.split("::", 3)
|
||||
texte = parts[3] if len(parts) > 3 else parts[-1]
|
||||
return "das", texte.strip()
|
||||
if "::" in key:
|
||||
diag_type, texte = key.split("::", 1)
|
||||
return diag_type.strip(), texte.strip()
|
||||
return "das", key.strip()
|
||||
|
||||
|
||||
def build_user_prompt(diag_type, texte):
|
||||
"""Construire le prompt utilisateur à partir du type et du texte."""
|
||||
type_label = "Diagnostic Principal (DP)" if diag_type == "dp" else "Diagnostic Associé Significatif (DAS)"
|
||||
|
||||
prompt = f"Code ce diagnostic en CIM-10.\n\n"
|
||||
prompt += f"DIAGNOSTIC : {texte.capitalize()}\n"
|
||||
prompt += f"TYPE : {type_label}"
|
||||
|
||||
return prompt
|
||||
|
||||
|
||||
def build_assistant_response(entry):
|
||||
"""Construire la réponse structurée de l'assistant."""
|
||||
code = entry.get("code", "")
|
||||
confidence = entry.get("confidence", "medium")
|
||||
justification = entry.get("justification", "")
|
||||
raisonnement = entry.get("raisonnement", "")
|
||||
|
||||
# Si on a un raisonnement complet, le formater en JSON structuré
|
||||
if raisonnement:
|
||||
# Parser les sections du raisonnement
|
||||
response = {}
|
||||
|
||||
sections = {
|
||||
"ANALYSE CLINIQUE": "analyse_clinique",
|
||||
"CODES CANDIDATS": "codes_candidats",
|
||||
"DISCRIMINATION": "discrimination",
|
||||
"REGLE PMSI": "regle_pmsi",
|
||||
"RÈGLE PMSI": "regle_pmsi",
|
||||
}
|
||||
|
||||
# Extraire chaque section du raisonnement
|
||||
remaining = raisonnement
|
||||
for header, key in sections.items():
|
||||
marker = f"{header} :"
|
||||
if marker not in remaining:
|
||||
marker = f"{header}:"
|
||||
if marker in remaining:
|
||||
idx = remaining.index(marker)
|
||||
# Trouver la fin de cette section (début de la suivante ou fin)
|
||||
end_idx = len(remaining)
|
||||
for next_header in sections:
|
||||
next_marker = f"{next_header} :"
|
||||
next_marker2 = f"{next_header}:"
|
||||
for nm in (next_marker, next_marker2):
|
||||
if nm in remaining[idx + len(marker):]:
|
||||
candidate = idx + len(marker) + remaining[idx + len(marker):].index(nm)
|
||||
if candidate < end_idx:
|
||||
end_idx = candidate
|
||||
value = remaining[idx + len(marker):end_idx].strip()
|
||||
if value:
|
||||
response[key] = value
|
||||
|
||||
response["code"] = code
|
||||
response["confidence"] = confidence
|
||||
if justification:
|
||||
response["justification"] = justification
|
||||
|
||||
return json.dumps(response, ensure_ascii=False, indent=None)
|
||||
|
||||
# Si pas de raisonnement, réponse simple
|
||||
response = {
|
||||
"code": code,
|
||||
"confidence": confidence,
|
||||
}
|
||||
if justification:
|
||||
response["justification"] = justification
|
||||
|
||||
return json.dumps(response, ensure_ascii=False, indent=None)
|
||||
|
||||
|
||||
def main():
|
||||
print("Chargement du cache Ollama (toutes sources)...")
|
||||
entries = load_cache()
|
||||
print(f" Total fusionné : {len(entries)} entrées")
|
||||
|
||||
pairs = []
|
||||
with_reasoning = 0
|
||||
without_reasoning = 0
|
||||
skipped_no_code = 0
|
||||
skipped_no_text = 0
|
||||
by_type = {"dp": 0, "das": 0}
|
||||
|
||||
for key, entry in entries.items():
|
||||
diag_type, texte = parse_cache_key(key)
|
||||
|
||||
if not texte or len(texte) < 3:
|
||||
skipped_no_text += 1
|
||||
continue
|
||||
if not entry.get("code"):
|
||||
skipped_no_code += 1
|
||||
continue
|
||||
|
||||
user_prompt = build_user_prompt(diag_type, texte)
|
||||
assistant_response = build_assistant_response(entry)
|
||||
|
||||
if entry.get("raisonnement"):
|
||||
with_reasoning += 1
|
||||
else:
|
||||
without_reasoning += 1
|
||||
|
||||
by_type[diag_type] = by_type.get(diag_type, 0) + 1
|
||||
pairs.append(make_chatml(SYSTEM_MSG, user_prompt, assistant_response))
|
||||
|
||||
random.shuffle(pairs)
|
||||
|
||||
output_path = OUT / "reasoning_chatml.jsonl"
|
||||
with open(output_path, "w") as f:
|
||||
for pair in pairs:
|
||||
f.write(json.dumps(pair, ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"\nTotal : {len(pairs)} exemples → {output_path}")
|
||||
print(f" DP : {by_type.get('dp', 0)}, DAS : {by_type.get('das', 0)}")
|
||||
print(f" Avec raisonnement complet : {with_reasoning}")
|
||||
print(f" Sans raisonnement (code seul) : {without_reasoning}")
|
||||
print(f" Ignorés (pas de code) : {skipped_no_code}")
|
||||
print(f" Ignorés (pas de texte) : {skipped_no_text}")
|
||||
print(f"Taille : {output_path.stat().st_size / 1024:.0f} Ko")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user