Passe de 95/3/2 (lookups/raisonnement/règles) à ~31/49/20. Dataset cible ~16K exemples denses (vs 66K de lookups avant). Modifiés : - 03_convert_cache.py : cache complet 1840 entrées (actuel + backup) - 04_build_dataset.py : subsampling agressif (CIM-10 1.5K, CCAM 1.5K, CoCoA 2K) + sélection intelligente priorisant le raisonnement - 12_generate_pipeline_examples.py : 3 templates (court + long + CPAM), cache actuel, cible ~2800 exemples Créés : - 13_generate_fascicule_reasoning.py : parsing 10 fascicules ATIH, génération Q&A raisonnement via Claude Opus 4.6 (~450 exemples) - 14_generate_negative_examples.py : 1000 exemples négatifs (symptômes/DP, redondances sémantiques, DAS non significatifs) - 15_generate_discrimination.py : 800 exercices de discrimination entre codes siblings CIM-10 via Claude Opus 4.6 - 16_parse_guide_metho.py : extraction Guide Méthodologique MCO 2026, Q&A directes + raisonnement via Claude Opus 4.6 (~500 exemples) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
332 lines
10 KiB
Python
332 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Phase 2 — Fine-tuning QLoRA de gemma3:12b avec Unsloth.
|
|
|
|
IMPORTANT : Arrêter Ollama avant de lancer ce script !
|
|
sudo systemctl stop ollama
|
|
|
|
Le script :
|
|
1. Charge gemma3:12b en 4-bit quantifié
|
|
2. Attache un adaptateur LoRA
|
|
3. Entraîne sur le dataset PMSI (ChatML)
|
|
4. Sauvegarde l'adaptateur LoRA
|
|
5. (Optionnel) Exporte en GGUF pour Ollama
|
|
|
|
Prérequis :
|
|
- bash scripts/07_setup_unsloth.sh (installer les dépendances)
|
|
- data/datasets/pmsi_train.jsonl + pmsi_eval.jsonl
|
|
|
|
Usage :
|
|
python scripts/08_train_lora.py [--epochs 3] [--lr 2e-4] [--batch 1] [--export-gguf]
|
|
|
|
Hardware cible : RTX 5070 (12 Go VRAM)
|
|
- QLoRA 4-bit sur 12B ≈ 8-9 Go VRAM
|
|
- batch_size=1 + gradient_accumulation=8 → batch effectif de 8
|
|
- gradient_checkpointing pour économiser la VRAM
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
|
|
BASE = Path(__file__).resolve().parent.parent
|
|
DATASETS = BASE / "data" / "datasets"
|
|
OUTPUT = BASE / "models"
|
|
OUTPUT.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def check_prerequisites():
|
|
"""Vérifier que tout est prêt."""
|
|
import torch
|
|
|
|
# GPU disponible ?
|
|
if not torch.cuda.is_available():
|
|
raise RuntimeError("CUDA non disponible. Vérifiez votre installation GPU.")
|
|
|
|
gpu_name = torch.cuda.get_device_name(0)
|
|
vram_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
|
|
vram_free = (torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0)) / 1024**3
|
|
|
|
print(f"GPU: {gpu_name}")
|
|
print(f"VRAM: {vram_total:.1f} Go total, {vram_free:.1f} Go libre")
|
|
|
|
if vram_free < 10:
|
|
print("⚠ Moins de 10 Go de VRAM libre.")
|
|
print(" → Arrêtez Ollama : sudo systemctl stop ollama")
|
|
print(" → Ou utilisez --offload-cpu pour décharger sur la RAM")
|
|
|
|
# Dataset existe ?
|
|
train_path = DATASETS / "pmsi_train.jsonl"
|
|
eval_path = DATASETS / "pmsi_eval.jsonl"
|
|
if not train_path.exists() or not eval_path.exists():
|
|
raise FileNotFoundError(
|
|
f"Dataset non trouvé. Lancez d'abord : python scripts/04_build_dataset.py"
|
|
)
|
|
|
|
# Compter les exemples
|
|
with open(train_path) as f:
|
|
n_train = sum(1 for _ in f)
|
|
with open(eval_path) as f:
|
|
n_eval = sum(1 for _ in f)
|
|
|
|
print(f"Dataset: {n_train} train + {n_eval} eval")
|
|
return train_path, eval_path
|
|
|
|
|
|
def load_model(model_name, max_seq_length, load_in_4bit=True):
|
|
"""Charger le modèle avec Unsloth."""
|
|
from unsloth import FastLanguageModel
|
|
|
|
print(f"\nChargement de {model_name} (4-bit={load_in_4bit})...")
|
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained(
|
|
model_name=model_name,
|
|
max_seq_length=max_seq_length,
|
|
dtype=None, # Auto-détection
|
|
load_in_4bit=load_in_4bit,
|
|
)
|
|
|
|
print(f" Modèle chargé : {model.config._name_or_path}")
|
|
print(f" Paramètres : {model.num_parameters() / 1e9:.1f}B")
|
|
|
|
return model, tokenizer
|
|
|
|
|
|
def attach_lora(model, r=32, alpha=64, dropout=0.05):
|
|
"""Attacher l'adaptateur LoRA."""
|
|
from unsloth import FastLanguageModel
|
|
|
|
print(f"\nAttachement LoRA (r={r}, alpha={alpha}, dropout={dropout})...")
|
|
|
|
model = FastLanguageModel.get_peft_model(
|
|
model,
|
|
r=r,
|
|
target_modules=[
|
|
"q_proj", "k_proj", "v_proj", "o_proj",
|
|
"gate_proj", "up_proj", "down_proj",
|
|
],
|
|
lora_alpha=alpha,
|
|
lora_dropout=dropout,
|
|
bias="none",
|
|
use_gradient_checkpointing="unsloth", # Économise 30% VRAM
|
|
random_state=42,
|
|
)
|
|
|
|
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
|
total = sum(p.numel() for p in model.parameters())
|
|
print(f" Paramètres entraînables : {trainable / 1e6:.1f}M / {total / 1e9:.1f}B ({100 * trainable / total:.2f}%)")
|
|
|
|
return model
|
|
|
|
|
|
def load_dataset(train_path, eval_path):
|
|
"""Charger le dataset au format ChatML."""
|
|
from datasets import Dataset
|
|
|
|
def load_jsonl(path):
|
|
examples = []
|
|
with open(path) as f:
|
|
for line in f:
|
|
examples.append(json.loads(line.strip()))
|
|
return examples
|
|
|
|
train_data = load_jsonl(train_path)
|
|
eval_data = load_jsonl(eval_path)
|
|
|
|
train_ds = Dataset.from_list(train_data)
|
|
eval_ds = Dataset.from_list(eval_data)
|
|
|
|
print(f"\nDataset chargé :")
|
|
print(f" Train : {len(train_ds)} exemples")
|
|
print(f" Eval : {len(eval_ds)} exemples")
|
|
|
|
return train_ds, eval_ds
|
|
|
|
|
|
def format_chat(example, tokenizer):
|
|
"""Formater un exemple ChatML pour l'entraînement."""
|
|
messages = example["messages"]
|
|
# Utiliser le template de chat du tokenizer
|
|
text = tokenizer.apply_chat_template(
|
|
messages,
|
|
tokenize=False,
|
|
add_generation_prompt=False,
|
|
)
|
|
return {"text": text}
|
|
|
|
|
|
def train(model, tokenizer, train_ds, eval_ds, args):
|
|
"""Lancer l'entraînement."""
|
|
from trl import SFTTrainer, SFTConfig
|
|
from aim.hugging_face import AimCallback
|
|
|
|
print(f"\nConfiguration d'entraînement :")
|
|
print(f" Epochs : {args.epochs}")
|
|
print(f" Learning rate : {args.lr}")
|
|
print(f" Batch size : {args.batch} (gradient_accumulation={args.grad_accum})")
|
|
print(f" Batch effectif : {args.batch * args.grad_accum}")
|
|
print(f" Max seq length : {args.max_seq_length}")
|
|
|
|
# Formater le dataset
|
|
train_ds = train_ds.map(lambda x: format_chat(x, tokenizer), num_proc=4)
|
|
eval_ds = eval_ds.map(lambda x: format_chat(x, tokenizer), num_proc=4)
|
|
|
|
output_dir = OUTPUT / "pmsi-lora-checkpoints"
|
|
|
|
# Callback Aim pour le tracking des métriques
|
|
aim_callback = AimCallback(
|
|
repo=str(BASE),
|
|
experiment="pmsi-coder-v2",
|
|
)
|
|
|
|
training_args = SFTConfig(
|
|
output_dir=str(output_dir),
|
|
num_train_epochs=args.epochs,
|
|
per_device_train_batch_size=args.batch,
|
|
per_device_eval_batch_size=args.batch,
|
|
gradient_accumulation_steps=args.grad_accum,
|
|
learning_rate=args.lr,
|
|
weight_decay=0.01,
|
|
warmup_ratio=0.05,
|
|
lr_scheduler_type="cosine",
|
|
logging_steps=10,
|
|
eval_strategy="steps",
|
|
eval_steps=1000,
|
|
save_strategy="steps",
|
|
save_steps=500,
|
|
save_total_limit=3,
|
|
fp16=False,
|
|
bf16=True,
|
|
max_seq_length=args.max_seq_length,
|
|
dataset_text_field="text",
|
|
seed=42,
|
|
report_to="none",
|
|
)
|
|
|
|
trainer = SFTTrainer(
|
|
model=model,
|
|
tokenizer=tokenizer,
|
|
train_dataset=train_ds,
|
|
eval_dataset=eval_ds,
|
|
args=training_args,
|
|
callbacks=[aim_callback],
|
|
)
|
|
|
|
print(f"\nDémarrage de l'entraînement...")
|
|
print(f" Output : {output_dir}")
|
|
print(f" Steps estimés : ~{len(train_ds) * args.epochs // (args.batch * args.grad_accum)}")
|
|
|
|
if args.resume:
|
|
print(f" Reprise depuis le dernier checkpoint...")
|
|
trainer.train(resume_from_checkpoint=True)
|
|
else:
|
|
trainer.train()
|
|
|
|
# Sauvegarder le modèle final
|
|
final_dir = OUTPUT / "pmsi-lora-final"
|
|
model.save_pretrained(str(final_dir))
|
|
tokenizer.save_pretrained(str(final_dir))
|
|
print(f"\nModèle LoRA sauvegardé : {final_dir}")
|
|
|
|
return trainer, final_dir
|
|
|
|
|
|
def export_gguf(model, tokenizer, final_dir, quantization="q4_k_m"):
|
|
"""Exporter en GGUF pour Ollama."""
|
|
print(f"\nExport GGUF ({quantization})...")
|
|
|
|
gguf_dir = OUTPUT / "pmsi-gguf"
|
|
gguf_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Unsloth export
|
|
model.save_pretrained_gguf(
|
|
str(gguf_dir),
|
|
tokenizer,
|
|
quantization_method=quantization,
|
|
)
|
|
|
|
# Trouver le fichier GGUF
|
|
gguf_files = list(gguf_dir.glob("*.gguf"))
|
|
if gguf_files:
|
|
gguf_path = gguf_files[0]
|
|
print(f" GGUF exporté : {gguf_path} ({gguf_path.stat().st_size / 1024**3:.1f} Go)")
|
|
|
|
# Créer le Modelfile pour Ollama
|
|
modelfile_path = gguf_dir / "Modelfile"
|
|
modelfile_content = f"""FROM {gguf_path.name}
|
|
|
|
PARAMETER temperature 0.3
|
|
PARAMETER top_p 0.9
|
|
PARAMETER num_ctx 8192
|
|
"""
|
|
with open(modelfile_path, "w") as f:
|
|
f.write(modelfile_content)
|
|
|
|
print(f" Modelfile créé : {modelfile_path}")
|
|
print(f"\n Pour importer dans Ollama :")
|
|
print(f" cd {gguf_dir}")
|
|
print(f" ollama create pmsi-coder -f Modelfile")
|
|
else:
|
|
print(" Aucun fichier GGUF trouvé !")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Fine-tuning QLoRA avec Unsloth")
|
|
|
|
# Modèle
|
|
parser.add_argument("--model", default="unsloth/gemma-3-12b-it-bnb-4bit",
|
|
help="Nom du modèle HuggingFace")
|
|
parser.add_argument("--max-seq-length", type=int, default=512,
|
|
help="Longueur max des séquences")
|
|
|
|
# LoRA
|
|
parser.add_argument("--lora-r", type=int, default=32, help="Rang LoRA")
|
|
parser.add_argument("--lora-alpha", type=int, default=64, help="Alpha LoRA")
|
|
parser.add_argument("--lora-dropout", type=float, default=0.0, help="Dropout LoRA (0=fast patching Unsloth)")
|
|
|
|
# Entraînement
|
|
parser.add_argument("--epochs", type=int, default=3, help="Nombre d'epochs")
|
|
parser.add_argument("--lr", type=float, default=2e-4, help="Learning rate")
|
|
parser.add_argument("--batch", type=int, default=1, help="Batch size par GPU")
|
|
parser.add_argument("--grad-accum", type=int, default=8, help="Gradient accumulation steps")
|
|
|
|
# Resume
|
|
parser.add_argument("--resume", action="store_true", help="Reprendre depuis le dernier checkpoint")
|
|
|
|
# Export
|
|
parser.add_argument("--export-gguf", action="store_true", help="Exporter en GGUF après entraînement")
|
|
parser.add_argument("--gguf-quant", default="q4_k_m", help="Méthode de quantification GGUF")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Vérifications
|
|
train_path, eval_path = check_prerequisites()
|
|
|
|
# Charger le modèle
|
|
model, tokenizer = load_model(args.model, args.max_seq_length)
|
|
|
|
# Attacher LoRA
|
|
model = attach_lora(model, r=args.lora_r, alpha=args.lora_alpha, dropout=args.lora_dropout)
|
|
|
|
# Charger le dataset
|
|
train_ds, eval_ds = load_dataset(train_path, eval_path)
|
|
|
|
# Entraîner
|
|
trainer, final_dir = train(model, tokenizer, train_ds, eval_ds, args)
|
|
|
|
# Export GGUF optionnel
|
|
if args.export_gguf:
|
|
export_gguf(model, tokenizer, final_dir, args.gguf_quant)
|
|
|
|
print("\n" + "=" * 50)
|
|
print("Fine-tuning terminé !")
|
|
print(f" Adaptateur LoRA : {final_dir}")
|
|
if args.export_gguf:
|
|
print(f" GGUF : {OUTPUT / 'pmsi-gguf'}")
|
|
print("=" * 50)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|