#!/usr/bin/env python3 """ Phase 2 — Fine-tuning QLoRA de gemma3:12b avec Unsloth. IMPORTANT : Arrêter Ollama avant de lancer ce script ! sudo systemctl stop ollama Le script : 1. Charge gemma3:12b en 4-bit quantifié 2. Attache un adaptateur LoRA 3. Entraîne sur le dataset PMSI (ChatML) 4. Sauvegarde l'adaptateur LoRA 5. (Optionnel) Exporte en GGUF pour Ollama Prérequis : - bash scripts/07_setup_unsloth.sh (installer les dépendances) - data/datasets/pmsi_train.jsonl + pmsi_eval.jsonl Usage : python scripts/08_train_lora.py [--epochs 3] [--lr 2e-4] [--batch 1] [--export-gguf] Hardware cible : RTX 5070 (12 Go VRAM) - QLoRA 4-bit sur 12B ≈ 8-9 Go VRAM - batch_size=1 + gradient_accumulation=8 → batch effectif de 8 - gradient_checkpointing pour économiser la VRAM """ import argparse import json import os from pathlib import Path BASE = Path(__file__).resolve().parent.parent DATASETS = BASE / "data" / "datasets" OUTPUT = BASE / "models" OUTPUT.mkdir(parents=True, exist_ok=True) def check_prerequisites(): """Vérifier que tout est prêt.""" import torch # GPU disponible ? if not torch.cuda.is_available(): raise RuntimeError("CUDA non disponible. Vérifiez votre installation GPU.") gpu_name = torch.cuda.get_device_name(0) vram_total = torch.cuda.get_device_properties(0).total_memory / 1024**3 vram_free = (torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0)) / 1024**3 print(f"GPU: {gpu_name}") print(f"VRAM: {vram_total:.1f} Go total, {vram_free:.1f} Go libre") if vram_free < 10: print("⚠ Moins de 10 Go de VRAM libre.") print(" → Arrêtez Ollama : sudo systemctl stop ollama") print(" → Ou utilisez --offload-cpu pour décharger sur la RAM") # Dataset existe ? train_path = DATASETS / "pmsi_train.jsonl" eval_path = DATASETS / "pmsi_eval.jsonl" if not train_path.exists() or not eval_path.exists(): raise FileNotFoundError( f"Dataset non trouvé. Lancez d'abord : python scripts/04_build_dataset.py" ) # Compter les exemples with open(train_path) as f: n_train = sum(1 for _ in f) with open(eval_path) as f: n_eval = sum(1 for _ in f) print(f"Dataset: {n_train} train + {n_eval} eval") return train_path, eval_path def load_model(model_name, max_seq_length, load_in_4bit=True): """Charger le modèle avec Unsloth.""" from unsloth import FastLanguageModel print(f"\nChargement de {model_name} (4-bit={load_in_4bit})...") model, tokenizer = FastLanguageModel.from_pretrained( model_name=model_name, max_seq_length=max_seq_length, dtype=None, # Auto-détection load_in_4bit=load_in_4bit, ) print(f" Modèle chargé : {model.config._name_or_path}") print(f" Paramètres : {model.num_parameters() / 1e9:.1f}B") return model, tokenizer def attach_lora(model, r=32, alpha=64, dropout=0.05): """Attacher l'adaptateur LoRA.""" from unsloth import FastLanguageModel print(f"\nAttachement LoRA (r={r}, alpha={alpha}, dropout={dropout})...") model = FastLanguageModel.get_peft_model( model, r=r, target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ], lora_alpha=alpha, lora_dropout=dropout, bias="none", use_gradient_checkpointing="unsloth", # Économise 30% VRAM random_state=42, ) trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) total = sum(p.numel() for p in model.parameters()) print(f" Paramètres entraînables : {trainable / 1e6:.1f}M / {total / 1e9:.1f}B ({100 * trainable / total:.2f}%)") return model def load_dataset(train_path, eval_path): """Charger le dataset au format ChatML.""" from datasets import Dataset def load_jsonl(path): examples = [] with open(path) as f: for line in f: examples.append(json.loads(line.strip())) return examples train_data = load_jsonl(train_path) eval_data = load_jsonl(eval_path) train_ds = Dataset.from_list(train_data) eval_ds = Dataset.from_list(eval_data) print(f"\nDataset chargé :") print(f" Train : {len(train_ds)} exemples") print(f" Eval : {len(eval_ds)} exemples") return train_ds, eval_ds def format_chat(example, tokenizer): """Formater un exemple ChatML pour l'entraînement.""" messages = example["messages"] # Utiliser le template de chat du tokenizer text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=False, ) return {"text": text} def train(model, tokenizer, train_ds, eval_ds, args): """Lancer l'entraînement.""" from trl import SFTTrainer, SFTConfig from aim.hugging_face import AimCallback print(f"\nConfiguration d'entraînement :") print(f" Epochs : {args.epochs}") print(f" Learning rate : {args.lr}") print(f" Batch size : {args.batch} (gradient_accumulation={args.grad_accum})") print(f" Batch effectif : {args.batch * args.grad_accum}") print(f" Max seq length : {args.max_seq_length}") # Formater le dataset train_ds = train_ds.map(lambda x: format_chat(x, tokenizer), num_proc=4) eval_ds = eval_ds.map(lambda x: format_chat(x, tokenizer), num_proc=4) output_dir = OUTPUT / "pmsi-lora-checkpoints" # Callback Aim pour le tracking des métriques aim_callback = AimCallback( repo=str(BASE), experiment="pmsi-coder-v2", ) training_args = SFTConfig( output_dir=str(output_dir), num_train_epochs=args.epochs, per_device_train_batch_size=args.batch, per_device_eval_batch_size=args.batch, gradient_accumulation_steps=args.grad_accum, learning_rate=args.lr, weight_decay=0.01, warmup_ratio=0.05, lr_scheduler_type="cosine", logging_steps=10, eval_strategy="steps", eval_steps=1000, save_strategy="steps", save_steps=500, save_total_limit=3, fp16=False, bf16=True, max_seq_length=args.max_seq_length, dataset_text_field="text", seed=42, report_to="none", ) trainer = SFTTrainer( model=model, tokenizer=tokenizer, train_dataset=train_ds, eval_dataset=eval_ds, args=training_args, callbacks=[aim_callback], ) print(f"\nDémarrage de l'entraînement...") print(f" Output : {output_dir}") print(f" Steps estimés : ~{len(train_ds) * args.epochs // (args.batch * args.grad_accum)}") if args.resume: print(f" Reprise depuis le dernier checkpoint...") trainer.train(resume_from_checkpoint=True) else: trainer.train() # Sauvegarder le modèle final final_dir = OUTPUT / "pmsi-lora-final" model.save_pretrained(str(final_dir)) tokenizer.save_pretrained(str(final_dir)) print(f"\nModèle LoRA sauvegardé : {final_dir}") return trainer, final_dir def export_gguf(model, tokenizer, final_dir, quantization="q4_k_m"): """Exporter en GGUF pour Ollama.""" print(f"\nExport GGUF ({quantization})...") gguf_dir = OUTPUT / "pmsi-gguf" gguf_dir.mkdir(parents=True, exist_ok=True) # Unsloth export model.save_pretrained_gguf( str(gguf_dir), tokenizer, quantization_method=quantization, ) # Trouver le fichier GGUF gguf_files = list(gguf_dir.glob("*.gguf")) if gguf_files: gguf_path = gguf_files[0] print(f" GGUF exporté : {gguf_path} ({gguf_path.stat().st_size / 1024**3:.1f} Go)") # Créer le Modelfile pour Ollama modelfile_path = gguf_dir / "Modelfile" modelfile_content = f"""FROM {gguf_path.name} PARAMETER temperature 0.3 PARAMETER top_p 0.9 PARAMETER num_ctx 8192 """ with open(modelfile_path, "w") as f: f.write(modelfile_content) print(f" Modelfile créé : {modelfile_path}") print(f"\n Pour importer dans Ollama :") print(f" cd {gguf_dir}") print(f" ollama create pmsi-coder -f Modelfile") else: print(" Aucun fichier GGUF trouvé !") def main(): parser = argparse.ArgumentParser(description="Fine-tuning QLoRA avec Unsloth") # Modèle parser.add_argument("--model", default="unsloth/gemma-3-12b-it-bnb-4bit", help="Nom du modèle HuggingFace") parser.add_argument("--max-seq-length", type=int, default=512, help="Longueur max des séquences") # LoRA parser.add_argument("--lora-r", type=int, default=32, help="Rang LoRA") parser.add_argument("--lora-alpha", type=int, default=64, help="Alpha LoRA") parser.add_argument("--lora-dropout", type=float, default=0.0, help="Dropout LoRA (0=fast patching Unsloth)") # Entraînement parser.add_argument("--epochs", type=int, default=3, help="Nombre d'epochs") parser.add_argument("--lr", type=float, default=2e-4, help="Learning rate") parser.add_argument("--batch", type=int, default=1, help="Batch size par GPU") parser.add_argument("--grad-accum", type=int, default=8, help="Gradient accumulation steps") # Resume parser.add_argument("--resume", action="store_true", help="Reprendre depuis le dernier checkpoint") # Export parser.add_argument("--export-gguf", action="store_true", help="Exporter en GGUF après entraînement") parser.add_argument("--gguf-quant", default="q4_k_m", help="Méthode de quantification GGUF") args = parser.parse_args() # Vérifications train_path, eval_path = check_prerequisites() # Charger le modèle model, tokenizer = load_model(args.model, args.max_seq_length) # Attacher LoRA model = attach_lora(model, r=args.lora_r, alpha=args.lora_alpha, dropout=args.lora_dropout) # Charger le dataset train_ds, eval_ds = load_dataset(train_path, eval_path) # Entraîner trainer, final_dir = train(model, tokenizer, train_ds, eval_ds, args) # Export GGUF optionnel if args.export_gguf: export_gguf(model, tokenizer, final_dir, args.gguf_quant) print("\n" + "=" * 50) print("Fine-tuning terminé !") print(f" Adaptateur LoRA : {final_dir}") if args.export_gguf: print(f" GGUF : {OUTPUT / 'pmsi-gguf'}") print("=" * 50) if __name__ == "__main__": main()