#!/usr/bin/env python3 """Vérification des ressources machine (GPU, RAM, CPU) avant exécution. Utilisable comme module ou en standalone : from scripts.check_resources import check_resources, require_resources # Vérification simple (lève RuntimeError si insuffisant) require_resources(vram_free_mb=2000, ram_free_gb=4) # Vérification informative (retourne un dict) status = check_resources() print(status) # En standalone python scripts/check_resources.py python scripts/check_resources.py --vram 2000 --ram 4 --wait """ import subprocess import shutil import time import sys from dataclasses import dataclass, field from typing import List, Optional @dataclass class GpuProcess: pid: int name: str vram_mb: int @dataclass class ResourceStatus: # GPU gpu_available: bool = False gpu_name: str = "" vram_total_mb: int = 0 vram_used_mb: int = 0 vram_free_mb: int = 0 gpu_util_pct: int = 0 gpu_processes: List[GpuProcess] = field(default_factory=list) # RAM ram_total_gb: float = 0.0 ram_used_gb: float = 0.0 ram_free_gb: float = 0.0 ram_available_gb: float = 0.0 # CPU cpu_count: int = 0 load_avg_1m: float = 0.0 load_avg_5m: float = 0.0 def summary(self) -> str: lines = [] lines.append("=" * 55) lines.append(" ÉTAT DES RESSOURCES MACHINE") lines.append("=" * 55) # GPU if self.gpu_available: lines.append(f"\n GPU : {self.gpu_name}") lines.append(f" VRAM totale : {self.vram_total_mb} Mo") lines.append(f" VRAM utilisée: {self.vram_used_mb} Mo ({self._pct(self.vram_used_mb, self.vram_total_mb)}%)") lines.append(f" VRAM libre : {self.vram_free_mb} Mo") lines.append(f" Utilisation : {self.gpu_util_pct}%") if self.gpu_processes: lines.append(f" Processus GPU ({len(self.gpu_processes)}) :") for p in self.gpu_processes: short_name = p.name.split("/")[-1] if "/" in p.name else p.name project = self._guess_project(p.name) label = f" ({project})" if project else "" lines.append(f" PID {p.pid}: {short_name}{label} — {p.vram_mb} Mo") else: lines.append(" Aucun processus GPU actif") else: lines.append("\n GPU : non disponible (nvidia-smi absent)") # RAM lines.append(f"\n RAM : {self.ram_total_gb:.1f} Go total") lines.append(f" Utilisée : {self.ram_used_gb:.1f} Go") lines.append(f" Disponible : {self.ram_available_gb:.1f} Go") # CPU lines.append(f"\n CPU : {self.cpu_count} cœurs") lines.append(f" Load avg : {self.load_avg_1m:.1f} (1m) / {self.load_avg_5m:.1f} (5m)") lines.append("=" * 55) return "\n".join(lines) @staticmethod def _pct(used: int, total: int) -> int: return round(used * 100 / total) if total > 0 else 0 @staticmethod def _guess_project(path: str) -> str: """Devine le projet à partir du chemin du processus.""" parts = path.split("/") for i, p in enumerate(parts): if p == "ai" and i + 1 < len(parts): return parts[i + 1].split(".")[0] return "" def check_resources() -> ResourceStatus: """Collecte l'état actuel des ressources machine.""" status = ResourceStatus() # --- GPU --- if shutil.which("nvidia-smi"): status.gpu_available = True try: out = subprocess.run( ["nvidia-smi", "--query-gpu=name,memory.total,memory.used,memory.free,utilization.gpu", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5 ) if out.returncode == 0: parts = [p.strip() for p in out.stdout.strip().split(",")] if len(parts) >= 5: status.gpu_name = parts[0] status.vram_total_mb = int(parts[1]) status.vram_used_mb = int(parts[2]) status.vram_free_mb = int(parts[3]) status.gpu_util_pct = int(parts[4]) except Exception: pass # Processus GPU try: out = subprocess.run( ["nvidia-smi", "--query-compute-apps=pid,process_name,used_gpu_memory", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5 ) if out.returncode == 0 and out.stdout.strip(): for line in out.stdout.strip().splitlines(): parts = [p.strip() for p in line.split(",")] if len(parts) >= 3: try: status.gpu_processes.append(GpuProcess( pid=int(parts[0]), name=parts[1], vram_mb=int(parts[2]), )) except ValueError: pass except Exception: pass # --- RAM --- try: with open("/proc/meminfo") as f: meminfo = {} for line in f: parts = line.split() if len(parts) >= 2: key = parts[0].rstrip(":") meminfo[key] = int(parts[1]) # en kB status.ram_total_gb = meminfo.get("MemTotal", 0) / 1048576 status.ram_free_gb = meminfo.get("MemFree", 0) / 1048576 status.ram_available_gb = meminfo.get("MemAvailable", 0) / 1048576 status.ram_used_gb = status.ram_total_gb - status.ram_free_gb except Exception: pass # --- CPU --- try: import os status.cpu_count = os.cpu_count() or 0 load = os.getloadavg() status.load_avg_1m = load[0] status.load_avg_5m = load[1] except Exception: pass return status def require_resources( vram_free_mb: int = 0, ram_free_gb: float = 0, max_gpu_util_pct: int = 100, fail_if_gpu_busy: bool = False, ) -> ResourceStatus: """Vérifie que les ressources minimales sont disponibles. Args: vram_free_mb: VRAM libre minimale requise (Mo). 0 = pas de vérification GPU. ram_free_gb: RAM disponible minimale (Go). max_gpu_util_pct: Utilisation GPU max tolérée (%). fail_if_gpu_busy: Si True, échoue si d'autres processus utilisent le GPU. Returns: ResourceStatus si tout est ok. Raises: RuntimeError avec détails si ressources insuffisantes. """ status = check_resources() errors = [] if vram_free_mb > 0: if not status.gpu_available: errors.append(f"GPU requis ({vram_free_mb} Mo VRAM) mais nvidia-smi non disponible") elif status.vram_free_mb < vram_free_mb: procs = "" if status.gpu_processes: procs = "\n Processus occupant le GPU :" for p in status.gpu_processes: short = p.name.split("/")[-1] project = status._guess_project(p.name) label = f" ({project})" if project else "" procs += f"\n PID {p.pid}: {short}{label} — {p.vram_mb} Mo" errors.append( f"VRAM insuffisante : {status.vram_free_mb} Mo libre, " f"{vram_free_mb} Mo requis (utilisé: {status.vram_used_mb}/{status.vram_total_mb} Mo)" f"{procs}" ) if max_gpu_util_pct < 100 and status.gpu_available: if status.gpu_util_pct > max_gpu_util_pct: errors.append( f"GPU trop chargé : {status.gpu_util_pct}% d'utilisation " f"(max toléré: {max_gpu_util_pct}%)" ) if fail_if_gpu_busy and status.gpu_processes: names = [f"{p.name.split('/')[-1]} (PID {p.pid})" for p in status.gpu_processes] errors.append(f"GPU occupé par : {', '.join(names)}") if ram_free_gb > 0 and status.ram_available_gb < ram_free_gb: errors.append( f"RAM insuffisante : {status.ram_available_gb:.1f} Go disponible, " f"{ram_free_gb:.1f} Go requis" ) if errors: msg = "Ressources insuffisantes :\n " + "\n ".join(errors) msg += "\n\n" + status.summary() raise RuntimeError(msg) return status def wait_for_resources( vram_free_mb: int = 0, ram_free_gb: float = 0, max_gpu_util_pct: int = 100, timeout_minutes: int = 30, check_interval_seconds: int = 30, ) -> ResourceStatus: """Attend que les ressources soient disponibles (avec timeout). Affiche un message périodique tant que les ressources sont insuffisantes. Utile avant un fine-tuning ou un batch lourd. Returns: ResourceStatus quand les ressources sont disponibles. Raises: TimeoutError si le timeout est atteint. """ deadline = time.time() + timeout_minutes * 60 attempt = 0 while time.time() < deadline: try: status = require_resources( vram_free_mb=vram_free_mb, ram_free_gb=ram_free_gb, max_gpu_util_pct=max_gpu_util_pct, ) if attempt > 0: print(f"\nRessources disponibles après {attempt * check_interval_seconds}s d'attente.") return status except RuntimeError as e: attempt += 1 if attempt == 1: print(f"En attente de ressources (timeout: {timeout_minutes}min)...") print(f" Requis: VRAM >= {vram_free_mb} Mo, RAM >= {ram_free_gb} Go") remaining = int((deadline - time.time()) / 60) status = check_resources() gpu_info = f"VRAM libre: {status.vram_free_mb} Mo" if status.gpu_available else "pas de GPU" print( f" [{attempt}] {gpu_info}, RAM dispo: {status.ram_available_gb:.1f} Go " f"— encore {remaining}min max", flush=True, ) time.sleep(check_interval_seconds) raise TimeoutError( f"Timeout ({timeout_minutes}min) : ressources toujours insuffisantes.\n" + check_resources().summary() ) def main(): import argparse parser = argparse.ArgumentParser(description="Vérification des ressources machine") parser.add_argument("--vram", type=int, default=0, help="VRAM libre minimale requise (Mo)") parser.add_argument("--ram", type=float, default=0, help="RAM disponible minimale (Go)") parser.add_argument("--gpu-util", type=int, default=100, help="Utilisation GPU max tolérée (%%)") parser.add_argument("--wait", action="store_true", help="Attendre que les ressources soient disponibles") parser.add_argument("--timeout", type=int, default=30, help="Timeout d'attente en minutes (défaut: 30)") args = parser.parse_args() # Afficher l'état actuel status = check_resources() print(status.summary()) # Vérifier les seuils si demandés if args.vram > 0 or args.ram > 0 or args.gpu_util < 100: if args.wait: try: wait_for_resources( vram_free_mb=args.vram, ram_free_gb=args.ram, max_gpu_util_pct=args.gpu_util, timeout_minutes=args.timeout, ) print("\nOK — ressources disponibles.") except TimeoutError as e: print(f"\nERREUR : {e}", file=sys.stderr) sys.exit(1) else: try: require_resources( vram_free_mb=args.vram, ram_free_gb=args.ram, max_gpu_util_pct=args.gpu_util, ) print("\nOK — ressources suffisantes.") except RuntimeError as e: print(f"\nERREUR : {e}", file=sys.stderr) sys.exit(1) if __name__ == "__main__": main()