- Nouveau module scripts/check_resources.py : état GPU/VRAM/RAM/CPU, require_resources() et wait_for_resources() avec polling - Intégré dans finetune_camembert_bio.py (8 Go VRAM + 8 Go RAM) - Intégré dans run_batch_silver_export.py (workers × 4 Go RAM) - Évaluateur : EVA et RAI ajoutés aux termes médicaux (score 100.0/100) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
348 lines
12 KiB
Python
348 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""Vérification des ressources machine (GPU, RAM, CPU) avant exécution.
|
|
|
|
Utilisable comme module ou en standalone :
|
|
from scripts.check_resources import check_resources, require_resources
|
|
|
|
# Vérification simple (lève RuntimeError si insuffisant)
|
|
require_resources(vram_free_mb=2000, ram_free_gb=4)
|
|
|
|
# Vérification informative (retourne un dict)
|
|
status = check_resources()
|
|
print(status)
|
|
|
|
# En standalone
|
|
python scripts/check_resources.py
|
|
python scripts/check_resources.py --vram 2000 --ram 4 --wait
|
|
"""
|
|
import subprocess
|
|
import shutil
|
|
import time
|
|
import sys
|
|
from dataclasses import dataclass, field
|
|
from typing import List, Optional
|
|
|
|
|
|
@dataclass
|
|
class GpuProcess:
|
|
pid: int
|
|
name: str
|
|
vram_mb: int
|
|
|
|
|
|
@dataclass
|
|
class ResourceStatus:
|
|
# GPU
|
|
gpu_available: bool = False
|
|
gpu_name: str = ""
|
|
vram_total_mb: int = 0
|
|
vram_used_mb: int = 0
|
|
vram_free_mb: int = 0
|
|
gpu_util_pct: int = 0
|
|
gpu_processes: List[GpuProcess] = field(default_factory=list)
|
|
# RAM
|
|
ram_total_gb: float = 0.0
|
|
ram_used_gb: float = 0.0
|
|
ram_free_gb: float = 0.0
|
|
ram_available_gb: float = 0.0
|
|
# CPU
|
|
cpu_count: int = 0
|
|
load_avg_1m: float = 0.0
|
|
load_avg_5m: float = 0.0
|
|
|
|
def summary(self) -> str:
|
|
lines = []
|
|
lines.append("=" * 55)
|
|
lines.append(" ÉTAT DES RESSOURCES MACHINE")
|
|
lines.append("=" * 55)
|
|
|
|
# GPU
|
|
if self.gpu_available:
|
|
lines.append(f"\n GPU : {self.gpu_name}")
|
|
lines.append(f" VRAM totale : {self.vram_total_mb} Mo")
|
|
lines.append(f" VRAM utilisée: {self.vram_used_mb} Mo ({self._pct(self.vram_used_mb, self.vram_total_mb)}%)")
|
|
lines.append(f" VRAM libre : {self.vram_free_mb} Mo")
|
|
lines.append(f" Utilisation : {self.gpu_util_pct}%")
|
|
if self.gpu_processes:
|
|
lines.append(f" Processus GPU ({len(self.gpu_processes)}) :")
|
|
for p in self.gpu_processes:
|
|
short_name = p.name.split("/")[-1] if "/" in p.name else p.name
|
|
project = self._guess_project(p.name)
|
|
label = f" ({project})" if project else ""
|
|
lines.append(f" PID {p.pid}: {short_name}{label} — {p.vram_mb} Mo")
|
|
else:
|
|
lines.append(" Aucun processus GPU actif")
|
|
else:
|
|
lines.append("\n GPU : non disponible (nvidia-smi absent)")
|
|
|
|
# RAM
|
|
lines.append(f"\n RAM : {self.ram_total_gb:.1f} Go total")
|
|
lines.append(f" Utilisée : {self.ram_used_gb:.1f} Go")
|
|
lines.append(f" Disponible : {self.ram_available_gb:.1f} Go")
|
|
|
|
# CPU
|
|
lines.append(f"\n CPU : {self.cpu_count} cœurs")
|
|
lines.append(f" Load avg : {self.load_avg_1m:.1f} (1m) / {self.load_avg_5m:.1f} (5m)")
|
|
|
|
lines.append("=" * 55)
|
|
return "\n".join(lines)
|
|
|
|
@staticmethod
|
|
def _pct(used: int, total: int) -> int:
|
|
return round(used * 100 / total) if total > 0 else 0
|
|
|
|
@staticmethod
|
|
def _guess_project(path: str) -> str:
|
|
"""Devine le projet à partir du chemin du processus."""
|
|
parts = path.split("/")
|
|
for i, p in enumerate(parts):
|
|
if p == "ai" and i + 1 < len(parts):
|
|
return parts[i + 1].split(".")[0]
|
|
return ""
|
|
|
|
|
|
def check_resources() -> ResourceStatus:
|
|
"""Collecte l'état actuel des ressources machine."""
|
|
status = ResourceStatus()
|
|
|
|
# --- GPU ---
|
|
if shutil.which("nvidia-smi"):
|
|
status.gpu_available = True
|
|
try:
|
|
out = subprocess.run(
|
|
["nvidia-smi", "--query-gpu=name,memory.total,memory.used,memory.free,utilization.gpu",
|
|
"--format=csv,noheader,nounits"],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
if out.returncode == 0:
|
|
parts = [p.strip() for p in out.stdout.strip().split(",")]
|
|
if len(parts) >= 5:
|
|
status.gpu_name = parts[0]
|
|
status.vram_total_mb = int(parts[1])
|
|
status.vram_used_mb = int(parts[2])
|
|
status.vram_free_mb = int(parts[3])
|
|
status.gpu_util_pct = int(parts[4])
|
|
except Exception:
|
|
pass
|
|
|
|
# Processus GPU
|
|
try:
|
|
out = subprocess.run(
|
|
["nvidia-smi", "--query-compute-apps=pid,process_name,used_gpu_memory",
|
|
"--format=csv,noheader,nounits"],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
if out.returncode == 0 and out.stdout.strip():
|
|
for line in out.stdout.strip().splitlines():
|
|
parts = [p.strip() for p in line.split(",")]
|
|
if len(parts) >= 3:
|
|
try:
|
|
status.gpu_processes.append(GpuProcess(
|
|
pid=int(parts[0]),
|
|
name=parts[1],
|
|
vram_mb=int(parts[2]),
|
|
))
|
|
except ValueError:
|
|
pass
|
|
except Exception:
|
|
pass
|
|
|
|
# --- RAM ---
|
|
try:
|
|
with open("/proc/meminfo") as f:
|
|
meminfo = {}
|
|
for line in f:
|
|
parts = line.split()
|
|
if len(parts) >= 2:
|
|
key = parts[0].rstrip(":")
|
|
meminfo[key] = int(parts[1]) # en kB
|
|
|
|
status.ram_total_gb = meminfo.get("MemTotal", 0) / 1048576
|
|
status.ram_free_gb = meminfo.get("MemFree", 0) / 1048576
|
|
status.ram_available_gb = meminfo.get("MemAvailable", 0) / 1048576
|
|
status.ram_used_gb = status.ram_total_gb - status.ram_free_gb
|
|
except Exception:
|
|
pass
|
|
|
|
# --- CPU ---
|
|
try:
|
|
import os
|
|
status.cpu_count = os.cpu_count() or 0
|
|
load = os.getloadavg()
|
|
status.load_avg_1m = load[0]
|
|
status.load_avg_5m = load[1]
|
|
except Exception:
|
|
pass
|
|
|
|
return status
|
|
|
|
|
|
def require_resources(
|
|
vram_free_mb: int = 0,
|
|
ram_free_gb: float = 0,
|
|
max_gpu_util_pct: int = 100,
|
|
fail_if_gpu_busy: bool = False,
|
|
) -> ResourceStatus:
|
|
"""Vérifie que les ressources minimales sont disponibles.
|
|
|
|
Args:
|
|
vram_free_mb: VRAM libre minimale requise (Mo). 0 = pas de vérification GPU.
|
|
ram_free_gb: RAM disponible minimale (Go).
|
|
max_gpu_util_pct: Utilisation GPU max tolérée (%).
|
|
fail_if_gpu_busy: Si True, échoue si d'autres processus utilisent le GPU.
|
|
|
|
Returns:
|
|
ResourceStatus si tout est ok.
|
|
|
|
Raises:
|
|
RuntimeError avec détails si ressources insuffisantes.
|
|
"""
|
|
status = check_resources()
|
|
errors = []
|
|
|
|
if vram_free_mb > 0:
|
|
if not status.gpu_available:
|
|
errors.append(f"GPU requis ({vram_free_mb} Mo VRAM) mais nvidia-smi non disponible")
|
|
elif status.vram_free_mb < vram_free_mb:
|
|
procs = ""
|
|
if status.gpu_processes:
|
|
procs = "\n Processus occupant le GPU :"
|
|
for p in status.gpu_processes:
|
|
short = p.name.split("/")[-1]
|
|
project = status._guess_project(p.name)
|
|
label = f" ({project})" if project else ""
|
|
procs += f"\n PID {p.pid}: {short}{label} — {p.vram_mb} Mo"
|
|
errors.append(
|
|
f"VRAM insuffisante : {status.vram_free_mb} Mo libre, "
|
|
f"{vram_free_mb} Mo requis (utilisé: {status.vram_used_mb}/{status.vram_total_mb} Mo)"
|
|
f"{procs}"
|
|
)
|
|
|
|
if max_gpu_util_pct < 100 and status.gpu_available:
|
|
if status.gpu_util_pct > max_gpu_util_pct:
|
|
errors.append(
|
|
f"GPU trop chargé : {status.gpu_util_pct}% d'utilisation "
|
|
f"(max toléré: {max_gpu_util_pct}%)"
|
|
)
|
|
|
|
if fail_if_gpu_busy and status.gpu_processes:
|
|
names = [f"{p.name.split('/')[-1]} (PID {p.pid})" for p in status.gpu_processes]
|
|
errors.append(f"GPU occupé par : {', '.join(names)}")
|
|
|
|
if ram_free_gb > 0 and status.ram_available_gb < ram_free_gb:
|
|
errors.append(
|
|
f"RAM insuffisante : {status.ram_available_gb:.1f} Go disponible, "
|
|
f"{ram_free_gb:.1f} Go requis"
|
|
)
|
|
|
|
if errors:
|
|
msg = "Ressources insuffisantes :\n " + "\n ".join(errors)
|
|
msg += "\n\n" + status.summary()
|
|
raise RuntimeError(msg)
|
|
|
|
return status
|
|
|
|
|
|
def wait_for_resources(
|
|
vram_free_mb: int = 0,
|
|
ram_free_gb: float = 0,
|
|
max_gpu_util_pct: int = 100,
|
|
timeout_minutes: int = 30,
|
|
check_interval_seconds: int = 30,
|
|
) -> ResourceStatus:
|
|
"""Attend que les ressources soient disponibles (avec timeout).
|
|
|
|
Affiche un message périodique tant que les ressources sont insuffisantes.
|
|
Utile avant un fine-tuning ou un batch lourd.
|
|
|
|
Returns:
|
|
ResourceStatus quand les ressources sont disponibles.
|
|
|
|
Raises:
|
|
TimeoutError si le timeout est atteint.
|
|
"""
|
|
deadline = time.time() + timeout_minutes * 60
|
|
attempt = 0
|
|
|
|
while time.time() < deadline:
|
|
try:
|
|
status = require_resources(
|
|
vram_free_mb=vram_free_mb,
|
|
ram_free_gb=ram_free_gb,
|
|
max_gpu_util_pct=max_gpu_util_pct,
|
|
)
|
|
if attempt > 0:
|
|
print(f"\nRessources disponibles après {attempt * check_interval_seconds}s d'attente.")
|
|
return status
|
|
except RuntimeError as e:
|
|
attempt += 1
|
|
if attempt == 1:
|
|
print(f"En attente de ressources (timeout: {timeout_minutes}min)...")
|
|
print(f" Requis: VRAM >= {vram_free_mb} Mo, RAM >= {ram_free_gb} Go")
|
|
|
|
remaining = int((deadline - time.time()) / 60)
|
|
status = check_resources()
|
|
gpu_info = f"VRAM libre: {status.vram_free_mb} Mo" if status.gpu_available else "pas de GPU"
|
|
print(
|
|
f" [{attempt}] {gpu_info}, RAM dispo: {status.ram_available_gb:.1f} Go "
|
|
f"— encore {remaining}min max",
|
|
flush=True,
|
|
)
|
|
time.sleep(check_interval_seconds)
|
|
|
|
raise TimeoutError(
|
|
f"Timeout ({timeout_minutes}min) : ressources toujours insuffisantes.\n"
|
|
+ check_resources().summary()
|
|
)
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Vérification des ressources machine")
|
|
parser.add_argument("--vram", type=int, default=0,
|
|
help="VRAM libre minimale requise (Mo)")
|
|
parser.add_argument("--ram", type=float, default=0,
|
|
help="RAM disponible minimale (Go)")
|
|
parser.add_argument("--gpu-util", type=int, default=100,
|
|
help="Utilisation GPU max tolérée (%%)")
|
|
parser.add_argument("--wait", action="store_true",
|
|
help="Attendre que les ressources soient disponibles")
|
|
parser.add_argument("--timeout", type=int, default=30,
|
|
help="Timeout d'attente en minutes (défaut: 30)")
|
|
args = parser.parse_args()
|
|
|
|
# Afficher l'état actuel
|
|
status = check_resources()
|
|
print(status.summary())
|
|
|
|
# Vérifier les seuils si demandés
|
|
if args.vram > 0 or args.ram > 0 or args.gpu_util < 100:
|
|
if args.wait:
|
|
try:
|
|
wait_for_resources(
|
|
vram_free_mb=args.vram,
|
|
ram_free_gb=args.ram,
|
|
max_gpu_util_pct=args.gpu_util,
|
|
timeout_minutes=args.timeout,
|
|
)
|
|
print("\nOK — ressources disponibles.")
|
|
except TimeoutError as e:
|
|
print(f"\nERREUR : {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
else:
|
|
try:
|
|
require_resources(
|
|
vram_free_mb=args.vram,
|
|
ram_free_gb=args.ram,
|
|
max_gpu_util_pct=args.gpu_util,
|
|
)
|
|
print("\nOK — ressources suffisantes.")
|
|
except RuntimeError as e:
|
|
print(f"\nERREUR : {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|