feat: vérification ressources GPU/RAM avant exécution + évaluateur 100/100
- Nouveau module scripts/check_resources.py : état GPU/VRAM/RAM/CPU, require_resources() et wait_for_resources() avec polling - Intégré dans finetune_camembert_bio.py (8 Go VRAM + 8 Go RAM) - Intégré dans run_batch_silver_export.py (workers × 4 Go RAM) - Évaluateur : EVA et RAI ajoutés aux termes médicaux (score 100.0/100) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
347
scripts/check_resources.py
Normal file
347
scripts/check_resources.py
Normal file
@@ -0,0 +1,347 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Vérification des ressources machine (GPU, RAM, CPU) avant exécution.
|
||||
|
||||
Utilisable comme module ou en standalone :
|
||||
from scripts.check_resources import check_resources, require_resources
|
||||
|
||||
# Vérification simple (lève RuntimeError si insuffisant)
|
||||
require_resources(vram_free_mb=2000, ram_free_gb=4)
|
||||
|
||||
# Vérification informative (retourne un dict)
|
||||
status = check_resources()
|
||||
print(status)
|
||||
|
||||
# En standalone
|
||||
python scripts/check_resources.py
|
||||
python scripts/check_resources.py --vram 2000 --ram 4 --wait
|
||||
"""
|
||||
import subprocess
|
||||
import shutil
|
||||
import time
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class GpuProcess:
|
||||
pid: int
|
||||
name: str
|
||||
vram_mb: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class ResourceStatus:
|
||||
# GPU
|
||||
gpu_available: bool = False
|
||||
gpu_name: str = ""
|
||||
vram_total_mb: int = 0
|
||||
vram_used_mb: int = 0
|
||||
vram_free_mb: int = 0
|
||||
gpu_util_pct: int = 0
|
||||
gpu_processes: List[GpuProcess] = field(default_factory=list)
|
||||
# RAM
|
||||
ram_total_gb: float = 0.0
|
||||
ram_used_gb: float = 0.0
|
||||
ram_free_gb: float = 0.0
|
||||
ram_available_gb: float = 0.0
|
||||
# CPU
|
||||
cpu_count: int = 0
|
||||
load_avg_1m: float = 0.0
|
||||
load_avg_5m: float = 0.0
|
||||
|
||||
def summary(self) -> str:
|
||||
lines = []
|
||||
lines.append("=" * 55)
|
||||
lines.append(" ÉTAT DES RESSOURCES MACHINE")
|
||||
lines.append("=" * 55)
|
||||
|
||||
# GPU
|
||||
if self.gpu_available:
|
||||
lines.append(f"\n GPU : {self.gpu_name}")
|
||||
lines.append(f" VRAM totale : {self.vram_total_mb} Mo")
|
||||
lines.append(f" VRAM utilisée: {self.vram_used_mb} Mo ({self._pct(self.vram_used_mb, self.vram_total_mb)}%)")
|
||||
lines.append(f" VRAM libre : {self.vram_free_mb} Mo")
|
||||
lines.append(f" Utilisation : {self.gpu_util_pct}%")
|
||||
if self.gpu_processes:
|
||||
lines.append(f" Processus GPU ({len(self.gpu_processes)}) :")
|
||||
for p in self.gpu_processes:
|
||||
short_name = p.name.split("/")[-1] if "/" in p.name else p.name
|
||||
project = self._guess_project(p.name)
|
||||
label = f" ({project})" if project else ""
|
||||
lines.append(f" PID {p.pid}: {short_name}{label} — {p.vram_mb} Mo")
|
||||
else:
|
||||
lines.append(" Aucun processus GPU actif")
|
||||
else:
|
||||
lines.append("\n GPU : non disponible (nvidia-smi absent)")
|
||||
|
||||
# RAM
|
||||
lines.append(f"\n RAM : {self.ram_total_gb:.1f} Go total")
|
||||
lines.append(f" Utilisée : {self.ram_used_gb:.1f} Go")
|
||||
lines.append(f" Disponible : {self.ram_available_gb:.1f} Go")
|
||||
|
||||
# CPU
|
||||
lines.append(f"\n CPU : {self.cpu_count} cœurs")
|
||||
lines.append(f" Load avg : {self.load_avg_1m:.1f} (1m) / {self.load_avg_5m:.1f} (5m)")
|
||||
|
||||
lines.append("=" * 55)
|
||||
return "\n".join(lines)
|
||||
|
||||
@staticmethod
|
||||
def _pct(used: int, total: int) -> int:
|
||||
return round(used * 100 / total) if total > 0 else 0
|
||||
|
||||
@staticmethod
|
||||
def _guess_project(path: str) -> str:
|
||||
"""Devine le projet à partir du chemin du processus."""
|
||||
parts = path.split("/")
|
||||
for i, p in enumerate(parts):
|
||||
if p == "ai" and i + 1 < len(parts):
|
||||
return parts[i + 1].split(".")[0]
|
||||
return ""
|
||||
|
||||
|
||||
def check_resources() -> ResourceStatus:
|
||||
"""Collecte l'état actuel des ressources machine."""
|
||||
status = ResourceStatus()
|
||||
|
||||
# --- GPU ---
|
||||
if shutil.which("nvidia-smi"):
|
||||
status.gpu_available = True
|
||||
try:
|
||||
out = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=name,memory.total,memory.used,memory.free,utilization.gpu",
|
||||
"--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
if out.returncode == 0:
|
||||
parts = [p.strip() for p in out.stdout.strip().split(",")]
|
||||
if len(parts) >= 5:
|
||||
status.gpu_name = parts[0]
|
||||
status.vram_total_mb = int(parts[1])
|
||||
status.vram_used_mb = int(parts[2])
|
||||
status.vram_free_mb = int(parts[3])
|
||||
status.gpu_util_pct = int(parts[4])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Processus GPU
|
||||
try:
|
||||
out = subprocess.run(
|
||||
["nvidia-smi", "--query-compute-apps=pid,process_name,used_gpu_memory",
|
||||
"--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
if out.returncode == 0 and out.stdout.strip():
|
||||
for line in out.stdout.strip().splitlines():
|
||||
parts = [p.strip() for p in line.split(",")]
|
||||
if len(parts) >= 3:
|
||||
try:
|
||||
status.gpu_processes.append(GpuProcess(
|
||||
pid=int(parts[0]),
|
||||
name=parts[1],
|
||||
vram_mb=int(parts[2]),
|
||||
))
|
||||
except ValueError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- RAM ---
|
||||
try:
|
||||
with open("/proc/meminfo") as f:
|
||||
meminfo = {}
|
||||
for line in f:
|
||||
parts = line.split()
|
||||
if len(parts) >= 2:
|
||||
key = parts[0].rstrip(":")
|
||||
meminfo[key] = int(parts[1]) # en kB
|
||||
|
||||
status.ram_total_gb = meminfo.get("MemTotal", 0) / 1048576
|
||||
status.ram_free_gb = meminfo.get("MemFree", 0) / 1048576
|
||||
status.ram_available_gb = meminfo.get("MemAvailable", 0) / 1048576
|
||||
status.ram_used_gb = status.ram_total_gb - status.ram_free_gb
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- CPU ---
|
||||
try:
|
||||
import os
|
||||
status.cpu_count = os.cpu_count() or 0
|
||||
load = os.getloadavg()
|
||||
status.load_avg_1m = load[0]
|
||||
status.load_avg_5m = load[1]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return status
|
||||
|
||||
|
||||
def require_resources(
|
||||
vram_free_mb: int = 0,
|
||||
ram_free_gb: float = 0,
|
||||
max_gpu_util_pct: int = 100,
|
||||
fail_if_gpu_busy: bool = False,
|
||||
) -> ResourceStatus:
|
||||
"""Vérifie que les ressources minimales sont disponibles.
|
||||
|
||||
Args:
|
||||
vram_free_mb: VRAM libre minimale requise (Mo). 0 = pas de vérification GPU.
|
||||
ram_free_gb: RAM disponible minimale (Go).
|
||||
max_gpu_util_pct: Utilisation GPU max tolérée (%).
|
||||
fail_if_gpu_busy: Si True, échoue si d'autres processus utilisent le GPU.
|
||||
|
||||
Returns:
|
||||
ResourceStatus si tout est ok.
|
||||
|
||||
Raises:
|
||||
RuntimeError avec détails si ressources insuffisantes.
|
||||
"""
|
||||
status = check_resources()
|
||||
errors = []
|
||||
|
||||
if vram_free_mb > 0:
|
||||
if not status.gpu_available:
|
||||
errors.append(f"GPU requis ({vram_free_mb} Mo VRAM) mais nvidia-smi non disponible")
|
||||
elif status.vram_free_mb < vram_free_mb:
|
||||
procs = ""
|
||||
if status.gpu_processes:
|
||||
procs = "\n Processus occupant le GPU :"
|
||||
for p in status.gpu_processes:
|
||||
short = p.name.split("/")[-1]
|
||||
project = status._guess_project(p.name)
|
||||
label = f" ({project})" if project else ""
|
||||
procs += f"\n PID {p.pid}: {short}{label} — {p.vram_mb} Mo"
|
||||
errors.append(
|
||||
f"VRAM insuffisante : {status.vram_free_mb} Mo libre, "
|
||||
f"{vram_free_mb} Mo requis (utilisé: {status.vram_used_mb}/{status.vram_total_mb} Mo)"
|
||||
f"{procs}"
|
||||
)
|
||||
|
||||
if max_gpu_util_pct < 100 and status.gpu_available:
|
||||
if status.gpu_util_pct > max_gpu_util_pct:
|
||||
errors.append(
|
||||
f"GPU trop chargé : {status.gpu_util_pct}% d'utilisation "
|
||||
f"(max toléré: {max_gpu_util_pct}%)"
|
||||
)
|
||||
|
||||
if fail_if_gpu_busy and status.gpu_processes:
|
||||
names = [f"{p.name.split('/')[-1]} (PID {p.pid})" for p in status.gpu_processes]
|
||||
errors.append(f"GPU occupé par : {', '.join(names)}")
|
||||
|
||||
if ram_free_gb > 0 and status.ram_available_gb < ram_free_gb:
|
||||
errors.append(
|
||||
f"RAM insuffisante : {status.ram_available_gb:.1f} Go disponible, "
|
||||
f"{ram_free_gb:.1f} Go requis"
|
||||
)
|
||||
|
||||
if errors:
|
||||
msg = "Ressources insuffisantes :\n " + "\n ".join(errors)
|
||||
msg += "\n\n" + status.summary()
|
||||
raise RuntimeError(msg)
|
||||
|
||||
return status
|
||||
|
||||
|
||||
def wait_for_resources(
|
||||
vram_free_mb: int = 0,
|
||||
ram_free_gb: float = 0,
|
||||
max_gpu_util_pct: int = 100,
|
||||
timeout_minutes: int = 30,
|
||||
check_interval_seconds: int = 30,
|
||||
) -> ResourceStatus:
|
||||
"""Attend que les ressources soient disponibles (avec timeout).
|
||||
|
||||
Affiche un message périodique tant que les ressources sont insuffisantes.
|
||||
Utile avant un fine-tuning ou un batch lourd.
|
||||
|
||||
Returns:
|
||||
ResourceStatus quand les ressources sont disponibles.
|
||||
|
||||
Raises:
|
||||
TimeoutError si le timeout est atteint.
|
||||
"""
|
||||
deadline = time.time() + timeout_minutes * 60
|
||||
attempt = 0
|
||||
|
||||
while time.time() < deadline:
|
||||
try:
|
||||
status = require_resources(
|
||||
vram_free_mb=vram_free_mb,
|
||||
ram_free_gb=ram_free_gb,
|
||||
max_gpu_util_pct=max_gpu_util_pct,
|
||||
)
|
||||
if attempt > 0:
|
||||
print(f"\nRessources disponibles après {attempt * check_interval_seconds}s d'attente.")
|
||||
return status
|
||||
except RuntimeError as e:
|
||||
attempt += 1
|
||||
if attempt == 1:
|
||||
print(f"En attente de ressources (timeout: {timeout_minutes}min)...")
|
||||
print(f" Requis: VRAM >= {vram_free_mb} Mo, RAM >= {ram_free_gb} Go")
|
||||
|
||||
remaining = int((deadline - time.time()) / 60)
|
||||
status = check_resources()
|
||||
gpu_info = f"VRAM libre: {status.vram_free_mb} Mo" if status.gpu_available else "pas de GPU"
|
||||
print(
|
||||
f" [{attempt}] {gpu_info}, RAM dispo: {status.ram_available_gb:.1f} Go "
|
||||
f"— encore {remaining}min max",
|
||||
flush=True,
|
||||
)
|
||||
time.sleep(check_interval_seconds)
|
||||
|
||||
raise TimeoutError(
|
||||
f"Timeout ({timeout_minutes}min) : ressources toujours insuffisantes.\n"
|
||||
+ check_resources().summary()
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Vérification des ressources machine")
|
||||
parser.add_argument("--vram", type=int, default=0,
|
||||
help="VRAM libre minimale requise (Mo)")
|
||||
parser.add_argument("--ram", type=float, default=0,
|
||||
help="RAM disponible minimale (Go)")
|
||||
parser.add_argument("--gpu-util", type=int, default=100,
|
||||
help="Utilisation GPU max tolérée (%%)")
|
||||
parser.add_argument("--wait", action="store_true",
|
||||
help="Attendre que les ressources soient disponibles")
|
||||
parser.add_argument("--timeout", type=int, default=30,
|
||||
help="Timeout d'attente en minutes (défaut: 30)")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Afficher l'état actuel
|
||||
status = check_resources()
|
||||
print(status.summary())
|
||||
|
||||
# Vérifier les seuils si demandés
|
||||
if args.vram > 0 or args.ram > 0 or args.gpu_util < 100:
|
||||
if args.wait:
|
||||
try:
|
||||
wait_for_resources(
|
||||
vram_free_mb=args.vram,
|
||||
ram_free_gb=args.ram,
|
||||
max_gpu_util_pct=args.gpu_util,
|
||||
timeout_minutes=args.timeout,
|
||||
)
|
||||
print("\nOK — ressources disponibles.")
|
||||
except TimeoutError as e:
|
||||
print(f"\nERREUR : {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
try:
|
||||
require_resources(
|
||||
vram_free_mb=args.vram,
|
||||
ram_free_gb=args.ram,
|
||||
max_gpu_util_pct=args.gpu_util,
|
||||
)
|
||||
print("\nOK — ressources suffisantes.")
|
||||
except RuntimeError as e:
|
||||
print(f"\nERREUR : {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user