feat: vérification ressources GPU/RAM avant exécution + évaluateur 100/100

- Nouveau module scripts/check_resources.py : état GPU/VRAM/RAM/CPU, require_resources() et wait_for_resources() avec polling - Intégré dans finetune_camembert_bio.py (8 Go VRAM + 8 Go RAM) - Intégré dans run_batch_silver_export.py (workers × 4 Go RAM) - Évaluateur : EVA et RAI ajoutés aux termes médicaux (score 100.0/100) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-16 10:27:33 +01:00
parent 49ff464e6e
commit d957e72aff
4 changed files with 375 additions and 0 deletions
--- a/scripts/check_resources.py
+++ b/scripts/check_resources.py
@@ -0,0 +1,347 @@
+#!/usr/bin/env python3
+"""Vérification des ressources machine (GPU, RAM, CPU) avant exécution.
+
+Utilisable comme module ou en standalone :
+    from scripts.check_resources import check_resources, require_resources
+
+    # Vérification simple (lève RuntimeError si insuffisant)
+    require_resources(vram_free_mb=2000, ram_free_gb=4)
+
+    # Vérification informative (retourne un dict)
+    status = check_resources()
+    print(status)
+
+    # En standalone
+    python scripts/check_resources.py
+    python scripts/check_resources.py --vram 2000 --ram 4 --wait
+"""
+import subprocess
+import shutil
+import time
+import sys
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+
+@dataclass
+class GpuProcess:
+    pid: int
+    name: str
+    vram_mb: int
+
+
+@dataclass
+class ResourceStatus:
+    # GPU
+    gpu_available: bool = False
+    gpu_name: str = ""
+    vram_total_mb: int = 0
+    vram_used_mb: int = 0
+    vram_free_mb: int = 0
+    gpu_util_pct: int = 0
+    gpu_processes: List[GpuProcess] = field(default_factory=list)
+    # RAM
+    ram_total_gb: float = 0.0
+    ram_used_gb: float = 0.0
+    ram_free_gb: float = 0.0
+    ram_available_gb: float = 0.0
+    # CPU
+    cpu_count: int = 0
+    load_avg_1m: float = 0.0
+    load_avg_5m: float = 0.0
+
+    def summary(self) -> str:
+        lines = []
+        lines.append("=" * 55)
+        lines.append("  ÉTAT DES RESSOURCES MACHINE")
+        lines.append("=" * 55)
+
+        # GPU
+        if self.gpu_available:
+            lines.append(f"\n  GPU : {self.gpu_name}")
+            lines.append(f"    VRAM totale  : {self.vram_total_mb} Mo")
+            lines.append(f"    VRAM utilisée: {self.vram_used_mb} Mo ({self._pct(self.vram_used_mb, self.vram_total_mb)}%)")
+            lines.append(f"    VRAM libre   : {self.vram_free_mb} Mo")
+            lines.append(f"    Utilisation   : {self.gpu_util_pct}%")
+            if self.gpu_processes:
+                lines.append(f"    Processus GPU ({len(self.gpu_processes)}) :")
+                for p in self.gpu_processes:
+                    short_name = p.name.split("/")[-1] if "/" in p.name else p.name
+                    project = self._guess_project(p.name)
+                    label = f" ({project})" if project else ""
+                    lines.append(f"      PID {p.pid}: {short_name}{label} — {p.vram_mb} Mo")
+            else:
+                lines.append("    Aucun processus GPU actif")
+        else:
+            lines.append("\n  GPU : non disponible (nvidia-smi absent)")
+
+        # RAM
+        lines.append(f"\n  RAM : {self.ram_total_gb:.1f} Go total")
+        lines.append(f"    Utilisée    : {self.ram_used_gb:.1f} Go")
+        lines.append(f"    Disponible  : {self.ram_available_gb:.1f} Go")
+
+        # CPU
+        lines.append(f"\n  CPU : {self.cpu_count} cœurs")
+        lines.append(f"    Load avg    : {self.load_avg_1m:.1f} (1m) / {self.load_avg_5m:.1f} (5m)")
+
+        lines.append("=" * 55)
+        return "\n".join(lines)
+
+    @staticmethod
+    def _pct(used: int, total: int) -> int:
+        return round(used * 100 / total) if total > 0 else 0
+
+    @staticmethod
+    def _guess_project(path: str) -> str:
+        """Devine le projet à partir du chemin du processus."""
+        parts = path.split("/")
+        for i, p in enumerate(parts):
+            if p == "ai" and i + 1 < len(parts):
+                return parts[i + 1].split(".")[0]
+        return ""
+
+
+def check_resources() -> ResourceStatus:
+    """Collecte l'état actuel des ressources machine."""
+    status = ResourceStatus()
+
+    # --- GPU ---
+    if shutil.which("nvidia-smi"):
+        status.gpu_available = True
+        try:
+            out = subprocess.run(
+                ["nvidia-smi", "--query-gpu=name,memory.total,memory.used,memory.free,utilization.gpu",
+                 "--format=csv,noheader,nounits"],
+                capture_output=True, text=True, timeout=5
+            )
+            if out.returncode == 0:
+                parts = [p.strip() for p in out.stdout.strip().split(",")]
+                if len(parts) >= 5:
+                    status.gpu_name = parts[0]
+                    status.vram_total_mb = int(parts[1])
+                    status.vram_used_mb = int(parts[2])
+                    status.vram_free_mb = int(parts[3])
+                    status.gpu_util_pct = int(parts[4])
+        except Exception:
+            pass
+
+        # Processus GPU
+        try:
+            out = subprocess.run(
+                ["nvidia-smi", "--query-compute-apps=pid,process_name,used_gpu_memory",
+                 "--format=csv,noheader,nounits"],
+                capture_output=True, text=True, timeout=5
+            )
+            if out.returncode == 0 and out.stdout.strip():
+                for line in out.stdout.strip().splitlines():
+                    parts = [p.strip() for p in line.split(",")]
+                    if len(parts) >= 3:
+                        try:
+                            status.gpu_processes.append(GpuProcess(
+                                pid=int(parts[0]),
+                                name=parts[1],
+                                vram_mb=int(parts[2]),
+                            ))
+                        except ValueError:
+                            pass
+        except Exception:
+            pass
+
+    # --- RAM ---
+    try:
+        with open("/proc/meminfo") as f:
+            meminfo = {}
+            for line in f:
+                parts = line.split()
+                if len(parts) >= 2:
+                    key = parts[0].rstrip(":")
+                    meminfo[key] = int(parts[1])  # en kB
+
+        status.ram_total_gb = meminfo.get("MemTotal", 0) / 1048576
+        status.ram_free_gb = meminfo.get("MemFree", 0) / 1048576
+        status.ram_available_gb = meminfo.get("MemAvailable", 0) / 1048576
+        status.ram_used_gb = status.ram_total_gb - status.ram_free_gb
+    except Exception:
+        pass
+
+    # --- CPU ---
+    try:
+        import os
+        status.cpu_count = os.cpu_count() or 0
+        load = os.getloadavg()
+        status.load_avg_1m = load[0]
+        status.load_avg_5m = load[1]
+    except Exception:
+        pass
+
+    return status
+
+
+def require_resources(
+    vram_free_mb: int = 0,
+    ram_free_gb: float = 0,
+    max_gpu_util_pct: int = 100,
+    fail_if_gpu_busy: bool = False,
+) -> ResourceStatus:
+    """Vérifie que les ressources minimales sont disponibles.
+
+    Args:
+        vram_free_mb: VRAM libre minimale requise (Mo). 0 = pas de vérification GPU.
+        ram_free_gb: RAM disponible minimale (Go).
+        max_gpu_util_pct: Utilisation GPU max tolérée (%).
+        fail_if_gpu_busy: Si True, échoue si d'autres processus utilisent le GPU.
+
+    Returns:
+        ResourceStatus si tout est ok.
+
+    Raises:
+        RuntimeError avec détails si ressources insuffisantes.
+    """
+    status = check_resources()
+    errors = []
+
+    if vram_free_mb > 0:
+        if not status.gpu_available:
+            errors.append(f"GPU requis ({vram_free_mb} Mo VRAM) mais nvidia-smi non disponible")
+        elif status.vram_free_mb < vram_free_mb:
+            procs = ""
+            if status.gpu_processes:
+                procs = "\n  Processus occupant le GPU :"
+                for p in status.gpu_processes:
+                    short = p.name.split("/")[-1]
+                    project = status._guess_project(p.name)
+                    label = f" ({project})" if project else ""
+                    procs += f"\n    PID {p.pid}: {short}{label} — {p.vram_mb} Mo"
+            errors.append(
+                f"VRAM insuffisante : {status.vram_free_mb} Mo libre, "
+                f"{vram_free_mb} Mo requis (utilisé: {status.vram_used_mb}/{status.vram_total_mb} Mo)"
+                f"{procs}"
+            )
+
+    if max_gpu_util_pct < 100 and status.gpu_available:
+        if status.gpu_util_pct > max_gpu_util_pct:
+            errors.append(
+                f"GPU trop chargé : {status.gpu_util_pct}% d'utilisation "
+                f"(max toléré: {max_gpu_util_pct}%)"
+            )
+
+    if fail_if_gpu_busy and status.gpu_processes:
+        names = [f"{p.name.split('/')[-1]} (PID {p.pid})" for p in status.gpu_processes]
+        errors.append(f"GPU occupé par : {', '.join(names)}")
+
+    if ram_free_gb > 0 and status.ram_available_gb < ram_free_gb:
+        errors.append(
+            f"RAM insuffisante : {status.ram_available_gb:.1f} Go disponible, "
+            f"{ram_free_gb:.1f} Go requis"
+        )
+
+    if errors:
+        msg = "Ressources insuffisantes :\n  " + "\n  ".join(errors)
+        msg += "\n\n" + status.summary()
+        raise RuntimeError(msg)
+
+    return status
+
+
+def wait_for_resources(
+    vram_free_mb: int = 0,
+    ram_free_gb: float = 0,
+    max_gpu_util_pct: int = 100,
+    timeout_minutes: int = 30,
+    check_interval_seconds: int = 30,
+) -> ResourceStatus:
+    """Attend que les ressources soient disponibles (avec timeout).
+
+    Affiche un message périodique tant que les ressources sont insuffisantes.
+    Utile avant un fine-tuning ou un batch lourd.
+
+    Returns:
+        ResourceStatus quand les ressources sont disponibles.
+
+    Raises:
+        TimeoutError si le timeout est atteint.
+    """
+    deadline = time.time() + timeout_minutes * 60
+    attempt = 0
+
+    while time.time() < deadline:
+        try:
+            status = require_resources(
+                vram_free_mb=vram_free_mb,
+                ram_free_gb=ram_free_gb,
+                max_gpu_util_pct=max_gpu_util_pct,
+            )
+            if attempt > 0:
+                print(f"\nRessources disponibles après {attempt * check_interval_seconds}s d'attente.")
+            return status
+        except RuntimeError as e:
+            attempt += 1
+            if attempt == 1:
+                print(f"En attente de ressources (timeout: {timeout_minutes}min)...")
+                print(f"  Requis: VRAM >= {vram_free_mb} Mo, RAM >= {ram_free_gb} Go")
+
+            remaining = int((deadline - time.time()) / 60)
+            status = check_resources()
+            gpu_info = f"VRAM libre: {status.vram_free_mb} Mo" if status.gpu_available else "pas de GPU"
+            print(
+                f"  [{attempt}] {gpu_info}, RAM dispo: {status.ram_available_gb:.1f} Go "
+                f"— encore {remaining}min max",
+                flush=True,
+            )
+            time.sleep(check_interval_seconds)
+
+    raise TimeoutError(
+        f"Timeout ({timeout_minutes}min) : ressources toujours insuffisantes.\n"
+        + check_resources().summary()
+    )
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Vérification des ressources machine")
+    parser.add_argument("--vram", type=int, default=0,
+                        help="VRAM libre minimale requise (Mo)")
+    parser.add_argument("--ram", type=float, default=0,
+                        help="RAM disponible minimale (Go)")
+    parser.add_argument("--gpu-util", type=int, default=100,
+                        help="Utilisation GPU max tolérée (%%)")
+    parser.add_argument("--wait", action="store_true",
+                        help="Attendre que les ressources soient disponibles")
+    parser.add_argument("--timeout", type=int, default=30,
+                        help="Timeout d'attente en minutes (défaut: 30)")
+    args = parser.parse_args()
+
+    # Afficher l'état actuel
+    status = check_resources()
+    print(status.summary())
+
+    # Vérifier les seuils si demandés
+    if args.vram > 0 or args.ram > 0 or args.gpu_util < 100:
+        if args.wait:
+            try:
+                wait_for_resources(
+                    vram_free_mb=args.vram,
+                    ram_free_gb=args.ram,
+                    max_gpu_util_pct=args.gpu_util,
+                    timeout_minutes=args.timeout,
+                )
+                print("\nOK — ressources disponibles.")
+            except TimeoutError as e:
+                print(f"\nERREUR : {e}", file=sys.stderr)
+                sys.exit(1)
+        else:
+            try:
+                require_resources(
+                    vram_free_mb=args.vram,
+                    ram_free_gb=args.ram,
+                    max_gpu_util_pct=args.gpu_util,
+                )
+                print("\nOK — ressources suffisantes.")
+            except RuntimeError as e:
+                print(f"\nERREUR : {e}", file=sys.stderr)
+                sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()