""" Preflight GPU Check — Vérification machine avant tout lancement. Vérifie que le GPU et la VRAM sont suffisamment libres avant de lancer des tests, replays, ou tout processus gourmand en ressources. Usage: from core.gpu.preflight import check_machine_ready, require_gpu_ready # Vérification simple result = check_machine_ready() if not result.ready: print(f"Machine pas prête : {result.reason}") # Avec seuils personnalisés result = check_machine_ready(min_free_vram_mb=2000, max_gpu_util_percent=50) # Comme décorateur (skip le test si GPU pas dispo) @require_gpu_ready(min_free_vram_mb=1000) def test_something(): ... """ import functools import logging import subprocess from dataclasses import dataclass, field from typing import List, Optional import pytest logger = logging.getLogger(__name__) # Seuils par défaut DEFAULT_MIN_FREE_VRAM_MB = 1000 # 1 GB minimum libre DEFAULT_MAX_GPU_UTIL_PERCENT = 80 # GPU pas saturé à plus de 80% DEFAULT_MAX_FOREIGN_PROCESSES = 5 # Alerte si trop de processus GPU @dataclass class GPUProcess: """Processus utilisant le GPU.""" pid: int name: str vram_mb: int is_own: bool # True si c'est un processus rpa_vision_v3 @dataclass class PreflightResult: """Résultat de la vérification machine.""" ready: bool reason: Optional[str] = None # État GPU gpu_name: str = "" total_vram_mb: int = 0 used_vram_mb: int = 0 free_vram_mb: int = 0 gpu_utilization_percent: int = 0 # Processus gpu_processes: List[GPUProcess] = field(default_factory=list) foreign_processes: List[GPUProcess] = field(default_factory=list) # Avertissements (non-bloquants) warnings: List[str] = field(default_factory=list) def __str__(self) -> str: status = "PRÊT" if self.ready else "PAS PRÊT" lines = [ f"[GPU Preflight: {status}]", f" GPU: {self.gpu_name}", f" VRAM: {self.used_vram_mb}/{self.total_vram_mb} MB " f"(libre: {self.free_vram_mb} MB)", f" Utilisation GPU: {self.gpu_utilization_percent}%", f" Processus GPU: {len(self.gpu_processes)} " f"(dont {len(self.foreign_processes)} externes)", ] if not self.ready: lines.append(f" Raison: {self.reason}") for w in self.warnings: lines.append(f" ⚠ {w}") if self.foreign_processes: lines.append(" Processus externes:") for p in self.foreign_processes: lines.append(f" - PID {p.pid}: {p.name} ({p.vram_mb} MB)") return "\n".join(lines) def _get_gpu_info() -> Optional[dict]: """Récupère les infos GPU via nvidia-smi.""" try: result = subprocess.run( [ "nvidia-smi", "--query-gpu=name,memory.total,memory.used,memory.free,utilization.gpu", "--format=csv,noheader,nounits", ], capture_output=True, text=True, timeout=5, ) if result.returncode != 0: return None parts = [p.strip() for p in result.stdout.strip().split(",")] if len(parts) < 5: return None return { "name": parts[0], "total_mb": int(parts[1]), "used_mb": int(parts[2]), "free_mb": int(parts[3]), "utilization": int(parts[4]) if parts[4].isdigit() else 0, } except Exception as e: logger.error(f"nvidia-smi échoué : {e}") return None def _get_gpu_processes() -> List[GPUProcess]: """Liste les processus utilisant le GPU.""" try: result = subprocess.run( [ "nvidia-smi", "--query-compute-apps=pid,process_name,used_gpu_memory", "--format=csv,noheader,nounits", ], capture_output=True, text=True, timeout=5, ) if result.returncode != 0: return [] processes = [] for line in result.stdout.strip().split("\n"): if not line.strip(): continue parts = [p.strip() for p in line.split(",")] if len(parts) < 3: continue pid = int(parts[0]) name = parts[1] vram = int(parts[2]) if parts[2].strip().isdigit() else 0 is_own = "rpa_vision_v3" in name processes.append(GPUProcess( pid=pid, name=name, vram_mb=vram, is_own=is_own, )) return processes except Exception as e: logger.error(f"Impossible de lister les processus GPU : {e}") return [] def check_machine_ready( min_free_vram_mb: int = DEFAULT_MIN_FREE_VRAM_MB, max_gpu_util_percent: int = DEFAULT_MAX_GPU_UTIL_PERCENT, max_foreign_processes: int = DEFAULT_MAX_FOREIGN_PROCESSES, ) -> PreflightResult: """ Vérifie que la machine est prête pour un lancement GPU. Args: min_free_vram_mb: VRAM libre minimum requise (défaut: 1000 MB) max_gpu_util_percent: Utilisation GPU max tolérée (défaut: 80%) max_foreign_processes: Nombre max de processus externes avant alerte Returns: PreflightResult avec l'état détaillé """ result = PreflightResult(ready=True) # 1. Vérifier que le GPU est accessible gpu_info = _get_gpu_info() if gpu_info is None: result.ready = False result.reason = "GPU inaccessible (nvidia-smi échoué)" logger.warning(result.reason) return result result.gpu_name = gpu_info["name"] result.total_vram_mb = gpu_info["total_mb"] result.used_vram_mb = gpu_info["used_mb"] result.free_vram_mb = gpu_info["free_mb"] result.gpu_utilization_percent = gpu_info["utilization"] # 2. Lister les processus GPU result.gpu_processes = _get_gpu_processes() result.foreign_processes = [p for p in result.gpu_processes if not p.is_own] # 3. Vérifier VRAM libre if result.free_vram_mb < min_free_vram_mb: result.ready = False result.reason = ( f"VRAM insuffisante : {result.free_vram_mb} MB libre " f"(minimum requis : {min_free_vram_mb} MB)" ) logger.warning(result.reason) return result # 4. Vérifier utilisation GPU if result.gpu_utilization_percent > max_gpu_util_percent: result.ready = False result.reason = ( f"GPU surchargé : {result.gpu_utilization_percent}% " f"(maximum toléré : {max_gpu_util_percent}%)" ) logger.warning(result.reason) return result # 5. Avertissements (non-bloquants) if len(result.foreign_processes) > max_foreign_processes: result.warnings.append( f"{len(result.foreign_processes)} processus externes sur le GPU" ) foreign_vram = sum(p.vram_mb for p in result.foreign_processes) if foreign_vram > result.total_vram_mb * 0.5: result.warnings.append( f"Processus externes utilisent {foreign_vram} MB " f"({foreign_vram * 100 // result.total_vram_mb}% de la VRAM)" ) if result.free_vram_mb < min_free_vram_mb * 2: result.warnings.append( f"VRAM libre ({result.free_vram_mb} MB) proche du seuil minimum" ) if result.warnings: for w in result.warnings: logger.info(f"Preflight warning: {w}") logger.info( f"GPU preflight OK: {result.free_vram_mb} MB libre, " f"{result.gpu_utilization_percent}% utilisation" ) return result def require_gpu_ready( min_free_vram_mb: int = DEFAULT_MIN_FREE_VRAM_MB, max_gpu_util_percent: int = DEFAULT_MAX_GPU_UTIL_PERCENT, ): """ Décorateur pytest — skip le test si le GPU n'est pas prêt. Usage: @require_gpu_ready(min_free_vram_mb=2000) def test_heavy_gpu_operation(): ... """ def decorator(func): @functools.wraps(func) def wrapper(*args, **kwargs): result = check_machine_ready( min_free_vram_mb=min_free_vram_mb, max_gpu_util_percent=max_gpu_util_percent, ) if not result.ready: pytest.skip(f"GPU pas prêt : {result.reason}") return func(*args, **kwargs) return wrapper return decorator