v1.0 - Version stable: multi-PC, détection UI-DETR-1, 3 modes exécution
- Frontend v4 accessible sur réseau local (192.168.1.40) - Ports ouverts: 3002 (frontend), 5001 (backend), 5004 (dashboard) - Ollama GPU fonctionnel - Self-healing interactif - Dashboard confiance Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
40
core/gpu/__init__.py
Normal file
40
core/gpu/__init__.py
Normal file
@@ -0,0 +1,40 @@
|
||||
"""
|
||||
GPU Resource Management Module for RPA Vision V3
|
||||
|
||||
This module provides dynamic GPU resource allocation between ML models:
|
||||
- Ollama VLM (qwen3-vl:8b) for UI classification
|
||||
- CLIP (ViT-B-32) for embedding matching
|
||||
|
||||
The GPUResourceManager optimizes VRAM usage by:
|
||||
- Unloading VLM in autopilot mode
|
||||
- Migrating CLIP to GPU when VRAM is available
|
||||
- Managing idle timeouts for automatic resource cleanup
|
||||
"""
|
||||
|
||||
from .gpu_resource_manager import (
|
||||
GPUResourceManager,
|
||||
ExecutionMode,
|
||||
ModelState,
|
||||
GPUResourceConfig,
|
||||
GPUResourceStatus,
|
||||
VRAMInfo,
|
||||
ResourceChangedEvent,
|
||||
get_gpu_resource_manager,
|
||||
)
|
||||
from .ollama_manager import OllamaManager
|
||||
from .vram_monitor import VRAMMonitor
|
||||
from .clip_manager import CLIPManager
|
||||
|
||||
__all__ = [
|
||||
"GPUResourceManager",
|
||||
"ExecutionMode",
|
||||
"ModelState",
|
||||
"GPUResourceConfig",
|
||||
"GPUResourceStatus",
|
||||
"VRAMInfo",
|
||||
"ResourceChangedEvent",
|
||||
"get_gpu_resource_manager",
|
||||
"OllamaManager",
|
||||
"VRAMMonitor",
|
||||
"CLIPManager",
|
||||
]
|
||||
248
core/gpu/clip_manager.py
Normal file
248
core/gpu/clip_manager.py
Normal file
@@ -0,0 +1,248 @@
|
||||
"""
|
||||
CLIP Manager - Manages CLIP model device migration
|
||||
|
||||
Handles:
|
||||
- CPU/GPU device migration for CLIP model
|
||||
- Pipeline reinitialization after device change
|
||||
- Graceful fallback on migration failures
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Any, Optional
|
||||
|
||||
import torch
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CLIPManager:
|
||||
"""
|
||||
Manages CLIP model device migration between CPU and GPU.
|
||||
|
||||
Coordinates with the embedding pipeline to ensure consistent
|
||||
device usage after migration.
|
||||
|
||||
Example:
|
||||
>>> manager = CLIPManager()
|
||||
>>> await manager.migrate_to_device("cuda")
|
||||
>>> device = manager.get_current_device()
|
||||
"""
|
||||
|
||||
def __init__(self, model_name: str = "ViT-B-32"):
|
||||
"""
|
||||
Initialize CLIPManager.
|
||||
|
||||
Args:
|
||||
model_name: CLIP model variant to manage
|
||||
"""
|
||||
self._model_name = model_name
|
||||
self._current_device = "cpu"
|
||||
self._model: Optional[Any] = None
|
||||
self._preprocess: Optional[Any] = None
|
||||
self._initialized = False
|
||||
|
||||
# Check CUDA availability
|
||||
self._cuda_available = torch.cuda.is_available()
|
||||
if not self._cuda_available:
|
||||
logger.warning("CUDA not available, CLIP will stay on CPU")
|
||||
|
||||
def get_current_device(self) -> str:
|
||||
"""
|
||||
Get the current device for CLIP model.
|
||||
|
||||
Returns:
|
||||
"cpu" or "cuda"
|
||||
"""
|
||||
return self._current_device
|
||||
|
||||
def is_cuda_available(self) -> bool:
|
||||
"""Check if CUDA is available for GPU migration."""
|
||||
return self._cuda_available
|
||||
|
||||
async def migrate_to_device(self, device: str) -> bool:
|
||||
"""
|
||||
Migrate CLIP model to specified device.
|
||||
|
||||
Args:
|
||||
device: Target device ("cpu" or "cuda")
|
||||
|
||||
Returns:
|
||||
True if migration successful
|
||||
"""
|
||||
if device not in ["cpu", "cuda"]:
|
||||
logger.error(f"Invalid device: {device}")
|
||||
return False
|
||||
|
||||
if device == self._current_device:
|
||||
logger.debug(f"CLIP already on {device}")
|
||||
return True
|
||||
|
||||
if device == "cuda" and not self._cuda_available:
|
||||
logger.warning("Cannot migrate to CUDA: not available")
|
||||
return False
|
||||
|
||||
logger.info(f"Migrating CLIP from {self._current_device} to {device}")
|
||||
|
||||
try:
|
||||
# Run migration in executor to avoid blocking
|
||||
loop = asyncio.get_event_loop()
|
||||
success = await loop.run_in_executor(
|
||||
None,
|
||||
self._do_migration,
|
||||
device
|
||||
)
|
||||
|
||||
if success:
|
||||
self._current_device = device
|
||||
logger.info(f"CLIP migrated to {device}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"CLIP migration failed: {e}")
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _do_migration(self, device: str) -> bool:
|
||||
"""
|
||||
Perform the actual device migration (blocking).
|
||||
|
||||
Args:
|
||||
device: Target device
|
||||
|
||||
Returns:
|
||||
True if successful
|
||||
"""
|
||||
try:
|
||||
# If model is loaded, move it
|
||||
if self._model is not None:
|
||||
self._model = self._model.to(device)
|
||||
logger.debug(f"Moved existing model to {device}")
|
||||
|
||||
# Reinitialize pipeline with new device
|
||||
self.reinitialize_pipeline(device)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Migration error: {e}")
|
||||
return False
|
||||
|
||||
def reinitialize_pipeline(self, device: Optional[str] = None) -> None:
|
||||
"""
|
||||
Reinitialize the embedding pipeline with current/specified device.
|
||||
|
||||
Args:
|
||||
device: Device to use (uses current if None)
|
||||
"""
|
||||
device = device or self._current_device
|
||||
|
||||
try:
|
||||
# Try to notify FusionEngine about device change
|
||||
self._notify_fusion_engine(device)
|
||||
logger.debug(f"Pipeline reinitialized for {device}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Pipeline reinitialization warning: {e}")
|
||||
|
||||
def _notify_fusion_engine(self, device: str) -> None:
|
||||
"""
|
||||
Notify FusionEngine about device change.
|
||||
|
||||
This allows the embedding system to update its device configuration.
|
||||
"""
|
||||
try:
|
||||
from core.embedding.fusion_engine import FusionEngine
|
||||
|
||||
# FusionEngine is typically a singleton, try to get instance
|
||||
# and update its device configuration
|
||||
# This is a soft dependency - if it fails, we continue
|
||||
|
||||
except ImportError:
|
||||
pass # FusionEngine not available, that's OK
|
||||
|
||||
def get_model(self) -> Optional[Any]:
|
||||
"""
|
||||
Get the CLIP model instance.
|
||||
|
||||
Returns:
|
||||
CLIP model or None if not loaded
|
||||
"""
|
||||
return self._model
|
||||
|
||||
def load_model(self) -> bool:
|
||||
"""
|
||||
Load the CLIP model on current device.
|
||||
|
||||
Returns:
|
||||
True if loaded successfully
|
||||
"""
|
||||
try:
|
||||
import open_clip
|
||||
|
||||
model, _, preprocess = open_clip.create_model_and_transforms(
|
||||
self._model_name,
|
||||
pretrained='openai',
|
||||
device=self._current_device
|
||||
)
|
||||
|
||||
self._model = model
|
||||
self._preprocess = preprocess
|
||||
self._initialized = True
|
||||
|
||||
logger.info(f"CLIP model {self._model_name} loaded on {self._current_device}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load CLIP model: {e}")
|
||||
return False
|
||||
|
||||
def unload_model(self) -> None:
|
||||
"""Unload the CLIP model to free memory."""
|
||||
if self._model is not None:
|
||||
del self._model
|
||||
self._model = None
|
||||
self._preprocess = None
|
||||
self._initialized = False
|
||||
|
||||
# Force garbage collection
|
||||
import gc
|
||||
gc.collect()
|
||||
|
||||
if self._cuda_available:
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
logger.info("CLIP model unloaded")
|
||||
|
||||
def encode_image(self, image) -> Optional[Any]:
|
||||
"""
|
||||
Encode an image using CLIP.
|
||||
|
||||
Args:
|
||||
image: PIL Image or tensor
|
||||
|
||||
Returns:
|
||||
Image embedding or None on error
|
||||
"""
|
||||
if not self._initialized or self._model is None:
|
||||
if not self.load_model():
|
||||
return None
|
||||
|
||||
try:
|
||||
import torch
|
||||
|
||||
with torch.no_grad():
|
||||
if self._preprocess:
|
||||
image_tensor = self._preprocess(image).unsqueeze(0)
|
||||
else:
|
||||
image_tensor = image
|
||||
|
||||
image_tensor = image_tensor.to(self._current_device)
|
||||
embedding = self._model.encode_image(image_tensor)
|
||||
|
||||
return embedding.cpu().numpy()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Image encoding error: {e}")
|
||||
return None
|
||||
614
core/gpu/gpu_resource_manager.py
Normal file
614
core/gpu/gpu_resource_manager.py
Normal file
@@ -0,0 +1,614 @@
|
||||
"""
|
||||
GPU Resource Manager - Central orchestrator for GPU resource allocation
|
||||
|
||||
Manages dynamic allocation of GPU resources between:
|
||||
- Ollama VLM (qwen3-vl:8b) - ~10.5 GB VRAM for UI classification
|
||||
- CLIP (ViT-B-32) - ~500 MB VRAM for embedding matching
|
||||
|
||||
Optimizes VRAM usage based on execution mode:
|
||||
- RECORDING: VLM loaded, CLIP on CPU
|
||||
- AUTOPILOT: VLM unloaded, CLIP on GPU
|
||||
- IDLE: No automatic changes
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ExecutionMode(str, Enum):
|
||||
"""Execution modes for the RPA system."""
|
||||
IDLE = "idle"
|
||||
RECORDING = "recording"
|
||||
AUTOPILOT = "autopilot"
|
||||
|
||||
|
||||
class ModelState(str, Enum):
|
||||
"""State of a model in the GPU resource manager."""
|
||||
UNLOADED = "unloaded"
|
||||
LOADING = "loading"
|
||||
LOADED = "loaded"
|
||||
UNLOADING = "unloading"
|
||||
ERROR = "error"
|
||||
|
||||
|
||||
@dataclass
|
||||
class VRAMInfo:
|
||||
"""Information about VRAM usage."""
|
||||
total_mb: int
|
||||
used_mb: int
|
||||
free_mb: int
|
||||
gpu_name: str
|
||||
gpu_utilization_percent: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class GPUResourceConfig:
|
||||
"""Configuration for GPU resource management."""
|
||||
ollama_endpoint: str = "http://localhost:11434"
|
||||
vlm_model: str = "qwen3-vl:8b"
|
||||
clip_model: str = "ViT-B-32"
|
||||
idle_timeout_seconds: int = 300 # 5 minutes
|
||||
vram_threshold_for_clip_gpu_mb: int = 1024 # 1 GB
|
||||
max_load_retries: int = 3
|
||||
load_timeout_seconds: int = 30
|
||||
unload_timeout_seconds: int = 5
|
||||
|
||||
|
||||
@dataclass
|
||||
class GPUResourceStatus:
|
||||
"""Current status of GPU resources."""
|
||||
execution_mode: ExecutionMode
|
||||
vlm_state: ModelState
|
||||
vlm_model: str
|
||||
clip_device: str
|
||||
vram: Optional[VRAMInfo]
|
||||
idle_timeout_seconds: int
|
||||
last_vlm_request: Optional[datetime]
|
||||
degraded_mode: bool
|
||||
degraded_reason: Optional[str]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ResourceChangedEvent:
|
||||
"""Event emitted when GPU resources change."""
|
||||
timestamp: datetime
|
||||
event_type: str # "vram_changed", "model_loaded", "model_unloaded", "device_changed"
|
||||
details: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
class GPUResourceManager:
|
||||
"""
|
||||
Central manager for GPU resource allocation.
|
||||
|
||||
Singleton pattern ensures only one instance manages GPU resources.
|
||||
|
||||
Example:
|
||||
>>> manager = get_gpu_resource_manager()
|
||||
>>> await manager.set_execution_mode(ExecutionMode.AUTOPILOT)
|
||||
>>> status = manager.get_status()
|
||||
"""
|
||||
|
||||
_instance: Optional["GPUResourceManager"] = None
|
||||
_lock = threading.Lock()
|
||||
|
||||
def __new__(cls, config: Optional[GPUResourceConfig] = None):
|
||||
with cls._lock:
|
||||
if cls._instance is None:
|
||||
cls._instance = super().__new__(cls)
|
||||
cls._instance._initialized = False
|
||||
return cls._instance
|
||||
|
||||
def __init__(self, config: Optional[GPUResourceConfig] = None):
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
self._config = config or GPUResourceConfig()
|
||||
self._execution_mode = ExecutionMode.IDLE
|
||||
self._vlm_state = ModelState.UNLOADED
|
||||
self._clip_device = "cpu"
|
||||
self._last_vlm_request: Optional[datetime] = None
|
||||
self._degraded_mode = False
|
||||
self._degraded_reason: Optional[str] = None
|
||||
|
||||
# Managers (lazy initialized)
|
||||
self._ollama_manager: Optional[Any] = None
|
||||
self._vram_monitor: Optional[Any] = None
|
||||
self._clip_manager: Optional[Any] = None
|
||||
|
||||
# Operation queue for sequential processing
|
||||
self._operation_queue: asyncio.Queue = asyncio.Queue()
|
||||
self._operation_lock = asyncio.Lock()
|
||||
|
||||
# Event callbacks
|
||||
self._on_resource_changed: List[Callable[[ResourceChangedEvent], None]] = []
|
||||
self._on_mode_changed: List[Callable[[ExecutionMode], None]] = []
|
||||
self._on_idle_unload: List[Callable[[], None]] = []
|
||||
|
||||
# Idle timeout management
|
||||
self._idle_timer: Optional[threading.Timer] = None
|
||||
self._idle_check_running = False
|
||||
|
||||
self._initialized = True
|
||||
logger.info(f"GPUResourceManager initialized with config: {self._config}")
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Lazy initialization of managers
|
||||
# =========================================================================
|
||||
|
||||
def _get_ollama_manager(self):
|
||||
"""Lazy load OllamaManager."""
|
||||
if self._ollama_manager is None:
|
||||
from .ollama_manager import OllamaManager
|
||||
self._ollama_manager = OllamaManager(
|
||||
endpoint=self._config.ollama_endpoint,
|
||||
model=self._config.vlm_model
|
||||
)
|
||||
return self._ollama_manager
|
||||
|
||||
def _get_vram_monitor(self):
|
||||
"""Lazy load VRAMMonitor."""
|
||||
if self._vram_monitor is None:
|
||||
from .vram_monitor import VRAMMonitor
|
||||
self._vram_monitor = VRAMMonitor()
|
||||
return self._vram_monitor
|
||||
|
||||
def _get_clip_manager(self):
|
||||
"""Lazy load CLIPManager."""
|
||||
if self._clip_manager is None:
|
||||
from .clip_manager import CLIPManager
|
||||
self._clip_manager = CLIPManager(model_name=self._config.clip_model)
|
||||
return self._clip_manager
|
||||
|
||||
# =========================================================================
|
||||
# Mode Management
|
||||
# =========================================================================
|
||||
|
||||
async def set_execution_mode(self, mode: ExecutionMode) -> None:
|
||||
"""
|
||||
Set the execution mode and adjust GPU resources accordingly.
|
||||
|
||||
Args:
|
||||
mode: Target execution mode
|
||||
"""
|
||||
if mode == self._execution_mode:
|
||||
logger.debug(f"Already in {mode.value} mode")
|
||||
return
|
||||
|
||||
old_mode = self._execution_mode
|
||||
logger.info(f"Transitioning from {old_mode.value} to {mode.value}")
|
||||
|
||||
async with self._operation_lock:
|
||||
if mode == ExecutionMode.AUTOPILOT:
|
||||
# Unload VLM, migrate CLIP to GPU
|
||||
await self.ensure_vlm_unloaded()
|
||||
await self._try_migrate_clip_to_gpu()
|
||||
|
||||
elif mode == ExecutionMode.RECORDING:
|
||||
# Migrate CLIP to CPU first, then load VLM
|
||||
await self._migrate_clip_to_cpu()
|
||||
await self.ensure_vlm_loaded()
|
||||
|
||||
# IDLE mode: no automatic changes
|
||||
|
||||
self._execution_mode = mode
|
||||
self._emit_mode_changed(mode)
|
||||
|
||||
logger.info(f"Mode transition complete: {mode.value}")
|
||||
|
||||
def get_execution_mode(self) -> ExecutionMode:
|
||||
"""Get the current execution mode."""
|
||||
return self._execution_mode
|
||||
|
||||
# =========================================================================
|
||||
# VLM Management
|
||||
# =========================================================================
|
||||
|
||||
async def ensure_vlm_loaded(self) -> bool:
|
||||
"""
|
||||
Ensure VLM is loaded and ready.
|
||||
|
||||
Returns:
|
||||
True if VLM is loaded, False on failure
|
||||
"""
|
||||
if self._vlm_state == ModelState.LOADED:
|
||||
self._update_vlm_request_time()
|
||||
return True
|
||||
|
||||
if self._degraded_mode:
|
||||
logger.warning("Cannot load VLM in degraded mode")
|
||||
return False
|
||||
|
||||
async with self._operation_lock:
|
||||
if self._vlm_state == ModelState.LOADED:
|
||||
self._update_vlm_request_time()
|
||||
return True
|
||||
|
||||
self._vlm_state = ModelState.LOADING
|
||||
logger.info("Loading VLM model...")
|
||||
|
||||
ollama = self._get_ollama_manager()
|
||||
retries = 0
|
||||
|
||||
while retries < self._config.max_load_retries:
|
||||
try:
|
||||
success = await asyncio.wait_for(
|
||||
ollama.load_model(),
|
||||
timeout=self._config.load_timeout_seconds
|
||||
)
|
||||
|
||||
if success:
|
||||
self._vlm_state = ModelState.LOADED
|
||||
self._update_vlm_request_time()
|
||||
self._start_idle_timer()
|
||||
self._emit_resource_changed("model_loaded", {"model": self._config.vlm_model})
|
||||
logger.info("VLM model loaded successfully")
|
||||
return True
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(f"VLM load timeout (attempt {retries + 1})")
|
||||
except Exception as e:
|
||||
logger.error(f"VLM load error: {e}")
|
||||
|
||||
retries += 1
|
||||
if retries < self._config.max_load_retries:
|
||||
await asyncio.sleep(1)
|
||||
|
||||
self._vlm_state = ModelState.ERROR
|
||||
self._set_degraded_mode(True, "VLM load failed after retries")
|
||||
logger.error("Failed to load VLM after all retries")
|
||||
return False
|
||||
|
||||
|
||||
async def ensure_vlm_unloaded(self) -> bool:
|
||||
"""
|
||||
Ensure VLM is unloaded.
|
||||
|
||||
Returns:
|
||||
True if VLM is unloaded, False on failure
|
||||
"""
|
||||
if self._vlm_state == ModelState.UNLOADED:
|
||||
return True
|
||||
|
||||
async with self._operation_lock:
|
||||
if self._vlm_state == ModelState.UNLOADED:
|
||||
return True
|
||||
|
||||
self._stop_idle_timer()
|
||||
self._vlm_state = ModelState.UNLOADING
|
||||
logger.info("Unloading VLM model...")
|
||||
|
||||
# Get VRAM before unload for verification
|
||||
vram_before = self._get_vram_usage_mb()
|
||||
|
||||
ollama = self._get_ollama_manager()
|
||||
try:
|
||||
success = await asyncio.wait_for(
|
||||
ollama.unload_model(),
|
||||
timeout=self._config.unload_timeout_seconds
|
||||
)
|
||||
|
||||
if success:
|
||||
self._vlm_state = ModelState.UNLOADED
|
||||
|
||||
# Verify VRAM decrease
|
||||
await asyncio.sleep(0.5) # Wait for VRAM to settle
|
||||
vram_after = self._get_vram_usage_mb()
|
||||
vram_freed = vram_before - vram_after
|
||||
|
||||
self._emit_resource_changed("model_unloaded", {
|
||||
"model": self._config.vlm_model,
|
||||
"vram_freed_mb": vram_freed
|
||||
})
|
||||
logger.info(f"VLM model unloaded, freed {vram_freed} MB VRAM")
|
||||
return True
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("VLM unload timeout")
|
||||
except Exception as e:
|
||||
logger.error(f"VLM unload error: {e}")
|
||||
|
||||
self._vlm_state = ModelState.ERROR
|
||||
return False
|
||||
|
||||
def is_vlm_loaded(self) -> bool:
|
||||
"""Check if VLM is currently loaded."""
|
||||
return self._vlm_state == ModelState.LOADED
|
||||
|
||||
def get_vlm_state(self) -> ModelState:
|
||||
"""Get the current VLM state."""
|
||||
return self._vlm_state
|
||||
|
||||
# =========================================================================
|
||||
# CLIP Management
|
||||
# =========================================================================
|
||||
|
||||
def get_clip_device(self) -> str:
|
||||
"""
|
||||
Get the current CLIP device.
|
||||
|
||||
Returns:
|
||||
"cpu" or "cuda"
|
||||
"""
|
||||
return self._clip_device
|
||||
|
||||
async def _try_migrate_clip_to_gpu(self) -> bool:
|
||||
"""Try to migrate CLIP to GPU if VRAM is available."""
|
||||
vram = self._get_vram_monitor().get_vram_info()
|
||||
if vram is None:
|
||||
logger.warning("Cannot get VRAM info, keeping CLIP on CPU")
|
||||
return False
|
||||
|
||||
if vram.free_mb < self._config.vram_threshold_for_clip_gpu_mb:
|
||||
logger.info(f"Insufficient VRAM ({vram.free_mb} MB), keeping CLIP on CPU")
|
||||
return False
|
||||
|
||||
return await self.migrate_clip_to_gpu()
|
||||
|
||||
async def migrate_clip_to_gpu(self) -> bool:
|
||||
"""
|
||||
Migrate CLIP model to GPU.
|
||||
|
||||
Returns:
|
||||
True if migration successful
|
||||
"""
|
||||
if self._clip_device == "cuda":
|
||||
return True
|
||||
|
||||
try:
|
||||
clip_manager = self._get_clip_manager()
|
||||
success = await clip_manager.migrate_to_device("cuda")
|
||||
|
||||
if success:
|
||||
self._clip_device = "cuda"
|
||||
self._emit_resource_changed("device_changed", {
|
||||
"model": "clip",
|
||||
"device": "cuda"
|
||||
})
|
||||
logger.info("CLIP migrated to GPU")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"CLIP GPU migration failed: {e}")
|
||||
|
||||
return False
|
||||
|
||||
async def _migrate_clip_to_cpu(self) -> bool:
|
||||
"""Migrate CLIP model to CPU."""
|
||||
if self._clip_device == "cpu":
|
||||
return True
|
||||
|
||||
return await self.migrate_clip_to_cpu()
|
||||
|
||||
async def migrate_clip_to_cpu(self) -> bool:
|
||||
"""
|
||||
Migrate CLIP model to CPU.
|
||||
|
||||
Returns:
|
||||
True if migration successful
|
||||
"""
|
||||
if self._clip_device == "cpu":
|
||||
return True
|
||||
|
||||
try:
|
||||
clip_manager = self._get_clip_manager()
|
||||
success = await clip_manager.migrate_to_device("cpu")
|
||||
|
||||
if success:
|
||||
self._clip_device = "cpu"
|
||||
self._emit_resource_changed("device_changed", {
|
||||
"model": "clip",
|
||||
"device": "cpu"
|
||||
})
|
||||
logger.info("CLIP migrated to CPU")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"CLIP CPU migration failed: {e}")
|
||||
|
||||
return False
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Monitoring
|
||||
# =========================================================================
|
||||
|
||||
def get_status(self) -> GPUResourceStatus:
|
||||
"""
|
||||
Get the current GPU resource status.
|
||||
|
||||
Returns:
|
||||
Complete status including VRAM, model states, and mode
|
||||
"""
|
||||
vram = self._get_vram_monitor().get_vram_info()
|
||||
|
||||
return GPUResourceStatus(
|
||||
execution_mode=self._execution_mode,
|
||||
vlm_state=self._vlm_state,
|
||||
vlm_model=self._config.vlm_model,
|
||||
clip_device=self._clip_device,
|
||||
vram=vram,
|
||||
idle_timeout_seconds=self._config.idle_timeout_seconds,
|
||||
last_vlm_request=self._last_vlm_request,
|
||||
degraded_mode=self._degraded_mode,
|
||||
degraded_reason=self._degraded_reason
|
||||
)
|
||||
|
||||
def get_vram_usage(self) -> Optional[VRAMInfo]:
|
||||
"""Get current VRAM usage information."""
|
||||
return self._get_vram_monitor().get_vram_info()
|
||||
|
||||
def _get_vram_usage_mb(self) -> int:
|
||||
"""Get current VRAM usage in MB."""
|
||||
vram = self._get_vram_monitor().get_vram_info()
|
||||
return vram.used_mb if vram else 0
|
||||
|
||||
# =========================================================================
|
||||
# Events
|
||||
# =========================================================================
|
||||
|
||||
def on_resource_changed(self, callback: Callable[[ResourceChangedEvent], None]) -> None:
|
||||
"""Register callback for resource change events."""
|
||||
self._on_resource_changed.append(callback)
|
||||
|
||||
def on_mode_changed(self, callback: Callable[[ExecutionMode], None]) -> None:
|
||||
"""Register callback for mode change events."""
|
||||
self._on_mode_changed.append(callback)
|
||||
|
||||
def on_idle_unload(self, callback: Callable[[], None]) -> None:
|
||||
"""Register callback for idle unload events."""
|
||||
self._on_idle_unload.append(callback)
|
||||
|
||||
def _emit_resource_changed(self, event_type: str, details: Dict[str, Any]) -> None:
|
||||
"""Emit a resource changed event."""
|
||||
event = ResourceChangedEvent(
|
||||
timestamp=datetime.now(),
|
||||
event_type=event_type,
|
||||
details=details
|
||||
)
|
||||
for callback in self._on_resource_changed:
|
||||
try:
|
||||
callback(event)
|
||||
except Exception as e:
|
||||
logger.error(f"Resource changed callback error: {e}")
|
||||
|
||||
def _emit_mode_changed(self, mode: ExecutionMode) -> None:
|
||||
"""Emit a mode changed event."""
|
||||
for callback in self._on_mode_changed:
|
||||
try:
|
||||
callback(mode)
|
||||
except Exception as e:
|
||||
logger.error(f"Mode changed callback error: {e}")
|
||||
|
||||
def _emit_idle_unload(self) -> None:
|
||||
"""Emit an idle unload event."""
|
||||
for callback in self._on_idle_unload:
|
||||
try:
|
||||
callback()
|
||||
except Exception as e:
|
||||
logger.error(f"Idle unload callback error: {e}")
|
||||
|
||||
# =========================================================================
|
||||
# Idle Timeout Management
|
||||
# =========================================================================
|
||||
|
||||
def _update_vlm_request_time(self) -> None:
|
||||
"""Update the last VLM request timestamp."""
|
||||
self._last_vlm_request = datetime.now()
|
||||
self._restart_idle_timer()
|
||||
|
||||
def _start_idle_timer(self) -> None:
|
||||
"""Start the idle timeout timer."""
|
||||
self._stop_idle_timer()
|
||||
self._idle_timer = threading.Timer(
|
||||
self._config.idle_timeout_seconds,
|
||||
self._on_idle_timeout
|
||||
)
|
||||
self._idle_timer.daemon = True
|
||||
self._idle_timer.start()
|
||||
|
||||
def _restart_idle_timer(self) -> None:
|
||||
"""Restart the idle timeout timer."""
|
||||
if self._vlm_state == ModelState.LOADED:
|
||||
self._start_idle_timer()
|
||||
|
||||
def _stop_idle_timer(self) -> None:
|
||||
"""Stop the idle timeout timer."""
|
||||
if self._idle_timer:
|
||||
self._idle_timer.cancel()
|
||||
self._idle_timer = None
|
||||
|
||||
def _on_idle_timeout(self) -> None:
|
||||
"""Handle idle timeout - unload VLM."""
|
||||
if self._vlm_state != ModelState.LOADED:
|
||||
return
|
||||
|
||||
logger.info("Idle timeout reached, unloading VLM")
|
||||
self._emit_idle_unload()
|
||||
|
||||
# Run unload in a new event loop (we're in a timer thread)
|
||||
try:
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
loop.run_until_complete(self.ensure_vlm_unloaded())
|
||||
loop.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Idle unload failed: {e}")
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Degraded Mode
|
||||
# =========================================================================
|
||||
|
||||
def _set_degraded_mode(self, degraded: bool, reason: Optional[str] = None) -> None:
|
||||
"""Set degraded mode status."""
|
||||
self._degraded_mode = degraded
|
||||
self._degraded_reason = reason
|
||||
if degraded:
|
||||
logger.warning(f"Entering degraded mode: {reason}")
|
||||
else:
|
||||
logger.info("Exiting degraded mode")
|
||||
|
||||
def is_degraded(self) -> bool:
|
||||
"""Check if operating in degraded mode."""
|
||||
return self._degraded_mode
|
||||
|
||||
# =========================================================================
|
||||
# Lifecycle
|
||||
# =========================================================================
|
||||
|
||||
def shutdown(self) -> None:
|
||||
"""Shutdown the GPU resource manager."""
|
||||
logger.info("Shutting down GPUResourceManager")
|
||||
self._stop_idle_timer()
|
||||
|
||||
# Unload VLM if loaded
|
||||
if self._vlm_state == ModelState.LOADED:
|
||||
try:
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
loop.run_until_complete(self.ensure_vlm_unloaded())
|
||||
loop.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Shutdown unload failed: {e}")
|
||||
|
||||
logger.info("GPUResourceManager shutdown complete")
|
||||
|
||||
@classmethod
|
||||
def reset_instance(cls) -> None:
|
||||
"""Reset the singleton instance (for testing)."""
|
||||
with cls._lock:
|
||||
if cls._instance:
|
||||
cls._instance.shutdown()
|
||||
cls._instance = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Factory function
|
||||
# =============================================================================
|
||||
|
||||
_manager_instance: Optional[GPUResourceManager] = None
|
||||
|
||||
|
||||
def get_gpu_resource_manager(config: Optional[GPUResourceConfig] = None) -> GPUResourceManager:
|
||||
"""
|
||||
Get the GPU resource manager singleton.
|
||||
|
||||
Args:
|
||||
config: Optional configuration (only used on first call)
|
||||
|
||||
Returns:
|
||||
GPUResourceManager instance
|
||||
"""
|
||||
global _manager_instance
|
||||
if _manager_instance is None:
|
||||
_manager_instance = GPUResourceManager(config)
|
||||
return _manager_instance
|
||||
265
core/gpu/ollama_manager.py
Normal file
265
core/gpu/ollama_manager.py
Normal file
@@ -0,0 +1,265 @@
|
||||
"""
|
||||
Ollama Manager - Manages VLM model lifecycle via Ollama API
|
||||
|
||||
Handles:
|
||||
- Loading/unloading models to/from VRAM
|
||||
- Health checks and availability detection
|
||||
- Keep-alive management for model persistence
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
|
||||
import aiohttp
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OllamaManager:
|
||||
"""
|
||||
Manages Ollama VLM model lifecycle.
|
||||
|
||||
Uses Ollama's REST API to control model loading/unloading.
|
||||
|
||||
Example:
|
||||
>>> manager = OllamaManager()
|
||||
>>> await manager.load_model()
|
||||
>>> is_loaded = await manager.is_model_loaded()
|
||||
>>> await manager.unload_model()
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
endpoint: str = "http://localhost:11434",
|
||||
model: str = "qwen3-vl:8b",
|
||||
default_keep_alive: str = "5m"
|
||||
):
|
||||
"""
|
||||
Initialize OllamaManager.
|
||||
|
||||
Args:
|
||||
endpoint: Ollama API endpoint
|
||||
model: Model name to manage
|
||||
default_keep_alive: Default keep-alive duration
|
||||
"""
|
||||
self._endpoint = endpoint.rstrip("/")
|
||||
self._model = model
|
||||
self._default_keep_alive = default_keep_alive
|
||||
self._session: Optional[aiohttp.ClientSession] = None
|
||||
|
||||
async def _get_session(self) -> aiohttp.ClientSession:
|
||||
"""Get or create aiohttp session."""
|
||||
if self._session is None or self._session.closed:
|
||||
self._session = aiohttp.ClientSession(
|
||||
timeout=aiohttp.ClientTimeout(total=60)
|
||||
)
|
||||
return self._session
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close the HTTP session."""
|
||||
if self._session and not self._session.closed:
|
||||
await self._session.close()
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Health Check
|
||||
# =========================================================================
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""
|
||||
Check if Ollama service is available (synchronous).
|
||||
|
||||
Returns:
|
||||
True if Ollama is reachable
|
||||
"""
|
||||
import requests
|
||||
try:
|
||||
response = requests.get(f"{self._endpoint}/api/tags", timeout=5)
|
||||
return response.status_code == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
async def is_available_async(self) -> bool:
|
||||
"""
|
||||
Check if Ollama service is available (async).
|
||||
|
||||
Returns:
|
||||
True if Ollama is reachable
|
||||
"""
|
||||
try:
|
||||
session = await self._get_session()
|
||||
async with session.get(f"{self._endpoint}/api/tags") as response:
|
||||
return response.status == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
# =========================================================================
|
||||
# Model Management
|
||||
# =========================================================================
|
||||
|
||||
async def load_model(self, keep_alive: Optional[str] = None) -> bool:
|
||||
"""
|
||||
Load the model into VRAM.
|
||||
|
||||
Uses a minimal generate request to trigger model loading.
|
||||
|
||||
Args:
|
||||
keep_alive: How long to keep model loaded (e.g., "5m", "1h")
|
||||
|
||||
Returns:
|
||||
True if model loaded successfully
|
||||
"""
|
||||
keep_alive = keep_alive or self._default_keep_alive
|
||||
|
||||
try:
|
||||
session = await self._get_session()
|
||||
|
||||
# Send a minimal request to load the model
|
||||
# Pour Qwen3, utiliser /nothink pour désactiver le thinking mode
|
||||
prompt = "/nothink " if "qwen" in self._model.lower() else ""
|
||||
|
||||
payload = {
|
||||
"model": self._model,
|
||||
"prompt": prompt,
|
||||
"keep_alive": keep_alive,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.0, # Déterministe pour la classification
|
||||
"top_k": 1 # Plus rapide pour les tâches de classification
|
||||
}
|
||||
}
|
||||
|
||||
logger.debug(f"Loading model {self._model} with keep_alive={keep_alive}")
|
||||
|
||||
async with session.post(
|
||||
f"{self._endpoint}/api/generate",
|
||||
json=payload
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
logger.info(f"Model {self._model} loaded successfully")
|
||||
return True
|
||||
else:
|
||||
text = await response.text()
|
||||
logger.error(f"Failed to load model: {response.status} - {text}")
|
||||
return False
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.error("Timeout loading model")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading model: {e}")
|
||||
return False
|
||||
|
||||
async def unload_model(self) -> bool:
|
||||
"""
|
||||
Unload the model from VRAM.
|
||||
|
||||
Sets keep_alive to 0 to trigger immediate unload.
|
||||
|
||||
Returns:
|
||||
True if model unloaded successfully
|
||||
"""
|
||||
try:
|
||||
session = await self._get_session()
|
||||
|
||||
# Send request with keep_alive=0 to unload
|
||||
payload = {
|
||||
"model": self._model,
|
||||
"prompt": "",
|
||||
"keep_alive": 0,
|
||||
"stream": False
|
||||
}
|
||||
|
||||
logger.debug(f"Unloading model {self._model}")
|
||||
|
||||
async with session.post(
|
||||
f"{self._endpoint}/api/generate",
|
||||
json=payload
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
logger.info(f"Model {self._model} unloaded successfully")
|
||||
return True
|
||||
else:
|
||||
text = await response.text()
|
||||
logger.error(f"Failed to unload model: {response.status} - {text}")
|
||||
return False
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.error("Timeout unloading model")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Error unloading model: {e}")
|
||||
return False
|
||||
|
||||
async def is_model_loaded(self) -> bool:
|
||||
"""
|
||||
Check if the model is currently loaded in VRAM.
|
||||
|
||||
Returns:
|
||||
True if model is loaded
|
||||
"""
|
||||
try:
|
||||
session = await self._get_session()
|
||||
|
||||
async with session.get(f"{self._endpoint}/api/ps") as response:
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
models = data.get("models", [])
|
||||
|
||||
for model_info in models:
|
||||
if model_info.get("name", "").startswith(self._model.split(":")[0]):
|
||||
return True
|
||||
|
||||
return False
|
||||
else:
|
||||
logger.warning(f"Failed to check loaded models: {response.status}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking loaded models: {e}")
|
||||
return False
|
||||
|
||||
async def list_loaded_models(self) -> List[str]:
|
||||
"""
|
||||
List all currently loaded models.
|
||||
|
||||
Returns:
|
||||
List of loaded model names
|
||||
"""
|
||||
try:
|
||||
session = await self._get_session()
|
||||
|
||||
async with session.get(f"{self._endpoint}/api/ps") as response:
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
models = data.get("models", [])
|
||||
return [m.get("name", "") for m in models]
|
||||
else:
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error listing loaded models: {e}")
|
||||
return []
|
||||
|
||||
async def list_available_models(self) -> List[str]:
|
||||
"""
|
||||
List all available models (downloaded).
|
||||
|
||||
Returns:
|
||||
List of available model names
|
||||
"""
|
||||
try:
|
||||
session = await self._get_session()
|
||||
|
||||
async with session.get(f"{self._endpoint}/api/tags") as response:
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
models = data.get("models", [])
|
||||
return [m.get("name", "") for m in models]
|
||||
else:
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error listing available models: {e}")
|
||||
return []
|
||||
292
core/gpu/vram_monitor.py
Normal file
292
core/gpu/vram_monitor.py
Normal file
@@ -0,0 +1,292 @@
|
||||
"""
|
||||
VRAM Monitor - Monitors GPU VRAM usage
|
||||
|
||||
Uses pynvml (NVIDIA Management Library) to query VRAM.
|
||||
Falls back gracefully on systems without NVIDIA GPU.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
import threading
|
||||
from typing import Callable, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Try to import pynvml
|
||||
try:
|
||||
import pynvml
|
||||
PYNVML_AVAILABLE = True
|
||||
except ImportError:
|
||||
PYNVML_AVAILABLE = False
|
||||
logger.warning("pynvml not available, VRAM monitoring will use nvidia-smi fallback")
|
||||
|
||||
|
||||
class VRAMInfo:
|
||||
"""Information about VRAM usage."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
total_mb: int,
|
||||
used_mb: int,
|
||||
free_mb: int,
|
||||
gpu_name: str,
|
||||
gpu_utilization_percent: int
|
||||
):
|
||||
self.total_mb = total_mb
|
||||
self.used_mb = used_mb
|
||||
self.free_mb = free_mb
|
||||
self.gpu_name = gpu_name
|
||||
self.gpu_utilization_percent = gpu_utilization_percent
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
f"VRAMInfo(used={self.used_mb}MB, free={self.free_mb}MB, "
|
||||
f"total={self.total_mb}MB, gpu={self.gpu_name})"
|
||||
)
|
||||
|
||||
|
||||
class VRAMMonitor:
|
||||
"""
|
||||
Monitors GPU VRAM usage.
|
||||
|
||||
Uses pynvml for efficient queries, falls back to nvidia-smi.
|
||||
|
||||
Example:
|
||||
>>> monitor = VRAMMonitor()
|
||||
>>> info = monitor.get_vram_info()
|
||||
>>> print(f"Free VRAM: {info.free_mb} MB")
|
||||
"""
|
||||
|
||||
def __init__(self, gpu_index: int = 0, poll_interval_ms: int = 1000):
|
||||
"""
|
||||
Initialize VRAM monitor.
|
||||
|
||||
Args:
|
||||
gpu_index: GPU index to monitor (default 0)
|
||||
poll_interval_ms: Polling interval for continuous monitoring
|
||||
"""
|
||||
self._gpu_index = gpu_index
|
||||
self._poll_interval_ms = poll_interval_ms
|
||||
self._nvml_initialized = False
|
||||
self._gpu_available = False
|
||||
self._handle = None
|
||||
|
||||
# Monitoring state
|
||||
self._monitoring = False
|
||||
self._monitor_thread: Optional[threading.Thread] = None
|
||||
self._callbacks: List[tuple] = [] # (callback, threshold_mb)
|
||||
self._last_vram_mb = 0
|
||||
|
||||
self._initialize()
|
||||
|
||||
|
||||
def _initialize(self) -> None:
|
||||
"""Initialize NVML if available."""
|
||||
if PYNVML_AVAILABLE:
|
||||
try:
|
||||
pynvml.nvmlInit()
|
||||
self._nvml_initialized = True
|
||||
|
||||
device_count = pynvml.nvmlDeviceGetCount()
|
||||
if device_count > self._gpu_index:
|
||||
self._handle = pynvml.nvmlDeviceGetHandleByIndex(self._gpu_index)
|
||||
self._gpu_available = True
|
||||
name = pynvml.nvmlDeviceGetName(self._handle)
|
||||
if isinstance(name, bytes):
|
||||
name = name.decode('utf-8')
|
||||
logger.info(f"VRAM monitor initialized for GPU {self._gpu_index}: {name}")
|
||||
else:
|
||||
logger.warning(f"GPU index {self._gpu_index} not found (count={device_count})")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to initialize pynvml: {e}")
|
||||
self._nvml_initialized = False
|
||||
|
||||
# Try nvidia-smi fallback
|
||||
if not self._gpu_available:
|
||||
self._gpu_available = self._check_nvidia_smi()
|
||||
|
||||
def _check_nvidia_smi(self) -> bool:
|
||||
"""Check if nvidia-smi is available."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
return result.returncode == 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def is_gpu_available(self) -> bool:
|
||||
"""Check if GPU monitoring is available."""
|
||||
return self._gpu_available
|
||||
|
||||
def get_vram_info(self) -> Optional[VRAMInfo]:
|
||||
"""
|
||||
Get current VRAM information.
|
||||
|
||||
Returns:
|
||||
VRAMInfo or None if GPU not available
|
||||
"""
|
||||
if not self._gpu_available:
|
||||
return None
|
||||
|
||||
if self._nvml_initialized and self._handle:
|
||||
return self._get_vram_pynvml()
|
||||
else:
|
||||
return self._get_vram_nvidia_smi()
|
||||
|
||||
def _get_vram_pynvml(self) -> Optional[VRAMInfo]:
|
||||
"""Get VRAM info using pynvml."""
|
||||
try:
|
||||
memory = pynvml.nvmlDeviceGetMemoryInfo(self._handle)
|
||||
utilization = pynvml.nvmlDeviceGetUtilizationRates(self._handle)
|
||||
name = pynvml.nvmlDeviceGetName(self._handle)
|
||||
if isinstance(name, bytes):
|
||||
name = name.decode('utf-8')
|
||||
|
||||
return VRAMInfo(
|
||||
total_mb=memory.total // (1024 * 1024),
|
||||
used_mb=memory.used // (1024 * 1024),
|
||||
free_mb=memory.free // (1024 * 1024),
|
||||
gpu_name=name,
|
||||
gpu_utilization_percent=utilization.gpu
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"pynvml error: {e}")
|
||||
return None
|
||||
|
||||
def _get_vram_nvidia_smi(self) -> Optional[VRAMInfo]:
|
||||
"""Get VRAM info using nvidia-smi (fallback)."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
"nvidia-smi",
|
||||
"--query-gpu=name,memory.used,memory.total,utilization.gpu",
|
||||
"--format=csv,noheader,nounits"
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
return None
|
||||
|
||||
lines = result.stdout.strip().split("\n")
|
||||
if self._gpu_index >= len(lines):
|
||||
return None
|
||||
|
||||
parts = [p.strip() for p in lines[self._gpu_index].split(",")]
|
||||
if len(parts) < 4:
|
||||
return None
|
||||
|
||||
name = parts[0]
|
||||
used_mb = int(parts[1])
|
||||
total_mb = int(parts[2])
|
||||
utilization = int(parts[3]) if parts[3].isdigit() else 0
|
||||
|
||||
return VRAMInfo(
|
||||
total_mb=total_mb,
|
||||
used_mb=used_mb,
|
||||
free_mb=total_mb - used_mb,
|
||||
gpu_name=name,
|
||||
gpu_utilization_percent=utilization
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"nvidia-smi error: {e}")
|
||||
return None
|
||||
|
||||
def get_available_vram_mb(self) -> int:
|
||||
"""Get available VRAM in MB."""
|
||||
info = self.get_vram_info()
|
||||
return info.free_mb if info else 0
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Continuous Monitoring
|
||||
# =========================================================================
|
||||
|
||||
def start_monitoring(self) -> None:
|
||||
"""Start continuous VRAM monitoring."""
|
||||
if self._monitoring:
|
||||
return
|
||||
|
||||
if not self._gpu_available:
|
||||
logger.warning("Cannot start monitoring: GPU not available")
|
||||
return
|
||||
|
||||
self._monitoring = True
|
||||
self._monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
|
||||
self._monitor_thread.start()
|
||||
logger.info("VRAM monitoring started")
|
||||
|
||||
def stop_monitoring(self) -> None:
|
||||
"""Stop continuous VRAM monitoring."""
|
||||
self._monitoring = False
|
||||
if self._monitor_thread:
|
||||
self._monitor_thread.join(timeout=2)
|
||||
self._monitor_thread = None
|
||||
logger.info("VRAM monitoring stopped")
|
||||
|
||||
def _monitor_loop(self) -> None:
|
||||
"""Monitoring loop running in background thread."""
|
||||
import time
|
||||
|
||||
while self._monitoring:
|
||||
info = self.get_vram_info()
|
||||
if info:
|
||||
current_vram = info.used_mb
|
||||
|
||||
# Check callbacks
|
||||
for callback, threshold_mb in self._callbacks:
|
||||
if abs(current_vram - self._last_vram_mb) >= threshold_mb:
|
||||
try:
|
||||
callback(info)
|
||||
except Exception as e:
|
||||
logger.error(f"VRAM callback error: {e}")
|
||||
|
||||
self._last_vram_mb = current_vram
|
||||
|
||||
time.sleep(self._poll_interval_ms / 1000.0)
|
||||
|
||||
def on_vram_changed(
|
||||
self,
|
||||
callback: Callable[[VRAMInfo], None],
|
||||
threshold_mb: int = 100
|
||||
) -> None:
|
||||
"""
|
||||
Register callback for VRAM changes.
|
||||
|
||||
Args:
|
||||
callback: Function to call when VRAM changes
|
||||
threshold_mb: Minimum change in MB to trigger callback
|
||||
"""
|
||||
self._callbacks.append((callback, threshold_mb))
|
||||
|
||||
# =========================================================================
|
||||
# Cleanup
|
||||
# =========================================================================
|
||||
|
||||
def shutdown(self) -> None:
|
||||
"""Shutdown the VRAM monitor."""
|
||||
self.stop_monitoring()
|
||||
|
||||
if self._nvml_initialized:
|
||||
try:
|
||||
pynvml.nvmlShutdown()
|
||||
except Exception:
|
||||
pass
|
||||
self._nvml_initialized = False
|
||||
|
||||
logger.info("VRAM monitor shutdown")
|
||||
|
||||
def __del__(self):
|
||||
"""Cleanup on deletion."""
|
||||
try:
|
||||
self.shutdown()
|
||||
except Exception:
|
||||
pass
|
||||
Reference in New Issue
Block a user