v1.0 - Version stable: multi-PC, détection UI-DETR-1, 3 modes exécution

- Frontend v4 accessible sur réseau local (192.168.1.40)
- Ports ouverts: 3002 (frontend), 5001 (backend), 5004 (dashboard)
- Ollama GPU fonctionnel
- Self-healing interactif
- Dashboard confiance

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Dom
2026-01-29 11:23:51 +01:00
parent 21bfa3b337
commit a27b74cf22
1595 changed files with 412691 additions and 400 deletions

40
core/gpu/__init__.py Normal file
View File

@@ -0,0 +1,40 @@
"""
GPU Resource Management Module for RPA Vision V3
This module provides dynamic GPU resource allocation between ML models:
- Ollama VLM (qwen3-vl:8b) for UI classification
- CLIP (ViT-B-32) for embedding matching
The GPUResourceManager optimizes VRAM usage by:
- Unloading VLM in autopilot mode
- Migrating CLIP to GPU when VRAM is available
- Managing idle timeouts for automatic resource cleanup
"""
from .gpu_resource_manager import (
GPUResourceManager,
ExecutionMode,
ModelState,
GPUResourceConfig,
GPUResourceStatus,
VRAMInfo,
ResourceChangedEvent,
get_gpu_resource_manager,
)
from .ollama_manager import OllamaManager
from .vram_monitor import VRAMMonitor
from .clip_manager import CLIPManager
__all__ = [
"GPUResourceManager",
"ExecutionMode",
"ModelState",
"GPUResourceConfig",
"GPUResourceStatus",
"VRAMInfo",
"ResourceChangedEvent",
"get_gpu_resource_manager",
"OllamaManager",
"VRAMMonitor",
"CLIPManager",
]

248
core/gpu/clip_manager.py Normal file
View File

@@ -0,0 +1,248 @@
"""
CLIP Manager - Manages CLIP model device migration
Handles:
- CPU/GPU device migration for CLIP model
- Pipeline reinitialization after device change
- Graceful fallback on migration failures
"""
import asyncio
import logging
from typing import Any, Optional
import torch
logger = logging.getLogger(__name__)
class CLIPManager:
"""
Manages CLIP model device migration between CPU and GPU.
Coordinates with the embedding pipeline to ensure consistent
device usage after migration.
Example:
>>> manager = CLIPManager()
>>> await manager.migrate_to_device("cuda")
>>> device = manager.get_current_device()
"""
def __init__(self, model_name: str = "ViT-B-32"):
"""
Initialize CLIPManager.
Args:
model_name: CLIP model variant to manage
"""
self._model_name = model_name
self._current_device = "cpu"
self._model: Optional[Any] = None
self._preprocess: Optional[Any] = None
self._initialized = False
# Check CUDA availability
self._cuda_available = torch.cuda.is_available()
if not self._cuda_available:
logger.warning("CUDA not available, CLIP will stay on CPU")
def get_current_device(self) -> str:
"""
Get the current device for CLIP model.
Returns:
"cpu" or "cuda"
"""
return self._current_device
def is_cuda_available(self) -> bool:
"""Check if CUDA is available for GPU migration."""
return self._cuda_available
async def migrate_to_device(self, device: str) -> bool:
"""
Migrate CLIP model to specified device.
Args:
device: Target device ("cpu" or "cuda")
Returns:
True if migration successful
"""
if device not in ["cpu", "cuda"]:
logger.error(f"Invalid device: {device}")
return False
if device == self._current_device:
logger.debug(f"CLIP already on {device}")
return True
if device == "cuda" and not self._cuda_available:
logger.warning("Cannot migrate to CUDA: not available")
return False
logger.info(f"Migrating CLIP from {self._current_device} to {device}")
try:
# Run migration in executor to avoid blocking
loop = asyncio.get_event_loop()
success = await loop.run_in_executor(
None,
self._do_migration,
device
)
if success:
self._current_device = device
logger.info(f"CLIP migrated to {device}")
return True
except Exception as e:
logger.error(f"CLIP migration failed: {e}")
return False
def _do_migration(self, device: str) -> bool:
"""
Perform the actual device migration (blocking).
Args:
device: Target device
Returns:
True if successful
"""
try:
# If model is loaded, move it
if self._model is not None:
self._model = self._model.to(device)
logger.debug(f"Moved existing model to {device}")
# Reinitialize pipeline with new device
self.reinitialize_pipeline(device)
return True
except Exception as e:
logger.error(f"Migration error: {e}")
return False
def reinitialize_pipeline(self, device: Optional[str] = None) -> None:
"""
Reinitialize the embedding pipeline with current/specified device.
Args:
device: Device to use (uses current if None)
"""
device = device or self._current_device
try:
# Try to notify FusionEngine about device change
self._notify_fusion_engine(device)
logger.debug(f"Pipeline reinitialized for {device}")
except Exception as e:
logger.warning(f"Pipeline reinitialization warning: {e}")
def _notify_fusion_engine(self, device: str) -> None:
"""
Notify FusionEngine about device change.
This allows the embedding system to update its device configuration.
"""
try:
from core.embedding.fusion_engine import FusionEngine
# FusionEngine is typically a singleton, try to get instance
# and update its device configuration
# This is a soft dependency - if it fails, we continue
except ImportError:
pass # FusionEngine not available, that's OK
def get_model(self) -> Optional[Any]:
"""
Get the CLIP model instance.
Returns:
CLIP model or None if not loaded
"""
return self._model
def load_model(self) -> bool:
"""
Load the CLIP model on current device.
Returns:
True if loaded successfully
"""
try:
import open_clip
model, _, preprocess = open_clip.create_model_and_transforms(
self._model_name,
pretrained='openai',
device=self._current_device
)
self._model = model
self._preprocess = preprocess
self._initialized = True
logger.info(f"CLIP model {self._model_name} loaded on {self._current_device}")
return True
except Exception as e:
logger.error(f"Failed to load CLIP model: {e}")
return False
def unload_model(self) -> None:
"""Unload the CLIP model to free memory."""
if self._model is not None:
del self._model
self._model = None
self._preprocess = None
self._initialized = False
# Force garbage collection
import gc
gc.collect()
if self._cuda_available:
torch.cuda.empty_cache()
logger.info("CLIP model unloaded")
def encode_image(self, image) -> Optional[Any]:
"""
Encode an image using CLIP.
Args:
image: PIL Image or tensor
Returns:
Image embedding or None on error
"""
if not self._initialized or self._model is None:
if not self.load_model():
return None
try:
import torch
with torch.no_grad():
if self._preprocess:
image_tensor = self._preprocess(image).unsqueeze(0)
else:
image_tensor = image
image_tensor = image_tensor.to(self._current_device)
embedding = self._model.encode_image(image_tensor)
return embedding.cpu().numpy()
except Exception as e:
logger.error(f"Image encoding error: {e}")
return None

View File

@@ -0,0 +1,614 @@
"""
GPU Resource Manager - Central orchestrator for GPU resource allocation
Manages dynamic allocation of GPU resources between:
- Ollama VLM (qwen3-vl:8b) - ~10.5 GB VRAM for UI classification
- CLIP (ViT-B-32) - ~500 MB VRAM for embedding matching
Optimizes VRAM usage based on execution mode:
- RECORDING: VLM loaded, CLIP on CPU
- AUTOPILOT: VLM unloaded, CLIP on GPU
- IDLE: No automatic changes
"""
import asyncio
import logging
import threading
import time
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from typing import Any, Callable, Dict, List, Optional
logger = logging.getLogger(__name__)
class ExecutionMode(str, Enum):
"""Execution modes for the RPA system."""
IDLE = "idle"
RECORDING = "recording"
AUTOPILOT = "autopilot"
class ModelState(str, Enum):
"""State of a model in the GPU resource manager."""
UNLOADED = "unloaded"
LOADING = "loading"
LOADED = "loaded"
UNLOADING = "unloading"
ERROR = "error"
@dataclass
class VRAMInfo:
"""Information about VRAM usage."""
total_mb: int
used_mb: int
free_mb: int
gpu_name: str
gpu_utilization_percent: int
@dataclass
class GPUResourceConfig:
"""Configuration for GPU resource management."""
ollama_endpoint: str = "http://localhost:11434"
vlm_model: str = "qwen3-vl:8b"
clip_model: str = "ViT-B-32"
idle_timeout_seconds: int = 300 # 5 minutes
vram_threshold_for_clip_gpu_mb: int = 1024 # 1 GB
max_load_retries: int = 3
load_timeout_seconds: int = 30
unload_timeout_seconds: int = 5
@dataclass
class GPUResourceStatus:
"""Current status of GPU resources."""
execution_mode: ExecutionMode
vlm_state: ModelState
vlm_model: str
clip_device: str
vram: Optional[VRAMInfo]
idle_timeout_seconds: int
last_vlm_request: Optional[datetime]
degraded_mode: bool
degraded_reason: Optional[str]
@dataclass
class ResourceChangedEvent:
"""Event emitted when GPU resources change."""
timestamp: datetime
event_type: str # "vram_changed", "model_loaded", "model_unloaded", "device_changed"
details: Dict[str, Any] = field(default_factory=dict)
class GPUResourceManager:
"""
Central manager for GPU resource allocation.
Singleton pattern ensures only one instance manages GPU resources.
Example:
>>> manager = get_gpu_resource_manager()
>>> await manager.set_execution_mode(ExecutionMode.AUTOPILOT)
>>> status = manager.get_status()
"""
_instance: Optional["GPUResourceManager"] = None
_lock = threading.Lock()
def __new__(cls, config: Optional[GPUResourceConfig] = None):
with cls._lock:
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._instance._initialized = False
return cls._instance
def __init__(self, config: Optional[GPUResourceConfig] = None):
if self._initialized:
return
self._config = config or GPUResourceConfig()
self._execution_mode = ExecutionMode.IDLE
self._vlm_state = ModelState.UNLOADED
self._clip_device = "cpu"
self._last_vlm_request: Optional[datetime] = None
self._degraded_mode = False
self._degraded_reason: Optional[str] = None
# Managers (lazy initialized)
self._ollama_manager: Optional[Any] = None
self._vram_monitor: Optional[Any] = None
self._clip_manager: Optional[Any] = None
# Operation queue for sequential processing
self._operation_queue: asyncio.Queue = asyncio.Queue()
self._operation_lock = asyncio.Lock()
# Event callbacks
self._on_resource_changed: List[Callable[[ResourceChangedEvent], None]] = []
self._on_mode_changed: List[Callable[[ExecutionMode], None]] = []
self._on_idle_unload: List[Callable[[], None]] = []
# Idle timeout management
self._idle_timer: Optional[threading.Timer] = None
self._idle_check_running = False
self._initialized = True
logger.info(f"GPUResourceManager initialized with config: {self._config}")
# =========================================================================
# Lazy initialization of managers
# =========================================================================
def _get_ollama_manager(self):
"""Lazy load OllamaManager."""
if self._ollama_manager is None:
from .ollama_manager import OllamaManager
self._ollama_manager = OllamaManager(
endpoint=self._config.ollama_endpoint,
model=self._config.vlm_model
)
return self._ollama_manager
def _get_vram_monitor(self):
"""Lazy load VRAMMonitor."""
if self._vram_monitor is None:
from .vram_monitor import VRAMMonitor
self._vram_monitor = VRAMMonitor()
return self._vram_monitor
def _get_clip_manager(self):
"""Lazy load CLIPManager."""
if self._clip_manager is None:
from .clip_manager import CLIPManager
self._clip_manager = CLIPManager(model_name=self._config.clip_model)
return self._clip_manager
# =========================================================================
# Mode Management
# =========================================================================
async def set_execution_mode(self, mode: ExecutionMode) -> None:
"""
Set the execution mode and adjust GPU resources accordingly.
Args:
mode: Target execution mode
"""
if mode == self._execution_mode:
logger.debug(f"Already in {mode.value} mode")
return
old_mode = self._execution_mode
logger.info(f"Transitioning from {old_mode.value} to {mode.value}")
async with self._operation_lock:
if mode == ExecutionMode.AUTOPILOT:
# Unload VLM, migrate CLIP to GPU
await self.ensure_vlm_unloaded()
await self._try_migrate_clip_to_gpu()
elif mode == ExecutionMode.RECORDING:
# Migrate CLIP to CPU first, then load VLM
await self._migrate_clip_to_cpu()
await self.ensure_vlm_loaded()
# IDLE mode: no automatic changes
self._execution_mode = mode
self._emit_mode_changed(mode)
logger.info(f"Mode transition complete: {mode.value}")
def get_execution_mode(self) -> ExecutionMode:
"""Get the current execution mode."""
return self._execution_mode
# =========================================================================
# VLM Management
# =========================================================================
async def ensure_vlm_loaded(self) -> bool:
"""
Ensure VLM is loaded and ready.
Returns:
True if VLM is loaded, False on failure
"""
if self._vlm_state == ModelState.LOADED:
self._update_vlm_request_time()
return True
if self._degraded_mode:
logger.warning("Cannot load VLM in degraded mode")
return False
async with self._operation_lock:
if self._vlm_state == ModelState.LOADED:
self._update_vlm_request_time()
return True
self._vlm_state = ModelState.LOADING
logger.info("Loading VLM model...")
ollama = self._get_ollama_manager()
retries = 0
while retries < self._config.max_load_retries:
try:
success = await asyncio.wait_for(
ollama.load_model(),
timeout=self._config.load_timeout_seconds
)
if success:
self._vlm_state = ModelState.LOADED
self._update_vlm_request_time()
self._start_idle_timer()
self._emit_resource_changed("model_loaded", {"model": self._config.vlm_model})
logger.info("VLM model loaded successfully")
return True
except asyncio.TimeoutError:
logger.warning(f"VLM load timeout (attempt {retries + 1})")
except Exception as e:
logger.error(f"VLM load error: {e}")
retries += 1
if retries < self._config.max_load_retries:
await asyncio.sleep(1)
self._vlm_state = ModelState.ERROR
self._set_degraded_mode(True, "VLM load failed after retries")
logger.error("Failed to load VLM after all retries")
return False
async def ensure_vlm_unloaded(self) -> bool:
"""
Ensure VLM is unloaded.
Returns:
True if VLM is unloaded, False on failure
"""
if self._vlm_state == ModelState.UNLOADED:
return True
async with self._operation_lock:
if self._vlm_state == ModelState.UNLOADED:
return True
self._stop_idle_timer()
self._vlm_state = ModelState.UNLOADING
logger.info("Unloading VLM model...")
# Get VRAM before unload for verification
vram_before = self._get_vram_usage_mb()
ollama = self._get_ollama_manager()
try:
success = await asyncio.wait_for(
ollama.unload_model(),
timeout=self._config.unload_timeout_seconds
)
if success:
self._vlm_state = ModelState.UNLOADED
# Verify VRAM decrease
await asyncio.sleep(0.5) # Wait for VRAM to settle
vram_after = self._get_vram_usage_mb()
vram_freed = vram_before - vram_after
self._emit_resource_changed("model_unloaded", {
"model": self._config.vlm_model,
"vram_freed_mb": vram_freed
})
logger.info(f"VLM model unloaded, freed {vram_freed} MB VRAM")
return True
except asyncio.TimeoutError:
logger.warning("VLM unload timeout")
except Exception as e:
logger.error(f"VLM unload error: {e}")
self._vlm_state = ModelState.ERROR
return False
def is_vlm_loaded(self) -> bool:
"""Check if VLM is currently loaded."""
return self._vlm_state == ModelState.LOADED
def get_vlm_state(self) -> ModelState:
"""Get the current VLM state."""
return self._vlm_state
# =========================================================================
# CLIP Management
# =========================================================================
def get_clip_device(self) -> str:
"""
Get the current CLIP device.
Returns:
"cpu" or "cuda"
"""
return self._clip_device
async def _try_migrate_clip_to_gpu(self) -> bool:
"""Try to migrate CLIP to GPU if VRAM is available."""
vram = self._get_vram_monitor().get_vram_info()
if vram is None:
logger.warning("Cannot get VRAM info, keeping CLIP on CPU")
return False
if vram.free_mb < self._config.vram_threshold_for_clip_gpu_mb:
logger.info(f"Insufficient VRAM ({vram.free_mb} MB), keeping CLIP on CPU")
return False
return await self.migrate_clip_to_gpu()
async def migrate_clip_to_gpu(self) -> bool:
"""
Migrate CLIP model to GPU.
Returns:
True if migration successful
"""
if self._clip_device == "cuda":
return True
try:
clip_manager = self._get_clip_manager()
success = await clip_manager.migrate_to_device("cuda")
if success:
self._clip_device = "cuda"
self._emit_resource_changed("device_changed", {
"model": "clip",
"device": "cuda"
})
logger.info("CLIP migrated to GPU")
return True
except Exception as e:
logger.error(f"CLIP GPU migration failed: {e}")
return False
async def _migrate_clip_to_cpu(self) -> bool:
"""Migrate CLIP model to CPU."""
if self._clip_device == "cpu":
return True
return await self.migrate_clip_to_cpu()
async def migrate_clip_to_cpu(self) -> bool:
"""
Migrate CLIP model to CPU.
Returns:
True if migration successful
"""
if self._clip_device == "cpu":
return True
try:
clip_manager = self._get_clip_manager()
success = await clip_manager.migrate_to_device("cpu")
if success:
self._clip_device = "cpu"
self._emit_resource_changed("device_changed", {
"model": "clip",
"device": "cpu"
})
logger.info("CLIP migrated to CPU")
return True
except Exception as e:
logger.error(f"CLIP CPU migration failed: {e}")
return False
# =========================================================================
# Monitoring
# =========================================================================
def get_status(self) -> GPUResourceStatus:
"""
Get the current GPU resource status.
Returns:
Complete status including VRAM, model states, and mode
"""
vram = self._get_vram_monitor().get_vram_info()
return GPUResourceStatus(
execution_mode=self._execution_mode,
vlm_state=self._vlm_state,
vlm_model=self._config.vlm_model,
clip_device=self._clip_device,
vram=vram,
idle_timeout_seconds=self._config.idle_timeout_seconds,
last_vlm_request=self._last_vlm_request,
degraded_mode=self._degraded_mode,
degraded_reason=self._degraded_reason
)
def get_vram_usage(self) -> Optional[VRAMInfo]:
"""Get current VRAM usage information."""
return self._get_vram_monitor().get_vram_info()
def _get_vram_usage_mb(self) -> int:
"""Get current VRAM usage in MB."""
vram = self._get_vram_monitor().get_vram_info()
return vram.used_mb if vram else 0
# =========================================================================
# Events
# =========================================================================
def on_resource_changed(self, callback: Callable[[ResourceChangedEvent], None]) -> None:
"""Register callback for resource change events."""
self._on_resource_changed.append(callback)
def on_mode_changed(self, callback: Callable[[ExecutionMode], None]) -> None:
"""Register callback for mode change events."""
self._on_mode_changed.append(callback)
def on_idle_unload(self, callback: Callable[[], None]) -> None:
"""Register callback for idle unload events."""
self._on_idle_unload.append(callback)
def _emit_resource_changed(self, event_type: str, details: Dict[str, Any]) -> None:
"""Emit a resource changed event."""
event = ResourceChangedEvent(
timestamp=datetime.now(),
event_type=event_type,
details=details
)
for callback in self._on_resource_changed:
try:
callback(event)
except Exception as e:
logger.error(f"Resource changed callback error: {e}")
def _emit_mode_changed(self, mode: ExecutionMode) -> None:
"""Emit a mode changed event."""
for callback in self._on_mode_changed:
try:
callback(mode)
except Exception as e:
logger.error(f"Mode changed callback error: {e}")
def _emit_idle_unload(self) -> None:
"""Emit an idle unload event."""
for callback in self._on_idle_unload:
try:
callback()
except Exception as e:
logger.error(f"Idle unload callback error: {e}")
# =========================================================================
# Idle Timeout Management
# =========================================================================
def _update_vlm_request_time(self) -> None:
"""Update the last VLM request timestamp."""
self._last_vlm_request = datetime.now()
self._restart_idle_timer()
def _start_idle_timer(self) -> None:
"""Start the idle timeout timer."""
self._stop_idle_timer()
self._idle_timer = threading.Timer(
self._config.idle_timeout_seconds,
self._on_idle_timeout
)
self._idle_timer.daemon = True
self._idle_timer.start()
def _restart_idle_timer(self) -> None:
"""Restart the idle timeout timer."""
if self._vlm_state == ModelState.LOADED:
self._start_idle_timer()
def _stop_idle_timer(self) -> None:
"""Stop the idle timeout timer."""
if self._idle_timer:
self._idle_timer.cancel()
self._idle_timer = None
def _on_idle_timeout(self) -> None:
"""Handle idle timeout - unload VLM."""
if self._vlm_state != ModelState.LOADED:
return
logger.info("Idle timeout reached, unloading VLM")
self._emit_idle_unload()
# Run unload in a new event loop (we're in a timer thread)
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(self.ensure_vlm_unloaded())
loop.close()
except Exception as e:
logger.error(f"Idle unload failed: {e}")
# =========================================================================
# Degraded Mode
# =========================================================================
def _set_degraded_mode(self, degraded: bool, reason: Optional[str] = None) -> None:
"""Set degraded mode status."""
self._degraded_mode = degraded
self._degraded_reason = reason
if degraded:
logger.warning(f"Entering degraded mode: {reason}")
else:
logger.info("Exiting degraded mode")
def is_degraded(self) -> bool:
"""Check if operating in degraded mode."""
return self._degraded_mode
# =========================================================================
# Lifecycle
# =========================================================================
def shutdown(self) -> None:
"""Shutdown the GPU resource manager."""
logger.info("Shutting down GPUResourceManager")
self._stop_idle_timer()
# Unload VLM if loaded
if self._vlm_state == ModelState.LOADED:
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(self.ensure_vlm_unloaded())
loop.close()
except Exception as e:
logger.error(f"Shutdown unload failed: {e}")
logger.info("GPUResourceManager shutdown complete")
@classmethod
def reset_instance(cls) -> None:
"""Reset the singleton instance (for testing)."""
with cls._lock:
if cls._instance:
cls._instance.shutdown()
cls._instance = None
# =============================================================================
# Factory function
# =============================================================================
_manager_instance: Optional[GPUResourceManager] = None
def get_gpu_resource_manager(config: Optional[GPUResourceConfig] = None) -> GPUResourceManager:
"""
Get the GPU resource manager singleton.
Args:
config: Optional configuration (only used on first call)
Returns:
GPUResourceManager instance
"""
global _manager_instance
if _manager_instance is None:
_manager_instance = GPUResourceManager(config)
return _manager_instance

265
core/gpu/ollama_manager.py Normal file
View File

@@ -0,0 +1,265 @@
"""
Ollama Manager - Manages VLM model lifecycle via Ollama API
Handles:
- Loading/unloading models to/from VRAM
- Health checks and availability detection
- Keep-alive management for model persistence
"""
import asyncio
import logging
from typing import List, Optional
import aiohttp
logger = logging.getLogger(__name__)
class OllamaManager:
"""
Manages Ollama VLM model lifecycle.
Uses Ollama's REST API to control model loading/unloading.
Example:
>>> manager = OllamaManager()
>>> await manager.load_model()
>>> is_loaded = await manager.is_model_loaded()
>>> await manager.unload_model()
"""
def __init__(
self,
endpoint: str = "http://localhost:11434",
model: str = "qwen3-vl:8b",
default_keep_alive: str = "5m"
):
"""
Initialize OllamaManager.
Args:
endpoint: Ollama API endpoint
model: Model name to manage
default_keep_alive: Default keep-alive duration
"""
self._endpoint = endpoint.rstrip("/")
self._model = model
self._default_keep_alive = default_keep_alive
self._session: Optional[aiohttp.ClientSession] = None
async def _get_session(self) -> aiohttp.ClientSession:
"""Get or create aiohttp session."""
if self._session is None or self._session.closed:
self._session = aiohttp.ClientSession(
timeout=aiohttp.ClientTimeout(total=60)
)
return self._session
async def close(self) -> None:
"""Close the HTTP session."""
if self._session and not self._session.closed:
await self._session.close()
# =========================================================================
# Health Check
# =========================================================================
def is_available(self) -> bool:
"""
Check if Ollama service is available (synchronous).
Returns:
True if Ollama is reachable
"""
import requests
try:
response = requests.get(f"{self._endpoint}/api/tags", timeout=5)
return response.status_code == 200
except Exception:
return False
async def is_available_async(self) -> bool:
"""
Check if Ollama service is available (async).
Returns:
True if Ollama is reachable
"""
try:
session = await self._get_session()
async with session.get(f"{self._endpoint}/api/tags") as response:
return response.status == 200
except Exception:
return False
# =========================================================================
# Model Management
# =========================================================================
async def load_model(self, keep_alive: Optional[str] = None) -> bool:
"""
Load the model into VRAM.
Uses a minimal generate request to trigger model loading.
Args:
keep_alive: How long to keep model loaded (e.g., "5m", "1h")
Returns:
True if model loaded successfully
"""
keep_alive = keep_alive or self._default_keep_alive
try:
session = await self._get_session()
# Send a minimal request to load the model
# Pour Qwen3, utiliser /nothink pour désactiver le thinking mode
prompt = "/nothink " if "qwen" in self._model.lower() else ""
payload = {
"model": self._model,
"prompt": prompt,
"keep_alive": keep_alive,
"stream": False,
"options": {
"temperature": 0.0, # Déterministe pour la classification
"top_k": 1 # Plus rapide pour les tâches de classification
}
}
logger.debug(f"Loading model {self._model} with keep_alive={keep_alive}")
async with session.post(
f"{self._endpoint}/api/generate",
json=payload
) as response:
if response.status == 200:
logger.info(f"Model {self._model} loaded successfully")
return True
else:
text = await response.text()
logger.error(f"Failed to load model: {response.status} - {text}")
return False
except asyncio.TimeoutError:
logger.error("Timeout loading model")
return False
except Exception as e:
logger.error(f"Error loading model: {e}")
return False
async def unload_model(self) -> bool:
"""
Unload the model from VRAM.
Sets keep_alive to 0 to trigger immediate unload.
Returns:
True if model unloaded successfully
"""
try:
session = await self._get_session()
# Send request with keep_alive=0 to unload
payload = {
"model": self._model,
"prompt": "",
"keep_alive": 0,
"stream": False
}
logger.debug(f"Unloading model {self._model}")
async with session.post(
f"{self._endpoint}/api/generate",
json=payload
) as response:
if response.status == 200:
logger.info(f"Model {self._model} unloaded successfully")
return True
else:
text = await response.text()
logger.error(f"Failed to unload model: {response.status} - {text}")
return False
except asyncio.TimeoutError:
logger.error("Timeout unloading model")
return False
except Exception as e:
logger.error(f"Error unloading model: {e}")
return False
async def is_model_loaded(self) -> bool:
"""
Check if the model is currently loaded in VRAM.
Returns:
True if model is loaded
"""
try:
session = await self._get_session()
async with session.get(f"{self._endpoint}/api/ps") as response:
if response.status == 200:
data = await response.json()
models = data.get("models", [])
for model_info in models:
if model_info.get("name", "").startswith(self._model.split(":")[0]):
return True
return False
else:
logger.warning(f"Failed to check loaded models: {response.status}")
return False
except Exception as e:
logger.error(f"Error checking loaded models: {e}")
return False
async def list_loaded_models(self) -> List[str]:
"""
List all currently loaded models.
Returns:
List of loaded model names
"""
try:
session = await self._get_session()
async with session.get(f"{self._endpoint}/api/ps") as response:
if response.status == 200:
data = await response.json()
models = data.get("models", [])
return [m.get("name", "") for m in models]
else:
return []
except Exception as e:
logger.error(f"Error listing loaded models: {e}")
return []
async def list_available_models(self) -> List[str]:
"""
List all available models (downloaded).
Returns:
List of available model names
"""
try:
session = await self._get_session()
async with session.get(f"{self._endpoint}/api/tags") as response:
if response.status == 200:
data = await response.json()
models = data.get("models", [])
return [m.get("name", "") for m in models]
else:
return []
except Exception as e:
logger.error(f"Error listing available models: {e}")
return []

292
core/gpu/vram_monitor.py Normal file
View File

@@ -0,0 +1,292 @@
"""
VRAM Monitor - Monitors GPU VRAM usage
Uses pynvml (NVIDIA Management Library) to query VRAM.
Falls back gracefully on systems without NVIDIA GPU.
"""
import logging
import subprocess
import threading
from typing import Callable, List, Optional
logger = logging.getLogger(__name__)
# Try to import pynvml
try:
import pynvml
PYNVML_AVAILABLE = True
except ImportError:
PYNVML_AVAILABLE = False
logger.warning("pynvml not available, VRAM monitoring will use nvidia-smi fallback")
class VRAMInfo:
"""Information about VRAM usage."""
def __init__(
self,
total_mb: int,
used_mb: int,
free_mb: int,
gpu_name: str,
gpu_utilization_percent: int
):
self.total_mb = total_mb
self.used_mb = used_mb
self.free_mb = free_mb
self.gpu_name = gpu_name
self.gpu_utilization_percent = gpu_utilization_percent
def __repr__(self) -> str:
return (
f"VRAMInfo(used={self.used_mb}MB, free={self.free_mb}MB, "
f"total={self.total_mb}MB, gpu={self.gpu_name})"
)
class VRAMMonitor:
"""
Monitors GPU VRAM usage.
Uses pynvml for efficient queries, falls back to nvidia-smi.
Example:
>>> monitor = VRAMMonitor()
>>> info = monitor.get_vram_info()
>>> print(f"Free VRAM: {info.free_mb} MB")
"""
def __init__(self, gpu_index: int = 0, poll_interval_ms: int = 1000):
"""
Initialize VRAM monitor.
Args:
gpu_index: GPU index to monitor (default 0)
poll_interval_ms: Polling interval for continuous monitoring
"""
self._gpu_index = gpu_index
self._poll_interval_ms = poll_interval_ms
self._nvml_initialized = False
self._gpu_available = False
self._handle = None
# Monitoring state
self._monitoring = False
self._monitor_thread: Optional[threading.Thread] = None
self._callbacks: List[tuple] = [] # (callback, threshold_mb)
self._last_vram_mb = 0
self._initialize()
def _initialize(self) -> None:
"""Initialize NVML if available."""
if PYNVML_AVAILABLE:
try:
pynvml.nvmlInit()
self._nvml_initialized = True
device_count = pynvml.nvmlDeviceGetCount()
if device_count > self._gpu_index:
self._handle = pynvml.nvmlDeviceGetHandleByIndex(self._gpu_index)
self._gpu_available = True
name = pynvml.nvmlDeviceGetName(self._handle)
if isinstance(name, bytes):
name = name.decode('utf-8')
logger.info(f"VRAM monitor initialized for GPU {self._gpu_index}: {name}")
else:
logger.warning(f"GPU index {self._gpu_index} not found (count={device_count})")
except Exception as e:
logger.warning(f"Failed to initialize pynvml: {e}")
self._nvml_initialized = False
# Try nvidia-smi fallback
if not self._gpu_available:
self._gpu_available = self._check_nvidia_smi()
def _check_nvidia_smi(self) -> bool:
"""Check if nvidia-smi is available."""
try:
result = subprocess.run(
["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
capture_output=True,
text=True,
timeout=5
)
return result.returncode == 0
except Exception:
return False
def is_gpu_available(self) -> bool:
"""Check if GPU monitoring is available."""
return self._gpu_available
def get_vram_info(self) -> Optional[VRAMInfo]:
"""
Get current VRAM information.
Returns:
VRAMInfo or None if GPU not available
"""
if not self._gpu_available:
return None
if self._nvml_initialized and self._handle:
return self._get_vram_pynvml()
else:
return self._get_vram_nvidia_smi()
def _get_vram_pynvml(self) -> Optional[VRAMInfo]:
"""Get VRAM info using pynvml."""
try:
memory = pynvml.nvmlDeviceGetMemoryInfo(self._handle)
utilization = pynvml.nvmlDeviceGetUtilizationRates(self._handle)
name = pynvml.nvmlDeviceGetName(self._handle)
if isinstance(name, bytes):
name = name.decode('utf-8')
return VRAMInfo(
total_mb=memory.total // (1024 * 1024),
used_mb=memory.used // (1024 * 1024),
free_mb=memory.free // (1024 * 1024),
gpu_name=name,
gpu_utilization_percent=utilization.gpu
)
except Exception as e:
logger.error(f"pynvml error: {e}")
return None
def _get_vram_nvidia_smi(self) -> Optional[VRAMInfo]:
"""Get VRAM info using nvidia-smi (fallback)."""
try:
result = subprocess.run(
[
"nvidia-smi",
"--query-gpu=name,memory.used,memory.total,utilization.gpu",
"--format=csv,noheader,nounits"
],
capture_output=True,
text=True,
timeout=5
)
if result.returncode != 0:
return None
lines = result.stdout.strip().split("\n")
if self._gpu_index >= len(lines):
return None
parts = [p.strip() for p in lines[self._gpu_index].split(",")]
if len(parts) < 4:
return None
name = parts[0]
used_mb = int(parts[1])
total_mb = int(parts[2])
utilization = int(parts[3]) if parts[3].isdigit() else 0
return VRAMInfo(
total_mb=total_mb,
used_mb=used_mb,
free_mb=total_mb - used_mb,
gpu_name=name,
gpu_utilization_percent=utilization
)
except Exception as e:
logger.error(f"nvidia-smi error: {e}")
return None
def get_available_vram_mb(self) -> int:
"""Get available VRAM in MB."""
info = self.get_vram_info()
return info.free_mb if info else 0
# =========================================================================
# Continuous Monitoring
# =========================================================================
def start_monitoring(self) -> None:
"""Start continuous VRAM monitoring."""
if self._monitoring:
return
if not self._gpu_available:
logger.warning("Cannot start monitoring: GPU not available")
return
self._monitoring = True
self._monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
self._monitor_thread.start()
logger.info("VRAM monitoring started")
def stop_monitoring(self) -> None:
"""Stop continuous VRAM monitoring."""
self._monitoring = False
if self._monitor_thread:
self._monitor_thread.join(timeout=2)
self._monitor_thread = None
logger.info("VRAM monitoring stopped")
def _monitor_loop(self) -> None:
"""Monitoring loop running in background thread."""
import time
while self._monitoring:
info = self.get_vram_info()
if info:
current_vram = info.used_mb
# Check callbacks
for callback, threshold_mb in self._callbacks:
if abs(current_vram - self._last_vram_mb) >= threshold_mb:
try:
callback(info)
except Exception as e:
logger.error(f"VRAM callback error: {e}")
self._last_vram_mb = current_vram
time.sleep(self._poll_interval_ms / 1000.0)
def on_vram_changed(
self,
callback: Callable[[VRAMInfo], None],
threshold_mb: int = 100
) -> None:
"""
Register callback for VRAM changes.
Args:
callback: Function to call when VRAM changes
threshold_mb: Minimum change in MB to trigger callback
"""
self._callbacks.append((callback, threshold_mb))
# =========================================================================
# Cleanup
# =========================================================================
def shutdown(self) -> None:
"""Shutdown the VRAM monitor."""
self.stop_monitoring()
if self._nvml_initialized:
try:
pynvml.nvmlShutdown()
except Exception:
pass
self._nvml_initialized = False
logger.info("VRAM monitor shutdown")
def __del__(self):
"""Cleanup on deletion."""
try:
self.shutdown()
except Exception:
pass