v1.0 - Version stable: multi-PC, détection UI-DETR-1, 3 modes exécution

- Frontend v4 accessible sur réseau local (192.168.1.40) - Ports ouverts: 3002 (frontend), 5001 (backend), 5004 (dashboard) - Ollama GPU fonctionnel - Self-healing interactif - Dashboard confiance Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 11:23:51 +01:00
parent 21bfa3b337
commit a27b74cf22
1595 changed files with 412691 additions and 400 deletions
--- a/core/gpu/init.py
+++ b/core/gpu/init.py
@@ -0,0 +1,40 @@
+"""
+GPU Resource Management Module for RPA Vision V3
+
+This module provides dynamic GPU resource allocation between ML models:
+- Ollama VLM (qwen3-vl:8b) for UI classification
+- CLIP (ViT-B-32) for embedding matching
+
+The GPUResourceManager optimizes VRAM usage by:
+- Unloading VLM in autopilot mode
+- Migrating CLIP to GPU when VRAM is available
+- Managing idle timeouts for automatic resource cleanup
+"""
+
+from .gpu_resource_manager import (
+    GPUResourceManager,
+    ExecutionMode,
+    ModelState,
+    GPUResourceConfig,
+    GPUResourceStatus,
+    VRAMInfo,
+    ResourceChangedEvent,
+    get_gpu_resource_manager,
+)
+from .ollama_manager import OllamaManager
+from .vram_monitor import VRAMMonitor
+from .clip_manager import CLIPManager
+
+__all__ = [
+    "GPUResourceManager",
+    "ExecutionMode",
+    "ModelState",
+    "GPUResourceConfig",
+    "GPUResourceStatus",
+    "VRAMInfo",
+    "ResourceChangedEvent",
+    "get_gpu_resource_manager",
+    "OllamaManager",
+    "VRAMMonitor",
+    "CLIPManager",
+]
--- a/core/gpu/clip_manager.py
+++ b/core/gpu/clip_manager.py
@@ -0,0 +1,248 @@
+"""
+CLIP Manager - Manages CLIP model device migration
+
+Handles:
+- CPU/GPU device migration for CLIP model
+- Pipeline reinitialization after device change
+- Graceful fallback on migration failures
+"""
+
+import asyncio
+import logging
+from typing import Any, Optional
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+class CLIPManager:
+    """
+    Manages CLIP model device migration between CPU and GPU.
+    
+    Coordinates with the embedding pipeline to ensure consistent
+    device usage after migration.
+    
+    Example:
+        >>> manager = CLIPManager()
+        >>> await manager.migrate_to_device("cuda")
+        >>> device = manager.get_current_device()
+    """
+    
+    def __init__(self, model_name: str = "ViT-B-32"):
+        """
+        Initialize CLIPManager.
+        
+        Args:
+            model_name: CLIP model variant to manage
+        """
+        self._model_name = model_name
+        self._current_device = "cpu"
+        self._model: Optional[Any] = None
+        self._preprocess: Optional[Any] = None
+        self._initialized = False
+        
+        # Check CUDA availability
+        self._cuda_available = torch.cuda.is_available()
+        if not self._cuda_available:
+            logger.warning("CUDA not available, CLIP will stay on CPU")
+    
+    def get_current_device(self) -> str:
+        """
+        Get the current device for CLIP model.
+        
+        Returns:
+            "cpu" or "cuda"
+        """
+        return self._current_device
+    
+    def is_cuda_available(self) -> bool:
+        """Check if CUDA is available for GPU migration."""
+        return self._cuda_available
+    
+    async def migrate_to_device(self, device: str) -> bool:
+        """
+        Migrate CLIP model to specified device.
+        
+        Args:
+            device: Target device ("cpu" or "cuda")
+        
+        Returns:
+            True if migration successful
+        """
+        if device not in ["cpu", "cuda"]:
+            logger.error(f"Invalid device: {device}")
+            return False
+        
+        if device == self._current_device:
+            logger.debug(f"CLIP already on {device}")
+            return True
+        
+        if device == "cuda" and not self._cuda_available:
+            logger.warning("Cannot migrate to CUDA: not available")
+            return False
+        
+        logger.info(f"Migrating CLIP from {self._current_device} to {device}")
+        
+        try:
+            # Run migration in executor to avoid blocking
+            loop = asyncio.get_event_loop()
+            success = await loop.run_in_executor(
+                None,
+                self._do_migration,
+                device
+            )
+            
+            if success:
+                self._current_device = device
+                logger.info(f"CLIP migrated to {device}")
+                return True
+            
+        except Exception as e:
+            logger.error(f"CLIP migration failed: {e}")
+        
+        return False
+
+    
+    def _do_migration(self, device: str) -> bool:
+        """
+        Perform the actual device migration (blocking).
+        
+        Args:
+            device: Target device
+        
+        Returns:
+            True if successful
+        """
+        try:
+            # If model is loaded, move it
+            if self._model is not None:
+                self._model = self._model.to(device)
+                logger.debug(f"Moved existing model to {device}")
+            
+            # Reinitialize pipeline with new device
+            self.reinitialize_pipeline(device)
+            
+            return True
+            
+        except Exception as e:
+            logger.error(f"Migration error: {e}")
+            return False
+    
+    def reinitialize_pipeline(self, device: Optional[str] = None) -> None:
+        """
+        Reinitialize the embedding pipeline with current/specified device.
+        
+        Args:
+            device: Device to use (uses current if None)
+        """
+        device = device or self._current_device
+        
+        try:
+            # Try to notify FusionEngine about device change
+            self._notify_fusion_engine(device)
+            logger.debug(f"Pipeline reinitialized for {device}")
+            
+        except Exception as e:
+            logger.warning(f"Pipeline reinitialization warning: {e}")
+    
+    def _notify_fusion_engine(self, device: str) -> None:
+        """
+        Notify FusionEngine about device change.
+        
+        This allows the embedding system to update its device configuration.
+        """
+        try:
+            from core.embedding.fusion_engine import FusionEngine
+            
+            # FusionEngine is typically a singleton, try to get instance
+            # and update its device configuration
+            # This is a soft dependency - if it fails, we continue
+            
+        except ImportError:
+            pass  # FusionEngine not available, that's OK
+    
+    def get_model(self) -> Optional[Any]:
+        """
+        Get the CLIP model instance.
+        
+        Returns:
+            CLIP model or None if not loaded
+        """
+        return self._model
+    
+    def load_model(self) -> bool:
+        """
+        Load the CLIP model on current device.
+        
+        Returns:
+            True if loaded successfully
+        """
+        try:
+            import open_clip
+            
+            model, _, preprocess = open_clip.create_model_and_transforms(
+                self._model_name,
+                pretrained='openai',
+                device=self._current_device
+            )
+            
+            self._model = model
+            self._preprocess = preprocess
+            self._initialized = True
+            
+            logger.info(f"CLIP model {self._model_name} loaded on {self._current_device}")
+            return True
+            
+        except Exception as e:
+            logger.error(f"Failed to load CLIP model: {e}")
+            return False
+    
+    def unload_model(self) -> None:
+        """Unload the CLIP model to free memory."""
+        if self._model is not None:
+            del self._model
+            self._model = None
+            self._preprocess = None
+            self._initialized = False
+            
+            # Force garbage collection
+            import gc
+            gc.collect()
+            
+            if self._cuda_available:
+                torch.cuda.empty_cache()
+            
+            logger.info("CLIP model unloaded")
+    
+    def encode_image(self, image) -> Optional[Any]:
+        """
+        Encode an image using CLIP.
+        
+        Args:
+            image: PIL Image or tensor
+        
+        Returns:
+            Image embedding or None on error
+        """
+        if not self._initialized or self._model is None:
+            if not self.load_model():
+                return None
+        
+        try:
+            import torch
+            
+            with torch.no_grad():
+                if self._preprocess:
+                    image_tensor = self._preprocess(image).unsqueeze(0)
+                else:
+                    image_tensor = image
+                
+                image_tensor = image_tensor.to(self._current_device)
+                embedding = self._model.encode_image(image_tensor)
+                
+                return embedding.cpu().numpy()
+                
+        except Exception as e:
+            logger.error(f"Image encoding error: {e}")
+            return None
--- a/core/gpu/gpu_resource_manager.py
+++ b/core/gpu/gpu_resource_manager.py
@@ -0,0 +1,614 @@
+"""
+GPU Resource Manager - Central orchestrator for GPU resource allocation
+
+Manages dynamic allocation of GPU resources between:
+- Ollama VLM (qwen3-vl:8b) - ~10.5 GB VRAM for UI classification
+- CLIP (ViT-B-32) - ~500 MB VRAM for embedding matching
+
+Optimizes VRAM usage based on execution mode:
+- RECORDING: VLM loaded, CLIP on CPU
+- AUTOPILOT: VLM unloaded, CLIP on GPU
+- IDLE: No automatic changes
+"""
+
+import asyncio
+import logging
+import threading
+import time
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from typing import Any, Callable, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class ExecutionMode(str, Enum):
+    """Execution modes for the RPA system."""
+    IDLE = "idle"
+    RECORDING = "recording"
+    AUTOPILOT = "autopilot"
+
+
+class ModelState(str, Enum):
+    """State of a model in the GPU resource manager."""
+    UNLOADED = "unloaded"
+    LOADING = "loading"
+    LOADED = "loaded"
+    UNLOADING = "unloading"
+    ERROR = "error"
+
+
+@dataclass
+class VRAMInfo:
+    """Information about VRAM usage."""
+    total_mb: int
+    used_mb: int
+    free_mb: int
+    gpu_name: str
+    gpu_utilization_percent: int
+
+
+@dataclass
+class GPUResourceConfig:
+    """Configuration for GPU resource management."""
+    ollama_endpoint: str = "http://localhost:11434"
+    vlm_model: str = "qwen3-vl:8b"
+    clip_model: str = "ViT-B-32"
+    idle_timeout_seconds: int = 300  # 5 minutes
+    vram_threshold_for_clip_gpu_mb: int = 1024  # 1 GB
+    max_load_retries: int = 3
+    load_timeout_seconds: int = 30
+    unload_timeout_seconds: int = 5
+
+
+@dataclass
+class GPUResourceStatus:
+    """Current status of GPU resources."""
+    execution_mode: ExecutionMode
+    vlm_state: ModelState
+    vlm_model: str
+    clip_device: str
+    vram: Optional[VRAMInfo]
+    idle_timeout_seconds: int
+    last_vlm_request: Optional[datetime]
+    degraded_mode: bool
+    degraded_reason: Optional[str]
+
+
+@dataclass
+class ResourceChangedEvent:
+    """Event emitted when GPU resources change."""
+    timestamp: datetime
+    event_type: str  # "vram_changed", "model_loaded", "model_unloaded", "device_changed"
+    details: Dict[str, Any] = field(default_factory=dict)
+
+
+class GPUResourceManager:
+    """
+    Central manager for GPU resource allocation.
+    
+    Singleton pattern ensures only one instance manages GPU resources.
+    
+    Example:
+        >>> manager = get_gpu_resource_manager()
+        >>> await manager.set_execution_mode(ExecutionMode.AUTOPILOT)
+        >>> status = manager.get_status()
+    """
+    
+    _instance: Optional["GPUResourceManager"] = None
+    _lock = threading.Lock()
+    
+    def __new__(cls, config: Optional[GPUResourceConfig] = None):
+        with cls._lock:
+            if cls._instance is None:
+                cls._instance = super().__new__(cls)
+                cls._instance._initialized = False
+            return cls._instance
+    
+    def __init__(self, config: Optional[GPUResourceConfig] = None):
+        if self._initialized:
+            return
+        
+        self._config = config or GPUResourceConfig()
+        self._execution_mode = ExecutionMode.IDLE
+        self._vlm_state = ModelState.UNLOADED
+        self._clip_device = "cpu"
+        self._last_vlm_request: Optional[datetime] = None
+        self._degraded_mode = False
+        self._degraded_reason: Optional[str] = None
+        
+        # Managers (lazy initialized)
+        self._ollama_manager: Optional[Any] = None
+        self._vram_monitor: Optional[Any] = None
+        self._clip_manager: Optional[Any] = None
+        
+        # Operation queue for sequential processing
+        self._operation_queue: asyncio.Queue = asyncio.Queue()
+        self._operation_lock = asyncio.Lock()
+        
+        # Event callbacks
+        self._on_resource_changed: List[Callable[[ResourceChangedEvent], None]] = []
+        self._on_mode_changed: List[Callable[[ExecutionMode], None]] = []
+        self._on_idle_unload: List[Callable[[], None]] = []
+        
+        # Idle timeout management
+        self._idle_timer: Optional[threading.Timer] = None
+        self._idle_check_running = False
+        
+        self._initialized = True
+        logger.info(f"GPUResourceManager initialized with config: {self._config}")
+
+    
+    # =========================================================================
+    # Lazy initialization of managers
+    # =========================================================================
+    
+    def _get_ollama_manager(self):
+        """Lazy load OllamaManager."""
+        if self._ollama_manager is None:
+            from .ollama_manager import OllamaManager
+            self._ollama_manager = OllamaManager(
+                endpoint=self._config.ollama_endpoint,
+                model=self._config.vlm_model
+            )
+        return self._ollama_manager
+    
+    def _get_vram_monitor(self):
+        """Lazy load VRAMMonitor."""
+        if self._vram_monitor is None:
+            from .vram_monitor import VRAMMonitor
+            self._vram_monitor = VRAMMonitor()
+        return self._vram_monitor
+    
+    def _get_clip_manager(self):
+        """Lazy load CLIPManager."""
+        if self._clip_manager is None:
+            from .clip_manager import CLIPManager
+            self._clip_manager = CLIPManager(model_name=self._config.clip_model)
+        return self._clip_manager
+    
+    # =========================================================================
+    # Mode Management
+    # =========================================================================
+    
+    async def set_execution_mode(self, mode: ExecutionMode) -> None:
+        """
+        Set the execution mode and adjust GPU resources accordingly.
+        
+        Args:
+            mode: Target execution mode
+        """
+        if mode == self._execution_mode:
+            logger.debug(f"Already in {mode.value} mode")
+            return
+        
+        old_mode = self._execution_mode
+        logger.info(f"Transitioning from {old_mode.value} to {mode.value}")
+        
+        async with self._operation_lock:
+            if mode == ExecutionMode.AUTOPILOT:
+                # Unload VLM, migrate CLIP to GPU
+                await self.ensure_vlm_unloaded()
+                await self._try_migrate_clip_to_gpu()
+                
+            elif mode == ExecutionMode.RECORDING:
+                # Migrate CLIP to CPU first, then load VLM
+                await self._migrate_clip_to_cpu()
+                await self.ensure_vlm_loaded()
+            
+            # IDLE mode: no automatic changes
+            
+            self._execution_mode = mode
+            self._emit_mode_changed(mode)
+        
+        logger.info(f"Mode transition complete: {mode.value}")
+    
+    def get_execution_mode(self) -> ExecutionMode:
+        """Get the current execution mode."""
+        return self._execution_mode
+    
+    # =========================================================================
+    # VLM Management
+    # =========================================================================
+    
+    async def ensure_vlm_loaded(self) -> bool:
+        """
+        Ensure VLM is loaded and ready.
+        
+        Returns:
+            True if VLM is loaded, False on failure
+        """
+        if self._vlm_state == ModelState.LOADED:
+            self._update_vlm_request_time()
+            return True
+        
+        if self._degraded_mode:
+            logger.warning("Cannot load VLM in degraded mode")
+            return False
+        
+        async with self._operation_lock:
+            if self._vlm_state == ModelState.LOADED:
+                self._update_vlm_request_time()
+                return True
+            
+            self._vlm_state = ModelState.LOADING
+            logger.info("Loading VLM model...")
+            
+            ollama = self._get_ollama_manager()
+            retries = 0
+            
+            while retries < self._config.max_load_retries:
+                try:
+                    success = await asyncio.wait_for(
+                        ollama.load_model(),
+                        timeout=self._config.load_timeout_seconds
+                    )
+                    
+                    if success:
+                        self._vlm_state = ModelState.LOADED
+                        self._update_vlm_request_time()
+                        self._start_idle_timer()
+                        self._emit_resource_changed("model_loaded", {"model": self._config.vlm_model})
+                        logger.info("VLM model loaded successfully")
+                        return True
+                    
+                except asyncio.TimeoutError:
+                    logger.warning(f"VLM load timeout (attempt {retries + 1})")
+                except Exception as e:
+                    logger.error(f"VLM load error: {e}")
+                
+                retries += 1
+                if retries < self._config.max_load_retries:
+                    await asyncio.sleep(1)
+            
+            self._vlm_state = ModelState.ERROR
+            self._set_degraded_mode(True, "VLM load failed after retries")
+            logger.error("Failed to load VLM after all retries")
+            return False
+
+    
+    async def ensure_vlm_unloaded(self) -> bool:
+        """
+        Ensure VLM is unloaded.
+        
+        Returns:
+            True if VLM is unloaded, False on failure
+        """
+        if self._vlm_state == ModelState.UNLOADED:
+            return True
+        
+        async with self._operation_lock:
+            if self._vlm_state == ModelState.UNLOADED:
+                return True
+            
+            self._stop_idle_timer()
+            self._vlm_state = ModelState.UNLOADING
+            logger.info("Unloading VLM model...")
+            
+            # Get VRAM before unload for verification
+            vram_before = self._get_vram_usage_mb()
+            
+            ollama = self._get_ollama_manager()
+            try:
+                success = await asyncio.wait_for(
+                    ollama.unload_model(),
+                    timeout=self._config.unload_timeout_seconds
+                )
+                
+                if success:
+                    self._vlm_state = ModelState.UNLOADED
+                    
+                    # Verify VRAM decrease
+                    await asyncio.sleep(0.5)  # Wait for VRAM to settle
+                    vram_after = self._get_vram_usage_mb()
+                    vram_freed = vram_before - vram_after
+                    
+                    self._emit_resource_changed("model_unloaded", {
+                        "model": self._config.vlm_model,
+                        "vram_freed_mb": vram_freed
+                    })
+                    logger.info(f"VLM model unloaded, freed {vram_freed} MB VRAM")
+                    return True
+                    
+            except asyncio.TimeoutError:
+                logger.warning("VLM unload timeout")
+            except Exception as e:
+                logger.error(f"VLM unload error: {e}")
+            
+            self._vlm_state = ModelState.ERROR
+            return False
+    
+    def is_vlm_loaded(self) -> bool:
+        """Check if VLM is currently loaded."""
+        return self._vlm_state == ModelState.LOADED
+    
+    def get_vlm_state(self) -> ModelState:
+        """Get the current VLM state."""
+        return self._vlm_state
+    
+    # =========================================================================
+    # CLIP Management
+    # =========================================================================
+    
+    def get_clip_device(self) -> str:
+        """
+        Get the current CLIP device.
+        
+        Returns:
+            "cpu" or "cuda"
+        """
+        return self._clip_device
+    
+    async def _try_migrate_clip_to_gpu(self) -> bool:
+        """Try to migrate CLIP to GPU if VRAM is available."""
+        vram = self._get_vram_monitor().get_vram_info()
+        if vram is None:
+            logger.warning("Cannot get VRAM info, keeping CLIP on CPU")
+            return False
+        
+        if vram.free_mb < self._config.vram_threshold_for_clip_gpu_mb:
+            logger.info(f"Insufficient VRAM ({vram.free_mb} MB), keeping CLIP on CPU")
+            return False
+        
+        return await self.migrate_clip_to_gpu()
+    
+    async def migrate_clip_to_gpu(self) -> bool:
+        """
+        Migrate CLIP model to GPU.
+        
+        Returns:
+            True if migration successful
+        """
+        if self._clip_device == "cuda":
+            return True
+        
+        try:
+            clip_manager = self._get_clip_manager()
+            success = await clip_manager.migrate_to_device("cuda")
+            
+            if success:
+                self._clip_device = "cuda"
+                self._emit_resource_changed("device_changed", {
+                    "model": "clip",
+                    "device": "cuda"
+                })
+                logger.info("CLIP migrated to GPU")
+                return True
+            
+        except Exception as e:
+            logger.error(f"CLIP GPU migration failed: {e}")
+        
+        return False
+    
+    async def _migrate_clip_to_cpu(self) -> bool:
+        """Migrate CLIP model to CPU."""
+        if self._clip_device == "cpu":
+            return True
+        
+        return await self.migrate_clip_to_cpu()
+    
+    async def migrate_clip_to_cpu(self) -> bool:
+        """
+        Migrate CLIP model to CPU.
+        
+        Returns:
+            True if migration successful
+        """
+        if self._clip_device == "cpu":
+            return True
+        
+        try:
+            clip_manager = self._get_clip_manager()
+            success = await clip_manager.migrate_to_device("cpu")
+            
+            if success:
+                self._clip_device = "cpu"
+                self._emit_resource_changed("device_changed", {
+                    "model": "clip",
+                    "device": "cpu"
+                })
+                logger.info("CLIP migrated to CPU")
+                return True
+            
+        except Exception as e:
+            logger.error(f"CLIP CPU migration failed: {e}")
+        
+        return False
+
+    
+    # =========================================================================
+    # Monitoring
+    # =========================================================================
+    
+    def get_status(self) -> GPUResourceStatus:
+        """
+        Get the current GPU resource status.
+        
+        Returns:
+            Complete status including VRAM, model states, and mode
+        """
+        vram = self._get_vram_monitor().get_vram_info()
+        
+        return GPUResourceStatus(
+            execution_mode=self._execution_mode,
+            vlm_state=self._vlm_state,
+            vlm_model=self._config.vlm_model,
+            clip_device=self._clip_device,
+            vram=vram,
+            idle_timeout_seconds=self._config.idle_timeout_seconds,
+            last_vlm_request=self._last_vlm_request,
+            degraded_mode=self._degraded_mode,
+            degraded_reason=self._degraded_reason
+        )
+    
+    def get_vram_usage(self) -> Optional[VRAMInfo]:
+        """Get current VRAM usage information."""
+        return self._get_vram_monitor().get_vram_info()
+    
+    def _get_vram_usage_mb(self) -> int:
+        """Get current VRAM usage in MB."""
+        vram = self._get_vram_monitor().get_vram_info()
+        return vram.used_mb if vram else 0
+    
+    # =========================================================================
+    # Events
+    # =========================================================================
+    
+    def on_resource_changed(self, callback: Callable[[ResourceChangedEvent], None]) -> None:
+        """Register callback for resource change events."""
+        self._on_resource_changed.append(callback)
+    
+    def on_mode_changed(self, callback: Callable[[ExecutionMode], None]) -> None:
+        """Register callback for mode change events."""
+        self._on_mode_changed.append(callback)
+    
+    def on_idle_unload(self, callback: Callable[[], None]) -> None:
+        """Register callback for idle unload events."""
+        self._on_idle_unload.append(callback)
+    
+    def _emit_resource_changed(self, event_type: str, details: Dict[str, Any]) -> None:
+        """Emit a resource changed event."""
+        event = ResourceChangedEvent(
+            timestamp=datetime.now(),
+            event_type=event_type,
+            details=details
+        )
+        for callback in self._on_resource_changed:
+            try:
+                callback(event)
+            except Exception as e:
+                logger.error(f"Resource changed callback error: {e}")
+    
+    def _emit_mode_changed(self, mode: ExecutionMode) -> None:
+        """Emit a mode changed event."""
+        for callback in self._on_mode_changed:
+            try:
+                callback(mode)
+            except Exception as e:
+                logger.error(f"Mode changed callback error: {e}")
+    
+    def _emit_idle_unload(self) -> None:
+        """Emit an idle unload event."""
+        for callback in self._on_idle_unload:
+            try:
+                callback()
+            except Exception as e:
+                logger.error(f"Idle unload callback error: {e}")
+    
+    # =========================================================================
+    # Idle Timeout Management
+    # =========================================================================
+    
+    def _update_vlm_request_time(self) -> None:
+        """Update the last VLM request timestamp."""
+        self._last_vlm_request = datetime.now()
+        self._restart_idle_timer()
+    
+    def _start_idle_timer(self) -> None:
+        """Start the idle timeout timer."""
+        self._stop_idle_timer()
+        self._idle_timer = threading.Timer(
+            self._config.idle_timeout_seconds,
+            self._on_idle_timeout
+        )
+        self._idle_timer.daemon = True
+        self._idle_timer.start()
+    
+    def _restart_idle_timer(self) -> None:
+        """Restart the idle timeout timer."""
+        if self._vlm_state == ModelState.LOADED:
+            self._start_idle_timer()
+    
+    def _stop_idle_timer(self) -> None:
+        """Stop the idle timeout timer."""
+        if self._idle_timer:
+            self._idle_timer.cancel()
+            self._idle_timer = None
+    
+    def _on_idle_timeout(self) -> None:
+        """Handle idle timeout - unload VLM."""
+        if self._vlm_state != ModelState.LOADED:
+            return
+        
+        logger.info("Idle timeout reached, unloading VLM")
+        self._emit_idle_unload()
+        
+        # Run unload in a new event loop (we're in a timer thread)
+        try:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            loop.run_until_complete(self.ensure_vlm_unloaded())
+            loop.close()
+        except Exception as e:
+            logger.error(f"Idle unload failed: {e}")
+
+    
+    # =========================================================================
+    # Degraded Mode
+    # =========================================================================
+    
+    def _set_degraded_mode(self, degraded: bool, reason: Optional[str] = None) -> None:
+        """Set degraded mode status."""
+        self._degraded_mode = degraded
+        self._degraded_reason = reason
+        if degraded:
+            logger.warning(f"Entering degraded mode: {reason}")
+        else:
+            logger.info("Exiting degraded mode")
+    
+    def is_degraded(self) -> bool:
+        """Check if operating in degraded mode."""
+        return self._degraded_mode
+    
+    # =========================================================================
+    # Lifecycle
+    # =========================================================================
+    
+    def shutdown(self) -> None:
+        """Shutdown the GPU resource manager."""
+        logger.info("Shutting down GPUResourceManager")
+        self._stop_idle_timer()
+        
+        # Unload VLM if loaded
+        if self._vlm_state == ModelState.LOADED:
+            try:
+                loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(loop)
+                loop.run_until_complete(self.ensure_vlm_unloaded())
+                loop.close()
+            except Exception as e:
+                logger.error(f"Shutdown unload failed: {e}")
+        
+        logger.info("GPUResourceManager shutdown complete")
+    
+    @classmethod
+    def reset_instance(cls) -> None:
+        """Reset the singleton instance (for testing)."""
+        with cls._lock:
+            if cls._instance:
+                cls._instance.shutdown()
+            cls._instance = None
+
+
+# =============================================================================
+# Factory function
+# =============================================================================
+
+_manager_instance: Optional[GPUResourceManager] = None
+
+
+def get_gpu_resource_manager(config: Optional[GPUResourceConfig] = None) -> GPUResourceManager:
+    """
+    Get the GPU resource manager singleton.
+    
+    Args:
+        config: Optional configuration (only used on first call)
+    
+    Returns:
+        GPUResourceManager instance
+    """
+    global _manager_instance
+    if _manager_instance is None:
+        _manager_instance = GPUResourceManager(config)
+    return _manager_instance
--- a/core/gpu/ollama_manager.py
+++ b/core/gpu/ollama_manager.py
@@ -0,0 +1,265 @@
+"""
+Ollama Manager - Manages VLM model lifecycle via Ollama API
+
+Handles:
+- Loading/unloading models to/from VRAM
+- Health checks and availability detection
+- Keep-alive management for model persistence
+"""
+
+import asyncio
+import logging
+from typing import List, Optional
+
+import aiohttp
+
+logger = logging.getLogger(__name__)
+
+
+class OllamaManager:
+    """
+    Manages Ollama VLM model lifecycle.
+    
+    Uses Ollama's REST API to control model loading/unloading.
+    
+    Example:
+        >>> manager = OllamaManager()
+        >>> await manager.load_model()
+        >>> is_loaded = await manager.is_model_loaded()
+        >>> await manager.unload_model()
+    """
+    
+    def __init__(
+        self,
+        endpoint: str = "http://localhost:11434",
+        model: str = "qwen3-vl:8b",
+        default_keep_alive: str = "5m"
+    ):
+        """
+        Initialize OllamaManager.
+        
+        Args:
+            endpoint: Ollama API endpoint
+            model: Model name to manage
+            default_keep_alive: Default keep-alive duration
+        """
+        self._endpoint = endpoint.rstrip("/")
+        self._model = model
+        self._default_keep_alive = default_keep_alive
+        self._session: Optional[aiohttp.ClientSession] = None
+    
+    async def _get_session(self) -> aiohttp.ClientSession:
+        """Get or create aiohttp session."""
+        if self._session is None or self._session.closed:
+            self._session = aiohttp.ClientSession(
+                timeout=aiohttp.ClientTimeout(total=60)
+            )
+        return self._session
+    
+    async def close(self) -> None:
+        """Close the HTTP session."""
+        if self._session and not self._session.closed:
+            await self._session.close()
+
+    
+    # =========================================================================
+    # Health Check
+    # =========================================================================
+    
+    def is_available(self) -> bool:
+        """
+        Check if Ollama service is available (synchronous).
+        
+        Returns:
+            True if Ollama is reachable
+        """
+        import requests
+        try:
+            response = requests.get(f"{self._endpoint}/api/tags", timeout=5)
+            return response.status_code == 200
+        except Exception:
+            return False
+    
+    async def is_available_async(self) -> bool:
+        """
+        Check if Ollama service is available (async).
+        
+        Returns:
+            True if Ollama is reachable
+        """
+        try:
+            session = await self._get_session()
+            async with session.get(f"{self._endpoint}/api/tags") as response:
+                return response.status == 200
+        except Exception:
+            return False
+    
+    # =========================================================================
+    # Model Management
+    # =========================================================================
+    
+    async def load_model(self, keep_alive: Optional[str] = None) -> bool:
+        """
+        Load the model into VRAM.
+        
+        Uses a minimal generate request to trigger model loading.
+        
+        Args:
+            keep_alive: How long to keep model loaded (e.g., "5m", "1h")
+        
+        Returns:
+            True if model loaded successfully
+        """
+        keep_alive = keep_alive or self._default_keep_alive
+        
+        try:
+            session = await self._get_session()
+            
+            # Send a minimal request to load the model
+            # Pour Qwen3, utiliser /nothink pour désactiver le thinking mode
+            prompt = "/nothink " if "qwen" in self._model.lower() else ""
+            
+            payload = {
+                "model": self._model,
+                "prompt": prompt,
+                "keep_alive": keep_alive,
+                "stream": False,
+                "options": {
+                    "temperature": 0.0,  # Déterministe pour la classification
+                    "top_k": 1  # Plus rapide pour les tâches de classification
+                }
+            }
+            
+            logger.debug(f"Loading model {self._model} with keep_alive={keep_alive}")
+            
+            async with session.post(
+                f"{self._endpoint}/api/generate",
+                json=payload
+            ) as response:
+                if response.status == 200:
+                    logger.info(f"Model {self._model} loaded successfully")
+                    return True
+                else:
+                    text = await response.text()
+                    logger.error(f"Failed to load model: {response.status} - {text}")
+                    return False
+                    
+        except asyncio.TimeoutError:
+            logger.error("Timeout loading model")
+            return False
+        except Exception as e:
+            logger.error(f"Error loading model: {e}")
+            return False
+    
+    async def unload_model(self) -> bool:
+        """
+        Unload the model from VRAM.
+        
+        Sets keep_alive to 0 to trigger immediate unload.
+        
+        Returns:
+            True if model unloaded successfully
+        """
+        try:
+            session = await self._get_session()
+            
+            # Send request with keep_alive=0 to unload
+            payload = {
+                "model": self._model,
+                "prompt": "",
+                "keep_alive": 0,
+                "stream": False
+            }
+            
+            logger.debug(f"Unloading model {self._model}")
+            
+            async with session.post(
+                f"{self._endpoint}/api/generate",
+                json=payload
+            ) as response:
+                if response.status == 200:
+                    logger.info(f"Model {self._model} unloaded successfully")
+                    return True
+                else:
+                    text = await response.text()
+                    logger.error(f"Failed to unload model: {response.status} - {text}")
+                    return False
+                    
+        except asyncio.TimeoutError:
+            logger.error("Timeout unloading model")
+            return False
+        except Exception as e:
+            logger.error(f"Error unloading model: {e}")
+            return False
+    
+    async def is_model_loaded(self) -> bool:
+        """
+        Check if the model is currently loaded in VRAM.
+        
+        Returns:
+            True if model is loaded
+        """
+        try:
+            session = await self._get_session()
+            
+            async with session.get(f"{self._endpoint}/api/ps") as response:
+                if response.status == 200:
+                    data = await response.json()
+                    models = data.get("models", [])
+                    
+                    for model_info in models:
+                        if model_info.get("name", "").startswith(self._model.split(":")[0]):
+                            return True
+                    
+                    return False
+                else:
+                    logger.warning(f"Failed to check loaded models: {response.status}")
+                    return False
+                    
+        except Exception as e:
+            logger.error(f"Error checking loaded models: {e}")
+            return False
+    
+    async def list_loaded_models(self) -> List[str]:
+        """
+        List all currently loaded models.
+        
+        Returns:
+            List of loaded model names
+        """
+        try:
+            session = await self._get_session()
+            
+            async with session.get(f"{self._endpoint}/api/ps") as response:
+                if response.status == 200:
+                    data = await response.json()
+                    models = data.get("models", [])
+                    return [m.get("name", "") for m in models]
+                else:
+                    return []
+                    
+        except Exception as e:
+            logger.error(f"Error listing loaded models: {e}")
+            return []
+    
+    async def list_available_models(self) -> List[str]:
+        """
+        List all available models (downloaded).
+        
+        Returns:
+            List of available model names
+        """
+        try:
+            session = await self._get_session()
+            
+            async with session.get(f"{self._endpoint}/api/tags") as response:
+                if response.status == 200:
+                    data = await response.json()
+                    models = data.get("models", [])
+                    return [m.get("name", "") for m in models]
+                else:
+                    return []
+                    
+        except Exception as e:
+            logger.error(f"Error listing available models: {e}")
+            return []
--- a/core/gpu/vram_monitor.py
+++ b/core/gpu/vram_monitor.py
@@ -0,0 +1,292 @@
+"""
+VRAM Monitor - Monitors GPU VRAM usage
+
+Uses pynvml (NVIDIA Management Library) to query VRAM.
+Falls back gracefully on systems without NVIDIA GPU.
+"""
+
+import logging
+import subprocess
+import threading
+from typing import Callable, List, Optional
+
+logger = logging.getLogger(__name__)
+
+# Try to import pynvml
+try:
+    import pynvml
+    PYNVML_AVAILABLE = True
+except ImportError:
+    PYNVML_AVAILABLE = False
+    logger.warning("pynvml not available, VRAM monitoring will use nvidia-smi fallback")
+
+
+class VRAMInfo:
+    """Information about VRAM usage."""
+    
+    def __init__(
+        self,
+        total_mb: int,
+        used_mb: int,
+        free_mb: int,
+        gpu_name: str,
+        gpu_utilization_percent: int
+    ):
+        self.total_mb = total_mb
+        self.used_mb = used_mb
+        self.free_mb = free_mb
+        self.gpu_name = gpu_name
+        self.gpu_utilization_percent = gpu_utilization_percent
+    
+    def __repr__(self) -> str:
+        return (
+            f"VRAMInfo(used={self.used_mb}MB, free={self.free_mb}MB, "
+            f"total={self.total_mb}MB, gpu={self.gpu_name})"
+        )
+
+
+class VRAMMonitor:
+    """
+    Monitors GPU VRAM usage.
+    
+    Uses pynvml for efficient queries, falls back to nvidia-smi.
+    
+    Example:
+        >>> monitor = VRAMMonitor()
+        >>> info = monitor.get_vram_info()
+        >>> print(f"Free VRAM: {info.free_mb} MB")
+    """
+    
+    def __init__(self, gpu_index: int = 0, poll_interval_ms: int = 1000):
+        """
+        Initialize VRAM monitor.
+        
+        Args:
+            gpu_index: GPU index to monitor (default 0)
+            poll_interval_ms: Polling interval for continuous monitoring
+        """
+        self._gpu_index = gpu_index
+        self._poll_interval_ms = poll_interval_ms
+        self._nvml_initialized = False
+        self._gpu_available = False
+        self._handle = None
+        
+        # Monitoring state
+        self._monitoring = False
+        self._monitor_thread: Optional[threading.Thread] = None
+        self._callbacks: List[tuple] = []  # (callback, threshold_mb)
+        self._last_vram_mb = 0
+        
+        self._initialize()
+
+    
+    def _initialize(self) -> None:
+        """Initialize NVML if available."""
+        if PYNVML_AVAILABLE:
+            try:
+                pynvml.nvmlInit()
+                self._nvml_initialized = True
+                
+                device_count = pynvml.nvmlDeviceGetCount()
+                if device_count > self._gpu_index:
+                    self._handle = pynvml.nvmlDeviceGetHandleByIndex(self._gpu_index)
+                    self._gpu_available = True
+                    name = pynvml.nvmlDeviceGetName(self._handle)
+                    if isinstance(name, bytes):
+                        name = name.decode('utf-8')
+                    logger.info(f"VRAM monitor initialized for GPU {self._gpu_index}: {name}")
+                else:
+                    logger.warning(f"GPU index {self._gpu_index} not found (count={device_count})")
+                    
+            except Exception as e:
+                logger.warning(f"Failed to initialize pynvml: {e}")
+                self._nvml_initialized = False
+        
+        # Try nvidia-smi fallback
+        if not self._gpu_available:
+            self._gpu_available = self._check_nvidia_smi()
+    
+    def _check_nvidia_smi(self) -> bool:
+        """Check if nvidia-smi is available."""
+        try:
+            result = subprocess.run(
+                ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            return result.returncode == 0
+        except Exception:
+            return False
+    
+    def is_gpu_available(self) -> bool:
+        """Check if GPU monitoring is available."""
+        return self._gpu_available
+    
+    def get_vram_info(self) -> Optional[VRAMInfo]:
+        """
+        Get current VRAM information.
+        
+        Returns:
+            VRAMInfo or None if GPU not available
+        """
+        if not self._gpu_available:
+            return None
+        
+        if self._nvml_initialized and self._handle:
+            return self._get_vram_pynvml()
+        else:
+            return self._get_vram_nvidia_smi()
+    
+    def _get_vram_pynvml(self) -> Optional[VRAMInfo]:
+        """Get VRAM info using pynvml."""
+        try:
+            memory = pynvml.nvmlDeviceGetMemoryInfo(self._handle)
+            utilization = pynvml.nvmlDeviceGetUtilizationRates(self._handle)
+            name = pynvml.nvmlDeviceGetName(self._handle)
+            if isinstance(name, bytes):
+                name = name.decode('utf-8')
+            
+            return VRAMInfo(
+                total_mb=memory.total // (1024 * 1024),
+                used_mb=memory.used // (1024 * 1024),
+                free_mb=memory.free // (1024 * 1024),
+                gpu_name=name,
+                gpu_utilization_percent=utilization.gpu
+            )
+        except Exception as e:
+            logger.error(f"pynvml error: {e}")
+            return None
+    
+    def _get_vram_nvidia_smi(self) -> Optional[VRAMInfo]:
+        """Get VRAM info using nvidia-smi (fallback)."""
+        try:
+            result = subprocess.run(
+                [
+                    "nvidia-smi",
+                    "--query-gpu=name,memory.used,memory.total,utilization.gpu",
+                    "--format=csv,noheader,nounits"
+                ],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            
+            if result.returncode != 0:
+                return None
+            
+            lines = result.stdout.strip().split("\n")
+            if self._gpu_index >= len(lines):
+                return None
+            
+            parts = [p.strip() for p in lines[self._gpu_index].split(",")]
+            if len(parts) < 4:
+                return None
+            
+            name = parts[0]
+            used_mb = int(parts[1])
+            total_mb = int(parts[2])
+            utilization = int(parts[3]) if parts[3].isdigit() else 0
+            
+            return VRAMInfo(
+                total_mb=total_mb,
+                used_mb=used_mb,
+                free_mb=total_mb - used_mb,
+                gpu_name=name,
+                gpu_utilization_percent=utilization
+            )
+            
+        except Exception as e:
+            logger.error(f"nvidia-smi error: {e}")
+            return None
+    
+    def get_available_vram_mb(self) -> int:
+        """Get available VRAM in MB."""
+        info = self.get_vram_info()
+        return info.free_mb if info else 0
+
+    
+    # =========================================================================
+    # Continuous Monitoring
+    # =========================================================================
+    
+    def start_monitoring(self) -> None:
+        """Start continuous VRAM monitoring."""
+        if self._monitoring:
+            return
+        
+        if not self._gpu_available:
+            logger.warning("Cannot start monitoring: GPU not available")
+            return
+        
+        self._monitoring = True
+        self._monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
+        self._monitor_thread.start()
+        logger.info("VRAM monitoring started")
+    
+    def stop_monitoring(self) -> None:
+        """Stop continuous VRAM monitoring."""
+        self._monitoring = False
+        if self._monitor_thread:
+            self._monitor_thread.join(timeout=2)
+            self._monitor_thread = None
+        logger.info("VRAM monitoring stopped")
+    
+    def _monitor_loop(self) -> None:
+        """Monitoring loop running in background thread."""
+        import time
+        
+        while self._monitoring:
+            info = self.get_vram_info()
+            if info:
+                current_vram = info.used_mb
+                
+                # Check callbacks
+                for callback, threshold_mb in self._callbacks:
+                    if abs(current_vram - self._last_vram_mb) >= threshold_mb:
+                        try:
+                            callback(info)
+                        except Exception as e:
+                            logger.error(f"VRAM callback error: {e}")
+                
+                self._last_vram_mb = current_vram
+            
+            time.sleep(self._poll_interval_ms / 1000.0)
+    
+    def on_vram_changed(
+        self,
+        callback: Callable[[VRAMInfo], None],
+        threshold_mb: int = 100
+    ) -> None:
+        """
+        Register callback for VRAM changes.
+        
+        Args:
+            callback: Function to call when VRAM changes
+            threshold_mb: Minimum change in MB to trigger callback
+        """
+        self._callbacks.append((callback, threshold_mb))
+    
+    # =========================================================================
+    # Cleanup
+    # =========================================================================
+    
+    def shutdown(self) -> None:
+        """Shutdown the VRAM monitor."""
+        self.stop_monitoring()
+        
+        if self._nvml_initialized:
+            try:
+                pynvml.nvmlShutdown()
+            except Exception:
+                pass
+            self._nvml_initialized = False
+        
+        logger.info("VRAM monitor shutdown")
+    
+    def __del__(self):
+        """Cleanup on deletion."""
+        try:
+            self.shutdown()
+        except Exception:
+            pass