chore: sauvegarde complète avant factorisation executor

Point de sauvegarde incluant les fichiers non committés des sessions précédentes (systemd, docs, agents, GPU manager). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-20 17:03:44 +02:00
parent 623be15bfe
commit 447fbb2c6e
1869 changed files with 791438 additions and 324 deletions
--- a/core/gpu/init.py
+++ b/core/gpu/init.py
@@ -2,7 +2,7 @@
 GPU Resource Management Module for RPA Vision V3

 This module provides dynamic GPU resource allocation between ML models:
- Ollama VLM (qwen3-vl:8b) for UI classification
+- Ollama VLM (gemma4:e4b par défaut, configurable via RPA_VLM_MODEL) for UI classification
 - CLIP (ViT-B-32) for embedding matching

 The GPUResourceManager optimizes VRAM usage by:
--- a/core/gpu/gpu_resource_manager.py
+++ b/core/gpu/gpu_resource_manager.py
@@ -2,7 +2,7 @@
 GPU Resource Manager - Central orchestrator for GPU resource allocation

 Manages dynamic allocation of GPU resources between:
- Ollama VLM (qwen3-vl:8b) - ~10.5 GB VRAM for UI classification
+- Ollama VLM (gemma4:e4b par défaut) - ~10 GB VRAM for UI classification
 - CLIP (ViT-B-32) - ~500 MB VRAM for embedding matching

 Optimizes VRAM usage based on execution mode:
@@ -12,13 +12,14 @@ Optimizes VRAM usage based on execution mode:
 """

 import asyncio
+import contextlib
 import logging
 import threading
 import time
 from dataclasses import dataclass, field
 from datetime import datetime
 from enum import Enum
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, Iterator, List, Optional

 logger = logging.getLogger(__name__)

@@ -53,7 +54,7 @@ class VRAMInfo:
 class GPUResourceConfig:
    """Configuration for GPU resource management."""
    ollama_endpoint: str = "http://localhost:11434"
-    vlm_model: str = "qwen3-vl:8b"
+    vlm_model: str = "gemma4:e4b"
    clip_model: str = "ViT-B-32"
    idle_timeout_seconds: int = 300  # 5 minutes
    vram_threshold_for_clip_gpu_mb: int = 1024  # 1 GB
@@ -126,6 +127,12 @@ class GPUResourceManager:
        # Operation queue for sequential processing
        self._operation_queue: asyncio.Queue = asyncio.Queue()
        self._operation_lock = asyncio.Lock()
+
+        # Lock d'inférence synchrone : sérialise les appels GPU concurrents
+        # (ScreenAnalyzer.analyze, UIDetector, CLIP.encode) entre
+        # ExecutionLoop et stream_processor pour éviter la saturation VRAM
+        # sur RTX 5070 (12 Go). Un seul analyze à la fois sur le GPU.
+        self._inference_lock = threading.Lock()
        
        # Event callbacks
        self._on_resource_changed: List[Callable[[ResourceChangedEvent], None]] = []
@@ -207,7 +214,45 @@ class GPUResourceManager:
    def get_execution_mode(self) -> ExecutionMode:
        """Get the current execution mode."""
        return self._execution_mode
-    
+
+    # =========================================================================
+    # Inference serialization (sync)
+    # =========================================================================
+
+    @contextlib.contextmanager
+    def acquire_inference(self, timeout: Optional[float] = None) -> Iterator[bool]:
+        """
+        Context manager synchrone pour sérialiser les inférences GPU.
+
+        Garantit qu'un seul appel d'inférence (ScreenAnalyzer.analyze,
+        UIDetector.detect, CLIP.encode…) tourne à la fois sur le GPU.
+        Évite la saturation VRAM quand ExecutionLoop et stream_processor
+        appellent analyze() simultanément sur une RTX 5070 (12 Go).
+
+        Args:
+            timeout: Délai max d'attente (secondes). None = bloquant.
+
+        Yields:
+            True si le lock est acquis, False en cas de timeout.
+
+        Example:
+            >>> with gpu_manager.acquire_inference(timeout=30.0) as acquired:
+            ...     if not acquired:
+            ...         logger.warning("GPU lock timeout")
+            ...     state = analyzer.analyze(path)
+        """
+        if timeout is None:
+            self._inference_lock.acquire()
+            acquired = True
+        else:
+            acquired = self._inference_lock.acquire(timeout=timeout)
+
+        try:
+            yield acquired
+        finally:
+            if acquired:
+                self._inference_lock.release()
+
    # =========================================================================
    # VLM Management
    # =========================================================================
--- a/core/gpu/ollama_manager.py
+++ b/core/gpu/ollama_manager.py
@@ -32,7 +32,7 @@ class OllamaManager:
    def __init__(
        self,
        endpoint: str = "http://localhost:11434",
-        model: str = "qwen3-vl:8b",
+        model: str = "gemma4:e4b",
        default_keep_alive: str = "5m"
    ):
        """