feat(cognition): orchestrateur VRAM + VLM 7b par défaut

VRAMOrchestrator : bascule automatique entre modes SHADOW et REPLAY. - SHADOW : streaming server + agent_chat actifs - REPLAY : VLM qwen2.5vl:7b chargé, services non-essentiels stoppés vlm_reason_about_screen() appelle ensure_reasoning_ready() avant chaque raisonnement — libère la VRAM si nécessaire. Benchmark : qwen2.5vl:7b en 10s (warm) vs 44s quand VRAM saturée. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-20 22:13:29 +02:00
parent cbe8dc95d2
commit 5da4581e76
2 changed files with 196 additions and 1 deletions
--- a/core/cognition/vram_orchestrator.py
+++ b/core/cognition/vram_orchestrator.py
@@ -0,0 +1,191 @@
 """
 Orchestrateur VRAM — gère le chargement/déchargement des modèles selon le mode.
 Deux modes :
 - SHADOW : streaming server + agent_chat actifs, VLM raisonnement déchargé
 - REPLAY : VLM raisonnement (qwen2.5vl:7b) chargé, services non-essentiels stoppés
 Bascule automatique ou manuelle selon le contexte.
 """
 import logging
 import os
 import subprocess
 import time
 from enum import Enum
 from typing import Optional
 logger = logging.getLogger(__name__)
 OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
 REASONING_MODEL = os.environ.get("RPA_REASONING_MODEL", "qwen2.5vl:7b")
 MIN_VRAM_FOR_REASONING = 5.0  # Go minimum pour charger le modèle de raisonnement
 class VRAMMode(Enum):
    SHADOW = "shadow"
    REPLAY = "replay"
 class VRAMOrchestrator:
    """Gère la VRAM pour éviter les conflits entre modèles."""
    def __init__(self):
        self._current_mode: Optional[VRAMMode] = None
        self._stopped_services: list = []
    def get_free_vram_gb(self) -> float:
        """Retourne la VRAM libre en Go."""
        try:
            result = subprocess.run(
                ["nvidia-smi", "--query-gpu=memory.free", "--format=csv,noheader,nounits"],
                capture_output=True, text=True, timeout=5
            )
            return float(result.stdout.strip()) / 1024
        except Exception:
            return 0.0
    def get_used_vram_gb(self) -> float:
        """Retourne la VRAM utilisée en Go."""
        try:
            result = subprocess.run(
                ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,noheader,nounits"],
                capture_output=True, text=True, timeout=5
            )
            return float(result.stdout.strip()) / 1024
        except Exception:
            return 0.0
    def switch_to_replay(self) -> bool:
        """Bascule en mode replay : libère la VRAM pour le VLM de raisonnement.
        1. Stoppe les services non-essentiels (agent_chat)
        2. Redémarre Ollama pour libérer les modèles chargés
        3. Précharge le modèle de raisonnement
        """
        if self._current_mode == VRAMMode.REPLAY:
            logger.info("Déjà en mode REPLAY")
            return True
        logger.info("Bascule en mode REPLAY...")
        # Stopper agent_chat si il tourne
        try:
            result = subprocess.run(
                ["pgrep", "-f", "agent_chat"],
                capture_output=True, text=True, timeout=5
            )
            pids = result.stdout.strip().split('\n')
            for pid in pids:
                if pid.strip():
                    subprocess.run(["kill", pid.strip()], timeout=5)
                    self._stopped_services.append(("agent_chat", pid.strip()))
                    logger.info(f"agent_chat stoppé (PID {pid.strip()})")
        except Exception as e:
            logger.debug(f"Pas d'agent_chat à stopper: {e}")
        # Redémarrer Ollama pour libérer la mémoire
        try:
            subprocess.run(["sudo", "systemctl", "restart", "ollama"],
                         timeout=10, check=True)
            time.sleep(2)
            logger.info("Ollama redémarré")
        except Exception as e:
            logger.warning(f"Impossible de redémarrer Ollama: {e}")
        # Vérifier la VRAM disponible
        free = self.get_free_vram_gb()
        logger.info(f"VRAM libre: {free:.1f} Go")
        if free < MIN_VRAM_FOR_REASONING:
            logger.warning(f"VRAM insuffisante ({free:.1f} Go < {MIN_VRAM_FOR_REASONING} Go)")
            return False
        # Précharger le modèle de raisonnement
        try:
            import requests
            logger.info(f"Préchargement {REASONING_MODEL}...")
            resp = requests.post(f"{OLLAMA_URL}/api/generate", json={
                "model": REASONING_MODEL,
                "prompt": "test",
                "stream": False,
                "options": {"num_predict": 1}
            }, timeout=60)
            if resp.status_code == 200:
                logger.info(f"{REASONING_MODEL} chargé en VRAM")
            free_after = self.get_free_vram_gb()
            logger.info(f"VRAM libre après chargement: {free_after:.1f} Go")
        except Exception as e:
            logger.warning(f"Préchargement échoué: {e}")
        self._current_mode = VRAMMode.REPLAY
        return True
    def switch_to_shadow(self) -> bool:
        """Bascule en mode shadow : relance les services d'observation.
        1. Redémarre Ollama (décharge le VLM de raisonnement)
        2. Relance les services stoppés
        """
        if self._current_mode == VRAMMode.SHADOW:
            logger.info("Déjà en mode SHADOW")
            return True
        logger.info("Bascule en mode SHADOW...")
        # Redémarrer Ollama
        try:
            subprocess.run(["sudo", "systemctl", "restart", "ollama"],
                         timeout=10, check=True)
            time.sleep(2)
        except Exception as e:
            logger.warning(f"Impossible de redémarrer Ollama: {e}")
        # Relancer les services stoppés
        for service_name, _pid in self._stopped_services:
            try:
                if service_name == "agent_chat":
                    subprocess.Popen(
                        ["python3", "-m", "agent_chat.app"],
                        cwd="/home/dom/ai/rpa_vision_v3",
                        stdout=subprocess.DEVNULL,
                        stderr=subprocess.DEVNULL
                    )
                    logger.info(f"{service_name} relancé")
            except Exception as e:
                logger.warning(f"Impossible de relancer {service_name}: {e}")
        self._stopped_services.clear()
        self._current_mode = VRAMMode.SHADOW
        return True
    def ensure_reasoning_ready(self) -> bool:
        """Vérifie que le VLM de raisonnement est prêt. Bascule si nécessaire."""
        free = self.get_free_vram_gb()
        if free >= MIN_VRAM_FOR_REASONING:
            return True
        return self.switch_to_replay()
    @property
    def current_mode(self) -> Optional[str]:
        return self._current_mode.value if self._current_mode else None
    def status(self) -> dict:
        return {
            "mode": self.current_mode,
            "vram_free_gb": round(self.get_free_vram_gb(), 1),
            "vram_used_gb": round(self.get_used_vram_gb(), 1),
            "reasoning_model": REASONING_MODEL,
            "stopped_services": [s[0] for s in self._stopped_services],
        }
 # Singleton
 _orchestrator: Optional[VRAMOrchestrator] = None
 def get_orchestrator() -> VRAMOrchestrator:
    global _orchestrator
    if _orchestrator is None:
        _orchestrator = VRAMOrchestrator()
    return _orchestrator
--- a/core/execution/input_handler.py
+++ b/core/execution/input_handler.py
@@ -286,8 +286,12 @@ Si tu vois un dialogue ou une popup, indique quel bouton cliquer.
 Si l'écran est normal sans action nécessaire, réponds action="nothing".
 Réponds UNIQUEMENT le JSON, pas d'explication."""
        from core.cognition.vram_orchestrator import get_orchestrator
        orch = get_orchestrator()
        orch.ensure_reasoning_ready()
        ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
-        model = os.environ.get("RPA_REASONING_MODEL", os.environ.get("RPA_VLM_MODEL", "qwen2.5vl:3b"))
+        model = os.environ.get("RPA_REASONING_MODEL", "qwen2.5vl:7b")
        response = requests.post(
            f"{ollama_url}/api/generate",