diff --git a/core/cognition/vram_orchestrator.py b/core/cognition/vram_orchestrator.py new file mode 100644 index 000000000..5e5d5962c --- /dev/null +++ b/core/cognition/vram_orchestrator.py @@ -0,0 +1,191 @@ +""" +Orchestrateur VRAM — gère le chargement/déchargement des modèles selon le mode. + +Deux modes : +- SHADOW : streaming server + agent_chat actifs, VLM raisonnement déchargé +- REPLAY : VLM raisonnement (qwen2.5vl:7b) chargé, services non-essentiels stoppés + +Bascule automatique ou manuelle selon le contexte. +""" + +import logging +import os +import subprocess +import time +from enum import Enum +from typing import Optional + +logger = logging.getLogger(__name__) + +OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") +REASONING_MODEL = os.environ.get("RPA_REASONING_MODEL", "qwen2.5vl:7b") +MIN_VRAM_FOR_REASONING = 5.0 # Go minimum pour charger le modèle de raisonnement + + +class VRAMMode(Enum): + SHADOW = "shadow" + REPLAY = "replay" + + +class VRAMOrchestrator: + """Gère la VRAM pour éviter les conflits entre modèles.""" + + def __init__(self): + self._current_mode: Optional[VRAMMode] = None + self._stopped_services: list = [] + + def get_free_vram_gb(self) -> float: + """Retourne la VRAM libre en Go.""" + try: + result = subprocess.run( + ["nvidia-smi", "--query-gpu=memory.free", "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=5 + ) + return float(result.stdout.strip()) / 1024 + except Exception: + return 0.0 + + def get_used_vram_gb(self) -> float: + """Retourne la VRAM utilisée en Go.""" + try: + result = subprocess.run( + ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=5 + ) + return float(result.stdout.strip()) / 1024 + except Exception: + return 0.0 + + def switch_to_replay(self) -> bool: + """Bascule en mode replay : libère la VRAM pour le VLM de raisonnement. + + 1. Stoppe les services non-essentiels (agent_chat) + 2. Redémarre Ollama pour libérer les modèles chargés + 3. Précharge le modèle de raisonnement + """ + if self._current_mode == VRAMMode.REPLAY: + logger.info("Déjà en mode REPLAY") + return True + + logger.info("Bascule en mode REPLAY...") + + # Stopper agent_chat si il tourne + try: + result = subprocess.run( + ["pgrep", "-f", "agent_chat"], + capture_output=True, text=True, timeout=5 + ) + pids = result.stdout.strip().split('\n') + for pid in pids: + if pid.strip(): + subprocess.run(["kill", pid.strip()], timeout=5) + self._stopped_services.append(("agent_chat", pid.strip())) + logger.info(f"agent_chat stoppé (PID {pid.strip()})") + except Exception as e: + logger.debug(f"Pas d'agent_chat à stopper: {e}") + + # Redémarrer Ollama pour libérer la mémoire + try: + subprocess.run(["sudo", "systemctl", "restart", "ollama"], + timeout=10, check=True) + time.sleep(2) + logger.info("Ollama redémarré") + except Exception as e: + logger.warning(f"Impossible de redémarrer Ollama: {e}") + + # Vérifier la VRAM disponible + free = self.get_free_vram_gb() + logger.info(f"VRAM libre: {free:.1f} Go") + + if free < MIN_VRAM_FOR_REASONING: + logger.warning(f"VRAM insuffisante ({free:.1f} Go < {MIN_VRAM_FOR_REASONING} Go)") + return False + + # Précharger le modèle de raisonnement + try: + import requests + logger.info(f"Préchargement {REASONING_MODEL}...") + resp = requests.post(f"{OLLAMA_URL}/api/generate", json={ + "model": REASONING_MODEL, + "prompt": "test", + "stream": False, + "options": {"num_predict": 1} + }, timeout=60) + if resp.status_code == 200: + logger.info(f"{REASONING_MODEL} chargé en VRAM") + free_after = self.get_free_vram_gb() + logger.info(f"VRAM libre après chargement: {free_after:.1f} Go") + except Exception as e: + logger.warning(f"Préchargement échoué: {e}") + + self._current_mode = VRAMMode.REPLAY + return True + + def switch_to_shadow(self) -> bool: + """Bascule en mode shadow : relance les services d'observation. + + 1. Redémarre Ollama (décharge le VLM de raisonnement) + 2. Relance les services stoppés + """ + if self._current_mode == VRAMMode.SHADOW: + logger.info("Déjà en mode SHADOW") + return True + + logger.info("Bascule en mode SHADOW...") + + # Redémarrer Ollama + try: + subprocess.run(["sudo", "systemctl", "restart", "ollama"], + timeout=10, check=True) + time.sleep(2) + except Exception as e: + logger.warning(f"Impossible de redémarrer Ollama: {e}") + + # Relancer les services stoppés + for service_name, _pid in self._stopped_services: + try: + if service_name == "agent_chat": + subprocess.Popen( + ["python3", "-m", "agent_chat.app"], + cwd="/home/dom/ai/rpa_vision_v3", + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL + ) + logger.info(f"{service_name} relancé") + except Exception as e: + logger.warning(f"Impossible de relancer {service_name}: {e}") + + self._stopped_services.clear() + self._current_mode = VRAMMode.SHADOW + return True + + def ensure_reasoning_ready(self) -> bool: + """Vérifie que le VLM de raisonnement est prêt. Bascule si nécessaire.""" + free = self.get_free_vram_gb() + if free >= MIN_VRAM_FOR_REASONING: + return True + return self.switch_to_replay() + + @property + def current_mode(self) -> Optional[str]: + return self._current_mode.value if self._current_mode else None + + def status(self) -> dict: + return { + "mode": self.current_mode, + "vram_free_gb": round(self.get_free_vram_gb(), 1), + "vram_used_gb": round(self.get_used_vram_gb(), 1), + "reasoning_model": REASONING_MODEL, + "stopped_services": [s[0] for s in self._stopped_services], + } + + +# Singleton +_orchestrator: Optional[VRAMOrchestrator] = None + + +def get_orchestrator() -> VRAMOrchestrator: + global _orchestrator + if _orchestrator is None: + _orchestrator = VRAMOrchestrator() + return _orchestrator diff --git a/core/execution/input_handler.py b/core/execution/input_handler.py index 82cc8b375..bc0816295 100644 --- a/core/execution/input_handler.py +++ b/core/execution/input_handler.py @@ -286,8 +286,12 @@ Si tu vois un dialogue ou une popup, indique quel bouton cliquer. Si l'écran est normal sans action nécessaire, réponds action="nothing". Réponds UNIQUEMENT le JSON, pas d'explication.""" + from core.cognition.vram_orchestrator import get_orchestrator + orch = get_orchestrator() + orch.ensure_reasoning_ready() + ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434") - model = os.environ.get("RPA_REASONING_MODEL", os.environ.get("RPA_VLM_MODEL", "qwen2.5vl:3b")) + model = os.environ.get("RPA_REASONING_MODEL", "qwen2.5vl:7b") response = requests.post( f"{ollama_url}/api/generate",