diff --git a/agent_v0/server_v1/resolve_engine.py b/agent_v0/server_v1/resolve_engine.py index e0dd73c66..12872debf 100644 --- a/agent_v0/server_v1/resolve_engine.py +++ b/agent_v0/server_v1/resolve_engine.py @@ -1648,6 +1648,15 @@ def _resolve_by_ocr_text( reco_arch='crnn_vgg16_bn', pretrained=True, ) + # Device paramétrable avec garde-fou VRAM (VLM sur DGX distant). + # cuda si VRAM locale libre, cpu sinon — jamais de hardcode cuda. + try: + from core.gpu.device_policy import resolve_device + if resolve_device("auto") == "cuda": + _V4_OCR_PREDICTOR = _V4_OCR_PREDICTOR.cuda() + logger.info("docTR V4 OCR chargé sur cuda") + except Exception as e: + logger.debug("docTR V4 OCR reste sur CPU (%s)", e) doc = DocumentFile.from_images([screenshot_path]) result = _V4_OCR_PREDICTOR(doc) diff --git a/core/detection/som_engine.py b/core/detection/som_engine.py index 683be768a..d8be8ce97 100644 --- a/core/detection/som_engine.py +++ b/core/detection/som_engine.py @@ -89,8 +89,11 @@ class SomResult: class SomEngine: """Moteur Set-of-Mark : YOLO + docTR + annotation.""" - def __init__(self, device: str = "cuda"): - self._device = device + def __init__(self, device: str = "auto"): + # Résolution paramétrable avec garde-fou VRAM (cf. core/gpu/device_policy). + # "auto" → cuda si VRAM libre suffisante (VLM sur DGX distant), sinon cpu. + from core.gpu.device_policy import resolve_device + self._device = resolve_device(device) self._yolo = None self._ocr = None self._loaded = False @@ -300,8 +303,12 @@ _shared_engine: Optional[SomEngine] = None _shared_lock = __import__("threading").Lock() -def get_shared_engine(device: str = "cpu") -> Optional[SomEngine]: - """Singleton SomEngine partagé entre tous les modules.""" +def get_shared_engine(device: str = "auto") -> Optional[SomEngine]: + """Singleton SomEngine partagé entre tous les modules. + + device="auto" (défaut) délègue à core.gpu.device_policy.resolve_device : + cuda si la VRAM locale est libre, cpu sinon. Passer "cpu" force le CPU. + """ global _shared_engine if _shared_engine is None: with _shared_lock: diff --git a/core/gpu/device_policy.py b/core/gpu/device_policy.py new file mode 100644 index 000000000..b356c1d83 --- /dev/null +++ b/core/gpu/device_policy.py @@ -0,0 +1,149 @@ +"""Résolution de device paramétrable (auto/cuda/cpu) avec garde-fou VRAM. + +Permet de basculer les étages CPU-par-défaut de la cascade vision (OCR docTR, +EasyOCR, YOLO/SoM) vers le GPU local **quand la VRAM est libre**, SANS jamais +hardcoder cuda. La politique anti-concurrence VRAM (tout sur CPU) datait d'une +époque où les VLM tournaient sur la RTX 5070 locale ; ils tournent désormais +sur un DGX distant (tunnel SSH `:11434`), libérant ~9 Go localement. + +Logique de garde-fou inspirée de `core/embedding/clip_embedder.py` (lignes +~65-82) : `torch.cuda.is_available()` + `torch.cuda.mem_get_info()`. + +Contraintes : + - JAMAIS de hardcode cuda ; + - aucun appel réseau ; + - import-safe : aucun chargement de modèle, aucune allocation GPU à l'import ; + - fallback CPU propre partout (jamais de crash si pas de GPU). + +Override global : variable d'environnement `RPA_VISION_DEVICE` ∈ {cpu, cuda, auto}. +""" + +from __future__ import annotations + +import logging +import os +from typing import Optional + +import torch + +logger = logging.getLogger(__name__) + +_GB = 1024 ** 3 + +# Valeurs reconnues pour l'argument `requested` et l'override env. +_VALID = {"cpu", "cuda", "auto"} + +# Garde-fous par défaut (Go). +DEFAULT_MIN_FREE_GB = 2.0 # VRAM libre minimale pour autoriser cuda +DEFAULT_MAX_TOTAL_GB = 6.0 # plafond d'usage VRAM total après bascule + + +def _env_override() -> Optional[str]: + """Lit l'override `RPA_VISION_DEVICE` s'il est présent et valide. + + Retourne None si absent ou invalide (on retombe alors sur `requested`). + """ + raw = os.getenv("RPA_VISION_DEVICE", "").strip().lower() + if not raw: + return None + if raw in _VALID: + return raw + logger.warning( + "RPA_VISION_DEVICE='%s' invalide (attendu cpu/cuda/auto) — ignoré", + raw, + ) + return None + + +def _cuda_available() -> bool: + """`torch.cuda.is_available()` protégé contre toute exception driver.""" + try: + return bool(torch.cuda.is_available()) + except Exception as e: # pragma: no cover - dépend du driver + logger.debug("torch.cuda.is_available a levé : %s — CPU", e) + return False + + +def _free_total_gb() -> Optional[tuple[float, float]]: + """VRAM (libre, totale) en Go via mem_get_info, ou None si indisponible.""" + try: + free_bytes, total_bytes = torch.cuda.mem_get_info() + return free_bytes / _GB, total_bytes / _GB + except Exception as e: # pragma: no cover - dépend du driver + logger.debug("torch.cuda.mem_get_info a levé : %s", e) + return None + + +def resolve_device( + requested: str = "auto", + min_free_gb: float = DEFAULT_MIN_FREE_GB, + max_total_gb: float = DEFAULT_MAX_TOTAL_GB, +) -> str: + """Résout le device effectif ("cuda" ou "cpu") selon la politique VRAM. + + Args: + requested: "cpu", "cuda" ou "auto" (défaut). L'override env + `RPA_VISION_DEVICE` prime sur cet argument s'il est présent/valide. + min_free_gb: VRAM libre minimale (Go) pour autoriser cuda en mode auto. + max_total_gb: plafond d'usage VRAM total (Go). Si basculer cuda ferait + dépasser ce plafond (used = total - free), on reste CPU. Garde-fou + contre la saturation quand d'autres process occupent déjà le GPU. + + Returns: + "cuda" ou "cpu". Toujours "cpu" en cas de doute (fallback propre). + + Politique : + - "cpu" → "cpu" ; + - "cuda" → "cuda" si cuda dispo, sinon "cpu" (fallback loggé) ; + - "auto" → "cuda" si cuda dispo ET free ≥ min_free_gb ET + used ≤ max_total_gb, sinon "cpu". + """ + effective = _env_override() or (requested or "auto").strip().lower() + if effective not in _VALID: + logger.warning( + "device demandé '%s' invalide (attendu cpu/cuda/auto) — auto", + effective, + ) + effective = "auto" + + if effective == "cpu": + return "cpu" + + if not _cuda_available(): + if effective == "cuda": + logger.info("device=cuda demandé mais CUDA indisponible — fallback CPU") + return "cpu" + + if effective == "cuda": + # Demande explicite : on respecte sans appliquer le garde-fou VRAM + # (l'appelant assume). CUDA est dispo → cuda. + return "cuda" + + # effective == "auto" : garde-fou VRAM. + mem = _free_total_gb() + if mem is None: + logger.info("auto: mem_get_info indisponible — CPU par prudence") + return "cpu" + + free_gb, total_gb = mem + used_gb = total_gb - free_gb + + if free_gb < min_free_gb: + logger.info( + "auto: VRAM libre %.1f Go < seuil %.1f Go — CPU", + free_gb, min_free_gb, + ) + return "cpu" + + if used_gb > max_total_gb: + logger.info( + "auto: usage VRAM %.1f Go > plafond %.1f Go — CPU", + used_gb, max_total_gb, + ) + return "cpu" + + logger.info( + "auto: VRAM libre %.1f Go (usage %.1f/%.1f Go) — CUDA", + free_gb, used_gb, total_gb, + ) + return "cuda" diff --git a/core/llm/ocr_extractor.py b/core/llm/ocr_extractor.py index e1f43ae5f..e8fbd0f08 100644 --- a/core/llm/ocr_extractor.py +++ b/core/llm/ocr_extractor.py @@ -25,14 +25,24 @@ _easyocr_reader = None def easyocr_gpu_enabled(default: bool = False) -> bool: """Return whether EasyOCR may allocate GPU memory. - The replay server shares the GPU with Ollama. Defaulting EasyOCR to CPU - keeps VRAM available for the VLM; set RPA_EASYOCR_GPU=1 only for a measured - OCR benchmark or a runtime that has spare VRAM. + Priorité : + 1. RPA_EASYOCR_GPU explicite (1/0) → décision forcée, compat héritée. + 2. Sinon, délègue à core.gpu.device_policy.resolve_device("auto") : + GPU autorisé uniquement si la VRAM locale est libre (les VLM tournent + désormais sur DGX distant, ~9 Go libres localement). Garde-fou VRAM + intégré ; fallback CPU propre si pas de GPU. + + `default` n'est utilisé que si la résolution échoue (sécurité). """ raw = os.getenv("RPA_EASYOCR_GPU", "") - if not raw: + if raw: + return raw.strip().lower() in {"1", "true", "yes", "on"} + try: + from core.gpu.device_policy import resolve_device + return resolve_device("auto") == "cuda" + except Exception as e: # pragma: no cover - fallback prudent + logger.debug("easyocr_gpu_enabled: resolve_device a échoué (%s)", e) return default - return raw.strip().lower() in {"1", "true", "yes", "on"} def _get_reader(): diff --git a/tests/unit/test_device_policy.py b/tests/unit/test_device_policy.py new file mode 100644 index 000000000..f3e58282d --- /dev/null +++ b/tests/unit/test_device_policy.py @@ -0,0 +1,153 @@ +"""Tests TDD pour la résolution de device paramétrable (auto/cuda/cpu). + +Objectif : basculer OCR/YOLO sur GPU local quand la VRAM est libre, SANS +hardcoder cuda, avec garde-fou VRAM et fallback CPU propre. + +Tous les tests mockent `torch.cuda.is_available` et `torch.cuda.mem_get_info` +pour ne PAS dépendre du GPU réel de la machine de CI/dev. +""" + +from __future__ import annotations + +import importlib +from unittest import mock + +import pytest + +from core.gpu import device_policy + + +GB = 1024 ** 3 + + +def _mock_cuda(available: bool, free_gb: float = 0.0, total_gb: float = 12.0): + """Construit un contexte de mock torch.cuda cohérent. + + free_gb / total_gb sont exprimés en Go ; mem_get_info renvoie des octets. + """ + free_bytes = int(free_gb * GB) + total_bytes = int(total_gb * GB) + return mock.patch.multiple( + device_policy.torch.cuda, + is_available=mock.Mock(return_value=available), + mem_get_info=mock.Mock(return_value=(free_bytes, total_bytes)), + ) + + +# ── requested="cpu" ───────────────────────────────────────────────────────── + +def test_resolve_cpu_explicit_returns_cpu(monkeypatch): + monkeypatch.delenv("RPA_VISION_DEVICE", raising=False) + with _mock_cuda(available=True, free_gb=12.0): + assert device_policy.resolve_device("cpu") == "cpu" + + +# ── requested="cuda" ──────────────────────────────────────────────────────── + +def test_resolve_cuda_falls_back_to_cpu_when_unavailable(monkeypatch): + monkeypatch.delenv("RPA_VISION_DEVICE", raising=False) + with _mock_cuda(available=False): + assert device_policy.resolve_device("cuda") == "cpu" + + +def test_resolve_cuda_returns_cuda_when_available(monkeypatch): + monkeypatch.delenv("RPA_VISION_DEVICE", raising=False) + with _mock_cuda(available=True, free_gb=8.0): + assert device_policy.resolve_device("cuda") == "cuda" + + +# ── requested="auto" (défaut) ─────────────────────────────────────────────── + +def test_resolve_auto_cuda_when_vram_sufficient(monkeypatch): + monkeypatch.delenv("RPA_VISION_DEVICE", raising=False) + with _mock_cuda(available=True, free_gb=8.0): + assert device_policy.resolve_device("auto", min_free_gb=2.0) == "cuda" + + +def test_resolve_auto_cpu_when_vram_insufficient(monkeypatch): + monkeypatch.delenv("RPA_VISION_DEVICE", raising=False) + with _mock_cuda(available=True, free_gb=1.0): + assert device_policy.resolve_device("auto", min_free_gb=2.0) == "cpu" + + +def test_resolve_auto_cpu_when_cuda_unavailable(monkeypatch): + monkeypatch.delenv("RPA_VISION_DEVICE", raising=False) + with _mock_cuda(available=False): + assert device_policy.resolve_device("auto") == "cpu" + + +def test_resolve_default_is_auto(monkeypatch): + """Sans argument, le défaut est 'auto'.""" + monkeypatch.delenv("RPA_VISION_DEVICE", raising=False) + with _mock_cuda(available=True, free_gb=8.0): + assert device_policy.resolve_device() == "cuda" + + +# ── garde-fou : usage total ne doit pas dépasser le plafond ───────────────── + +def test_resolve_auto_cpu_when_switch_would_exceed_total_cap(monkeypatch): + """Si basculer cuda ferait dépasser le plafond d'usage total (6 Go par + défaut), on reste CPU même si la VRAM libre dépasse min_free_gb. + + total=12, free=4 → used=8 > cap 6 → CPU. + """ + monkeypatch.delenv("RPA_VISION_DEVICE", raising=False) + with _mock_cuda(available=True, free_gb=4.0, total_gb=12.0): + assert device_policy.resolve_device("auto", min_free_gb=2.0, + max_total_gb=6.0) == "cpu" + + +def test_resolve_auto_cuda_when_under_total_cap(monkeypatch): + """total=12, free=11 → used=1 < cap 6 et free 11 ≥ min 2 → CUDA.""" + monkeypatch.delenv("RPA_VISION_DEVICE", raising=False) + with _mock_cuda(available=True, free_gb=11.0, total_gb=12.0): + assert device_policy.resolve_device("auto", min_free_gb=2.0, + max_total_gb=6.0) == "cuda" + + +# ── override env RPA_VISION_DEVICE ────────────────────────────────────────── + +def test_env_override_cpu_forces_cpu_even_in_auto(monkeypatch): + monkeypatch.setenv("RPA_VISION_DEVICE", "cpu") + with _mock_cuda(available=True, free_gb=12.0): + assert device_policy.resolve_device("auto") == "cpu" + assert device_policy.resolve_device("cuda") == "cpu" + + +def test_env_override_cuda_takes_precedence_over_requested_cpu(monkeypatch): + """L'override env prime sur l'argument requested.""" + monkeypatch.setenv("RPA_VISION_DEVICE", "cuda") + with _mock_cuda(available=True, free_gb=8.0): + assert device_policy.resolve_device("cpu") == "cuda" + + +def test_env_override_cuda_still_falls_back_when_unavailable(monkeypatch): + monkeypatch.setenv("RPA_VISION_DEVICE", "cuda") + with _mock_cuda(available=False): + assert device_policy.resolve_device("auto") == "cpu" + + +def test_env_override_invalid_value_ignored(monkeypatch): + """Une valeur env invalide est ignorée (on retombe sur requested).""" + monkeypatch.setenv("RPA_VISION_DEVICE", "banana") + with _mock_cuda(available=False): + assert device_policy.resolve_device("auto") == "cpu" + + +# ── robustesse : pas de crash si torch.cuda lève ─────────────────────────── + +def test_resolve_auto_cpu_on_torch_exception(monkeypatch): + monkeypatch.delenv("RPA_VISION_DEVICE", raising=False) + with mock.patch.object( + device_policy.torch.cuda, "is_available", + side_effect=RuntimeError("driver boom"), + ): + assert device_policy.resolve_device("auto") == "cpu" + + +# ── import-safe : pas d'effet de bord à l'import ─────────────────────────── + +def test_module_import_is_safe(): + """Réimporter le module ne doit déclencher aucun chargement modèle/GPU.""" + importlib.reload(device_policy) + assert hasattr(device_policy, "resolve_device")