feat(p1g): device policy GPU/CPU paramétrable pour la cascade vision
resolve_device(auto/cuda/cpu) avec garde-fou VRAM et fallback CPU propre.
Bascule EasyOCR/SoM/docTR sur GPU si VRAM libre, rollback env sans toucher au code.
- core/gpu/device_policy.py (nouveau) : resolve_device + garde-fou VRAM (max_total_gb)
- core/detection/som_engine.py, core/llm/ocr_extractor.py,
agent_v0/server_v1/resolve_engine.py : câblage device auto (35 lignes)
- tests/unit/test_device_policy.py : 15 tests (verts venv réel)
Rollback sans toucher au code : RPA_VISION_DEVICE=cpu (force CPU global) / RPA_EASYOCR_GPU=0.
Bench GPU réel (latence) + activation large après verdict Qwen. QG Qwen deja valide sur le patch.
Mergé depuis worktree agent-a4f390f410e00ad7c (base 5b2afa362), 3 fichiers cibles non modifiés
dans le principal (zéro écrasement), dry-run apply propre.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1648,6 +1648,15 @@ def _resolve_by_ocr_text(
|
||||
reco_arch='crnn_vgg16_bn',
|
||||
pretrained=True,
|
||||
)
|
||||
# Device paramétrable avec garde-fou VRAM (VLM sur DGX distant).
|
||||
# cuda si VRAM locale libre, cpu sinon — jamais de hardcode cuda.
|
||||
try:
|
||||
from core.gpu.device_policy import resolve_device
|
||||
if resolve_device("auto") == "cuda":
|
||||
_V4_OCR_PREDICTOR = _V4_OCR_PREDICTOR.cuda()
|
||||
logger.info("docTR V4 OCR chargé sur cuda")
|
||||
except Exception as e:
|
||||
logger.debug("docTR V4 OCR reste sur CPU (%s)", e)
|
||||
|
||||
doc = DocumentFile.from_images([screenshot_path])
|
||||
result = _V4_OCR_PREDICTOR(doc)
|
||||
|
||||
@@ -89,8 +89,11 @@ class SomResult:
|
||||
class SomEngine:
|
||||
"""Moteur Set-of-Mark : YOLO + docTR + annotation."""
|
||||
|
||||
def __init__(self, device: str = "cuda"):
|
||||
self._device = device
|
||||
def __init__(self, device: str = "auto"):
|
||||
# Résolution paramétrable avec garde-fou VRAM (cf. core/gpu/device_policy).
|
||||
# "auto" → cuda si VRAM libre suffisante (VLM sur DGX distant), sinon cpu.
|
||||
from core.gpu.device_policy import resolve_device
|
||||
self._device = resolve_device(device)
|
||||
self._yolo = None
|
||||
self._ocr = None
|
||||
self._loaded = False
|
||||
@@ -300,8 +303,12 @@ _shared_engine: Optional[SomEngine] = None
|
||||
_shared_lock = __import__("threading").Lock()
|
||||
|
||||
|
||||
def get_shared_engine(device: str = "cpu") -> Optional[SomEngine]:
|
||||
"""Singleton SomEngine partagé entre tous les modules."""
|
||||
def get_shared_engine(device: str = "auto") -> Optional[SomEngine]:
|
||||
"""Singleton SomEngine partagé entre tous les modules.
|
||||
|
||||
device="auto" (défaut) délègue à core.gpu.device_policy.resolve_device :
|
||||
cuda si la VRAM locale est libre, cpu sinon. Passer "cpu" force le CPU.
|
||||
"""
|
||||
global _shared_engine
|
||||
if _shared_engine is None:
|
||||
with _shared_lock:
|
||||
|
||||
149
core/gpu/device_policy.py
Normal file
149
core/gpu/device_policy.py
Normal file
@@ -0,0 +1,149 @@
|
||||
"""Résolution de device paramétrable (auto/cuda/cpu) avec garde-fou VRAM.
|
||||
|
||||
Permet de basculer les étages CPU-par-défaut de la cascade vision (OCR docTR,
|
||||
EasyOCR, YOLO/SoM) vers le GPU local **quand la VRAM est libre**, SANS jamais
|
||||
hardcoder cuda. La politique anti-concurrence VRAM (tout sur CPU) datait d'une
|
||||
époque où les VLM tournaient sur la RTX 5070 locale ; ils tournent désormais
|
||||
sur un DGX distant (tunnel SSH `:11434`), libérant ~9 Go localement.
|
||||
|
||||
Logique de garde-fou inspirée de `core/embedding/clip_embedder.py` (lignes
|
||||
~65-82) : `torch.cuda.is_available()` + `torch.cuda.mem_get_info()`.
|
||||
|
||||
Contraintes :
|
||||
- JAMAIS de hardcode cuda ;
|
||||
- aucun appel réseau ;
|
||||
- import-safe : aucun chargement de modèle, aucune allocation GPU à l'import ;
|
||||
- fallback CPU propre partout (jamais de crash si pas de GPU).
|
||||
|
||||
Override global : variable d'environnement `RPA_VISION_DEVICE` ∈ {cpu, cuda, auto}.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_GB = 1024 ** 3
|
||||
|
||||
# Valeurs reconnues pour l'argument `requested` et l'override env.
|
||||
_VALID = {"cpu", "cuda", "auto"}
|
||||
|
||||
# Garde-fous par défaut (Go).
|
||||
DEFAULT_MIN_FREE_GB = 2.0 # VRAM libre minimale pour autoriser cuda
|
||||
DEFAULT_MAX_TOTAL_GB = 6.0 # plafond d'usage VRAM total après bascule
|
||||
|
||||
|
||||
def _env_override() -> Optional[str]:
|
||||
"""Lit l'override `RPA_VISION_DEVICE` s'il est présent et valide.
|
||||
|
||||
Retourne None si absent ou invalide (on retombe alors sur `requested`).
|
||||
"""
|
||||
raw = os.getenv("RPA_VISION_DEVICE", "").strip().lower()
|
||||
if not raw:
|
||||
return None
|
||||
if raw in _VALID:
|
||||
return raw
|
||||
logger.warning(
|
||||
"RPA_VISION_DEVICE='%s' invalide (attendu cpu/cuda/auto) — ignoré",
|
||||
raw,
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def _cuda_available() -> bool:
|
||||
"""`torch.cuda.is_available()` protégé contre toute exception driver."""
|
||||
try:
|
||||
return bool(torch.cuda.is_available())
|
||||
except Exception as e: # pragma: no cover - dépend du driver
|
||||
logger.debug("torch.cuda.is_available a levé : %s — CPU", e)
|
||||
return False
|
||||
|
||||
|
||||
def _free_total_gb() -> Optional[tuple[float, float]]:
|
||||
"""VRAM (libre, totale) en Go via mem_get_info, ou None si indisponible."""
|
||||
try:
|
||||
free_bytes, total_bytes = torch.cuda.mem_get_info()
|
||||
return free_bytes / _GB, total_bytes / _GB
|
||||
except Exception as e: # pragma: no cover - dépend du driver
|
||||
logger.debug("torch.cuda.mem_get_info a levé : %s", e)
|
||||
return None
|
||||
|
||||
|
||||
def resolve_device(
|
||||
requested: str = "auto",
|
||||
min_free_gb: float = DEFAULT_MIN_FREE_GB,
|
||||
max_total_gb: float = DEFAULT_MAX_TOTAL_GB,
|
||||
) -> str:
|
||||
"""Résout le device effectif ("cuda" ou "cpu") selon la politique VRAM.
|
||||
|
||||
Args:
|
||||
requested: "cpu", "cuda" ou "auto" (défaut). L'override env
|
||||
`RPA_VISION_DEVICE` prime sur cet argument s'il est présent/valide.
|
||||
min_free_gb: VRAM libre minimale (Go) pour autoriser cuda en mode auto.
|
||||
max_total_gb: plafond d'usage VRAM total (Go). Si basculer cuda ferait
|
||||
dépasser ce plafond (used = total - free), on reste CPU. Garde-fou
|
||||
contre la saturation quand d'autres process occupent déjà le GPU.
|
||||
|
||||
Returns:
|
||||
"cuda" ou "cpu". Toujours "cpu" en cas de doute (fallback propre).
|
||||
|
||||
Politique :
|
||||
- "cpu" → "cpu" ;
|
||||
- "cuda" → "cuda" si cuda dispo, sinon "cpu" (fallback loggé) ;
|
||||
- "auto" → "cuda" si cuda dispo ET free ≥ min_free_gb ET
|
||||
used ≤ max_total_gb, sinon "cpu".
|
||||
"""
|
||||
effective = _env_override() or (requested or "auto").strip().lower()
|
||||
if effective not in _VALID:
|
||||
logger.warning(
|
||||
"device demandé '%s' invalide (attendu cpu/cuda/auto) — auto",
|
||||
effective,
|
||||
)
|
||||
effective = "auto"
|
||||
|
||||
if effective == "cpu":
|
||||
return "cpu"
|
||||
|
||||
if not _cuda_available():
|
||||
if effective == "cuda":
|
||||
logger.info("device=cuda demandé mais CUDA indisponible — fallback CPU")
|
||||
return "cpu"
|
||||
|
||||
if effective == "cuda":
|
||||
# Demande explicite : on respecte sans appliquer le garde-fou VRAM
|
||||
# (l'appelant assume). CUDA est dispo → cuda.
|
||||
return "cuda"
|
||||
|
||||
# effective == "auto" : garde-fou VRAM.
|
||||
mem = _free_total_gb()
|
||||
if mem is None:
|
||||
logger.info("auto: mem_get_info indisponible — CPU par prudence")
|
||||
return "cpu"
|
||||
|
||||
free_gb, total_gb = mem
|
||||
used_gb = total_gb - free_gb
|
||||
|
||||
if free_gb < min_free_gb:
|
||||
logger.info(
|
||||
"auto: VRAM libre %.1f Go < seuil %.1f Go — CPU",
|
||||
free_gb, min_free_gb,
|
||||
)
|
||||
return "cpu"
|
||||
|
||||
if used_gb > max_total_gb:
|
||||
logger.info(
|
||||
"auto: usage VRAM %.1f Go > plafond %.1f Go — CPU",
|
||||
used_gb, max_total_gb,
|
||||
)
|
||||
return "cpu"
|
||||
|
||||
logger.info(
|
||||
"auto: VRAM libre %.1f Go (usage %.1f/%.1f Go) — CUDA",
|
||||
free_gb, used_gb, total_gb,
|
||||
)
|
||||
return "cuda"
|
||||
@@ -25,14 +25,24 @@ _easyocr_reader = None
|
||||
def easyocr_gpu_enabled(default: bool = False) -> bool:
|
||||
"""Return whether EasyOCR may allocate GPU memory.
|
||||
|
||||
The replay server shares the GPU with Ollama. Defaulting EasyOCR to CPU
|
||||
keeps VRAM available for the VLM; set RPA_EASYOCR_GPU=1 only for a measured
|
||||
OCR benchmark or a runtime that has spare VRAM.
|
||||
Priorité :
|
||||
1. RPA_EASYOCR_GPU explicite (1/0) → décision forcée, compat héritée.
|
||||
2. Sinon, délègue à core.gpu.device_policy.resolve_device("auto") :
|
||||
GPU autorisé uniquement si la VRAM locale est libre (les VLM tournent
|
||||
désormais sur DGX distant, ~9 Go libres localement). Garde-fou VRAM
|
||||
intégré ; fallback CPU propre si pas de GPU.
|
||||
|
||||
`default` n'est utilisé que si la résolution échoue (sécurité).
|
||||
"""
|
||||
raw = os.getenv("RPA_EASYOCR_GPU", "")
|
||||
if not raw:
|
||||
if raw:
|
||||
return raw.strip().lower() in {"1", "true", "yes", "on"}
|
||||
try:
|
||||
from core.gpu.device_policy import resolve_device
|
||||
return resolve_device("auto") == "cuda"
|
||||
except Exception as e: # pragma: no cover - fallback prudent
|
||||
logger.debug("easyocr_gpu_enabled: resolve_device a échoué (%s)", e)
|
||||
return default
|
||||
return raw.strip().lower() in {"1", "true", "yes", "on"}
|
||||
|
||||
|
||||
def _get_reader():
|
||||
|
||||
153
tests/unit/test_device_policy.py
Normal file
153
tests/unit/test_device_policy.py
Normal file
@@ -0,0 +1,153 @@
|
||||
"""Tests TDD pour la résolution de device paramétrable (auto/cuda/cpu).
|
||||
|
||||
Objectif : basculer OCR/YOLO sur GPU local quand la VRAM est libre, SANS
|
||||
hardcoder cuda, avec garde-fou VRAM et fallback CPU propre.
|
||||
|
||||
Tous les tests mockent `torch.cuda.is_available` et `torch.cuda.mem_get_info`
|
||||
pour ne PAS dépendre du GPU réel de la machine de CI/dev.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
|
||||
from core.gpu import device_policy
|
||||
|
||||
|
||||
GB = 1024 ** 3
|
||||
|
||||
|
||||
def _mock_cuda(available: bool, free_gb: float = 0.0, total_gb: float = 12.0):
|
||||
"""Construit un contexte de mock torch.cuda cohérent.
|
||||
|
||||
free_gb / total_gb sont exprimés en Go ; mem_get_info renvoie des octets.
|
||||
"""
|
||||
free_bytes = int(free_gb * GB)
|
||||
total_bytes = int(total_gb * GB)
|
||||
return mock.patch.multiple(
|
||||
device_policy.torch.cuda,
|
||||
is_available=mock.Mock(return_value=available),
|
||||
mem_get_info=mock.Mock(return_value=(free_bytes, total_bytes)),
|
||||
)
|
||||
|
||||
|
||||
# ── requested="cpu" ─────────────────────────────────────────────────────────
|
||||
|
||||
def test_resolve_cpu_explicit_returns_cpu(monkeypatch):
|
||||
monkeypatch.delenv("RPA_VISION_DEVICE", raising=False)
|
||||
with _mock_cuda(available=True, free_gb=12.0):
|
||||
assert device_policy.resolve_device("cpu") == "cpu"
|
||||
|
||||
|
||||
# ── requested="cuda" ────────────────────────────────────────────────────────
|
||||
|
||||
def test_resolve_cuda_falls_back_to_cpu_when_unavailable(monkeypatch):
|
||||
monkeypatch.delenv("RPA_VISION_DEVICE", raising=False)
|
||||
with _mock_cuda(available=False):
|
||||
assert device_policy.resolve_device("cuda") == "cpu"
|
||||
|
||||
|
||||
def test_resolve_cuda_returns_cuda_when_available(monkeypatch):
|
||||
monkeypatch.delenv("RPA_VISION_DEVICE", raising=False)
|
||||
with _mock_cuda(available=True, free_gb=8.0):
|
||||
assert device_policy.resolve_device("cuda") == "cuda"
|
||||
|
||||
|
||||
# ── requested="auto" (défaut) ───────────────────────────────────────────────
|
||||
|
||||
def test_resolve_auto_cuda_when_vram_sufficient(monkeypatch):
|
||||
monkeypatch.delenv("RPA_VISION_DEVICE", raising=False)
|
||||
with _mock_cuda(available=True, free_gb=8.0):
|
||||
assert device_policy.resolve_device("auto", min_free_gb=2.0) == "cuda"
|
||||
|
||||
|
||||
def test_resolve_auto_cpu_when_vram_insufficient(monkeypatch):
|
||||
monkeypatch.delenv("RPA_VISION_DEVICE", raising=False)
|
||||
with _mock_cuda(available=True, free_gb=1.0):
|
||||
assert device_policy.resolve_device("auto", min_free_gb=2.0) == "cpu"
|
||||
|
||||
|
||||
def test_resolve_auto_cpu_when_cuda_unavailable(monkeypatch):
|
||||
monkeypatch.delenv("RPA_VISION_DEVICE", raising=False)
|
||||
with _mock_cuda(available=False):
|
||||
assert device_policy.resolve_device("auto") == "cpu"
|
||||
|
||||
|
||||
def test_resolve_default_is_auto(monkeypatch):
|
||||
"""Sans argument, le défaut est 'auto'."""
|
||||
monkeypatch.delenv("RPA_VISION_DEVICE", raising=False)
|
||||
with _mock_cuda(available=True, free_gb=8.0):
|
||||
assert device_policy.resolve_device() == "cuda"
|
||||
|
||||
|
||||
# ── garde-fou : usage total ne doit pas dépasser le plafond ─────────────────
|
||||
|
||||
def test_resolve_auto_cpu_when_switch_would_exceed_total_cap(monkeypatch):
|
||||
"""Si basculer cuda ferait dépasser le plafond d'usage total (6 Go par
|
||||
défaut), on reste CPU même si la VRAM libre dépasse min_free_gb.
|
||||
|
||||
total=12, free=4 → used=8 > cap 6 → CPU.
|
||||
"""
|
||||
monkeypatch.delenv("RPA_VISION_DEVICE", raising=False)
|
||||
with _mock_cuda(available=True, free_gb=4.0, total_gb=12.0):
|
||||
assert device_policy.resolve_device("auto", min_free_gb=2.0,
|
||||
max_total_gb=6.0) == "cpu"
|
||||
|
||||
|
||||
def test_resolve_auto_cuda_when_under_total_cap(monkeypatch):
|
||||
"""total=12, free=11 → used=1 < cap 6 et free 11 ≥ min 2 → CUDA."""
|
||||
monkeypatch.delenv("RPA_VISION_DEVICE", raising=False)
|
||||
with _mock_cuda(available=True, free_gb=11.0, total_gb=12.0):
|
||||
assert device_policy.resolve_device("auto", min_free_gb=2.0,
|
||||
max_total_gb=6.0) == "cuda"
|
||||
|
||||
|
||||
# ── override env RPA_VISION_DEVICE ──────────────────────────────────────────
|
||||
|
||||
def test_env_override_cpu_forces_cpu_even_in_auto(monkeypatch):
|
||||
monkeypatch.setenv("RPA_VISION_DEVICE", "cpu")
|
||||
with _mock_cuda(available=True, free_gb=12.0):
|
||||
assert device_policy.resolve_device("auto") == "cpu"
|
||||
assert device_policy.resolve_device("cuda") == "cpu"
|
||||
|
||||
|
||||
def test_env_override_cuda_takes_precedence_over_requested_cpu(monkeypatch):
|
||||
"""L'override env prime sur l'argument requested."""
|
||||
monkeypatch.setenv("RPA_VISION_DEVICE", "cuda")
|
||||
with _mock_cuda(available=True, free_gb=8.0):
|
||||
assert device_policy.resolve_device("cpu") == "cuda"
|
||||
|
||||
|
||||
def test_env_override_cuda_still_falls_back_when_unavailable(monkeypatch):
|
||||
monkeypatch.setenv("RPA_VISION_DEVICE", "cuda")
|
||||
with _mock_cuda(available=False):
|
||||
assert device_policy.resolve_device("auto") == "cpu"
|
||||
|
||||
|
||||
def test_env_override_invalid_value_ignored(monkeypatch):
|
||||
"""Une valeur env invalide est ignorée (on retombe sur requested)."""
|
||||
monkeypatch.setenv("RPA_VISION_DEVICE", "banana")
|
||||
with _mock_cuda(available=False):
|
||||
assert device_policy.resolve_device("auto") == "cpu"
|
||||
|
||||
|
||||
# ── robustesse : pas de crash si torch.cuda lève ───────────────────────────
|
||||
|
||||
def test_resolve_auto_cpu_on_torch_exception(monkeypatch):
|
||||
monkeypatch.delenv("RPA_VISION_DEVICE", raising=False)
|
||||
with mock.patch.object(
|
||||
device_policy.torch.cuda, "is_available",
|
||||
side_effect=RuntimeError("driver boom"),
|
||||
):
|
||||
assert device_policy.resolve_device("auto") == "cpu"
|
||||
|
||||
|
||||
# ── import-safe : pas d'effet de bord à l'import ───────────────────────────
|
||||
|
||||
def test_module_import_is_safe():
|
||||
"""Réimporter le module ne doit déclencher aucun chargement modèle/GPU."""
|
||||
importlib.reload(device_policy)
|
||||
assert hasattr(device_policy, "resolve_device")
|
||||
Reference in New Issue
Block a user