feat(p1z): centralize V4 reasoning model resolution (DGX-safe)

Remplace le default runtime dangereux `qwen2.5vl:7b` (absent du tunnel DGX -> 404) des chemins V4/reasoning par un helper central get_reasoning_model(). - core/detection/vlm_config.py : + get_reasoning_model() + DEFAULT_REASONING_MODEL (qwen2.5vl:7b-rpa). Ordre : RPA_REASONING_MODEL -> RPA_VLM_MODEL/VLM_MODEL -> default DGX-safe. Pas d'appel reseau (lazy, safe a l'import). - core/execution/input_handler.py, observe_reason_act.py (x3), core/cognition/vram_orchestrator.py : migration des 5 call-sites. - tests/unit/test_reasoning_model.py : 8 tests (default DGX-safe, ordre de resolution, non-regression wiring des 3 modules V4). Hors scope (signale lot P1.w) : DEFAULT_VLM_MODEL=gemma4:latest reste fallback de get_vlm_model(). Client gele non touche. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-04 16:23:10 +02:00
parent 4dc7d840d6
commit 806cc04b82
5 changed files with 130 additions and 6 deletions
--- a/core/cognition/vram_orchestrator.py
+++ b/core/cognition/vram_orchestrator.py
@@ -3,7 +3,7 @@ Orchestrateur VRAM — gère le chargement/déchargement des modèles selon le m

 Deux modes :
 - SHADOW : streaming server + agent_chat actifs, VLM raisonnement déchargé
- REPLAY : VLM raisonnement (qwen2.5vl:7b) chargé, services non-essentiels stoppés
+- REPLAY : VLM raisonnement (cf. get_reasoning_model) chargé, services non-essentiels stoppés

 Bascule automatique ou manuelle selon le contexte.
 """
@@ -15,10 +15,12 @@ import time
 from enum import Enum
 from typing import Optional

+from core.detection.vlm_config import get_reasoning_model
+
 logger = logging.getLogger(__name__)

 OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
-REASONING_MODEL = os.environ.get("RPA_REASONING_MODEL", "qwen2.5vl:7b")
+REASONING_MODEL = get_reasoning_model()
 MIN_VRAM_FOR_REASONING = 5.0  # Go minimum pour charger le modèle de raisonnement


--- a/core/detection/vlm_config.py
+++ b/core/detection/vlm_config.py
@@ -261,6 +261,42 @@ def get_bbox_grounding_model() -> str:
    )


+# ────────────────────────────────────────────────────────────────────────────
+# P1.z (2026-06-04) : résolution centralisée du modèle V4/reasoning, DGX-safe
+# ────────────────────────────────────────────────────────────────────────────
+
+# Modèle de raisonnement V4/ORA par défaut — DGX-safe.
+# Les chemins reasoning (ORALoop, détection dialogue/popup, vram_orchestrator)
+# font du VLM généraliste sur screenshot (JSON action/decision), pas du grounding
+# bbox. Le default est aligné sur le modèle présent sur le tunnel DGX
+# (qwen2.5vl:7b-rpa), PAS sur `qwen2.5vl:7b` brut qui est absent du DGX → 404.
+DEFAULT_REASONING_MODEL = "qwen2.5vl:7b-rpa"
+
+
+def get_reasoning_model() -> str:
+    """Retourne le modèle pour les chemins V4/reasoning (ORALoop, détection
+    dialogue/popup, orchestration VRAM).
+
+    Distinct du grounding (get_grounding_profile / get_bbox_grounding_model) :
+    ici on raisonne en langage naturel + JSON sur un screenshot, pas de
+    coordonnées. Pas d'appel réseau (résolution lazy, safe à l'import).
+
+    Ordre de résolution :
+      1. RPA_REASONING_MODEL (dédié, prioritaire)
+      2. RPA_VLM_MODEL / VLM_MODEL (hérite de la config VLM existante)
+      3. DEFAULT_REASONING_MODEL (qwen2.5vl:7b-rpa, présent sur DGX)
+
+    Returns:
+        Nom du modèle de raisonnement (ex: "qwen2.5vl:7b-rpa").
+    """
+    return (
+        os.environ.get("RPA_REASONING_MODEL")
+        or os.environ.get("RPA_VLM_MODEL")
+        or os.environ.get("VLM_MODEL")
+        or DEFAULT_REASONING_MODEL
+    )
+
+
 def needs_think_false(model_name: str) -> bool:
    """Détermine si un modèle nécessite think=false dans le payload.

--- a/core/execution/input_handler.py
+++ b/core/execution/input_handler.py
@@ -14,6 +14,8 @@ import shutil
 import time
 from typing import Any, Dict, List, Optional

+from core.detection.vlm_config import get_reasoning_model
+
 logger = logging.getLogger(__name__)

 try:
@@ -291,7 +293,7 @@ Si l'écran est normal sans action nécessaire, réponds action="nothing".
 Réponds UNIQUEMENT le JSON, pas d'explication."""

        ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
-        model = os.environ.get("RPA_REASONING_MODEL", "qwen2.5vl:7b")
+        model = get_reasoning_model()

        response = requests.post(
            f"{ollama_url}/api/generate",
--- a/core/execution/observe_reason_act.py
+++ b/core/execution/observe_reason_act.py
@@ -21,6 +21,8 @@ import re
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional

+from core.detection.vlm_config import get_reasoning_model
+
 logger = logging.getLogger(__name__)

 # Import du contexte cognitif (mémoire de travail)
@@ -407,7 +409,7 @@ Règles:

            # --- Appel VLM (Ollama) ---
            ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
-            model = os.environ.get("RPA_REASONING_MODEL", "qwen2.5vl:7b")
+            model = get_reasoning_model()

            print(f"🧠 [ORA/reason_instruction] Appel VLM {model}...")

@@ -1207,7 +1209,7 @@ Règles:
            image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')

            ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
-            model = os.environ.get("RPA_REASONING_MODEL", "qwen2.5vl:7b")
+            model = get_reasoning_model()

            resp = requests.post(f"{ollama_url}/api/generate", json={
                "model": model,
@@ -1963,7 +1965,7 @@ Règles:
            )

            ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
-            model = os.environ.get("RPA_REASONING_MODEL", "qwen2.5vl:7b")
+            model = get_reasoning_model()

            response = requests.post(
                f"{ollama_url}/api/generate",