chore(dgx): snapshot consolidation WIP pour transfert poc DGX

Regroupe le WIP non committé requis pour le clone/runtime DGX (Option A) : - api_stream.py : préflight replay + smoke santé modèles + handler 403 WP-B - de-hardcode VLM : vlm_config, gpu/*, vram_orchestrator, ollama_manager - stream_processor, semantic_matcher, agent_chat (app/planner/intent) - workflows.db (acquis ; le transfert artifacts le mettra à jour + rewrite chemins) - docs : plans DGX, benchmarks VLM/grounders, recherche SOTA, coordination 8 juin Snapshot destiné à la branche poc-dgx poussée sur Gitea pour cloner le DGX. Scan anti-secret : clean. graphify (repo embarqué) exclu. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-08 16:33:58 +02:00
parent f18de016d7
commit 6d34b3cb68
204 changed files with 15744 additions and 47 deletions
--- a/core/cognition/vram_orchestrator.py
+++ b/core/cognition/vram_orchestrator.py
@@ -6,6 +6,16 @@ Deux modes :
 - REPLAY : VLM raisonnement (cf. get_reasoning_model) chargé, services non-essentiels stoppés

 Bascule automatique ou manuelle selon le contexte.
+
+⚠️ LIMITE POST-DGX (2026-06-05) — DETTE CONNUE :
+Cet orchestrateur a été conçu pour un Ollama **local** : le `sudo systemctl
+restart ollama` (switch_to_replay / switch_to_shadow) et `nvidia-smi`
+(get_free_vram_gb / get_used_vram_gb) ne ciblent que la machine locale.
+Or Ollama tourne désormais sur le **DGX via tunnel SSH** (OLLAMA_URL pointe
+le tunnel). Dans ce cas le restart local est **inopérant** : il ne purge PAS
+la VRAM des VLM distants et nvidia-smi mesure le GPU local, pas celui du DGX.
+À rendre conditionnel (tunnel distant vs Ollama local) avant tout usage en
+mode DGX — logique runtime inchangée ici (correction = décision Dom).
 """

 import logging
--- a/core/detection/vlm_config.py
+++ b/core/detection/vlm_config.py
@@ -161,6 +161,10 @@ def is_thinking_model(model_name: str) -> bool:
 # Profil grounding par défaut — qwen3.5:9b avec ctx 4096 et prefill JSON.
 # Cohérent avec décision Codex après revue Gemini : empêcher rechauffe
 # qwen2.5vl en ctx 8192 et garantir un chemin grounding reproductible.
+# ⚠️ DETTE (2026-06-05) : qwen3.5:9b est ABSENT du endpoint Ollama/DGX → le
+# chemin grounding JSON retombe en pratique sur DEFAULT_GROUNDING_FALLBACK
+# (qwen2.5vl:7b-rpa). Ce chemin JSON est donc peu/pas exercé au runtime DGX.
+# À pull sur le DGX OU nettoyer (aligner sur le fallback) — décision Dom.
 DEFAULT_GROUNDING_MODEL = "qwen3.5:9b"
 DEFAULT_GROUNDING_CTX = 4096
 DEFAULT_GROUNDING_PREFILL = '{"x_pct":'
--- a/core/gpu/init.py
+++ b/core/gpu/init.py
@@ -2,7 +2,7 @@
 GPU Resource Management Module for RPA Vision V3

 This module provides dynamic GPU resource allocation between ML models:
- Ollama VLM (gemma4:e4b par défaut, configurable via RPA_VLM_MODEL) for UI classification
+- Ollama VLM (modèle central configurable via RPA_VLM_MODEL) for UI classification
 - CLIP (ViT-B-32) for embedding matching

 The GPUResourceManager optimizes VRAM usage by:
--- a/core/gpu/gpu_resource_manager.py
+++ b/core/gpu/gpu_resource_manager.py
@@ -2,7 +2,7 @@
 GPU Resource Manager - Central orchestrator for GPU resource allocation

 Manages dynamic allocation of GPU resources between:
- Ollama VLM (gemma4:e4b par défaut) - ~10 GB VRAM for UI classification
+- Ollama VLM (modèle reasoning/VLM central) - ~10 GB VRAM for UI classification
 - CLIP (ViT-B-32) - ~500 MB VRAM for embedding matching

 Optimizes VRAM usage based on execution mode:
@@ -21,6 +21,8 @@ from datetime import datetime
 from enum import Enum
 from typing import Any, Callable, Dict, Iterator, List, Optional

+from core.detection.vlm_config import get_reasoning_model
+
 logger = logging.getLogger(__name__)


@@ -54,7 +56,7 @@ class VRAMInfo:
 class GPUResourceConfig:
    """Configuration for GPU resource management."""
    ollama_endpoint: str = "http://localhost:11434"
-    vlm_model: str = "gemma4:e4b"
+    vlm_model: str = field(default_factory=get_reasoning_model)
    clip_model: str = "ViT-B-32"
    idle_timeout_seconds: int = 300  # 5 minutes
    vram_threshold_for_clip_gpu_mb: int = 1024  # 1 GB
--- a/core/gpu/ollama_manager.py
+++ b/core/gpu/ollama_manager.py
@@ -13,6 +13,8 @@ from typing import List, Optional

 import aiohttp

+from core.detection.vlm_config import get_reasoning_model
+
 logger = logging.getLogger(__name__)


@@ -32,7 +34,7 @@ class OllamaManager:
    def __init__(
        self,
        endpoint: str = "http://localhost:11434",
-        model: str = "gemma4:e4b",
+        model: Optional[str] = None,
        default_keep_alive: str = "5m"
    ):
        """
@@ -44,7 +46,7 @@ class OllamaManager:
            default_keep_alive: Default keep-alive duration
        """
        self._endpoint = endpoint.rstrip("/")
-        self._model = model
+        self._model = model or get_reasoning_model()
        self._default_keep_alive = default_keep_alive
        self._session: Optional[aiohttp.ClientSession] = None
    
--- a/core/workflow/semantic_matcher.py
+++ b/core/workflow/semantic_matcher.py
@@ -20,6 +20,8 @@ from dataclasses import dataclass
 from pathlib import Path
 import json

+from core.detection.vlm_config import get_reasoning_model
+
 logger = logging.getLogger(__name__)

 # Répertoires par défaut à scanner pour les workflows
@@ -31,10 +33,72 @@ DEFAULT_WORKFLOW_DIRS = [

 # Configuration Ollama par défaut
 DEFAULT_OLLAMA_ENDPOINT = "http://localhost:11434"
-DEFAULT_OLLAMA_MODEL = "qwen2.5:7b"
 DEFAULT_LLM_TIMEOUT = 10  # secondes


+def _default_ollama_model() -> str:
+    return get_reasoning_model()
+
+
+DEFAULT_OLLAMA_MODEL = _default_ollama_model()
+
+_WORKFLOW_TEXT_KEYS = {
+    "action_type",
+    "description",
+    "expected_window_title",
+    "label",
+    "name",
+    "required_texts",
+    "required_window_title",
+    "tags",
+    "target_text",
+    "text",
+    "title_contains",
+    "title_pattern",
+    "type",
+    "value",
+    "vlm_description",
+    "window_title",
+}
+
+_WORKFLOW_TEXT_SKIP_KEYS = {
+    "_prototype_vector",
+    "bbox",
+    "bounding_box",
+    "embedding",
+    "position",
+    "position_x",
+    "position_y",
+    "vector",
+}
+
+_TOKEN_SYNONYMS = {
+    "blocnotes": ("bloc", "notes"),
+    "blocnote": ("bloc", "notes"),
+    "notepad": ("bloc", "notes"),
+    "sauvegarde": ("enregistrer",),
+    "sauvegarder": ("enregistrer",),
+    "sauvegardes": ("enregistrer",),
+    "save": ("enregistrer",),
+    "saved": ("enregistrer",),
+    "saving": ("enregistrer",),
+    "enregistre": ("enregistrer",),
+    "enregistres": ("enregistrer",),
+    "enregistrez": ("enregistrer",),
+}
+
+_IMPORTANT_ACTION_TOKENS = {
+    "annuler",
+    "dialogue",
+    "ecraser",
+    "enregistrer",
+    "fichier",
+    "ouvrir",
+    "popup",
+    "remplacer",
+}
+
+
@dataclass
 class WorkflowMatch:
    """Résultat d'un matching de workflow."""
@@ -88,7 +152,7 @@ class SemanticMatcher:
        workflows_dir: Union[str, List[str], None] = None,
        use_embeddings: bool = True,
        use_llm: bool = True,
-        llm_model: str = DEFAULT_OLLAMA_MODEL,
+        llm_model: Optional[str] = None,
        llm_endpoint: str = DEFAULT_OLLAMA_ENDPOINT,
        llm_timeout: int = DEFAULT_LLM_TIMEOUT,
        auto_reload_interval: int = 60,
@@ -101,7 +165,7 @@ class SemanticMatcher:
                           Peut être un str (un seul répertoire) ou une liste.
            use_embeddings: Utiliser les embeddings pour le matching (compatibilité)
            use_llm: Activer le matching sémantique via Ollama LLM
-            llm_model: Modèle Ollama à utiliser (défaut: qwen2.5:7b)
+            llm_model: Modèle Ollama à utiliser (défaut: modèle reasoning central)
            llm_endpoint: Endpoint Ollama (défaut: http://localhost:11434)
            llm_timeout: Timeout pour les appels LLM en secondes
            auto_reload_interval: Intervalle en secondes pour vérifier les nouveaux workflows (0 = désactivé)
@@ -121,7 +185,7 @@ class SemanticMatcher:

        self.use_embeddings = use_embeddings
        self.use_llm = use_llm
-        self.llm_model = llm_model
+        self.llm_model = llm_model or _default_ollama_model()
        self.llm_endpoint = llm_endpoint
        self.llm_timeout = llm_timeout

@@ -181,7 +245,11 @@ class SemanticMatcher:
            Nombre de workflows chargés
        """
        count = 0
-        for workflow_path in workflows_dir.glob("*.json"):
+        workflow_paths = sorted(
+            workflows_dir.rglob("*.json"),
+            key=lambda p: (len(p.relative_to(workflows_dir).parts), str(p)),
+        )
+        for workflow_path in workflow_paths:
            try:
                with open(workflow_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
@@ -298,8 +366,46 @@ class SemanticMatcher:
                action_type = action.get("type", "")
                keywords.add(action_type)

+        # Workflows appris: les signaux utiles vivent souvent dans les nodes,
+        # conditions et templates, pas seulement dans les tags/edges.
+        for value in self._iter_workflow_text_values(workflow_data):
+            keywords.update(self._tokenize(value))
+
        return list(keywords)

+    def _iter_workflow_text_values(
+        self,
+        value: Any,
+        parent_key: str = "",
+    ) -> List[str]:
+        """Extraire les textes courts utiles au matching depuis un workflow.
+
+        On évite les champs volumineux ou numériques (embeddings, bbox), mais on
+        garde les titres de fenêtres, labels, valeurs et descriptions d'actions.
+        """
+        texts: List[str] = []
+
+        if isinstance(value, dict):
+            for key, child in value.items():
+                key_lower = str(key).lower()
+                if key_lower in _WORKFLOW_TEXT_SKIP_KEYS:
+                    continue
+                if key_lower in _WORKFLOW_TEXT_KEYS and isinstance(child, str):
+                    texts.append(child)
+                elif isinstance(child, (dict, list)):
+                    texts.extend(self._iter_workflow_text_values(child, key_lower))
+            return texts
+
+        if isinstance(value, list):
+            for item in value:
+                if isinstance(item, str):
+                    if parent_key in _WORKFLOW_TEXT_KEYS:
+                        texts.append(item)
+                elif isinstance(item, (dict, list)):
+                    texts.extend(self._iter_workflow_text_values(item, parent_key))
+
+        return texts
+
    def _tokenize(self, text: str) -> List[str]:
        """Tokeniser un texte en mots-clés."""
        # Normaliser
@@ -319,7 +425,17 @@ class SemanticMatcher:
            'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been'
        }

-        return [w for w in words if len(w) > 2 and w not in stop_words]
+        tokens: List[str] = []
+        for word in words:
+            if len(word) <= 2 or word in stop_words:
+                continue
+            replacement = _TOKEN_SYNONYMS.get(word)
+            if replacement:
+                tokens.extend(replacement)
+            else:
+                tokens.append(word)
+
+        return tokens

    # =========================================================================
    # Matching LLM (Ollama)
@@ -654,6 +770,11 @@ Réponds UNIQUEMENT au format JSON, sans texte avant ni après:
            if intersection:
                reasons.append(f"keywords:{','.join(intersection)}")

+            important = intersection & _IMPORTANT_ACTION_TOKENS
+            if important:
+                score += 0.2
+                reasons.append(f"action_tokens:{','.join(sorted(important))}")
+
        # 4. Matching de la description
        if metadata.description:
            desc_tokens = set(self._tokenize(metadata.description))