chore(dgx): snapshot consolidation WIP pour transfert poc DGX
Regroupe le WIP non committé requis pour le clone/runtime DGX (Option A) : - api_stream.py : préflight replay + smoke santé modèles + handler 403 WP-B - de-hardcode VLM : vlm_config, gpu/*, vram_orchestrator, ollama_manager - stream_processor, semantic_matcher, agent_chat (app/planner/intent) - workflows.db (acquis ; le transfert artifacts le mettra à jour + rewrite chemins) - docs : plans DGX, benchmarks VLM/grounders, recherche SOTA, coordination 8 juin Snapshot destiné à la branche poc-dgx poussée sur Gitea pour cloner le DGX. Scan anti-secret : clean. graphify (repo embarqué) exclu. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -6,6 +6,16 @@ Deux modes :
|
||||
- REPLAY : VLM raisonnement (cf. get_reasoning_model) chargé, services non-essentiels stoppés
|
||||
|
||||
Bascule automatique ou manuelle selon le contexte.
|
||||
|
||||
⚠️ LIMITE POST-DGX (2026-06-05) — DETTE CONNUE :
|
||||
Cet orchestrateur a été conçu pour un Ollama **local** : le `sudo systemctl
|
||||
restart ollama` (switch_to_replay / switch_to_shadow) et `nvidia-smi`
|
||||
(get_free_vram_gb / get_used_vram_gb) ne ciblent que la machine locale.
|
||||
Or Ollama tourne désormais sur le **DGX via tunnel SSH** (OLLAMA_URL pointe
|
||||
le tunnel). Dans ce cas le restart local est **inopérant** : il ne purge PAS
|
||||
la VRAM des VLM distants et nvidia-smi mesure le GPU local, pas celui du DGX.
|
||||
À rendre conditionnel (tunnel distant vs Ollama local) avant tout usage en
|
||||
mode DGX — logique runtime inchangée ici (correction = décision Dom).
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
@@ -161,6 +161,10 @@ def is_thinking_model(model_name: str) -> bool:
|
||||
# Profil grounding par défaut — qwen3.5:9b avec ctx 4096 et prefill JSON.
|
||||
# Cohérent avec décision Codex après revue Gemini : empêcher rechauffe
|
||||
# qwen2.5vl en ctx 8192 et garantir un chemin grounding reproductible.
|
||||
# ⚠️ DETTE (2026-06-05) : qwen3.5:9b est ABSENT du endpoint Ollama/DGX → le
|
||||
# chemin grounding JSON retombe en pratique sur DEFAULT_GROUNDING_FALLBACK
|
||||
# (qwen2.5vl:7b-rpa). Ce chemin JSON est donc peu/pas exercé au runtime DGX.
|
||||
# À pull sur le DGX OU nettoyer (aligner sur le fallback) — décision Dom.
|
||||
DEFAULT_GROUNDING_MODEL = "qwen3.5:9b"
|
||||
DEFAULT_GROUNDING_CTX = 4096
|
||||
DEFAULT_GROUNDING_PREFILL = '{"x_pct":'
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
GPU Resource Management Module for RPA Vision V3
|
||||
|
||||
This module provides dynamic GPU resource allocation between ML models:
|
||||
- Ollama VLM (gemma4:e4b par défaut, configurable via RPA_VLM_MODEL) for UI classification
|
||||
- Ollama VLM (modèle central configurable via RPA_VLM_MODEL) for UI classification
|
||||
- CLIP (ViT-B-32) for embedding matching
|
||||
|
||||
The GPUResourceManager optimizes VRAM usage by:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
GPU Resource Manager - Central orchestrator for GPU resource allocation
|
||||
|
||||
Manages dynamic allocation of GPU resources between:
|
||||
- Ollama VLM (gemma4:e4b par défaut) - ~10 GB VRAM for UI classification
|
||||
- Ollama VLM (modèle reasoning/VLM central) - ~10 GB VRAM for UI classification
|
||||
- CLIP (ViT-B-32) - ~500 MB VRAM for embedding matching
|
||||
|
||||
Optimizes VRAM usage based on execution mode:
|
||||
@@ -21,6 +21,8 @@ from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any, Callable, Dict, Iterator, List, Optional
|
||||
|
||||
from core.detection.vlm_config import get_reasoning_model
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -54,7 +56,7 @@ class VRAMInfo:
|
||||
class GPUResourceConfig:
|
||||
"""Configuration for GPU resource management."""
|
||||
ollama_endpoint: str = "http://localhost:11434"
|
||||
vlm_model: str = "gemma4:e4b"
|
||||
vlm_model: str = field(default_factory=get_reasoning_model)
|
||||
clip_model: str = "ViT-B-32"
|
||||
idle_timeout_seconds: int = 300 # 5 minutes
|
||||
vram_threshold_for_clip_gpu_mb: int = 1024 # 1 GB
|
||||
|
||||
@@ -13,6 +13,8 @@ from typing import List, Optional
|
||||
|
||||
import aiohttp
|
||||
|
||||
from core.detection.vlm_config import get_reasoning_model
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -32,7 +34,7 @@ class OllamaManager:
|
||||
def __init__(
|
||||
self,
|
||||
endpoint: str = "http://localhost:11434",
|
||||
model: str = "gemma4:e4b",
|
||||
model: Optional[str] = None,
|
||||
default_keep_alive: str = "5m"
|
||||
):
|
||||
"""
|
||||
@@ -44,7 +46,7 @@ class OllamaManager:
|
||||
default_keep_alive: Default keep-alive duration
|
||||
"""
|
||||
self._endpoint = endpoint.rstrip("/")
|
||||
self._model = model
|
||||
self._model = model or get_reasoning_model()
|
||||
self._default_keep_alive = default_keep_alive
|
||||
self._session: Optional[aiohttp.ClientSession] = None
|
||||
|
||||
|
||||
@@ -20,6 +20,8 @@ from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
import json
|
||||
|
||||
from core.detection.vlm_config import get_reasoning_model
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Répertoires par défaut à scanner pour les workflows
|
||||
@@ -31,10 +33,72 @@ DEFAULT_WORKFLOW_DIRS = [
|
||||
|
||||
# Configuration Ollama par défaut
|
||||
DEFAULT_OLLAMA_ENDPOINT = "http://localhost:11434"
|
||||
DEFAULT_OLLAMA_MODEL = "qwen2.5:7b"
|
||||
DEFAULT_LLM_TIMEOUT = 10 # secondes
|
||||
|
||||
|
||||
def _default_ollama_model() -> str:
|
||||
return get_reasoning_model()
|
||||
|
||||
|
||||
DEFAULT_OLLAMA_MODEL = _default_ollama_model()
|
||||
|
||||
_WORKFLOW_TEXT_KEYS = {
|
||||
"action_type",
|
||||
"description",
|
||||
"expected_window_title",
|
||||
"label",
|
||||
"name",
|
||||
"required_texts",
|
||||
"required_window_title",
|
||||
"tags",
|
||||
"target_text",
|
||||
"text",
|
||||
"title_contains",
|
||||
"title_pattern",
|
||||
"type",
|
||||
"value",
|
||||
"vlm_description",
|
||||
"window_title",
|
||||
}
|
||||
|
||||
_WORKFLOW_TEXT_SKIP_KEYS = {
|
||||
"_prototype_vector",
|
||||
"bbox",
|
||||
"bounding_box",
|
||||
"embedding",
|
||||
"position",
|
||||
"position_x",
|
||||
"position_y",
|
||||
"vector",
|
||||
}
|
||||
|
||||
_TOKEN_SYNONYMS = {
|
||||
"blocnotes": ("bloc", "notes"),
|
||||
"blocnote": ("bloc", "notes"),
|
||||
"notepad": ("bloc", "notes"),
|
||||
"sauvegarde": ("enregistrer",),
|
||||
"sauvegarder": ("enregistrer",),
|
||||
"sauvegardes": ("enregistrer",),
|
||||
"save": ("enregistrer",),
|
||||
"saved": ("enregistrer",),
|
||||
"saving": ("enregistrer",),
|
||||
"enregistre": ("enregistrer",),
|
||||
"enregistres": ("enregistrer",),
|
||||
"enregistrez": ("enregistrer",),
|
||||
}
|
||||
|
||||
_IMPORTANT_ACTION_TOKENS = {
|
||||
"annuler",
|
||||
"dialogue",
|
||||
"ecraser",
|
||||
"enregistrer",
|
||||
"fichier",
|
||||
"ouvrir",
|
||||
"popup",
|
||||
"remplacer",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class WorkflowMatch:
|
||||
"""Résultat d'un matching de workflow."""
|
||||
@@ -88,7 +152,7 @@ class SemanticMatcher:
|
||||
workflows_dir: Union[str, List[str], None] = None,
|
||||
use_embeddings: bool = True,
|
||||
use_llm: bool = True,
|
||||
llm_model: str = DEFAULT_OLLAMA_MODEL,
|
||||
llm_model: Optional[str] = None,
|
||||
llm_endpoint: str = DEFAULT_OLLAMA_ENDPOINT,
|
||||
llm_timeout: int = DEFAULT_LLM_TIMEOUT,
|
||||
auto_reload_interval: int = 60,
|
||||
@@ -101,7 +165,7 @@ class SemanticMatcher:
|
||||
Peut être un str (un seul répertoire) ou une liste.
|
||||
use_embeddings: Utiliser les embeddings pour le matching (compatibilité)
|
||||
use_llm: Activer le matching sémantique via Ollama LLM
|
||||
llm_model: Modèle Ollama à utiliser (défaut: qwen2.5:7b)
|
||||
llm_model: Modèle Ollama à utiliser (défaut: modèle reasoning central)
|
||||
llm_endpoint: Endpoint Ollama (défaut: http://localhost:11434)
|
||||
llm_timeout: Timeout pour les appels LLM en secondes
|
||||
auto_reload_interval: Intervalle en secondes pour vérifier les nouveaux workflows (0 = désactivé)
|
||||
@@ -121,7 +185,7 @@ class SemanticMatcher:
|
||||
|
||||
self.use_embeddings = use_embeddings
|
||||
self.use_llm = use_llm
|
||||
self.llm_model = llm_model
|
||||
self.llm_model = llm_model or _default_ollama_model()
|
||||
self.llm_endpoint = llm_endpoint
|
||||
self.llm_timeout = llm_timeout
|
||||
|
||||
@@ -181,7 +245,11 @@ class SemanticMatcher:
|
||||
Nombre de workflows chargés
|
||||
"""
|
||||
count = 0
|
||||
for workflow_path in workflows_dir.glob("*.json"):
|
||||
workflow_paths = sorted(
|
||||
workflows_dir.rglob("*.json"),
|
||||
key=lambda p: (len(p.relative_to(workflows_dir).parts), str(p)),
|
||||
)
|
||||
for workflow_path in workflow_paths:
|
||||
try:
|
||||
with open(workflow_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
@@ -298,8 +366,46 @@ class SemanticMatcher:
|
||||
action_type = action.get("type", "")
|
||||
keywords.add(action_type)
|
||||
|
||||
# Workflows appris: les signaux utiles vivent souvent dans les nodes,
|
||||
# conditions et templates, pas seulement dans les tags/edges.
|
||||
for value in self._iter_workflow_text_values(workflow_data):
|
||||
keywords.update(self._tokenize(value))
|
||||
|
||||
return list(keywords)
|
||||
|
||||
def _iter_workflow_text_values(
|
||||
self,
|
||||
value: Any,
|
||||
parent_key: str = "",
|
||||
) -> List[str]:
|
||||
"""Extraire les textes courts utiles au matching depuis un workflow.
|
||||
|
||||
On évite les champs volumineux ou numériques (embeddings, bbox), mais on
|
||||
garde les titres de fenêtres, labels, valeurs et descriptions d'actions.
|
||||
"""
|
||||
texts: List[str] = []
|
||||
|
||||
if isinstance(value, dict):
|
||||
for key, child in value.items():
|
||||
key_lower = str(key).lower()
|
||||
if key_lower in _WORKFLOW_TEXT_SKIP_KEYS:
|
||||
continue
|
||||
if key_lower in _WORKFLOW_TEXT_KEYS and isinstance(child, str):
|
||||
texts.append(child)
|
||||
elif isinstance(child, (dict, list)):
|
||||
texts.extend(self._iter_workflow_text_values(child, key_lower))
|
||||
return texts
|
||||
|
||||
if isinstance(value, list):
|
||||
for item in value:
|
||||
if isinstance(item, str):
|
||||
if parent_key in _WORKFLOW_TEXT_KEYS:
|
||||
texts.append(item)
|
||||
elif isinstance(item, (dict, list)):
|
||||
texts.extend(self._iter_workflow_text_values(item, parent_key))
|
||||
|
||||
return texts
|
||||
|
||||
def _tokenize(self, text: str) -> List[str]:
|
||||
"""Tokeniser un texte en mots-clés."""
|
||||
# Normaliser
|
||||
@@ -319,7 +425,17 @@ class SemanticMatcher:
|
||||
'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been'
|
||||
}
|
||||
|
||||
return [w for w in words if len(w) > 2 and w not in stop_words]
|
||||
tokens: List[str] = []
|
||||
for word in words:
|
||||
if len(word) <= 2 or word in stop_words:
|
||||
continue
|
||||
replacement = _TOKEN_SYNONYMS.get(word)
|
||||
if replacement:
|
||||
tokens.extend(replacement)
|
||||
else:
|
||||
tokens.append(word)
|
||||
|
||||
return tokens
|
||||
|
||||
# =========================================================================
|
||||
# Matching LLM (Ollama)
|
||||
@@ -654,6 +770,11 @@ Réponds UNIQUEMENT au format JSON, sans texte avant ni après:
|
||||
if intersection:
|
||||
reasons.append(f"keywords:{','.join(intersection)}")
|
||||
|
||||
important = intersection & _IMPORTANT_ACTION_TOKENS
|
||||
if important:
|
||||
score += 0.2
|
||||
reasons.append(f"action_tokens:{','.join(sorted(important))}")
|
||||
|
||||
# 4. Matching de la description
|
||||
if metadata.description:
|
||||
desc_tokens = set(self._tokenize(metadata.description))
|
||||
|
||||
Reference in New Issue
Block a user