feat: replay visuel Windows opérationnel — template matching + VWB complet

- Bouton "Windows" dans VWB pour exécuter sur le PC distant - Template matching OpenCV multi-scale pour localiser les ancres visuelles - Proxy VWB→streaming server avec chargement ancre (thumb, pas full) - Fix executor Windows : mss lazy, result reporting, debug prints - Fix poll replay permanent (sans session active) - Mapping types VWB→executor (click_anchor→click, type_text→type) - CORS streaming server, capture Windows dans VWB - Dédup heartbeats côté client (hash perceptuel) - Mode cloud VLM configurable via RPA_VLM_MODEL - Fix resolve_target : pas de ScreenAnalyzer fallback (trop lent) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-17 18:56:44 +01:00
parent dd149c1cbb
commit 371db69543
7 changed files with 361 additions and 15 deletions
--- a/core/detection/ollama_client.py
+++ b/core/detection/ollama_client.py
@@ -321,10 +321,21 @@ Respond with just the role name, nothing else."""
                "confidence": 0.3, "success": True,
            }

-        prompt = """Classify this UI element. Reply with ONLY a JSON object.
+        # Le system prompt contraint le thinking de qwen3-vl et réduit
+        # drastiquement le nombre de tokens gaspillés en réflexion interne.
+        # Sans system prompt, le modèle pense 500-800 tokens et épuise le budget.
+        # Avec, il ne pense que 100-400 tokens et produit du JSON fiable.
+        system_prompt = "You are a JSON-only UI classifier. No thinking. No explanation. Output raw JSON only."
+
+        prompt = """Classify this UI element. Reply with ONLY a JSON object, nothing else.
+
 Types: button, text_input, checkbox, radio, dropdown, tab, link, icon, table_row, menu_item
 Roles: primary_action, cancel, submit, form_input, search_field, navigation, settings, close, delete, edit, save
-Example: {"type": "button", "role": "submit", "text": "OK"}
+
+Example 1: {"type": "button", "role": "submit", "text": "OK"}
+Example 2: {"type": "text_input", "role": "form_input", "text": ""}
+Example 3: {"type": "icon", "role": "close", "text": "X"}
+
 Your answer:"""

        # Retry une fois si réponse vide
@@ -332,8 +343,9 @@ Your answer:"""
            result = self.generate(
                prompt,
                image=element_image,
+                system_prompt=system_prompt,
                temperature=0.1,
-                max_tokens=200,
+                max_tokens=300,
                force_json=False
            )

--- a/core/detection/ui_detector.py
+++ b/core/detection/ui_detector.py
@@ -13,6 +13,7 @@ from typing import List, Dict, Optional, Any, Tuple
 from pathlib import Path
 from dataclasses import dataclass
 import logging
+import os
 import numpy as np
 from PIL import Image
 import cv2
@@ -67,12 +68,10 @@ class BoundingBox:
@dataclass
 class DetectionConfig:
    """Configuration de la détection UI hybride"""
-    # VLM
-    # Modèles recommandés:
-    # - "qwen2.5vl:3b" (léger, tient en GPU 12GB avec split partiel)
-    # - "qwen2.5vl:7b" (meilleur mais 13GB mémoire, CPU-only sur RTX 5070)
-    # - "qwen3-vl:8b" (plus gros, supporté mais plus d'erreurs JSON)
-    vlm_model: str = "qwen3-vl:8b"
+    # VLM — modèle configurable via variable d'environnement RPA_VLM_MODEL
+    # Production (local) : "qwen3-vl:8b" — GPU local, pas de réseau
+    # Tests (cloud) : "qwen3-vl:235b-cloud" — pas de GPU, plus lent mais libère la VRAM
+    vlm_model: str = os.environ.get("RPA_VLM_MODEL", "qwen3-vl:8b")
    vlm_endpoint: str = "http://localhost:11434"
    use_vlm_classification: bool = True  # Utiliser VLM pour classifier