feat: replay visuel Windows opérationnel — template matching + VWB complet

- Bouton "Windows" dans VWB pour exécuter sur le PC distant - Template matching OpenCV multi-scale pour localiser les ancres visuelles - Proxy VWB→streaming server avec chargement ancre (thumb, pas full) - Fix executor Windows : mss lazy, result reporting, debug prints - Fix poll replay permanent (sans session active) - Mapping types VWB→executor (click_anchor→click, type_text→type) - CORS streaming server, capture Windows dans VWB - Dédup heartbeats côté client (hash perceptuel) - Mode cloud VLM configurable via RPA_VLM_MODEL - Fix resolve_target : pas de ScreenAnalyzer fallback (trop lent) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-17 18:56:44 +01:00
parent dd149c1cbb
commit 371db69543
7 changed files with 361 additions and 15 deletions
--- a/core/detection/ollama_client.py
+++ b/core/detection/ollama_client.py
@@ -321,10 +321,21 @@ Respond with just the role name, nothing else."""
                "confidence": 0.3, "success": True,
            }
-        prompt = """Classify this UI element. Reply with ONLY a JSON object.
+        # Le system prompt contraint le thinking de qwen3-vl et réduit
        # drastiquement le nombre de tokens gaspillés en réflexion interne.
        # Sans system prompt, le modèle pense 500-800 tokens et épuise le budget.
        # Avec, il ne pense que 100-400 tokens et produit du JSON fiable.
        system_prompt = "You are a JSON-only UI classifier. No thinking. No explanation. Output raw JSON only."
        prompt = """Classify this UI element. Reply with ONLY a JSON object, nothing else.
 Types: button, text_input, checkbox, radio, dropdown, tab, link, icon, table_row, menu_item
 Roles: primary_action, cancel, submit, form_input, search_field, navigation, settings, close, delete, edit, save
-Example: {"type": "button", "role": "submit", "text": "OK"}
+
 Example 1: {"type": "button", "role": "submit", "text": "OK"}
 Example 2: {"type": "text_input", "role": "form_input", "text": ""}
 Example 3: {"type": "icon", "role": "close", "text": "X"}
 Your answer:"""
        # Retry une fois si réponse vide
@@ -332,8 +343,9 @@ Your answer:"""
            result = self.generate(
                prompt,
                image=element_image,
                system_prompt=system_prompt,
                temperature=0.1,
-                max_tokens=200,
+                max_tokens=300,
                force_json=False
            )
--- a/core/detection/ui_detector.py
+++ b/core/detection/ui_detector.py
@@ -13,6 +13,7 @@ from typing import List, Dict, Optional, Any, Tuple
 from pathlib import Path
 from dataclasses import dataclass
 import logging
 import os
 import numpy as np
 from PIL import Image
 import cv2
@@ -67,12 +68,10 @@ class BoundingBox:
@dataclass
 class DetectionConfig:
    """Configuration de la détection UI hybride"""
-    # VLM
+    # VLM — modèle configurable via variable d'environnement RPA_VLM_MODEL
-    # Modèles recommandés:
+    # Production (local) : "qwen3-vl:8b" — GPU local, pas de réseau
-    # - "qwen2.5vl:3b" (léger, tient en GPU 12GB avec split partiel)
+    # Tests (cloud) : "qwen3-vl:235b-cloud" — pas de GPU, plus lent mais libère la VRAM
-    # - "qwen2.5vl:7b" (meilleur mais 13GB mémoire, CPU-only sur RTX 5070)
+    vlm_model: str = os.environ.get("RPA_VLM_MODEL", "qwen3-vl:8b")
    # - "qwen3-vl:8b" (plus gros, supporté mais plus d'erreurs JSON)
    vlm_model: str = "qwen3-vl:8b"
    vlm_endpoint: str = "http://localhost:11434"
    use_vlm_classification: bool = True  # Utiliser VLM pour classifier
--- a/visual_workflow_builder/backend/api/screen_capture.py
+++ b/visual_workflow_builder/backend/api/screen_capture.py
@@ -120,6 +120,60 @@ def capture_screen():
        }), 500
@screen_capture_bp.route('/capture-windows', methods=['POST'])
@cross_origin()
 def capture_windows():
    """
    Récupère le dernier screenshot du PC Windows (via streaming server).
    Le client Agent V1 envoie des heartbeats toutes les 5s.
    On récupère le plus récent comme capture.
    """
    import glob
    from pathlib import Path
    # Remonter jusqu'à la racine du projet (rpa_vision_v3/)
    project_root = Path(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
    live_dir = project_root / "data" / "training" / "live_sessions"
    # Trouver la session la plus récente
    sessions = sorted(live_dir.glob("sess_*/shots"), key=lambda p: p.parent.name, reverse=True)
    if not sessions:
        return jsonify({'error': 'Aucune session Windows trouvée'}), 404
    # Chercher le screenshot plein écran le plus récent (full ou heartbeat, pas les crops)
    latest_shot = None
    for session_shots in sessions[:3]:
        shots = [s for s in session_shots.glob("*.png")
                 if "full" in s.name or "heartbeat" in s.name or "focus" in s.name]
        if shots:
            shots.sort(key=lambda p: p.stat().st_mtime, reverse=True)
            latest_shot = shots[0]
            break
    if not latest_shot:
        return jsonify({'error': 'Aucun screenshot Windows disponible'}), 404
    try:
        from PIL import Image
        img = Image.open(latest_shot)
        buf = io.BytesIO()
        img.save(buf, format='PNG')
        img_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')
        return jsonify({
            'image': img_base64,
            'width': img.width,
            'height': img.height,
            'format': 'png',
            'source': 'windows',
            'file': str(latest_shot.name),
            'session': latest_shot.parent.parent.name,
        })
    except Exception as e:
        return jsonify({'error': str(e)}), 500
@screen_capture_bp.route('/detect-elements', methods=['POST'])
@cross_origin()
 def detect_elements():
--- a/visual_workflow_builder/backend/api_v3/dag_execute.py
+++ b/visual_workflow_builder/backend/api_v3/dag_execute.py
@@ -775,3 +775,198 @@ def upload_excel():
        'filename': file.filename,
        'suggested_table': suggested,
    })
 # ---------------------------------------------------------------------------
 # Exécution sur Windows — proxy vers le streaming server (port 5005)
 # ---------------------------------------------------------------------------
 def _load_anchor_image_b64(anchor_id: str) -> Optional[str]:
    """Charger l'image d'une ancre et la retourner en base64.
    Cherche dans 3 emplacements possibles :
    1. data/anchors/{id}_full.png (nouveau format V3)
    2. data/anchor_images/{id}/original.png (ancien format)
    3. SQLite visual_anchors.image_path (chemin absolu en BDD)
    """
    import base64 as b64
    backend_dir = Path(__file__).resolve().parent.parent
    # 1. Nouveau format : data/anchors/{id}_thumb.png (crop de l'ancre, pas le screenshot complet)
    new_path = backend_dir / 'data' / 'anchors' / f'{anchor_id}_thumb.png'
    if new_path.exists():
        try:
            with open(new_path, 'rb') as f:
                return b64.b64encode(f.read()).decode('utf-8')
        except Exception as e:
            logger.error("Erreur lecture ancre %s : %s", new_path, e)
    # 2. Ancien format : data/anchor_images/{id}/original.png
    old_path = backend_dir / 'data' / 'anchor_images' / anchor_id / 'original.png'
    if old_path.exists():
        try:
            with open(old_path, 'rb') as f:
                return b64.b64encode(f.read()).decode('utf-8')
        except Exception as e:
            logger.error("Erreur lecture ancre %s : %s", old_path, e)
    # 3. Chemin depuis la BDD
    try:
        import sqlite3
        db_path = backend_dir / 'instance' / 'workflows.db'
        conn = sqlite3.connect(str(db_path))
        row = conn.execute("SELECT image_path FROM visual_anchors WHERE id=?", (anchor_id,)).fetchone()
        conn.close()
        if row and row[0] and Path(row[0]).exists():
            with open(row[0], 'rb') as f:
                return b64.b64encode(f.read()).decode('utf-8')
    except Exception as e:
        logger.error("Erreur lecture ancre BDD %s : %s", anchor_id, e)
    logger.warning("Image ancre introuvable pour %s", anchor_id)
    return None
 def _load_anchor_metadata(anchor_id: str) -> Optional[Dict]:
    """Charger les métadonnées d'une ancre (bounding_box, taille, etc.)."""
    backend_dir = Path(__file__).resolve().parent.parent
    # 1. Ancien format : metadata.json
    meta_path = backend_dir / 'data' / 'anchor_images' / anchor_id / 'metadata.json'
    if meta_path.exists():
        try:
            with open(meta_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception:
            pass
    # 2. Depuis la BDD visual_anchors
    try:
        import sqlite3
        db_path = backend_dir / 'instance' / 'workflows.db'
        conn = sqlite3.connect(str(db_path))
        row = conn.execute(
            "SELECT bbox_x, bbox_y, bbox_width, bbox_height, screen_width, screen_height "
            "FROM visual_anchors WHERE id=?", (anchor_id,)
        ).fetchone()
        conn.close()
        if row:
            return {
                'bounding_box': {'x': row[0], 'y': row[1], 'width': row[2], 'height': row[3]},
                'original_size': {'width': row[4] or 1920, 'height': row[5] or 1080},
            }
    except Exception:
        pass
    return None
@api_v3_bp.route('/execute-windows', methods=['POST'])
 def execute_windows():
    """Proxy les actions du workflow vers le streaming server pour exécution sur Windows.
    Le navigateur ne peut pas contacter le port 5005 directement (CORS/réseau),
    donc le backend VWB sert de proxy.
    Pour les actions click_anchor, charge l'image de l'ancre visuelle depuis le
    disque et l'inclut en base64 dans target_spec afin que l'exécuteur Windows
    puisse résoudre la position par template matching (visual_mode).
    """
    import requests as req
    data = request.get_json()
    if not data:
        return jsonify({'error': 'Aucune donnée'}), 400
    # Mapper les types VWB → types executor Windows
    TYPE_MAP = {
        'click_anchor': 'click',
        'double_click_anchor': 'click',
        'right_click_anchor': 'click',
        'type_text': 'type',
        'type_secret': 'type',
        'keyboard_shortcut': 'key_combo',
        'hotkey': 'key_combo',
        'scroll_to_anchor': 'scroll',
        'wait_for_anchor': 'wait',
        'visual_condition': 'wait',
    }
    # Types d'actions basées sur une ancre visuelle (nécessitent visual_mode)
    _ANCHOR_CLICK_TYPES = {'click_anchor', 'double_click_anchor', 'right_click_anchor'}
    if 'actions' in data:
        for action in data['actions']:
            vwb_type = action.get('type', '')
            params = action.get('parameters', {})
            # Mapper le type VWB → type executor
            mapped_type = TYPE_MAP.get(vwb_type, vwb_type)
            action['type'] = mapped_type
            # ---------------------------------------------------------------
            # Actions basées sur ancre visuelle → injecter visual_mode
            # ---------------------------------------------------------------
            if vwb_type in _ANCHOR_CLICK_TYPES:
                anchor_id = action.get('anchor_id')
                if anchor_id:
                    anchor_b64 = _load_anchor_image_b64(anchor_id)
                    if anchor_b64:
                        # Charger les métadonnées pour le bounding_box de référence
                        anchor_meta = _load_anchor_metadata(anchor_id)
                        target_spec = {
                            'anchor_image_base64': anchor_b64,
                            'anchor_id': anchor_id,
                        }
                        if anchor_meta:
                            target_spec['anchor_bbox'] = anchor_meta.get('bounding_box', {})
                            target_spec['original_size'] = anchor_meta.get('original_size', {})
                        action['visual_mode'] = True
                        action['target_spec'] = target_spec
                        logger.info(
                            "Action %s : ancre '%s' chargée (%d Ko), visual_mode activé",
                            action.get('action_id', '?'),
                            anchor_id,
                            len(anchor_b64) // 1024,
                        )
                    else:
                        logger.warning(
                            "Action %s : ancre '%s' introuvable, fallback blind mode",
                            action.get('action_id', '?'),
                            anchor_id,
                        )
                # Mapper le bouton selon le type de clic VWB
                if vwb_type == 'double_click_anchor':
                    action['button'] = 'double'
                elif vwb_type == 'right_click_anchor':
                    action['button'] = 'right'
            # ---------------------------------------------------------------
            # type_text / type_secret → extraire le texte
            # ---------------------------------------------------------------
            if vwb_type in ('type_text', 'type_secret') and 'text' in params:
                action['text'] = params['text']
                # Ne pas forcer un clic préalable à (0,0) si pas de coordonnées
                # L'exécuteur ne cliquera que si x_pct > 0 et y_pct > 0
                # (le clic de positionnement est fait par l'action click_anchor précédente)
            # ---------------------------------------------------------------
            # keyboard_shortcut / hotkey → extraire les touches
            # ---------------------------------------------------------------
            if vwb_type in ('keyboard_shortcut', 'hotkey') and 'keys' in params:
                action['keys'] = params['keys']
    try:
        resp = req.post(
            'http://localhost:5005/api/v1/traces/stream/replay/raw',
            json=data,
            timeout=30,  # Augmenté car le template matching peut prendre du temps
        )
        return jsonify(resp.json()), resp.status_code
    except req.ConnectionError:
        return jsonify({'error': 'Streaming server (port 5005) non disponible'}), 503
    except Exception as e:
        return jsonify({'error': str(e)}), 500
--- a/visual_workflow_builder/frontend_v4/src/components/CapturePanel.tsx
+++ b/visual_workflow_builder/frontend_v4/src/components/CapturePanel.tsx
@@ -150,6 +150,29 @@ export default function CapturePanel({
        <button onClick={onCapture} disabled={countdown !== null}>
          Capturer
        </button>
        <button
          onClick={async () => {
            try {
              const resp = await fetch('/api/screen-capture/capture-windows', { method: 'POST' });
              const data = await resp.json();
              if (data.image) {
                const fakeCapture = {
                  screenshot_base64: data.image,
                  width: data.width,
                  height: data.height,
                  source: 'windows',
                };
                setCurrentCapture(fakeCapture as any);
              }
            } catch (err) {
              console.error('Capture Windows échouée:', err);
            }
          }}
          title="Capture le dernier écran du PC Windows"
          style={{ fontSize: '12px' }}
        >
          🖥️ Windows
        </button>
        <select value={timerSeconds} onChange={(e) => setTimerSeconds(Number(e.target.value))}>
          <option value="0">Immédiat</option>
          <option value="3">3 sec</option>
--- a/visual_workflow_builder/frontend_v4/src/components/ExecutionControls.tsx
+++ b/visual_workflow_builder/frontend_v4/src/components/ExecutionControls.tsx
@@ -1,3 +1,4 @@
 // @ts-nocheck
 import type { Execution } from '../types';
 interface Props {
@@ -9,12 +10,73 @@ interface Props {
 export default function ExecutionControls({ execution, onStart, onStop }: Props) {
  const isRunning = execution?.status === 'running' || execution?.status === 'paused';
  const handleExecuteWindows = async () => {
    try {
      // Récupérer le workflow actif depuis l'état de la session
      const stateResp = await fetch('/api/v3/session/state');
      const state = await stateResp.json();
      let workflowId = state?.session?.active_workflow_id;
      let steps = state?.workflow?.steps || [];
      // Si pas de workflow actif, essayer de charger le premier disponible
      if (!steps.length && state?.workflows_list?.length) {
        const firstWf = state.workflows_list[0];
        workflowId = firstWf.id;
        // Charger les étapes du workflow
        const wfResp = await fetch(`/api/v3/workflow/${firstWf.id}`);
        const wfData = await wfResp.json();
        steps = wfData?.steps || wfData?.workflow?.steps || [];
      }
      if (!steps.length) {
        alert('Aucune étape dans le workflow. Sélectionnez un workflow d\'abord.');
        return;
      }
      // Via le proxy Vite (/api → port 5002)
      const resp = await fetch('/api/v3/execute-windows', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({
          workflow_id: workflowId,
          session_id: `replay_${Date.now()}`,
          actions: steps.map((step: any, i: number) => ({
            action_id: step.id || `action_${i}`,
            type: step.action_type,
            parameters: step.parameters || {},
            anchor_id: step.anchor_id || null,
            order: i,
          })),
        }),
      });
      const result = await resp.json();
      if (result.replay_id) {
        alert(`Replay lancé sur Windows ! ID: ${result.replay_id}`);
      } else {
        alert(`Erreur: ${result.error || JSON.stringify(result)}`);
      }
    } catch (err) {
      alert(`Erreur connexion streaming server: ${err}`);
    }
  };
  return (
    <div className="execution-controls">
      {!isRunning ? (
        <div style={{ display: 'flex', gap: '4px' }}>
          <button className="btn-start" onClick={onStart}>
            ▶️ Exécuter
          </button>
          <button
            className="btn-start"
            onClick={handleExecuteWindows}
            style={{ background: '#0078d4', fontSize: '12px' }}
            title="Envoyer les actions au PC Windows via le streaming server"
          >
            🖥️ Windows
          </button>
        </div>
      ) : (
        <>
          <div className="exec-status">
--- a/visual_workflow_builder/frontend_v4/src/services/uiDetection.ts
+++ b/visual_workflow_builder/frontend_v4/src/services/uiDetection.ts
@@ -2,8 +2,8 @@
 * Service de détection UI (UI-DETR-1)
 */
-// VWB backend (port 5002) — contient le screen capturer et la détection UI
+// Via le proxy Vite (/api → port 5002) — fonctionne depuis n'importe quel navigateur
-const API_BASE = `http://${window.location.hostname}:5002`;
+const API_BASE = '';
 export interface UIElement {
  id: number;
@@ -52,7 +52,8 @@ export async function detectUIElements(
      'Content-Type': 'application/json',
    },
    body: JSON.stringify({
-      image_base64: imageBase64,
+      // Enlever le préfixe data:image/...;base64, si présent
      image_base64: imageBase64.replace(/^data:image\/[^;]+;base64,/, ''),
      threshold: options.threshold ?? 0.35,
      annotate: options.annotate ?? false,
      show_confidence: options.showConfidence ?? false,