feat(grounding): pipeline centralisé + serveur UI-TARS transformers + nettoyage code mort

Architecture grounding complète : - core/grounding/server.py : serveur FastAPI (port 8200) avec UI-TARS-1.5-7B en 4-bit NF4 Process séparé avec son propre contexte CUDA (résout le crash Flask/CUDA) - core/grounding/pipeline.py : orchestrateur cascade template→OCR→UI-TARS→static - core/grounding/template_matcher.py : TemplateMatcher centralisé (remplace 5 copies) - core/grounding/ui_tars_grounder.py : client HTTP vers le serveur de grounding - core/grounding/target.py : GroundingTarget + GroundingResult ORA modifié : - _act_click() : capture unique de l'écran envoyée au serveur de grounding - Pre-check VLM skippé pour ui_tars (redondant, et Ollama n'a plus de VRAM) - verify_level='none' par défaut (vérification titre OCR prévue en Phase 2) - Détection réponses négatives UI-TARS ("I don't see it" → fallback OCR) Nettoyage : - 9 fichiers morts archivés dans _archive/ (~6300 lignes supprimées) - 21 tests ajoutés pour TemplateMatcher Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 17:48:18 +02:00
parent 16ff396dbf
commit 9da589c8c2
20 changed files with 1862 additions and 15 deletions
--- a/tools/benchmark_grounding.py
+++ b/tools/benchmark_grounding.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+"""
+Benchmark complet des méthodes de grounding visuel.
+À lancer avec la VM Windows visible à l'écran, bureau avec dossier Demo.
+
+Usage:
+    cd ~/ai/rpa_vision_v3
+    .venv/bin/python3 tools/benchmark_grounding.py
+"""
+import mss, io, base64, requests, time, re, cv2, numpy as np, os, glob, json
+from PIL import Image
+
+OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
+ANCHOR_DIR = 'visual_workflow_builder/backend/data/anchors'
+
+
+def capture_screen():
+    with mss.mss() as sct:
+        grab = sct.grab(sct.monitors[0])
+        screen = Image.frombytes('RGB', grab.size, grab.rgb)
+    return screen
+
+
+def screen_to_b64(screen):
+    buf = io.BytesIO()
+    screen.save(buf, format='JPEG', quality=70)
+    return base64.b64encode(buf.getvalue()).decode()
+
+
+def parse_coords(text, screen_w, screen_h):
+    for pat in [
+        r"start_box='?\<?\|?box_start\|?\>?\((\d+),(\d+)\)",
+        r'\((\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\)',
+        r'\[(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\]',
+    ]:
+        m = re.search(pat, text)
+        if m:
+            rx, ry = float(m.group(1)), float(m.group(2))
+            if rx <= 1.0 and ry <= 1.0:
+                return int(rx * screen_w), int(ry * screen_h)
+            elif rx <= 1000 and ry <= 1000:
+                return int(rx * screen_w / 1000), int(ry * screen_h / 1000)
+            return int(rx), int(ry)
+    return None
+
+
+def test_vlm(model, prompt, b64, screen_w, screen_h):
+    t0 = time.time()
+    try:
+        resp = requests.post(f'{OLLAMA_URL}/api/generate', json={
+            'model': model, 'prompt': prompt, 'images': [b64],
+            'stream': False, 'options': {'temperature': 0.0, 'num_predict': 50}
+        }, timeout=60)
+        elapsed = time.time() - t0
+        if resp.status_code != 200:
+            return elapsed, None, f"HTTP {resp.status_code}"
+        text = resp.json().get('response', '').strip()
+        coords = parse_coords(text, screen_w, screen_h)
+        return elapsed, coords, text[:120]
+    except Exception as e:
+        return time.time() - t0, None, str(e)[:80]
+
+
+def test_template(screen_gray, anchor_path):
+    anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE)
+    if anchor is None:
+        return None
+    ah, aw = anchor.shape[:2]
+    if ah >= screen_gray.shape[0] or aw >= screen_gray.shape[1]:
+        return None
+    t0 = time.time()
+    result = cv2.matchTemplate(screen_gray, anchor, cv2.TM_CCOEFF_NORMED)
+    _, max_val, _, max_loc = cv2.minMaxLoc(result)
+    elapsed = (time.time() - t0) * 1000
+    return {
+        'method': 'template', 'time_ms': elapsed,
+        'score': max_val, 'pos': (max_loc[0] + aw//2, max_loc[1] + ah//2)
+    }
+
+
+def test_template_multiscale(screen_gray, anchor_path, scales=(0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3)):
+    anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE)
+    if anchor is None:
+        return None
+    ah, aw = anchor.shape[:2]
+    t0 = time.time()
+    best_val, best_loc, best_scale = 0, None, 1.0
+    for s in scales:
+        resized = cv2.resize(anchor, None, fx=s, fy=s)
+        rh, rw = resized.shape[:2]
+        if rh >= screen_gray.shape[0] or rw >= screen_gray.shape[1]:
+            continue
+        res = cv2.matchTemplate(screen_gray, resized, cv2.TM_CCOEFF_NORMED)
+        _, mv, _, ml = cv2.minMaxLoc(res)
+        if mv > best_val:
+            best_val, best_loc, best_scale = mv, ml, s
+    elapsed = (time.time() - t0) * 1000
+    if best_loc is None:
+        return None
+    rh, rw = int(ah * best_scale), int(aw * best_scale)
+    return {
+        'method': 'template_multiscale', 'time_ms': elapsed,
+        'score': best_val, 'pos': (best_loc[0] + rw//2, best_loc[1] + rh//2),
+        'scale': best_scale
+    }
+
+
+def test_orb(screen_gray, anchor_path, max_distance=50):
+    anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE)
+    if anchor is None:
+        return None
+    t0 = time.time()
+    orb = cv2.ORB_create(nfeatures=1000)
+    kp1, des1 = orb.detectAndCompute(anchor, None)
+    kp2, des2 = orb.detectAndCompute(screen_gray, None)
+    if des1 is None or des2 is None or len(des1) < 2 or len(des2) < 2:
+        return {'method': 'ORB', 'time_ms': (time.time()-t0)*1000, 'matches': 0, 'pos': None}
+    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
+    matches = bf.match(des1, des2)
+    good = sorted([m for m in matches if m.distance < max_distance], key=lambda m: m.distance)
+    elapsed = (time.time() - t0) * 1000
+    pos = None
+    if len(good) >= 4:
+        pts = np.float32([kp2[m.trainIdx].pt for m in good])
+        pos = (int(np.median(pts[:, 0])), int(np.median(pts[:, 1])))
+    return {'method': 'ORB', 'time_ms': elapsed, 'matches': len(good), 'pos': pos}
+
+
+def test_akaze(screen_gray, anchor_path, max_distance=80):
+    anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE)
+    if anchor is None:
+        return None
+    t0 = time.time()
+    akaze = cv2.AKAZE_create()
+    kp1, des1 = akaze.detectAndCompute(anchor, None)
+    kp2, des2 = akaze.detectAndCompute(screen_gray, None)
+    if des1 is None or des2 is None or len(des1) < 2 or len(des2) < 2:
+        return {'method': 'AKAZE', 'time_ms': (time.time()-t0)*1000, 'matches': 0, 'pos': None}
+    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
+    matches = bf.match(des1, des2)
+    good = sorted([m for m in matches if m.distance < max_distance], key=lambda m: m.distance)
+    elapsed = (time.time() - t0) * 1000
+    pos = None
+    if len(good) >= 4:
+        pts = np.float32([kp2[m.trainIdx].pt for m in good])
+        pos = (int(np.median(pts[:, 0])), int(np.median(pts[:, 1])))
+    return {'method': 'AKAZE', 'time_ms': elapsed, 'matches': len(good), 'pos': pos}
+
+
+def main():
+    print("="*70)
+    print("BENCHMARK GROUNDING — Léa RPA Vision")
+    print("="*70)
+
+    screen = capture_screen()
+    screen_w, screen_h = screen.size
+    b64 = screen_to_b64(screen)
+    screen_cv = cv2.cvtColor(np.array(screen), cv2.COLOR_RGB2BGR)
+    screen_gray = cv2.cvtColor(screen_cv, cv2.COLOR_BGR2GRAY)
+    print(f"Écran: {screen_w}x{screen_h}\n")
+
+    # ── VLM grounding ──
+    print("─── VLM GROUNDING (cible: 'Demo folder') ───")
+    vlm_tests = [
+        ("qwen3-vl:8b", 'Click on "Demo folder". Return the action in format: click(start_box="(x,y)") with coordinates normalized 0-1000.'),
+        ("qwen2.5vl:7b", 'Click on "Demo folder". Return the action in format: click(start_box="(x,y)") with coordinates normalized 0-1000.'),
+        ("moondream:latest", 'Where is the "Demo" folder icon? Give coordinates as (x, y) in pixels.'),
+        ("gemma4:latest", 'Click on "Demo folder". Return the action in format: click(start_box="(x,y)") with coordinates normalized 0-1000.'),
+    ]
+    for model, prompt in vlm_tests:
+        elapsed, coords, text = test_vlm(model, prompt, b64, screen_w, screen_h)
+        coord_str = f"({coords[0]:4d}, {coords[1]:4d})" if coords else "       —       "
+        print(f"  {model:35s} {elapsed:5.1f}s  {coord_str}  {text[:60]}")
+
+    # ── OpenCV ──
+    print(f"\n─── OPENCV (ancres de {ANCHOR_DIR}) ───")
+    thumbs = sorted(glob.glob(f'{ANCHOR_DIR}/*_thumb.png'))[:5]
+    full_imgs = sorted(glob.glob(f'{ANCHOR_DIR}/*_full.png'))[:5]
+
+    for thumb_path in thumbs:
+        name = os.path.basename(thumb_path).replace('_thumb.png', '')[:30]
+        ah, aw = cv2.imread(thumb_path, cv2.IMREAD_GRAYSCALE).shape[:2] if cv2.imread(thumb_path) is not None else (0,0)
+        print(f"\n  Ancre: {name} ({aw}x{ah})")
+
+        r = test_template(screen_gray, thumb_path)
+        if r:
+            print(f"    Template:          {r['time_ms']:6.1f}ms  score={r['score']:.3f}  pos={r['pos']}")
+
+        r = test_template_multiscale(screen_gray, thumb_path)
+        if r:
+            print(f"    Template multi-s:  {r['time_ms']:6.1f}ms  score={r['score']:.3f}  pos={r['pos']}  scale={r['scale']}")
+
+        r = test_orb(screen_gray, thumb_path)
+        if r:
+            print(f"    ORB:               {r['time_ms']:6.1f}ms  matches={r['matches']:3d}  pos={r['pos']}")
+
+        r = test_akaze(screen_gray, thumb_path)
+        if r:
+            print(f"    AKAZE:             {r['time_ms']:6.1f}ms  matches={r['matches']:3d}  pos={r['pos']}")
+
+    # ── Résumé ──
+    print(f"\n{'='*70}")
+    print("RÉSUMÉ")
+    print("="*70)
+    print("""
+Pipeline recommandé (du plus rapide au plus lent) :
+  1. Template matching classique     ~20-50ms   (score > 0.75 = direct)
+  2. Template multi-scale            ~80-150ms  (robuste aux changements de taille)
+  3. OCR (docTR)                     ~500-1000ms (texte uniquement)
+  4. Static fallback                 ~0ms       (coordonnées d'origine)
+
+Note : les feature matchers (ORB/AKAZE) ne sont pas adaptés aux petites
+ancres UI (< 200x200px) — trop peu de keypoints distinctifs.
+""")
+
+
+if __name__ == '__main__':
+    main()
--- a/tools/start_grounding_server.sh
+++ b/tools/start_grounding_server.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Lancement du serveur de grounding UI-TARS (port 8200)
+#
+# Le serveur charge UI-TARS-1.5-7B en 4-bit NF4 dans son propre process
+# Python avec un contexte CUDA propre. Le backend Flask VWB et la boucle
+# ORA appellent ce serveur en HTTP.
+#
+# Usage :
+#   ./tools/start_grounding_server.sh          # premier plan
+#   ./tools/start_grounding_server.sh --bg     # arriere-plan (log dans /tmp)
+
+set -e
+
+cd /home/dom/ai/rpa_vision_v3
+
+VENV=".venv/bin/python3"
+LOG="/tmp/grounding_server.log"
+
+if [ ! -f "$VENV" ]; then
+    echo "ERREUR: venv non trouve a $VENV"
+    exit 1
+fi
+
+echo "=== Serveur de Grounding UI-TARS ==="
+echo "Port: 8200"
+echo "Modele: ByteDance-Seed/UI-TARS-1.5-7B (4-bit NF4)"
+echo ""
+
+if [ "$1" = "--bg" ]; then
+    echo "Lancement en arriere-plan (logs dans $LOG)"
+    nohup $VENV -m core.grounding.server > "$LOG" 2>&1 &
+    PID=$!
+    echo "PID: $PID"
+    echo "$PID" > /tmp/grounding_server.pid
+    echo "Verifier: curl http://localhost:8200/health"
+    echo "Logs: tail -f $LOG"
+else
+    $VENV -m core.grounding.server
+fi