feat(grounding): pipeline centralisé + serveur UI-TARS transformers + nettoyage code mort
Architecture grounding complète :
- core/grounding/server.py : serveur FastAPI (port 8200) avec UI-TARS-1.5-7B en 4-bit NF4
Process séparé avec son propre contexte CUDA (résout le crash Flask/CUDA)
- core/grounding/pipeline.py : orchestrateur cascade template→OCR→UI-TARS→static
- core/grounding/template_matcher.py : TemplateMatcher centralisé (remplace 5 copies)
- core/grounding/ui_tars_grounder.py : client HTTP vers le serveur de grounding
- core/grounding/target.py : GroundingTarget + GroundingResult
ORA modifié :
- _act_click() : capture unique de l'écran envoyée au serveur de grounding
- Pre-check VLM skippé pour ui_tars (redondant, et Ollama n'a plus de VRAM)
- verify_level='none' par défaut (vérification titre OCR prévue en Phase 2)
- Détection réponses négatives UI-TARS ("I don't see it" → fallback OCR)
Nettoyage :
- 9 fichiers morts archivés dans _archive/ (~6300 lignes supprimées)
- 21 tests ajoutés pour TemplateMatcher
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
218
tools/benchmark_grounding.py
Normal file
218
tools/benchmark_grounding.py
Normal file
@@ -0,0 +1,218 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Benchmark complet des méthodes de grounding visuel.
|
||||
À lancer avec la VM Windows visible à l'écran, bureau avec dossier Demo.
|
||||
|
||||
Usage:
|
||||
cd ~/ai/rpa_vision_v3
|
||||
.venv/bin/python3 tools/benchmark_grounding.py
|
||||
"""
|
||||
import mss, io, base64, requests, time, re, cv2, numpy as np, os, glob, json
|
||||
from PIL import Image
|
||||
|
||||
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
||||
ANCHOR_DIR = 'visual_workflow_builder/backend/data/anchors'
|
||||
|
||||
|
||||
def capture_screen():
|
||||
with mss.mss() as sct:
|
||||
grab = sct.grab(sct.monitors[0])
|
||||
screen = Image.frombytes('RGB', grab.size, grab.rgb)
|
||||
return screen
|
||||
|
||||
|
||||
def screen_to_b64(screen):
|
||||
buf = io.BytesIO()
|
||||
screen.save(buf, format='JPEG', quality=70)
|
||||
return base64.b64encode(buf.getvalue()).decode()
|
||||
|
||||
|
||||
def parse_coords(text, screen_w, screen_h):
|
||||
for pat in [
|
||||
r"start_box='?\<?\|?box_start\|?\>?\((\d+),(\d+)\)",
|
||||
r'\((\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\)',
|
||||
r'\[(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\]',
|
||||
]:
|
||||
m = re.search(pat, text)
|
||||
if m:
|
||||
rx, ry = float(m.group(1)), float(m.group(2))
|
||||
if rx <= 1.0 and ry <= 1.0:
|
||||
return int(rx * screen_w), int(ry * screen_h)
|
||||
elif rx <= 1000 and ry <= 1000:
|
||||
return int(rx * screen_w / 1000), int(ry * screen_h / 1000)
|
||||
return int(rx), int(ry)
|
||||
return None
|
||||
|
||||
|
||||
def test_vlm(model, prompt, b64, screen_w, screen_h):
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(f'{OLLAMA_URL}/api/generate', json={
|
||||
'model': model, 'prompt': prompt, 'images': [b64],
|
||||
'stream': False, 'options': {'temperature': 0.0, 'num_predict': 50}
|
||||
}, timeout=60)
|
||||
elapsed = time.time() - t0
|
||||
if resp.status_code != 200:
|
||||
return elapsed, None, f"HTTP {resp.status_code}"
|
||||
text = resp.json().get('response', '').strip()
|
||||
coords = parse_coords(text, screen_w, screen_h)
|
||||
return elapsed, coords, text[:120]
|
||||
except Exception as e:
|
||||
return time.time() - t0, None, str(e)[:80]
|
||||
|
||||
|
||||
def test_template(screen_gray, anchor_path):
|
||||
anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE)
|
||||
if anchor is None:
|
||||
return None
|
||||
ah, aw = anchor.shape[:2]
|
||||
if ah >= screen_gray.shape[0] or aw >= screen_gray.shape[1]:
|
||||
return None
|
||||
t0 = time.time()
|
||||
result = cv2.matchTemplate(screen_gray, anchor, cv2.TM_CCOEFF_NORMED)
|
||||
_, max_val, _, max_loc = cv2.minMaxLoc(result)
|
||||
elapsed = (time.time() - t0) * 1000
|
||||
return {
|
||||
'method': 'template', 'time_ms': elapsed,
|
||||
'score': max_val, 'pos': (max_loc[0] + aw//2, max_loc[1] + ah//2)
|
||||
}
|
||||
|
||||
|
||||
def test_template_multiscale(screen_gray, anchor_path, scales=(0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3)):
|
||||
anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE)
|
||||
if anchor is None:
|
||||
return None
|
||||
ah, aw = anchor.shape[:2]
|
||||
t0 = time.time()
|
||||
best_val, best_loc, best_scale = 0, None, 1.0
|
||||
for s in scales:
|
||||
resized = cv2.resize(anchor, None, fx=s, fy=s)
|
||||
rh, rw = resized.shape[:2]
|
||||
if rh >= screen_gray.shape[0] or rw >= screen_gray.shape[1]:
|
||||
continue
|
||||
res = cv2.matchTemplate(screen_gray, resized, cv2.TM_CCOEFF_NORMED)
|
||||
_, mv, _, ml = cv2.minMaxLoc(res)
|
||||
if mv > best_val:
|
||||
best_val, best_loc, best_scale = mv, ml, s
|
||||
elapsed = (time.time() - t0) * 1000
|
||||
if best_loc is None:
|
||||
return None
|
||||
rh, rw = int(ah * best_scale), int(aw * best_scale)
|
||||
return {
|
||||
'method': 'template_multiscale', 'time_ms': elapsed,
|
||||
'score': best_val, 'pos': (best_loc[0] + rw//2, best_loc[1] + rh//2),
|
||||
'scale': best_scale
|
||||
}
|
||||
|
||||
|
||||
def test_orb(screen_gray, anchor_path, max_distance=50):
|
||||
anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE)
|
||||
if anchor is None:
|
||||
return None
|
||||
t0 = time.time()
|
||||
orb = cv2.ORB_create(nfeatures=1000)
|
||||
kp1, des1 = orb.detectAndCompute(anchor, None)
|
||||
kp2, des2 = orb.detectAndCompute(screen_gray, None)
|
||||
if des1 is None or des2 is None or len(des1) < 2 or len(des2) < 2:
|
||||
return {'method': 'ORB', 'time_ms': (time.time()-t0)*1000, 'matches': 0, 'pos': None}
|
||||
bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
|
||||
matches = bf.match(des1, des2)
|
||||
good = sorted([m for m in matches if m.distance < max_distance], key=lambda m: m.distance)
|
||||
elapsed = (time.time() - t0) * 1000
|
||||
pos = None
|
||||
if len(good) >= 4:
|
||||
pts = np.float32([kp2[m.trainIdx].pt for m in good])
|
||||
pos = (int(np.median(pts[:, 0])), int(np.median(pts[:, 1])))
|
||||
return {'method': 'ORB', 'time_ms': elapsed, 'matches': len(good), 'pos': pos}
|
||||
|
||||
|
||||
def test_akaze(screen_gray, anchor_path, max_distance=80):
|
||||
anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE)
|
||||
if anchor is None:
|
||||
return None
|
||||
t0 = time.time()
|
||||
akaze = cv2.AKAZE_create()
|
||||
kp1, des1 = akaze.detectAndCompute(anchor, None)
|
||||
kp2, des2 = akaze.detectAndCompute(screen_gray, None)
|
||||
if des1 is None or des2 is None or len(des1) < 2 or len(des2) < 2:
|
||||
return {'method': 'AKAZE', 'time_ms': (time.time()-t0)*1000, 'matches': 0, 'pos': None}
|
||||
bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
|
||||
matches = bf.match(des1, des2)
|
||||
good = sorted([m for m in matches if m.distance < max_distance], key=lambda m: m.distance)
|
||||
elapsed = (time.time() - t0) * 1000
|
||||
pos = None
|
||||
if len(good) >= 4:
|
||||
pts = np.float32([kp2[m.trainIdx].pt for m in good])
|
||||
pos = (int(np.median(pts[:, 0])), int(np.median(pts[:, 1])))
|
||||
return {'method': 'AKAZE', 'time_ms': elapsed, 'matches': len(good), 'pos': pos}
|
||||
|
||||
|
||||
def main():
|
||||
print("="*70)
|
||||
print("BENCHMARK GROUNDING — Léa RPA Vision")
|
||||
print("="*70)
|
||||
|
||||
screen = capture_screen()
|
||||
screen_w, screen_h = screen.size
|
||||
b64 = screen_to_b64(screen)
|
||||
screen_cv = cv2.cvtColor(np.array(screen), cv2.COLOR_RGB2BGR)
|
||||
screen_gray = cv2.cvtColor(screen_cv, cv2.COLOR_BGR2GRAY)
|
||||
print(f"Écran: {screen_w}x{screen_h}\n")
|
||||
|
||||
# ── VLM grounding ──
|
||||
print("─── VLM GROUNDING (cible: 'Demo folder') ───")
|
||||
vlm_tests = [
|
||||
("qwen3-vl:8b", 'Click on "Demo folder". Return the action in format: click(start_box="(x,y)") with coordinates normalized 0-1000.'),
|
||||
("qwen2.5vl:7b", 'Click on "Demo folder". Return the action in format: click(start_box="(x,y)") with coordinates normalized 0-1000.'),
|
||||
("moondream:latest", 'Where is the "Demo" folder icon? Give coordinates as (x, y) in pixels.'),
|
||||
("gemma4:latest", 'Click on "Demo folder". Return the action in format: click(start_box="(x,y)") with coordinates normalized 0-1000.'),
|
||||
]
|
||||
for model, prompt in vlm_tests:
|
||||
elapsed, coords, text = test_vlm(model, prompt, b64, screen_w, screen_h)
|
||||
coord_str = f"({coords[0]:4d}, {coords[1]:4d})" if coords else " — "
|
||||
print(f" {model:35s} {elapsed:5.1f}s {coord_str} {text[:60]}")
|
||||
|
||||
# ── OpenCV ──
|
||||
print(f"\n─── OPENCV (ancres de {ANCHOR_DIR}) ───")
|
||||
thumbs = sorted(glob.glob(f'{ANCHOR_DIR}/*_thumb.png'))[:5]
|
||||
full_imgs = sorted(glob.glob(f'{ANCHOR_DIR}/*_full.png'))[:5]
|
||||
|
||||
for thumb_path in thumbs:
|
||||
name = os.path.basename(thumb_path).replace('_thumb.png', '')[:30]
|
||||
ah, aw = cv2.imread(thumb_path, cv2.IMREAD_GRAYSCALE).shape[:2] if cv2.imread(thumb_path) is not None else (0,0)
|
||||
print(f"\n Ancre: {name} ({aw}x{ah})")
|
||||
|
||||
r = test_template(screen_gray, thumb_path)
|
||||
if r:
|
||||
print(f" Template: {r['time_ms']:6.1f}ms score={r['score']:.3f} pos={r['pos']}")
|
||||
|
||||
r = test_template_multiscale(screen_gray, thumb_path)
|
||||
if r:
|
||||
print(f" Template multi-s: {r['time_ms']:6.1f}ms score={r['score']:.3f} pos={r['pos']} scale={r['scale']}")
|
||||
|
||||
r = test_orb(screen_gray, thumb_path)
|
||||
if r:
|
||||
print(f" ORB: {r['time_ms']:6.1f}ms matches={r['matches']:3d} pos={r['pos']}")
|
||||
|
||||
r = test_akaze(screen_gray, thumb_path)
|
||||
if r:
|
||||
print(f" AKAZE: {r['time_ms']:6.1f}ms matches={r['matches']:3d} pos={r['pos']}")
|
||||
|
||||
# ── Résumé ──
|
||||
print(f"\n{'='*70}")
|
||||
print("RÉSUMÉ")
|
||||
print("="*70)
|
||||
print("""
|
||||
Pipeline recommandé (du plus rapide au plus lent) :
|
||||
1. Template matching classique ~20-50ms (score > 0.75 = direct)
|
||||
2. Template multi-scale ~80-150ms (robuste aux changements de taille)
|
||||
3. OCR (docTR) ~500-1000ms (texte uniquement)
|
||||
4. Static fallback ~0ms (coordonnées d'origine)
|
||||
|
||||
Note : les feature matchers (ORB/AKAZE) ne sont pas adaptés aux petites
|
||||
ancres UI (< 200x200px) — trop peu de keypoints distinctifs.
|
||||
""")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
39
tools/start_grounding_server.sh
Executable file
39
tools/start_grounding_server.sh
Executable file
@@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
# Lancement du serveur de grounding UI-TARS (port 8200)
|
||||
#
|
||||
# Le serveur charge UI-TARS-1.5-7B en 4-bit NF4 dans son propre process
|
||||
# Python avec un contexte CUDA propre. Le backend Flask VWB et la boucle
|
||||
# ORA appellent ce serveur en HTTP.
|
||||
#
|
||||
# Usage :
|
||||
# ./tools/start_grounding_server.sh # premier plan
|
||||
# ./tools/start_grounding_server.sh --bg # arriere-plan (log dans /tmp)
|
||||
|
||||
set -e
|
||||
|
||||
cd /home/dom/ai/rpa_vision_v3
|
||||
|
||||
VENV=".venv/bin/python3"
|
||||
LOG="/tmp/grounding_server.log"
|
||||
|
||||
if [ ! -f "$VENV" ]; then
|
||||
echo "ERREUR: venv non trouve a $VENV"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "=== Serveur de Grounding UI-TARS ==="
|
||||
echo "Port: 8200"
|
||||
echo "Modele: ByteDance-Seed/UI-TARS-1.5-7B (4-bit NF4)"
|
||||
echo ""
|
||||
|
||||
if [ "$1" = "--bg" ]; then
|
||||
echo "Lancement en arriere-plan (logs dans $LOG)"
|
||||
nohup $VENV -m core.grounding.server > "$LOG" 2>&1 &
|
||||
PID=$!
|
||||
echo "PID: $PID"
|
||||
echo "$PID" > /tmp/grounding_server.pid
|
||||
echo "Verifier: curl http://localhost:8200/health"
|
||||
echo "Logs: tail -f $LOG"
|
||||
else
|
||||
$VENV -m core.grounding.server
|
||||
fi
|
||||
Reference in New Issue
Block a user