Files
rpa_vision_v3/tools/benchmark_grounding.py
Dom 9da589c8c2 feat(grounding): pipeline centralisé + serveur UI-TARS transformers + nettoyage code mort
Architecture grounding complète :
- core/grounding/server.py : serveur FastAPI (port 8200) avec UI-TARS-1.5-7B en 4-bit NF4
  Process séparé avec son propre contexte CUDA (résout le crash Flask/CUDA)
- core/grounding/pipeline.py : orchestrateur cascade template→OCR→UI-TARS→static
- core/grounding/template_matcher.py : TemplateMatcher centralisé (remplace 5 copies)
- core/grounding/ui_tars_grounder.py : client HTTP vers le serveur de grounding
- core/grounding/target.py : GroundingTarget + GroundingResult

ORA modifié :
- _act_click() : capture unique de l'écran envoyée au serveur de grounding
- Pre-check VLM skippé pour ui_tars (redondant, et Ollama n'a plus de VRAM)
- verify_level='none' par défaut (vérification titre OCR prévue en Phase 2)
- Détection réponses négatives UI-TARS ("I don't see it" → fallback OCR)

Nettoyage :
- 9 fichiers morts archivés dans _archive/ (~6300 lignes supprimées)
- 21 tests ajoutés pour TemplateMatcher

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 17:48:18 +02:00

219 lines
8.6 KiB
Python

#!/usr/bin/env python3
"""
Benchmark complet des méthodes de grounding visuel.
À lancer avec la VM Windows visible à l'écran, bureau avec dossier Demo.
Usage:
cd ~/ai/rpa_vision_v3
.venv/bin/python3 tools/benchmark_grounding.py
"""
import mss, io, base64, requests, time, re, cv2, numpy as np, os, glob, json
from PIL import Image
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
ANCHOR_DIR = 'visual_workflow_builder/backend/data/anchors'
def capture_screen():
with mss.mss() as sct:
grab = sct.grab(sct.monitors[0])
screen = Image.frombytes('RGB', grab.size, grab.rgb)
return screen
def screen_to_b64(screen):
buf = io.BytesIO()
screen.save(buf, format='JPEG', quality=70)
return base64.b64encode(buf.getvalue()).decode()
def parse_coords(text, screen_w, screen_h):
for pat in [
r"start_box='?\<?\|?box_start\|?\>?\((\d+),(\d+)\)",
r'\((\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\)',
r'\[(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\]',
]:
m = re.search(pat, text)
if m:
rx, ry = float(m.group(1)), float(m.group(2))
if rx <= 1.0 and ry <= 1.0:
return int(rx * screen_w), int(ry * screen_h)
elif rx <= 1000 and ry <= 1000:
return int(rx * screen_w / 1000), int(ry * screen_h / 1000)
return int(rx), int(ry)
return None
def test_vlm(model, prompt, b64, screen_w, screen_h):
t0 = time.time()
try:
resp = requests.post(f'{OLLAMA_URL}/api/generate', json={
'model': model, 'prompt': prompt, 'images': [b64],
'stream': False, 'options': {'temperature': 0.0, 'num_predict': 50}
}, timeout=60)
elapsed = time.time() - t0
if resp.status_code != 200:
return elapsed, None, f"HTTP {resp.status_code}"
text = resp.json().get('response', '').strip()
coords = parse_coords(text, screen_w, screen_h)
return elapsed, coords, text[:120]
except Exception as e:
return time.time() - t0, None, str(e)[:80]
def test_template(screen_gray, anchor_path):
anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE)
if anchor is None:
return None
ah, aw = anchor.shape[:2]
if ah >= screen_gray.shape[0] or aw >= screen_gray.shape[1]:
return None
t0 = time.time()
result = cv2.matchTemplate(screen_gray, anchor, cv2.TM_CCOEFF_NORMED)
_, max_val, _, max_loc = cv2.minMaxLoc(result)
elapsed = (time.time() - t0) * 1000
return {
'method': 'template', 'time_ms': elapsed,
'score': max_val, 'pos': (max_loc[0] + aw//2, max_loc[1] + ah//2)
}
def test_template_multiscale(screen_gray, anchor_path, scales=(0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3)):
anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE)
if anchor is None:
return None
ah, aw = anchor.shape[:2]
t0 = time.time()
best_val, best_loc, best_scale = 0, None, 1.0
for s in scales:
resized = cv2.resize(anchor, None, fx=s, fy=s)
rh, rw = resized.shape[:2]
if rh >= screen_gray.shape[0] or rw >= screen_gray.shape[1]:
continue
res = cv2.matchTemplate(screen_gray, resized, cv2.TM_CCOEFF_NORMED)
_, mv, _, ml = cv2.minMaxLoc(res)
if mv > best_val:
best_val, best_loc, best_scale = mv, ml, s
elapsed = (time.time() - t0) * 1000
if best_loc is None:
return None
rh, rw = int(ah * best_scale), int(aw * best_scale)
return {
'method': 'template_multiscale', 'time_ms': elapsed,
'score': best_val, 'pos': (best_loc[0] + rw//2, best_loc[1] + rh//2),
'scale': best_scale
}
def test_orb(screen_gray, anchor_path, max_distance=50):
anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE)
if anchor is None:
return None
t0 = time.time()
orb = cv2.ORB_create(nfeatures=1000)
kp1, des1 = orb.detectAndCompute(anchor, None)
kp2, des2 = orb.detectAndCompute(screen_gray, None)
if des1 is None or des2 is None or len(des1) < 2 or len(des2) < 2:
return {'method': 'ORB', 'time_ms': (time.time()-t0)*1000, 'matches': 0, 'pos': None}
bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
matches = bf.match(des1, des2)
good = sorted([m for m in matches if m.distance < max_distance], key=lambda m: m.distance)
elapsed = (time.time() - t0) * 1000
pos = None
if len(good) >= 4:
pts = np.float32([kp2[m.trainIdx].pt for m in good])
pos = (int(np.median(pts[:, 0])), int(np.median(pts[:, 1])))
return {'method': 'ORB', 'time_ms': elapsed, 'matches': len(good), 'pos': pos}
def test_akaze(screen_gray, anchor_path, max_distance=80):
anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE)
if anchor is None:
return None
t0 = time.time()
akaze = cv2.AKAZE_create()
kp1, des1 = akaze.detectAndCompute(anchor, None)
kp2, des2 = akaze.detectAndCompute(screen_gray, None)
if des1 is None or des2 is None or len(des1) < 2 or len(des2) < 2:
return {'method': 'AKAZE', 'time_ms': (time.time()-t0)*1000, 'matches': 0, 'pos': None}
bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
matches = bf.match(des1, des2)
good = sorted([m for m in matches if m.distance < max_distance], key=lambda m: m.distance)
elapsed = (time.time() - t0) * 1000
pos = None
if len(good) >= 4:
pts = np.float32([kp2[m.trainIdx].pt for m in good])
pos = (int(np.median(pts[:, 0])), int(np.median(pts[:, 1])))
return {'method': 'AKAZE', 'time_ms': elapsed, 'matches': len(good), 'pos': pos}
def main():
print("="*70)
print("BENCHMARK GROUNDING — Léa RPA Vision")
print("="*70)
screen = capture_screen()
screen_w, screen_h = screen.size
b64 = screen_to_b64(screen)
screen_cv = cv2.cvtColor(np.array(screen), cv2.COLOR_RGB2BGR)
screen_gray = cv2.cvtColor(screen_cv, cv2.COLOR_BGR2GRAY)
print(f"Écran: {screen_w}x{screen_h}\n")
# ── VLM grounding ──
print("─── VLM GROUNDING (cible: 'Demo folder') ───")
vlm_tests = [
("qwen3-vl:8b", 'Click on "Demo folder". Return the action in format: click(start_box="(x,y)") with coordinates normalized 0-1000.'),
("qwen2.5vl:7b", 'Click on "Demo folder". Return the action in format: click(start_box="(x,y)") with coordinates normalized 0-1000.'),
("moondream:latest", 'Where is the "Demo" folder icon? Give coordinates as (x, y) in pixels.'),
("gemma4:latest", 'Click on "Demo folder". Return the action in format: click(start_box="(x,y)") with coordinates normalized 0-1000.'),
]
for model, prompt in vlm_tests:
elapsed, coords, text = test_vlm(model, prompt, b64, screen_w, screen_h)
coord_str = f"({coords[0]:4d}, {coords[1]:4d})" if coords else ""
print(f" {model:35s} {elapsed:5.1f}s {coord_str} {text[:60]}")
# ── OpenCV ──
print(f"\n─── OPENCV (ancres de {ANCHOR_DIR}) ───")
thumbs = sorted(glob.glob(f'{ANCHOR_DIR}/*_thumb.png'))[:5]
full_imgs = sorted(glob.glob(f'{ANCHOR_DIR}/*_full.png'))[:5]
for thumb_path in thumbs:
name = os.path.basename(thumb_path).replace('_thumb.png', '')[:30]
ah, aw = cv2.imread(thumb_path, cv2.IMREAD_GRAYSCALE).shape[:2] if cv2.imread(thumb_path) is not None else (0,0)
print(f"\n Ancre: {name} ({aw}x{ah})")
r = test_template(screen_gray, thumb_path)
if r:
print(f" Template: {r['time_ms']:6.1f}ms score={r['score']:.3f} pos={r['pos']}")
r = test_template_multiscale(screen_gray, thumb_path)
if r:
print(f" Template multi-s: {r['time_ms']:6.1f}ms score={r['score']:.3f} pos={r['pos']} scale={r['scale']}")
r = test_orb(screen_gray, thumb_path)
if r:
print(f" ORB: {r['time_ms']:6.1f}ms matches={r['matches']:3d} pos={r['pos']}")
r = test_akaze(screen_gray, thumb_path)
if r:
print(f" AKAZE: {r['time_ms']:6.1f}ms matches={r['matches']:3d} pos={r['pos']}")
# ── Résumé ──
print(f"\n{'='*70}")
print("RÉSUMÉ")
print("="*70)
print("""
Pipeline recommandé (du plus rapide au plus lent) :
1. Template matching classique ~20-50ms (score > 0.75 = direct)
2. Template multi-scale ~80-150ms (robuste aux changements de taille)
3. OCR (docTR) ~500-1000ms (texte uniquement)
4. Static fallback ~0ms (coordonnées d'origine)
Note : les feature matchers (ORB/AKAZE) ne sont pas adaptés aux petites
ancres UI (< 200x200px) — trop peu de keypoints distinctifs.
""")
if __name__ == '__main__':
main()