#!/usr/bin/env python3 """ Benchmark complet des méthodes de grounding visuel. À lancer avec la VM Windows visible à l'écran, bureau avec dossier Demo. Usage: cd ~/ai/rpa_vision_v3 .venv/bin/python3 tools/benchmark_grounding.py """ import mss, io, base64, requests, time, re, cv2, numpy as np, os, glob, json from PIL import Image OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") ANCHOR_DIR = 'visual_workflow_builder/backend/data/anchors' def capture_screen(): with mss.mss() as sct: grab = sct.grab(sct.monitors[0]) screen = Image.frombytes('RGB', grab.size, grab.rgb) return screen def screen_to_b64(screen): buf = io.BytesIO() screen.save(buf, format='JPEG', quality=70) return base64.b64encode(buf.getvalue()).decode() def parse_coords(text, screen_w, screen_h): for pat in [ r"start_box='?\?\((\d+),(\d+)\)", r'\((\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\)', r'\[(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\]', ]: m = re.search(pat, text) if m: rx, ry = float(m.group(1)), float(m.group(2)) if rx <= 1.0 and ry <= 1.0: return int(rx * screen_w), int(ry * screen_h) elif rx <= 1000 and ry <= 1000: return int(rx * screen_w / 1000), int(ry * screen_h / 1000) return int(rx), int(ry) return None def test_vlm(model, prompt, b64, screen_w, screen_h): t0 = time.time() try: resp = requests.post(f'{OLLAMA_URL}/api/generate', json={ 'model': model, 'prompt': prompt, 'images': [b64], 'stream': False, 'options': {'temperature': 0.0, 'num_predict': 50} }, timeout=60) elapsed = time.time() - t0 if resp.status_code != 200: return elapsed, None, f"HTTP {resp.status_code}" text = resp.json().get('response', '').strip() coords = parse_coords(text, screen_w, screen_h) return elapsed, coords, text[:120] except Exception as e: return time.time() - t0, None, str(e)[:80] def test_template(screen_gray, anchor_path): anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE) if anchor is None: return None ah, aw = anchor.shape[:2] if ah >= screen_gray.shape[0] or aw >= screen_gray.shape[1]: return None t0 = time.time() result = cv2.matchTemplate(screen_gray, anchor, cv2.TM_CCOEFF_NORMED) _, max_val, _, max_loc = cv2.minMaxLoc(result) elapsed = (time.time() - t0) * 1000 return { 'method': 'template', 'time_ms': elapsed, 'score': max_val, 'pos': (max_loc[0] + aw//2, max_loc[1] + ah//2) } def test_template_multiscale(screen_gray, anchor_path, scales=(0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3)): anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE) if anchor is None: return None ah, aw = anchor.shape[:2] t0 = time.time() best_val, best_loc, best_scale = 0, None, 1.0 for s in scales: resized = cv2.resize(anchor, None, fx=s, fy=s) rh, rw = resized.shape[:2] if rh >= screen_gray.shape[0] or rw >= screen_gray.shape[1]: continue res = cv2.matchTemplate(screen_gray, resized, cv2.TM_CCOEFF_NORMED) _, mv, _, ml = cv2.minMaxLoc(res) if mv > best_val: best_val, best_loc, best_scale = mv, ml, s elapsed = (time.time() - t0) * 1000 if best_loc is None: return None rh, rw = int(ah * best_scale), int(aw * best_scale) return { 'method': 'template_multiscale', 'time_ms': elapsed, 'score': best_val, 'pos': (best_loc[0] + rw//2, best_loc[1] + rh//2), 'scale': best_scale } def test_orb(screen_gray, anchor_path, max_distance=50): anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE) if anchor is None: return None t0 = time.time() orb = cv2.ORB_create(nfeatures=1000) kp1, des1 = orb.detectAndCompute(anchor, None) kp2, des2 = orb.detectAndCompute(screen_gray, None) if des1 is None or des2 is None or len(des1) < 2 or len(des2) < 2: return {'method': 'ORB', 'time_ms': (time.time()-t0)*1000, 'matches': 0, 'pos': None} bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True) matches = bf.match(des1, des2) good = sorted([m for m in matches if m.distance < max_distance], key=lambda m: m.distance) elapsed = (time.time() - t0) * 1000 pos = None if len(good) >= 4: pts = np.float32([kp2[m.trainIdx].pt for m in good]) pos = (int(np.median(pts[:, 0])), int(np.median(pts[:, 1]))) return {'method': 'ORB', 'time_ms': elapsed, 'matches': len(good), 'pos': pos} def test_akaze(screen_gray, anchor_path, max_distance=80): anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE) if anchor is None: return None t0 = time.time() akaze = cv2.AKAZE_create() kp1, des1 = akaze.detectAndCompute(anchor, None) kp2, des2 = akaze.detectAndCompute(screen_gray, None) if des1 is None or des2 is None or len(des1) < 2 or len(des2) < 2: return {'method': 'AKAZE', 'time_ms': (time.time()-t0)*1000, 'matches': 0, 'pos': None} bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True) matches = bf.match(des1, des2) good = sorted([m for m in matches if m.distance < max_distance], key=lambda m: m.distance) elapsed = (time.time() - t0) * 1000 pos = None if len(good) >= 4: pts = np.float32([kp2[m.trainIdx].pt for m in good]) pos = (int(np.median(pts[:, 0])), int(np.median(pts[:, 1]))) return {'method': 'AKAZE', 'time_ms': elapsed, 'matches': len(good), 'pos': pos} def main(): print("="*70) print("BENCHMARK GROUNDING — Léa RPA Vision") print("="*70) screen = capture_screen() screen_w, screen_h = screen.size b64 = screen_to_b64(screen) screen_cv = cv2.cvtColor(np.array(screen), cv2.COLOR_RGB2BGR) screen_gray = cv2.cvtColor(screen_cv, cv2.COLOR_BGR2GRAY) print(f"Écran: {screen_w}x{screen_h}\n") # ── VLM grounding ── print("─── VLM GROUNDING (cible: 'Demo folder') ───") vlm_tests = [ ("qwen3-vl:8b", 'Click on "Demo folder". Return the action in format: click(start_box="(x,y)") with coordinates normalized 0-1000.'), ("qwen2.5vl:7b", 'Click on "Demo folder". Return the action in format: click(start_box="(x,y)") with coordinates normalized 0-1000.'), ("moondream:latest", 'Where is the "Demo" folder icon? Give coordinates as (x, y) in pixels.'), ("gemma4:latest", 'Click on "Demo folder". Return the action in format: click(start_box="(x,y)") with coordinates normalized 0-1000.'), ] for model, prompt in vlm_tests: elapsed, coords, text = test_vlm(model, prompt, b64, screen_w, screen_h) coord_str = f"({coords[0]:4d}, {coords[1]:4d})" if coords else " — " print(f" {model:35s} {elapsed:5.1f}s {coord_str} {text[:60]}") # ── OpenCV ── print(f"\n─── OPENCV (ancres de {ANCHOR_DIR}) ───") thumbs = sorted(glob.glob(f'{ANCHOR_DIR}/*_thumb.png'))[:5] full_imgs = sorted(glob.glob(f'{ANCHOR_DIR}/*_full.png'))[:5] for thumb_path in thumbs: name = os.path.basename(thumb_path).replace('_thumb.png', '')[:30] ah, aw = cv2.imread(thumb_path, cv2.IMREAD_GRAYSCALE).shape[:2] if cv2.imread(thumb_path) is not None else (0,0) print(f"\n Ancre: {name} ({aw}x{ah})") r = test_template(screen_gray, thumb_path) if r: print(f" Template: {r['time_ms']:6.1f}ms score={r['score']:.3f} pos={r['pos']}") r = test_template_multiscale(screen_gray, thumb_path) if r: print(f" Template multi-s: {r['time_ms']:6.1f}ms score={r['score']:.3f} pos={r['pos']} scale={r['scale']}") r = test_orb(screen_gray, thumb_path) if r: print(f" ORB: {r['time_ms']:6.1f}ms matches={r['matches']:3d} pos={r['pos']}") r = test_akaze(screen_gray, thumb_path) if r: print(f" AKAZE: {r['time_ms']:6.1f}ms matches={r['matches']:3d} pos={r['pos']}") # ── Résumé ── print(f"\n{'='*70}") print("RÉSUMÉ") print("="*70) print(""" Pipeline recommandé (du plus rapide au plus lent) : 1. Template matching classique ~20-50ms (score > 0.75 = direct) 2. Template multi-scale ~80-150ms (robuste aux changements de taille) 3. OCR (docTR) ~500-1000ms (texte uniquement) 4. Static fallback ~0ms (coordonnées d'origine) Note : les feature matchers (ORB/AKAZE) ne sont pas adaptés aux petites ancres UI (< 200x200px) — trop peu de keypoints distinctifs. """) if __name__ == '__main__': main()