rpa_vision_v3/tools/benchmark_grounding.py

#!/usr/bin/env python3
"""
Benchmark complet des méthodes de grounding visuel.
À lancer avec la VM Windows visible à l'écran, bureau avec dossier Demo.

Usage:
    cd ~/ai/rpa_vision_v3
    .venv/bin/python3 tools/benchmark_grounding.py
"""
import mss, io, base64, requests, time, re, cv2, numpy as np, os, glob, json
from PIL import Image

OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
ANCHOR_DIR = 'visual_workflow_builder/backend/data/anchors'


def capture_screen():
    with mss.mss() as sct:
        grab = sct.grab(sct.monitors[0])
        screen = Image.frombytes('RGB', grab.size, grab.rgb)
    return screen


def screen_to_b64(screen):
    buf = io.BytesIO()
    screen.save(buf, format='JPEG', quality=70)
    return base64.b64encode(buf.getvalue()).decode()


def parse_coords(text, screen_w, screen_h):
    for pat in [
        r"start_box='?\<?\|?box_start\|?\>?\((\d+),(\d+)\)",
        r'\((\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\)',
        r'\[(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\]',
    ]:
        m = re.search(pat, text)
        if m:
            rx, ry = float(m.group(1)), float(m.group(2))
            if rx <= 1.0 and ry <= 1.0:
                return int(rx * screen_w), int(ry * screen_h)
            elif rx <= 1000 and ry <= 1000:
                return int(rx * screen_w / 1000), int(ry * screen_h / 1000)
            return int(rx), int(ry)
    return None


def test_vlm(model, prompt, b64, screen_w, screen_h):
    t0 = time.time()
    try:
        resp = requests.post(f'{OLLAMA_URL}/api/generate', json={
            'model': model, 'prompt': prompt, 'images': [b64],
            'stream': False, 'options': {'temperature': 0.0, 'num_predict': 50}
        }, timeout=60)
        elapsed = time.time() - t0
        if resp.status_code != 200:
            return elapsed, None, f"HTTP {resp.status_code}"
        text = resp.json().get('response', '').strip()
        coords = parse_coords(text, screen_w, screen_h)
        return elapsed, coords, text[:120]
    except Exception as e:
        return time.time() - t0, None, str(e)[:80]


def test_template(screen_gray, anchor_path):
    anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE)
    if anchor is None:
        return None
    ah, aw = anchor.shape[:2]
    if ah >= screen_gray.shape[0] or aw >= screen_gray.shape[1]:
        return None
    t0 = time.time()
    result = cv2.matchTemplate(screen_gray, anchor, cv2.TM_CCOEFF_NORMED)
    _, max_val, _, max_loc = cv2.minMaxLoc(result)
    elapsed = (time.time() - t0) * 1000
    return {
        'method': 'template', 'time_ms': elapsed,
        'score': max_val, 'pos': (max_loc[0] + aw//2, max_loc[1] + ah//2)
    }


def test_template_multiscale(screen_gray, anchor_path, scales=(0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3)):
    anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE)
    if anchor is None:
        return None
    ah, aw = anchor.shape[:2]
    t0 = time.time()
    best_val, best_loc, best_scale = 0, None, 1.0
    for s in scales:
        resized = cv2.resize(anchor, None, fx=s, fy=s)
        rh, rw = resized.shape[:2]
        if rh >= screen_gray.shape[0] or rw >= screen_gray.shape[1]:
            continue
        res = cv2.matchTemplate(screen_gray, resized, cv2.TM_CCOEFF_NORMED)
        _, mv, _, ml = cv2.minMaxLoc(res)
        if mv > best_val:
            best_val, best_loc, best_scale = mv, ml, s
    elapsed = (time.time() - t0) * 1000
    if best_loc is None:
        return None
    rh, rw = int(ah * best_scale), int(aw * best_scale)
    return {
        'method': 'template_multiscale', 'time_ms': elapsed,
        'score': best_val, 'pos': (best_loc[0] + rw//2, best_loc[1] + rh//2),
        'scale': best_scale
    }


def test_orb(screen_gray, anchor_path, max_distance=50):
    anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE)
    if anchor is None:
        return None
    t0 = time.time()
    orb = cv2.ORB_create(nfeatures=1000)
    kp1, des1 = orb.detectAndCompute(anchor, None)
    kp2, des2 = orb.detectAndCompute(screen_gray, None)
    if des1 is None or des2 is None or len(des1) < 2 or len(des2) < 2:
        return {'method': 'ORB', 'time_ms': (time.time()-t0)*1000, 'matches': 0, 'pos': None}
    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    matches = bf.match(des1, des2)
    good = sorted([m for m in matches if m.distance < max_distance], key=lambda m: m.distance)
    elapsed = (time.time() - t0) * 1000
    pos = None
    if len(good) >= 4:
        pts = np.float32([kp2[m.trainIdx].pt for m in good])
        pos = (int(np.median(pts[:, 0])), int(np.median(pts[:, 1])))
    return {'method': 'ORB', 'time_ms': elapsed, 'matches': len(good), 'pos': pos}


def test_akaze(screen_gray, anchor_path, max_distance=80):
    anchor = cv2.imread(anchor_path, cv2.IMREAD_GRAYSCALE)
    if anchor is None:
        return None
    t0 = time.time()
    akaze = cv2.AKAZE_create()
    kp1, des1 = akaze.detectAndCompute(anchor, None)
    kp2, des2 = akaze.detectAndCompute(screen_gray, None)
    if des1 is None or des2 is None or len(des1) < 2 or len(des2) < 2:
        return {'method': 'AKAZE', 'time_ms': (time.time()-t0)*1000, 'matches': 0, 'pos': None}
    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    matches = bf.match(des1, des2)
    good = sorted([m for m in matches if m.distance < max_distance], key=lambda m: m.distance)
    elapsed = (time.time() - t0) * 1000
    pos = None
    if len(good) >= 4:
        pts = np.float32([kp2[m.trainIdx].pt for m in good])
        pos = (int(np.median(pts[:, 0])), int(np.median(pts[:, 1])))
    return {'method': 'AKAZE', 'time_ms': elapsed, 'matches': len(good), 'pos': pos}


def main():
    print("="*70)
    print("BENCHMARK GROUNDING — Léa RPA Vision")
    print("="*70)

    screen = capture_screen()
    screen_w, screen_h = screen.size
    b64 = screen_to_b64(screen)
    screen_cv = cv2.cvtColor(np.array(screen), cv2.COLOR_RGB2BGR)
    screen_gray = cv2.cvtColor(screen_cv, cv2.COLOR_BGR2GRAY)
    print(f"Écran: {screen_w}x{screen_h}\n")

    # ── VLM grounding ──
    print("─── VLM GROUNDING (cible: 'Demo folder') ───")
    vlm_tests = [
        ("qwen3-vl:8b", 'Click on "Demo folder". Return the action in format: click(start_box="(x,y)") with coordinates normalized 0-1000.'),
        ("qwen2.5vl:7b", 'Click on "Demo folder". Return the action in format: click(start_box="(x,y)") with coordinates normalized 0-1000.'),
        ("moondream:latest", 'Where is the "Demo" folder icon? Give coordinates as (x, y) in pixels.'),
        ("gemma4:latest", 'Click on "Demo folder". Return the action in format: click(start_box="(x,y)") with coordinates normalized 0-1000.'),
    ]
    for model, prompt in vlm_tests:
        elapsed, coords, text = test_vlm(model, prompt, b64, screen_w, screen_h)
        coord_str = f"({coords[0]:4d}, {coords[1]:4d})" if coords else "       —       "
        print(f"  {model:35s} {elapsed:5.1f}s  {coord_str}  {text[:60]}")

    # ── OpenCV ──
    print(f"\n─── OPENCV (ancres de {ANCHOR_DIR}) ───")
    thumbs = sorted(glob.glob(f'{ANCHOR_DIR}/*_thumb.png'))[:5]
    full_imgs = sorted(glob.glob(f'{ANCHOR_DIR}/*_full.png'))[:5]

    for thumb_path in thumbs:
        name = os.path.basename(thumb_path).replace('_thumb.png', '')[:30]
        ah, aw = cv2.imread(thumb_path, cv2.IMREAD_GRAYSCALE).shape[:2] if cv2.imread(thumb_path) is not None else (0,0)
        print(f"\n  Ancre: {name} ({aw}x{ah})")

        r = test_template(screen_gray, thumb_path)
        if r:
            print(f"    Template:          {r['time_ms']:6.1f}ms  score={r['score']:.3f}  pos={r['pos']}")

        r = test_template_multiscale(screen_gray, thumb_path)
        if r:
            print(f"    Template multi-s:  {r['time_ms']:6.1f}ms  score={r['score']:.3f}  pos={r['pos']}  scale={r['scale']}")

        r = test_orb(screen_gray, thumb_path)
        if r:
            print(f"    ORB:               {r['time_ms']:6.1f}ms  matches={r['matches']:3d}  pos={r['pos']}")

        r = test_akaze(screen_gray, thumb_path)
        if r:
            print(f"    AKAZE:             {r['time_ms']:6.1f}ms  matches={r['matches']:3d}  pos={r['pos']}")

    # ── Résumé ──
    print(f"\n{'='*70}")
    print("RÉSUMÉ")
    print("="*70)
    print("""
Pipeline recommandé (du plus rapide au plus lent) :
  1. Template matching classique     ~20-50ms   (score > 0.75 = direct)
  2. Template multi-scale            ~80-150ms  (robuste aux changements de taille)
  3. OCR (docTR)                     ~500-1000ms (texte uniquement)
  4. Static fallback                 ~0ms       (coordonnées d'origine)

Note : les feature matchers (ORB/AKAZE) ne sont pas adaptés aux petites
ancres UI (< 200x200px) — trop peu de keypoints distinctifs.
""")


if __name__ == '__main__':
    main()