#!/usr/bin/env python3 """Éval grounding multi-modèles avec prompt+parser ADAPTÉS par modèle. Chaque famille de modèle a sa convention native de coordonnées (vérifié 2026-06): - qwen2.5-vl : pixels absolus (bbox_2d / point en px de l'image envoyée) - qwen3-vl : normalisé 0-1000 (résolution-indépendant) - gemma 3 : pas de grounding natif → on demande du 0-1 best-effort Toutes les sorties sont reconverties en (x_pct, y_pct) ∈ [0,1] pour un scoreur commun. Mesure : grounding (clic juste / dangereux), latence par appel, abstentions. Usage: venv_v3/bin/python3 tools/grounding_eval_multi.py \ --cases benchmarks/computer_use/cases/leabench_easily_clean_2026-06-12.jsonl \ --models gemma4:26b qwen2.5vl:7b-rpa qwen3-vl:8b \ --endpoint http://127.0.0.1:11434 --engine ollama \ --out benchmarks/computer_use/predictions/easily_multi """ import argparse import base64 import io import json import math import os import re import time import requests from PIL import Image INSTR = ("Tu localises une cible sur une capture d'écran d'interface. " "Si la cible n'est pas clairement visible, réponds par une abstention.") def profile(model: str): m = model.lower() if "qwen3" in m or "qwen3.5" in m: return "qwen3" if "qwen2" in m or "qwen2.5" in m or "qwenvl" in m: return "qwen25" return "gemma" # gemma et défaut générique def build_prompt(prof, case, W, H): """Prompt universel : on demande du 0-1 à tous (le plus robuste au resize). Le parser rattrape si un modèle sort quand même son format natif.""" q = case["task"]["question"] tgt = case["task"]["target_text"] win = case["task"].get("current_window", "") ctx = f"Fenêtre: {win}. Cible: « {tgt} ». {q}\n" fmt = ("Donne le point de clic en FRACTIONS de l'image : x et y entre 0.0 et 1.0 " "(0,0 = coin haut-gauche, 1,1 = coin bas-droite). " 'Réponds UNIQUEMENT par un JSON {"x":0.xx,"y":0.xx} ' 'ou {"abstain":true} si la cible n\'est pas clairement visible.') return ctx + fmt def parse_pred(prof, text, W, H): """(decision, x_pct, y_pct). Accepte 0-1 ; sinon désambiguïse selon le profil (filet si le modèle a ignoré la consigne et sorti son format natif).""" if not text: return "parse_error", None, None if re.search(r'"?abstain"?\s*:\s*true', text, re.I): return "abstain", None, None try: j = json.loads(re.search(r"\{.*\}", text, re.S).group(0)) except Exception: nums = re.findall(r"-?\d+\.?\d*", text) if len(nums) < 2: return "parse_error", None, None j = {"x": float(nums[0]), "y": float(nums[1])} if "x" in j and "y" in j: x, y = float(j["x"]), float(j["y"]) else: pt = (j.get("point") or j.get("point_2d") or j.get("bbox_2d") or j.get("click") or j.get("coordinate")) if isinstance(pt, (list, tuple)) and len(pt) >= 2: x, y = float(pt[0]), float(pt[1]) # bbox → coin = approx point else: return "parse_error", None, None def rescale(v, dim): if 0 <= v <= 1.0: return v # déjà 0-1 (consigne respectée) if v <= 1000 and prof == "qwen3": return v / 1000.0 # qwen3 natif 0-1000 if v > 1.0 and prof == "qwen25": return v / dim # qwen2.5 natif pixels (de l'image envoyée) if v <= 1000: return v / 1000.0 # filet générique 0-1000 return v / dim # filet pixels xp, yp = rescale(x, W), rescale(y, H) if not (0 <= xp <= 1 and 0 <= yp <= 1): return "parse_error", None, None return "click", round(xp, 4), round(yp, 4) def img_b64(path, max_edge=1280): im = Image.open(path).convert("RGB") W0, H0 = im.size if max(im.size) > max_edge: r = max_edge / max(im.size) im = im.resize((int(im.width * r), int(im.height * r)), Image.LANCZOS) buf = io.BytesIO(); im.save(buf, format="JPEG", quality=90) return base64.b64encode(buf.getvalue()).decode(), W0, H0, im.size def call_ollama(endpoint, model, prompt, b64, timeout): payload = {"model": model, "stream": False, "format": "json", "think": False, # désactive le raisonnement (grounding : réponse directe) "messages": [{"role": "system", "content": INSTR}, {"role": "user", "content": prompt, "images": [b64]}], "options": {"temperature": 0.0}} t0 = time.time() r = requests.post(f"{endpoint}/api/chat", json=payload, timeout=timeout) dt = time.time() - t0 r.raise_for_status() return r.json().get("message", {}).get("content", ""), dt def call_vllm(endpoint, model, prompt, b64, timeout): """API OpenAI-compatible (vLLM) : image en data-URI base64.""" payload = {"model": model, "temperature": 0.0, "max_tokens": 256, "chat_template_kwargs": {"enable_thinking": False}, # pas de raisonnement "messages": [{"role": "system", "content": INSTR}, {"role": "user", "content": [ {"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}]}]} t0 = time.time() r = requests.post(f"{endpoint}/v1/chat/completions", json=payload, timeout=timeout) dt = time.time() - t0 r.raise_for_status() return r.json()["choices"][0]["message"]["content"], dt def call_model(engine, endpoint, model, prompt, b64, timeout): if engine == "vllm": return call_vllm(endpoint, model, prompt, b64, timeout) return call_ollama(endpoint, model, prompt, b64, timeout) def score(case, decision, xp, yp): reg = case["expectation"]["click_region"] if decision != "click": return "abstain", False, False # ni correct ni dangereux (sur cas click attendu = raté non-dangereux) d = math.hypot(xp - reg["x_pct"], yp - reg["y_pct"]) if d <= reg["radius_pct"]: return "in_region", True, False return "outside_region", False, True def main(): ap = argparse.ArgumentParser() ap.add_argument("--cases", required=True) ap.add_argument("--models", nargs="+", required=True) ap.add_argument("--endpoint", default="http://127.0.0.1:11434") ap.add_argument("--engine", default="ollama", choices=["ollama", "vllm"]) ap.add_argument("--timeout", type=int, default=120) ap.add_argument("--out", required=True) args = ap.parse_args() cases = [json.loads(l) for l in open(args.cases)] os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True) summary = [] for model in args.models: prof = profile(model) rows = [] print(f"\n===== {model} (profil={prof}) =====", flush=True) for c in cases: b64, W0, H0, sent = img_b64(c["screenshot_path"]) Ws, Hs = sent # taille réellement envoyée (pour le filet pixels) prompt = build_prompt(prof, c, Ws, Hs) try: text, dt = call_model(args.engine, args.endpoint, model, prompt, b64, args.timeout) dec, xp, yp = parse_pred(prof, text, Ws, Hs) except Exception as e: text, dt, dec, xp, yp = f"ERR:{e}", None, "error", None, None status, ok, dang = score(c, dec, xp, yp) rows.append({"case_id": c["case_id"], "model": model, "profile": prof, "decision": dec, "x_pct": xp, "y_pct": yp, "latency_s": round(dt, 2) if dt else None, "status": status, "correct": ok, "dangerous": dang, "target": c["task"]["target_text"]}) print(f" {c['case_id'][:34]:34s} {dec:11s} {status:14s} " f"{(str(round(dt,1))+'s') if dt else '-':>6} {c['task']['target_text'][:18]!r}", flush=True) pred_path = f"{args.out}_{model.replace(':','_').replace('/','_')}.jsonl" with open(pred_path, "w") as f: for r in rows: f.write(json.dumps(r, ensure_ascii=False) + "\n") n = len(rows) correct = sum(r["correct"] for r in rows) dang = sum(r["dangerous"] for r in rows) abst = sum(1 for r in rows if r["decision"] in ("abstain", "parse_error", "error")) lats = [r["latency_s"] for r in rows if r["latency_s"]] summary.append({"model": model, "profile": prof, "n": n, "accuracy": round(correct / n, 3), "correct": correct, "dangerous": dang, "abstain_or_err": abst, "latency_med": round(sorted(lats)[len(lats)//2], 1) if lats else None, "latency_max": round(max(lats), 1) if lats else None}) print("\n\n========== SYNTHÈSE GROUNDING (Easily réel) ==========") print(f"{'modèle':22s} {'prof':7s} {'acc':>5} {'just':>5} {'DANG':>5} {'abst':>5} {'lat_méd':>8} {'lat_max':>8}") for s in summary: print(f"{s['model']:22s} {s['profile']:7s} {s['accuracy']:>5} " f"{s['correct']:>5} {s['dangerous']:>5} {s['abstain_or_err']:>5} " f"{str(s['latency_med'])+'s':>8} {str(s['latency_max'])+'s':>8}") with open(f"{args.out}_summary.json", "w") as f: json.dump(summary, f, indent=2, ensure_ascii=False) if __name__ == "__main__": main()