#!/usr/bin/env python3 """E2E grounding via le VRAI chemin de résolution `_resolve_by_grounding`. Contrairement à `grounding_eval_multi.py` (qui fait son propre appel vLLM direct avec son prompt/parser = chemin parallèle « unit-mocké »), ce harness exerce le chemin de production réel : `agent_v0.server_v1.resolve_engine._resolve_by_grounding` en mode `RPA_GROUNDING_ENGINE=qwen3vl_vllm`. But : confirmer que le 0.933 du bench (tuple modèle+moteur+prompt+parse+think) se reproduit quand c'est le code de prod qui construit le prompt, encode l'image et parse la réponse — pas un script de bench séparé. Scoring identique au bench original (distance euclidienne au click_region humain). Usage (env + tunnel vLLM 8001 requis) : RPA_GROUNDING_ENGINE=qwen3vl_vllm .venv/bin/python3 \ tools/grounding_e2e_resolve_engine.py \ --cases benchmarks/computer_use/cases/leabench_easily_clean_v2.jsonl """ import argparse import json import logging import math import time from PIL import Image from agent_v0.server_v1.resolve_engine import _resolve_by_grounding logging.basicConfig(level=logging.WARNING) def score(case, resolved): """(status, correct, dangerous, x_pct, y_pct). resolved = dict de _resolve_by_grounding ou None (abstention). Règle identique à grounding_eval_multi.score : clic dans le rayon = juste, hors rayon = dangereux, abstention = raté non-dangereux. """ reg = case["expectation"]["click_region"] if not resolved or resolved.get("x_pct") is None: return "abstain", False, False, None, None xp, yp = float(resolved["x_pct"]), float(resolved["y_pct"]) d = math.hypot(xp - reg["x_pct"], yp - reg["y_pct"]) if d <= reg["radius_pct"]: return "in_region", True, False, xp, yp return "outside_region", False, True, xp, yp def main(): ap = argparse.ArgumentParser() ap.add_argument("--cases", required=True) ap.add_argument("--out", default="benchmarks/computer_use/predictions/easily_e2e_resolve_engine.jsonl") args = ap.parse_args() cases = [json.loads(line) for line in open(args.cases)] rows = [] print(f"\n===== E2E _resolve_by_grounding (qwen3vl_vllm) — {len(cases)} cas =====", flush=True) for c in cases: W, H = Image.open(c["screenshot_path"]).size target_spec = {"by_text": c["task"]["target_text"]} t0 = time.time() try: resolved = _resolve_by_grounding(c["screenshot_path"], target_spec, W, H) err = None except Exception as e: # noqa: BLE001 resolved, err = None, f"{type(e).__name__}: {e}" dt = time.time() - t0 status, ok, dang, xp, yp = score(c, resolved) method = resolved.get("method") if resolved else None rows.append({ "case_id": c["case_id"], "target": c["task"]["target_text"], "status": status, "correct": ok, "dangerous": dang, "x_pct": xp, "y_pct": yp, "method": method, "latency_s": round(dt, 2), "error": err, }) flag = "OK " if ok else ("DANGER" if dang else "abst") print(f" {c['case_id'][:30]:30s} {flag:6s} {status:14s} {dt:5.1f}s " f"{(c['task']['target_text'][:20]):20s} " f"{('('+str(xp)+','+str(yp)+')') if xp is not None else (err or '-')}", flush=True) with open(args.out, "w") as f: for r in rows: f.write(json.dumps(r, ensure_ascii=False) + "\n") n = len(rows) correct = sum(r["correct"] for r in rows) dang = sum(r["dangerous"] for r in rows) abst = sum(1 for r in rows if r["status"] == "abstain") lats = [r["latency_s"] for r in rows if r["latency_s"]] lat_med = sorted(lats)[len(lats) // 2] if lats else None print("\n========== SYNTHÈSE E2E (vrai chemin resolve_engine) ==========") print(f" n={n} accuracy={correct/n:.3f} justes={correct} " f"DANGEREUX={dang} abstentions={abst} lat_méd={lat_med}s") print(f" prédictions → {args.out}") if __name__ == "__main__": main()