feat(tools): add 7 wired+bench utility scripts (A+B classification)
- A (wired, imports project modules): e2e_map_roles, anonymize_demo, grounding_e2e_resolve_engine - B (orphan projection, standalone benches): enrichment_eval_multi, extract_easily_bench_cases, extract_record_bench_cases, grounding_eval_multi
This commit is contained in:
100
tools/grounding_e2e_resolve_engine.py
Normal file
100
tools/grounding_e2e_resolve_engine.py
Normal file
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E2E grounding via le VRAI chemin de résolution `_resolve_by_grounding`.
|
||||
|
||||
Contrairement à `grounding_eval_multi.py` (qui fait son propre appel vLLM direct
|
||||
avec son prompt/parser = chemin parallèle « unit-mocké »), ce harness exerce le
|
||||
chemin de production réel : `agent_v0.server_v1.resolve_engine._resolve_by_grounding`
|
||||
en mode `RPA_GROUNDING_ENGINE=qwen3vl_vllm`.
|
||||
|
||||
But : confirmer que le 0.933 du bench (tuple modèle+moteur+prompt+parse+think) se
|
||||
reproduit quand c'est le code de prod qui construit le prompt, encode l'image et
|
||||
parse la réponse — pas un script de bench séparé.
|
||||
|
||||
Scoring identique au bench original (distance euclidienne au click_region humain).
|
||||
|
||||
Usage (env + tunnel vLLM 8001 requis) :
|
||||
RPA_GROUNDING_ENGINE=qwen3vl_vllm .venv/bin/python3 \
|
||||
tools/grounding_e2e_resolve_engine.py \
|
||||
--cases benchmarks/computer_use/cases/leabench_easily_clean_v2.jsonl
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import time
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from agent_v0.server_v1.resolve_engine import _resolve_by_grounding
|
||||
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
|
||||
|
||||
def score(case, resolved):
|
||||
"""(status, correct, dangerous, x_pct, y_pct).
|
||||
|
||||
resolved = dict de _resolve_by_grounding ou None (abstention).
|
||||
Règle identique à grounding_eval_multi.score : clic dans le rayon = juste,
|
||||
hors rayon = dangereux, abstention = raté non-dangereux.
|
||||
"""
|
||||
reg = case["expectation"]["click_region"]
|
||||
if not resolved or resolved.get("x_pct") is None:
|
||||
return "abstain", False, False, None, None
|
||||
xp, yp = float(resolved["x_pct"]), float(resolved["y_pct"])
|
||||
d = math.hypot(xp - reg["x_pct"], yp - reg["y_pct"])
|
||||
if d <= reg["radius_pct"]:
|
||||
return "in_region", True, False, xp, yp
|
||||
return "outside_region", False, True, xp, yp
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--cases", required=True)
|
||||
ap.add_argument("--out", default="benchmarks/computer_use/predictions/easily_e2e_resolve_engine.jsonl")
|
||||
args = ap.parse_args()
|
||||
|
||||
cases = [json.loads(line) for line in open(args.cases)]
|
||||
rows = []
|
||||
print(f"\n===== E2E _resolve_by_grounding (qwen3vl_vllm) — {len(cases)} cas =====", flush=True)
|
||||
for c in cases:
|
||||
W, H = Image.open(c["screenshot_path"]).size
|
||||
target_spec = {"by_text": c["task"]["target_text"]}
|
||||
t0 = time.time()
|
||||
try:
|
||||
resolved = _resolve_by_grounding(c["screenshot_path"], target_spec, W, H)
|
||||
err = None
|
||||
except Exception as e: # noqa: BLE001
|
||||
resolved, err = None, f"{type(e).__name__}: {e}"
|
||||
dt = time.time() - t0
|
||||
status, ok, dang, xp, yp = score(c, resolved)
|
||||
method = resolved.get("method") if resolved else None
|
||||
rows.append({
|
||||
"case_id": c["case_id"], "target": c["task"]["target_text"],
|
||||
"status": status, "correct": ok, "dangerous": dang,
|
||||
"x_pct": xp, "y_pct": yp, "method": method,
|
||||
"latency_s": round(dt, 2), "error": err,
|
||||
})
|
||||
flag = "OK " if ok else ("DANGER" if dang else "abst")
|
||||
print(f" {c['case_id'][:30]:30s} {flag:6s} {status:14s} {dt:5.1f}s "
|
||||
f"{(c['task']['target_text'][:20]):20s} "
|
||||
f"{('('+str(xp)+','+str(yp)+')') if xp is not None else (err or '-')}",
|
||||
flush=True)
|
||||
|
||||
with open(args.out, "w") as f:
|
||||
for r in rows:
|
||||
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
||||
|
||||
n = len(rows)
|
||||
correct = sum(r["correct"] for r in rows)
|
||||
dang = sum(r["dangerous"] for r in rows)
|
||||
abst = sum(1 for r in rows if r["status"] == "abstain")
|
||||
lats = [r["latency_s"] for r in rows if r["latency_s"]]
|
||||
lat_med = sorted(lats)[len(lats) // 2] if lats else None
|
||||
print("\n========== SYNTHÈSE E2E (vrai chemin resolve_engine) ==========")
|
||||
print(f" n={n} accuracy={correct/n:.3f} justes={correct} "
|
||||
f"DANGEREUX={dang} abstentions={abst} lat_méd={lat_med}s")
|
||||
print(f" prédictions → {args.out}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user