Files
rpa_vision_v3/tools/grounding_e2e_resolve_engine.py
Dom bb1ea42318
Some checks failed
tests / Lint (ruff + black) (push) Failing after 1m49s
tests / Tests unitaires (sans GPU) (push) Failing after 1m53s
tests / Tests sécurité (critique) (push) Has been skipped
feat(tools): add 7 wired+bench utility scripts (A+B classification)
- A (wired, imports project modules): e2e_map_roles, anonymize_demo, grounding_e2e_resolve_engine
- B (orphan projection, standalone benches): enrichment_eval_multi, extract_easily_bench_cases, extract_record_bench_cases, grounding_eval_multi
2026-07-02 13:27:04 +02:00

101 lines
4.0 KiB
Python

#!/usr/bin/env python3
"""E2E grounding via le VRAI chemin de résolution `_resolve_by_grounding`.
Contrairement à `grounding_eval_multi.py` (qui fait son propre appel vLLM direct
avec son prompt/parser = chemin parallèle « unit-mocké »), ce harness exerce le
chemin de production réel : `agent_v0.server_v1.resolve_engine._resolve_by_grounding`
en mode `RPA_GROUNDING_ENGINE=qwen3vl_vllm`.
But : confirmer que le 0.933 du bench (tuple modèle+moteur+prompt+parse+think) se
reproduit quand c'est le code de prod qui construit le prompt, encode l'image et
parse la réponse — pas un script de bench séparé.
Scoring identique au bench original (distance euclidienne au click_region humain).
Usage (env + tunnel vLLM 8001 requis) :
RPA_GROUNDING_ENGINE=qwen3vl_vllm .venv/bin/python3 \
tools/grounding_e2e_resolve_engine.py \
--cases benchmarks/computer_use/cases/leabench_easily_clean_v2.jsonl
"""
import argparse
import json
import logging
import math
import time
from PIL import Image
from agent_v0.server_v1.resolve_engine import _resolve_by_grounding
logging.basicConfig(level=logging.WARNING)
def score(case, resolved):
"""(status, correct, dangerous, x_pct, y_pct).
resolved = dict de _resolve_by_grounding ou None (abstention).
Règle identique à grounding_eval_multi.score : clic dans le rayon = juste,
hors rayon = dangereux, abstention = raté non-dangereux.
"""
reg = case["expectation"]["click_region"]
if not resolved or resolved.get("x_pct") is None:
return "abstain", False, False, None, None
xp, yp = float(resolved["x_pct"]), float(resolved["y_pct"])
d = math.hypot(xp - reg["x_pct"], yp - reg["y_pct"])
if d <= reg["radius_pct"]:
return "in_region", True, False, xp, yp
return "outside_region", False, True, xp, yp
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--cases", required=True)
ap.add_argument("--out", default="benchmarks/computer_use/predictions/easily_e2e_resolve_engine.jsonl")
args = ap.parse_args()
cases = [json.loads(line) for line in open(args.cases)]
rows = []
print(f"\n===== E2E _resolve_by_grounding (qwen3vl_vllm) — {len(cases)} cas =====", flush=True)
for c in cases:
W, H = Image.open(c["screenshot_path"]).size
target_spec = {"by_text": c["task"]["target_text"]}
t0 = time.time()
try:
resolved = _resolve_by_grounding(c["screenshot_path"], target_spec, W, H)
err = None
except Exception as e: # noqa: BLE001
resolved, err = None, f"{type(e).__name__}: {e}"
dt = time.time() - t0
status, ok, dang, xp, yp = score(c, resolved)
method = resolved.get("method") if resolved else None
rows.append({
"case_id": c["case_id"], "target": c["task"]["target_text"],
"status": status, "correct": ok, "dangerous": dang,
"x_pct": xp, "y_pct": yp, "method": method,
"latency_s": round(dt, 2), "error": err,
})
flag = "OK " if ok else ("DANGER" if dang else "abst")
print(f" {c['case_id'][:30]:30s} {flag:6s} {status:14s} {dt:5.1f}s "
f"{(c['task']['target_text'][:20]):20s} "
f"{('('+str(xp)+','+str(yp)+')') if xp is not None else (err or '-')}",
flush=True)
with open(args.out, "w") as f:
for r in rows:
f.write(json.dumps(r, ensure_ascii=False) + "\n")
n = len(rows)
correct = sum(r["correct"] for r in rows)
dang = sum(r["dangerous"] for r in rows)
abst = sum(1 for r in rows if r["status"] == "abstain")
lats = [r["latency_s"] for r in rows if r["latency_s"]]
lat_med = sorted(lats)[len(lats) // 2] if lats else None
print("\n========== SYNTHÈSE E2E (vrai chemin resolve_engine) ==========")
print(f" n={n} accuracy={correct/n:.3f} justes={correct} "
f"DANGEREUX={dang} abstentions={abst} lat_méd={lat_med}s")
print(f" prédictions → {args.out}")
if __name__ == "__main__":
main()