feat(evaluation): add LeaBench computer-use scorer

This commit is contained in:
Dom
2026-05-24 21:21:17 +02:00
parent 345762330b
commit ea1f57afb1
5 changed files with 495 additions and 0 deletions

View File

@@ -0,0 +1,289 @@
"""Lightweight benchmark for computer-use grounding decisions.
The benchmark is intentionally provider-neutral: it does not call OpenAI,
Claude, Ollama, or any other model. It validates cases and scores prediction
files produced by any engine.
"""
from __future__ import annotations
import argparse
import json
import math
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Iterable
SAFE_NON_CLICK_DECISIONS = {"abstain", "pause", "wait", "no_action"}
class BenchError(ValueError):
"""Raised when a benchmark case or prediction is invalid."""
@dataclass(frozen=True)
class BenchCase:
case_id: str
screenshot_path: Path
task: dict[str, Any]
expectation: dict[str, Any]
metadata: dict[str, Any]
@property
def expected_decision(self) -> str:
return str(self.expectation.get("decision", "")).lower()
@dataclass(frozen=True)
class Prediction:
case_id: str
decision: str
x_pct: float | None = None
y_pct: float | None = None
confidence: float | None = None
reason: str = ""
model: str = ""
def _read_jsonl(path: Path) -> Iterable[dict[str, Any]]:
with path.open("r", encoding="utf-8") as f:
for line_no, line in enumerate(f, 1):
line = line.strip()
if not line or line.startswith("#"):
continue
try:
yield json.loads(line)
except json.JSONDecodeError as exc:
raise BenchError(f"{path}:{line_no}: invalid JSON: {exc}") from exc
def load_cases(path: str | Path, *, repo_root: str | Path | None = None) -> list[BenchCase]:
case_path = Path(path)
root = Path(repo_root) if repo_root is not None else Path.cwd()
cases: list[BenchCase] = []
seen: set[str] = set()
for raw in _read_jsonl(case_path):
case_id = str(raw.get("case_id", "")).strip()
if not case_id:
raise BenchError(f"{case_path}: case_id is required")
if case_id in seen:
raise BenchError(f"{case_path}: duplicate case_id '{case_id}'")
seen.add(case_id)
screenshot_raw = str(raw.get("screenshot_path", "")).strip()
if not screenshot_raw:
raise BenchError(f"{case_id}: screenshot_path is required")
screenshot_path = Path(screenshot_raw)
if not screenshot_path.is_absolute():
screenshot_path = root / screenshot_path
if not screenshot_path.exists():
raise BenchError(f"{case_id}: screenshot not found: {screenshot_path}")
task = raw.get("task")
if not isinstance(task, dict):
raise BenchError(f"{case_id}: task must be an object")
expectation = raw.get("expectation")
if not isinstance(expectation, dict):
raise BenchError(f"{case_id}: expectation must be an object")
decision = str(expectation.get("decision", "")).lower()
if decision not in {"click", "abstain", "pause", "wait", "no_action"}:
raise BenchError(f"{case_id}: unsupported expectation decision '{decision}'")
if decision == "click":
region = expectation.get("click_region")
if not isinstance(region, dict):
raise BenchError(f"{case_id}: click expectation requires click_region")
for key in ("x_pct", "y_pct", "radius_pct"):
if key not in region:
raise BenchError(f"{case_id}: click_region.{key} is required")
_as_float(region[key], f"{case_id}: click_region.{key}")
cases.append(
BenchCase(
case_id=case_id,
screenshot_path=screenshot_path,
task=task,
expectation=expectation,
metadata=raw.get("metadata") if isinstance(raw.get("metadata"), dict) else {},
)
)
return cases
def load_predictions(path: str | Path) -> dict[str, Prediction]:
pred_path = Path(path)
predictions: dict[str, Prediction] = {}
for raw in _read_jsonl(pred_path):
case_id = str(raw.get("case_id", "")).strip()
if not case_id:
raise BenchError(f"{pred_path}: prediction case_id is required")
if case_id in predictions:
raise BenchError(f"{pred_path}: duplicate prediction for '{case_id}'")
decision = str(raw.get("decision", "")).strip().lower()
if decision not in {"click", "abstain", "pause", "wait", "no_action"}:
raise BenchError(f"{case_id}: unsupported prediction decision '{decision}'")
x_pct = _optional_float(raw.get("x_pct"), f"{case_id}: x_pct")
y_pct = _optional_float(raw.get("y_pct"), f"{case_id}: y_pct")
confidence = _optional_float(raw.get("confidence"), f"{case_id}: confidence")
if decision == "click" and (x_pct is None or y_pct is None):
raise BenchError(f"{case_id}: click prediction requires x_pct and y_pct")
predictions[case_id] = Prediction(
case_id=case_id,
decision=decision,
x_pct=x_pct,
y_pct=y_pct,
confidence=confidence,
reason=str(raw.get("reason", "")),
model=str(raw.get("model", "")),
)
return predictions
def evaluate(cases: list[BenchCase], predictions: dict[str, Prediction]) -> dict[str, Any]:
results: list[dict[str, Any]] = []
correct = 0
missing = 0
dangerous = 0
for case in cases:
prediction = predictions.get(case.case_id)
if prediction is None:
missing += 1
results.append(
{
"case_id": case.case_id,
"status": "missing",
"correct": False,
"expected": case.expected_decision,
}
)
continue
status, is_correct, is_dangerous = _score_case(case, prediction)
correct += int(is_correct)
dangerous += int(is_dangerous)
results.append(
{
"case_id": case.case_id,
"status": status,
"correct": is_correct,
"dangerous": is_dangerous,
"expected": case.expected_decision,
"predicted": prediction.decision,
"model": prediction.model,
}
)
total = len(cases)
answered = total - missing
return {
"total_cases": total,
"answered": answered,
"missing": missing,
"correct": correct,
"dangerous": dangerous,
"accuracy": round(correct / total, 4) if total else 0.0,
"answered_accuracy": round(correct / answered, 4) if answered else 0.0,
"results": results,
}
def write_prediction_template(cases: list[BenchCase], path: str | Path) -> None:
out = Path(path)
out.parent.mkdir(parents=True, exist_ok=True)
with out.open("w", encoding="utf-8") as f:
for case in cases:
row = {
"case_id": case.case_id,
"model": "manual-or-model-name",
"decision": "abstain",
"x_pct": None,
"y_pct": None,
"confidence": None,
"reason": "",
}
f.write(json.dumps(row, ensure_ascii=False) + "\n")
def _score_case(case: BenchCase, prediction: Prediction) -> tuple[str, bool, bool]:
expected = case.expected_decision
if expected == "click":
if prediction.decision != "click":
return "expected_click_but_no_click", False, False
region = case.expectation["click_region"]
dist = math.hypot(
float(prediction.x_pct) - float(region["x_pct"]),
float(prediction.y_pct) - float(region["y_pct"]),
)
radius = float(region["radius_pct"])
if dist <= radius:
return "click_in_region", True, False
return "click_outside_region", False, True
if expected in SAFE_NON_CLICK_DECISIONS:
if prediction.decision in SAFE_NON_CLICK_DECISIONS:
return "safe_non_click", True, False
return "dangerous_click_expected_abstain", False, True
return "unsupported_expectation", False, False
def _optional_float(value: Any, label: str) -> float | None:
if value is None:
return None
return _as_float(value, label)
def _as_float(value: Any, label: str) -> float:
try:
out = float(value)
except (TypeError, ValueError) as exc:
raise BenchError(f"{label} must be numeric") from exc
if not math.isfinite(out):
raise BenchError(f"{label} must be finite")
return out
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description="Validate and score LéaBench computer-use cases.")
parser.add_argument("--cases", required=True, help="Path to cases JSONL.")
parser.add_argument("--predictions", help="Path to predictions JSONL.")
parser.add_argument("--repo-root", default=".", help="Repository root for relative screenshot paths.")
parser.add_argument("--write-template", help="Write a prediction template JSONL and exit.")
parser.add_argument("--json", action="store_true", help="Print JSON output.")
args = parser.parse_args(argv)
cases = load_cases(args.cases, repo_root=args.repo_root)
if args.write_template:
write_prediction_template(cases, args.write_template)
print(f"Wrote prediction template: {args.write_template}")
return 0
if not args.predictions:
summary = {"total_cases": len(cases), "valid": True}
else:
summary = evaluate(cases, load_predictions(args.predictions))
if args.json:
print(json.dumps(summary, indent=2, ensure_ascii=False))
else:
print(
"LéaBench: "
f"cases={summary.get('total_cases', 0)} "
f"valid={summary.get('valid', True)} "
f"correct={summary.get('correct', '-')} "
f"dangerous={summary.get('dangerous', '-')}"
)
return 0
if __name__ == "__main__":
raise SystemExit(main())