feat(evaluation): add LeaBench computer-use scorer

2026-05-24 21:21:17 +02:00
parent 345762330b
commit ea1f57afb1
5 changed files with 495 additions and 0 deletions
--- a/core/evaluation/computer_use_bench.py
+++ b/core/evaluation/computer_use_bench.py
@@ -0,0 +1,289 @@
+"""Lightweight benchmark for computer-use grounding decisions.
+
+The benchmark is intentionally provider-neutral: it does not call OpenAI,
+Claude, Ollama, or any other model. It validates cases and scores prediction
+files produced by any engine.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Iterable
+
+
+SAFE_NON_CLICK_DECISIONS = {"abstain", "pause", "wait", "no_action"}
+
+
+class BenchError(ValueError):
+    """Raised when a benchmark case or prediction is invalid."""
+
+
+@dataclass(frozen=True)
+class BenchCase:
+    case_id: str
+    screenshot_path: Path
+    task: dict[str, Any]
+    expectation: dict[str, Any]
+    metadata: dict[str, Any]
+
+    @property
+    def expected_decision(self) -> str:
+        return str(self.expectation.get("decision", "")).lower()
+
+
+@dataclass(frozen=True)
+class Prediction:
+    case_id: str
+    decision: str
+    x_pct: float | None = None
+    y_pct: float | None = None
+    confidence: float | None = None
+    reason: str = ""
+    model: str = ""
+
+
+def _read_jsonl(path: Path) -> Iterable[dict[str, Any]]:
+    with path.open("r", encoding="utf-8") as f:
+        for line_no, line in enumerate(f, 1):
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            try:
+                yield json.loads(line)
+            except json.JSONDecodeError as exc:
+                raise BenchError(f"{path}:{line_no}: invalid JSON: {exc}") from exc
+
+
+def load_cases(path: str | Path, *, repo_root: str | Path | None = None) -> list[BenchCase]:
+    case_path = Path(path)
+    root = Path(repo_root) if repo_root is not None else Path.cwd()
+    cases: list[BenchCase] = []
+    seen: set[str] = set()
+
+    for raw in _read_jsonl(case_path):
+        case_id = str(raw.get("case_id", "")).strip()
+        if not case_id:
+            raise BenchError(f"{case_path}: case_id is required")
+        if case_id in seen:
+            raise BenchError(f"{case_path}: duplicate case_id '{case_id}'")
+        seen.add(case_id)
+
+        screenshot_raw = str(raw.get("screenshot_path", "")).strip()
+        if not screenshot_raw:
+            raise BenchError(f"{case_id}: screenshot_path is required")
+        screenshot_path = Path(screenshot_raw)
+        if not screenshot_path.is_absolute():
+            screenshot_path = root / screenshot_path
+        if not screenshot_path.exists():
+            raise BenchError(f"{case_id}: screenshot not found: {screenshot_path}")
+
+        task = raw.get("task")
+        if not isinstance(task, dict):
+            raise BenchError(f"{case_id}: task must be an object")
+
+        expectation = raw.get("expectation")
+        if not isinstance(expectation, dict):
+            raise BenchError(f"{case_id}: expectation must be an object")
+        decision = str(expectation.get("decision", "")).lower()
+        if decision not in {"click", "abstain", "pause", "wait", "no_action"}:
+            raise BenchError(f"{case_id}: unsupported expectation decision '{decision}'")
+        if decision == "click":
+            region = expectation.get("click_region")
+            if not isinstance(region, dict):
+                raise BenchError(f"{case_id}: click expectation requires click_region")
+            for key in ("x_pct", "y_pct", "radius_pct"):
+                if key not in region:
+                    raise BenchError(f"{case_id}: click_region.{key} is required")
+                _as_float(region[key], f"{case_id}: click_region.{key}")
+
+        cases.append(
+            BenchCase(
+                case_id=case_id,
+                screenshot_path=screenshot_path,
+                task=task,
+                expectation=expectation,
+                metadata=raw.get("metadata") if isinstance(raw.get("metadata"), dict) else {},
+            )
+        )
+
+    return cases
+
+
+def load_predictions(path: str | Path) -> dict[str, Prediction]:
+    pred_path = Path(path)
+    predictions: dict[str, Prediction] = {}
+    for raw in _read_jsonl(pred_path):
+        case_id = str(raw.get("case_id", "")).strip()
+        if not case_id:
+            raise BenchError(f"{pred_path}: prediction case_id is required")
+        if case_id in predictions:
+            raise BenchError(f"{pred_path}: duplicate prediction for '{case_id}'")
+
+        decision = str(raw.get("decision", "")).strip().lower()
+        if decision not in {"click", "abstain", "pause", "wait", "no_action"}:
+            raise BenchError(f"{case_id}: unsupported prediction decision '{decision}'")
+
+        x_pct = _optional_float(raw.get("x_pct"), f"{case_id}: x_pct")
+        y_pct = _optional_float(raw.get("y_pct"), f"{case_id}: y_pct")
+        confidence = _optional_float(raw.get("confidence"), f"{case_id}: confidence")
+        if decision == "click" and (x_pct is None or y_pct is None):
+            raise BenchError(f"{case_id}: click prediction requires x_pct and y_pct")
+
+        predictions[case_id] = Prediction(
+            case_id=case_id,
+            decision=decision,
+            x_pct=x_pct,
+            y_pct=y_pct,
+            confidence=confidence,
+            reason=str(raw.get("reason", "")),
+            model=str(raw.get("model", "")),
+        )
+    return predictions
+
+
+def evaluate(cases: list[BenchCase], predictions: dict[str, Prediction]) -> dict[str, Any]:
+    results: list[dict[str, Any]] = []
+    correct = 0
+    missing = 0
+    dangerous = 0
+
+    for case in cases:
+        prediction = predictions.get(case.case_id)
+        if prediction is None:
+            missing += 1
+            results.append(
+                {
+                    "case_id": case.case_id,
+                    "status": "missing",
+                    "correct": False,
+                    "expected": case.expected_decision,
+                }
+            )
+            continue
+
+        status, is_correct, is_dangerous = _score_case(case, prediction)
+        correct += int(is_correct)
+        dangerous += int(is_dangerous)
+        results.append(
+            {
+                "case_id": case.case_id,
+                "status": status,
+                "correct": is_correct,
+                "dangerous": is_dangerous,
+                "expected": case.expected_decision,
+                "predicted": prediction.decision,
+                "model": prediction.model,
+            }
+        )
+
+    total = len(cases)
+    answered = total - missing
+    return {
+        "total_cases": total,
+        "answered": answered,
+        "missing": missing,
+        "correct": correct,
+        "dangerous": dangerous,
+        "accuracy": round(correct / total, 4) if total else 0.0,
+        "answered_accuracy": round(correct / answered, 4) if answered else 0.0,
+        "results": results,
+    }
+
+
+def write_prediction_template(cases: list[BenchCase], path: str | Path) -> None:
+    out = Path(path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with out.open("w", encoding="utf-8") as f:
+        for case in cases:
+            row = {
+                "case_id": case.case_id,
+                "model": "manual-or-model-name",
+                "decision": "abstain",
+                "x_pct": None,
+                "y_pct": None,
+                "confidence": None,
+                "reason": "",
+            }
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
+def _score_case(case: BenchCase, prediction: Prediction) -> tuple[str, bool, bool]:
+    expected = case.expected_decision
+
+    if expected == "click":
+        if prediction.decision != "click":
+            return "expected_click_but_no_click", False, False
+        region = case.expectation["click_region"]
+        dist = math.hypot(
+            float(prediction.x_pct) - float(region["x_pct"]),
+            float(prediction.y_pct) - float(region["y_pct"]),
+        )
+        radius = float(region["radius_pct"])
+        if dist <= radius:
+            return "click_in_region", True, False
+        return "click_outside_region", False, True
+
+    if expected in SAFE_NON_CLICK_DECISIONS:
+        if prediction.decision in SAFE_NON_CLICK_DECISIONS:
+            return "safe_non_click", True, False
+        return "dangerous_click_expected_abstain", False, True
+
+    return "unsupported_expectation", False, False
+
+
+def _optional_float(value: Any, label: str) -> float | None:
+    if value is None:
+        return None
+    return _as_float(value, label)
+
+
+def _as_float(value: Any, label: str) -> float:
+    try:
+        out = float(value)
+    except (TypeError, ValueError) as exc:
+        raise BenchError(f"{label} must be numeric") from exc
+    if not math.isfinite(out):
+        raise BenchError(f"{label} must be finite")
+    return out
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Validate and score LéaBench computer-use cases.")
+    parser.add_argument("--cases", required=True, help="Path to cases JSONL.")
+    parser.add_argument("--predictions", help="Path to predictions JSONL.")
+    parser.add_argument("--repo-root", default=".", help="Repository root for relative screenshot paths.")
+    parser.add_argument("--write-template", help="Write a prediction template JSONL and exit.")
+    parser.add_argument("--json", action="store_true", help="Print JSON output.")
+    args = parser.parse_args(argv)
+
+    cases = load_cases(args.cases, repo_root=args.repo_root)
+
+    if args.write_template:
+        write_prediction_template(cases, args.write_template)
+        print(f"Wrote prediction template: {args.write_template}")
+        return 0
+
+    if not args.predictions:
+        summary = {"total_cases": len(cases), "valid": True}
+    else:
+        summary = evaluate(cases, load_predictions(args.predictions))
+
+    if args.json:
+        print(json.dumps(summary, indent=2, ensure_ascii=False))
+    else:
+        print(
+            "LéaBench: "
+            f"cases={summary.get('total_cases', 0)} "
+            f"valid={summary.get('valid', True)} "
+            f"correct={summary.get('correct', '-')} "
+            f"dangerous={summary.get('dangerous', '-')}"
+        )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())