feat(evaluation): add LeaBench model prompt packs

2026-05-24 21:53:24 +02:00
parent ea1f57afb1
commit 054279feb4
3 changed files with 151 additions and 0 deletions
--- a/tests/unit/test_computer_use_bench.py
+++ b/tests/unit/test_computer_use_bench.py
@@ -3,9 +3,11 @@ from pathlib import Path

 from core.evaluation.computer_use_bench import (
    BenchError,
+    build_model_prompt,
    evaluate,
    load_cases,
    load_predictions,
+    write_model_prompt_pack,
    write_prediction_template,
 )

@@ -124,3 +126,37 @@ def test_write_prediction_template(tmp_path):
    rows = [json.loads(line) for line in template_path.read_text().splitlines()]
    assert [row["case_id"] for row in rows] == ["absent", "visible"]
    assert all(row["decision"] == "abstain" for row in rows)
+
+
+def test_build_model_prompt_uses_task_without_leaking_expectation(tmp_path):
+    screenshot = tmp_path / "screen.jpg"
+    screenshot.write_bytes(b"fake image bytes")
+    cases_path = tmp_path / "cases.jsonl"
+    _write_jsonl(cases_path, _case_rows(screenshot))
+    case = load_cases(cases_path, repo_root=tmp_path)[1]
+
+    prompt = build_model_prompt(case, repo_root=tmp_path)
+    serialized = json.dumps(prompt)
+
+    assert prompt["case_id"] == "visible"
+    assert prompt["screenshot_path"] == "screen.jpg"
+    assert "JSON STRICT" in prompt["system_prompt"]
+    assert prompt["output_schema"]["decision"] == "click|abstain|pause|wait|no_action"
+    assert "click save" in prompt["user_prompt"]["instruction"]
+    assert "click_region" not in serialized
+    assert "expectation" not in serialized
+
+
+def test_write_model_prompt_pack(tmp_path):
+    screenshot = tmp_path / "screen.jpg"
+    screenshot.write_bytes(b"fake image bytes")
+    cases_path = tmp_path / "cases.jsonl"
+    prompt_pack_path = tmp_path / "prompts.jsonl"
+    _write_jsonl(cases_path, _case_rows(screenshot))
+
+    write_model_prompt_pack(load_cases(cases_path, repo_root=tmp_path), prompt_pack_path, repo_root=tmp_path)
+
+    rows = [json.loads(line) for line in prompt_pack_path.read_text().splitlines()]
+    assert [row["case_id"] for row in rows] == ["absent", "visible"]
+    assert all(row["generation"]["temperature"] == 0.0 for row in rows)
+    assert all(row["safety"]["runtime_control"].startswith("benchmark only") for row in rows)