feat(evaluation): add LeaBench model prompt packs

This commit is contained in:
Dom
2026-05-24 21:53:24 +02:00
parent ea1f57afb1
commit 054279feb4
3 changed files with 151 additions and 0 deletions

View File

@@ -3,9 +3,11 @@ from pathlib import Path
from core.evaluation.computer_use_bench import (
BenchError,
build_model_prompt,
evaluate,
load_cases,
load_predictions,
write_model_prompt_pack,
write_prediction_template,
)
@@ -124,3 +126,37 @@ def test_write_prediction_template(tmp_path):
rows = [json.loads(line) for line in template_path.read_text().splitlines()]
assert [row["case_id"] for row in rows] == ["absent", "visible"]
assert all(row["decision"] == "abstain" for row in rows)
def test_build_model_prompt_uses_task_without_leaking_expectation(tmp_path):
screenshot = tmp_path / "screen.jpg"
screenshot.write_bytes(b"fake image bytes")
cases_path = tmp_path / "cases.jsonl"
_write_jsonl(cases_path, _case_rows(screenshot))
case = load_cases(cases_path, repo_root=tmp_path)[1]
prompt = build_model_prompt(case, repo_root=tmp_path)
serialized = json.dumps(prompt)
assert prompt["case_id"] == "visible"
assert prompt["screenshot_path"] == "screen.jpg"
assert "JSON STRICT" in prompt["system_prompt"]
assert prompt["output_schema"]["decision"] == "click|abstain|pause|wait|no_action"
assert "click save" in prompt["user_prompt"]["instruction"]
assert "click_region" not in serialized
assert "expectation" not in serialized
def test_write_model_prompt_pack(tmp_path):
screenshot = tmp_path / "screen.jpg"
screenshot.write_bytes(b"fake image bytes")
cases_path = tmp_path / "cases.jsonl"
prompt_pack_path = tmp_path / "prompts.jsonl"
_write_jsonl(cases_path, _case_rows(screenshot))
write_model_prompt_pack(load_cases(cases_path, repo_root=tmp_path), prompt_pack_path, repo_root=tmp_path)
rows = [json.loads(line) for line in prompt_pack_path.read_text().splitlines()]
assert [row["case_id"] for row in rows] == ["absent", "visible"]
assert all(row["generation"]["temperature"] == 0.0 for row in rows)
assert all(row["safety"]["runtime_control"].startswith("benchmark only") for row in rows)