feat(evaluation): add LeaBench model prompt packs
This commit is contained in:
@@ -3,9 +3,11 @@ from pathlib import Path
|
||||
|
||||
from core.evaluation.computer_use_bench import (
|
||||
BenchError,
|
||||
build_model_prompt,
|
||||
evaluate,
|
||||
load_cases,
|
||||
load_predictions,
|
||||
write_model_prompt_pack,
|
||||
write_prediction_template,
|
||||
)
|
||||
|
||||
@@ -124,3 +126,37 @@ def test_write_prediction_template(tmp_path):
|
||||
rows = [json.loads(line) for line in template_path.read_text().splitlines()]
|
||||
assert [row["case_id"] for row in rows] == ["absent", "visible"]
|
||||
assert all(row["decision"] == "abstain" for row in rows)
|
||||
|
||||
|
||||
def test_build_model_prompt_uses_task_without_leaking_expectation(tmp_path):
|
||||
screenshot = tmp_path / "screen.jpg"
|
||||
screenshot.write_bytes(b"fake image bytes")
|
||||
cases_path = tmp_path / "cases.jsonl"
|
||||
_write_jsonl(cases_path, _case_rows(screenshot))
|
||||
case = load_cases(cases_path, repo_root=tmp_path)[1]
|
||||
|
||||
prompt = build_model_prompt(case, repo_root=tmp_path)
|
||||
serialized = json.dumps(prompt)
|
||||
|
||||
assert prompt["case_id"] == "visible"
|
||||
assert prompt["screenshot_path"] == "screen.jpg"
|
||||
assert "JSON STRICT" in prompt["system_prompt"]
|
||||
assert prompt["output_schema"]["decision"] == "click|abstain|pause|wait|no_action"
|
||||
assert "click save" in prompt["user_prompt"]["instruction"]
|
||||
assert "click_region" not in serialized
|
||||
assert "expectation" not in serialized
|
||||
|
||||
|
||||
def test_write_model_prompt_pack(tmp_path):
|
||||
screenshot = tmp_path / "screen.jpg"
|
||||
screenshot.write_bytes(b"fake image bytes")
|
||||
cases_path = tmp_path / "cases.jsonl"
|
||||
prompt_pack_path = tmp_path / "prompts.jsonl"
|
||||
_write_jsonl(cases_path, _case_rows(screenshot))
|
||||
|
||||
write_model_prompt_pack(load_cases(cases_path, repo_root=tmp_path), prompt_pack_path, repo_root=tmp_path)
|
||||
|
||||
rows = [json.loads(line) for line in prompt_pack_path.read_text().splitlines()]
|
||||
assert [row["case_id"] for row in rows] == ["absent", "visible"]
|
||||
assert all(row["generation"]["temperature"] == 0.0 for row in rows)
|
||||
assert all(row["safety"]["runtime_control"].startswith("benchmark only") for row in rows)
|
||||
|
||||
Reference in New Issue
Block a user