feat(evaluation): add local Ollama LeaBench adapter
This commit is contained in:
160
tests/unit/test_ollama_lea_bench_adapter.py
Normal file
160
tests/unit/test_ollama_lea_bench_adapter.py
Normal file
@@ -0,0 +1,160 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from core.evaluation.computer_use_bench import load_cases, load_predictions
|
||||
from core.evaluation.ollama_lea_bench_adapter import (
|
||||
build_ollama_payload,
|
||||
extract_json_object,
|
||||
normalize_prediction,
|
||||
run_ollama_case,
|
||||
write_ollama_predictions,
|
||||
)
|
||||
|
||||
|
||||
class _FakeResponse:
|
||||
def __init__(self, status_code: int, content: str):
|
||||
self.status_code = status_code
|
||||
self._content = content
|
||||
|
||||
def json(self):
|
||||
return {"message": {"content": self._content}}
|
||||
|
||||
|
||||
def _write_jsonl(path: Path, rows: list[dict]) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with path.open("w", encoding="utf-8") as f:
|
||||
for row in rows:
|
||||
f.write(json.dumps(row) + "\n")
|
||||
|
||||
|
||||
def _write_image(path: Path) -> None:
|
||||
Image.new("RGB", (32, 24), color=(255, 255, 255)).save(path)
|
||||
|
||||
|
||||
def _case_rows(screenshot: Path) -> list[dict]:
|
||||
return [
|
||||
{
|
||||
"case_id": "visible",
|
||||
"screenshot_path": str(screenshot),
|
||||
"task": {
|
||||
"intent": "click save",
|
||||
"target_text": "Enregistrer",
|
||||
"current_window": "Enregistrer sous",
|
||||
"expected_next_window": "Bloc-notes",
|
||||
"question": "Clique uniquement sur Enregistrer.",
|
||||
},
|
||||
"expectation": {
|
||||
"decision": "click",
|
||||
"click_region": {"x_pct": 0.5, "y_pct": 0.8, "radius_pct": 0.1},
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _load_one_case(tmp_path: Path):
|
||||
screenshot = tmp_path / "screen.png"
|
||||
_write_image(screenshot)
|
||||
cases_path = tmp_path / "cases.jsonl"
|
||||
_write_jsonl(cases_path, _case_rows(screenshot))
|
||||
return load_cases(cases_path, repo_root=tmp_path)[0]
|
||||
|
||||
|
||||
def test_build_payload_uses_image_without_leaking_expectation(tmp_path):
|
||||
case = _load_one_case(tmp_path)
|
||||
|
||||
payload = build_ollama_payload(case, model="qwen-test", image_b64="abc123")
|
||||
serialized = json.dumps(payload)
|
||||
|
||||
assert payload["model"] == "qwen-test"
|
||||
assert payload["messages"][1]["images"] == ["abc123"]
|
||||
assert "Expected current window: Enregistrer sous" in payload["messages"][1]["content"]
|
||||
assert "click_region" not in serialized
|
||||
assert "expectation" not in serialized
|
||||
|
||||
|
||||
def test_extract_json_object_accepts_fences_and_single_quotes():
|
||||
assert extract_json_object('```json\n{"decision":"abstain"}\n```') == {"decision": "abstain"}
|
||||
assert extract_json_object("prefix {'decision':'wait'} suffix") == {"decision": "wait"}
|
||||
|
||||
|
||||
def test_normalize_prediction_accepts_valid_click(tmp_path):
|
||||
case = _load_one_case(tmp_path)
|
||||
|
||||
prediction = normalize_prediction(
|
||||
case,
|
||||
{"decision": "click", "x_pct": 0.51, "y_pct": "0.79", "confidence": 0.9, "reason": "ok"},
|
||||
model="qwen-test",
|
||||
)
|
||||
|
||||
assert prediction["case_id"] == "visible"
|
||||
assert prediction["decision"] == "click"
|
||||
assert prediction["x_pct"] == 0.51
|
||||
assert prediction["y_pct"] == 0.79
|
||||
|
||||
|
||||
def test_normalize_prediction_forces_abstain_on_bad_click(tmp_path):
|
||||
case = _load_one_case(tmp_path)
|
||||
|
||||
prediction = normalize_prediction(
|
||||
case,
|
||||
{"decision": "click", "x_pct": 1.2, "y_pct": 0.2, "confidence": 0.9},
|
||||
model="qwen-test",
|
||||
)
|
||||
|
||||
assert prediction["decision"] == "abstain"
|
||||
assert prediction["reason"] == "coords_out_of_bounds"
|
||||
|
||||
|
||||
def test_run_ollama_case_uses_http_response_without_network(tmp_path):
|
||||
case = _load_one_case(tmp_path)
|
||||
|
||||
def fake_post(url, json, timeout):
|
||||
assert url == "http://ollama.test/api/chat"
|
||||
assert timeout == 7
|
||||
assert json["messages"][1]["images"] == ["fake-image"]
|
||||
return _FakeResponse(
|
||||
200,
|
||||
'{"decision":"abstain","x_pct":null,"y_pct":null,"confidence":0.8,"reason":"wrong_window"}',
|
||||
)
|
||||
|
||||
prediction = run_ollama_case(
|
||||
case,
|
||||
model="qwen-test",
|
||||
endpoint="http://ollama.test",
|
||||
timeout=7,
|
||||
post=fake_post,
|
||||
image_encoder=lambda _path: "fake-image",
|
||||
retries=0,
|
||||
)
|
||||
|
||||
assert prediction["model"] == "qwen-test"
|
||||
assert prediction["decision"] == "abstain"
|
||||
assert prediction["x_pct"] is None
|
||||
|
||||
|
||||
def test_write_ollama_predictions_outputs_valid_leabench_jsonl(tmp_path):
|
||||
case = _load_one_case(tmp_path)
|
||||
predictions_path = tmp_path / "predictions.jsonl"
|
||||
|
||||
def fake_post(_url, json, timeout):
|
||||
assert json["format"] == "json"
|
||||
assert timeout == 45
|
||||
return _FakeResponse(
|
||||
200,
|
||||
'{"decision":"click","x_pct":0.5,"y_pct":0.8,"confidence":0.92,"reason":"visible"}',
|
||||
)
|
||||
|
||||
write_ollama_predictions(
|
||||
[case],
|
||||
predictions_path,
|
||||
model="qwen-test",
|
||||
endpoint="http://ollama.test",
|
||||
post=fake_post,
|
||||
image_encoder=lambda _path: "fake-image",
|
||||
)
|
||||
|
||||
predictions = load_predictions(predictions_path)
|
||||
assert predictions["visible"].decision == "click"
|
||||
assert predictions["visible"].x_pct == 0.5
|
||||
Reference in New Issue
Block a user