feat(evaluation): add local Ollama LeaBench adapter

2026-05-24 21:58:06 +02:00
parent 6544ebe3f0
commit debd7b423c
4 changed files with 498 additions and 0 deletions
--- a/tests/unit/test_ollama_lea_bench_adapter.py
+++ b/tests/unit/test_ollama_lea_bench_adapter.py
@@ -0,0 +1,160 @@
+import json
+from pathlib import Path
+
+from PIL import Image
+
+from core.evaluation.computer_use_bench import load_cases, load_predictions
+from core.evaluation.ollama_lea_bench_adapter import (
+    build_ollama_payload,
+    extract_json_object,
+    normalize_prediction,
+    run_ollama_case,
+    write_ollama_predictions,
+)
+
+
+class _FakeResponse:
+    def __init__(self, status_code: int, content: str):
+        self.status_code = status_code
+        self._content = content
+
+    def json(self):
+        return {"message": {"content": self._content}}
+
+
+def _write_jsonl(path: Path, rows: list[dict]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row) + "\n")
+
+
+def _write_image(path: Path) -> None:
+    Image.new("RGB", (32, 24), color=(255, 255, 255)).save(path)
+
+
+def _case_rows(screenshot: Path) -> list[dict]:
+    return [
+        {
+            "case_id": "visible",
+            "screenshot_path": str(screenshot),
+            "task": {
+                "intent": "click save",
+                "target_text": "Enregistrer",
+                "current_window": "Enregistrer sous",
+                "expected_next_window": "Bloc-notes",
+                "question": "Clique uniquement sur Enregistrer.",
+            },
+            "expectation": {
+                "decision": "click",
+                "click_region": {"x_pct": 0.5, "y_pct": 0.8, "radius_pct": 0.1},
+            },
+        }
+    ]
+
+
+def _load_one_case(tmp_path: Path):
+    screenshot = tmp_path / "screen.png"
+    _write_image(screenshot)
+    cases_path = tmp_path / "cases.jsonl"
+    _write_jsonl(cases_path, _case_rows(screenshot))
+    return load_cases(cases_path, repo_root=tmp_path)[0]
+
+
+def test_build_payload_uses_image_without_leaking_expectation(tmp_path):
+    case = _load_one_case(tmp_path)
+
+    payload = build_ollama_payload(case, model="qwen-test", image_b64="abc123")
+    serialized = json.dumps(payload)
+
+    assert payload["model"] == "qwen-test"
+    assert payload["messages"][1]["images"] == ["abc123"]
+    assert "Expected current window: Enregistrer sous" in payload["messages"][1]["content"]
+    assert "click_region" not in serialized
+    assert "expectation" not in serialized
+
+
+def test_extract_json_object_accepts_fences_and_single_quotes():
+    assert extract_json_object('```json\n{"decision":"abstain"}\n```') == {"decision": "abstain"}
+    assert extract_json_object("prefix {'decision':'wait'} suffix") == {"decision": "wait"}
+
+
+def test_normalize_prediction_accepts_valid_click(tmp_path):
+    case = _load_one_case(tmp_path)
+
+    prediction = normalize_prediction(
+        case,
+        {"decision": "click", "x_pct": 0.51, "y_pct": "0.79", "confidence": 0.9, "reason": "ok"},
+        model="qwen-test",
+    )
+
+    assert prediction["case_id"] == "visible"
+    assert prediction["decision"] == "click"
+    assert prediction["x_pct"] == 0.51
+    assert prediction["y_pct"] == 0.79
+
+
+def test_normalize_prediction_forces_abstain_on_bad_click(tmp_path):
+    case = _load_one_case(tmp_path)
+
+    prediction = normalize_prediction(
+        case,
+        {"decision": "click", "x_pct": 1.2, "y_pct": 0.2, "confidence": 0.9},
+        model="qwen-test",
+    )
+
+    assert prediction["decision"] == "abstain"
+    assert prediction["reason"] == "coords_out_of_bounds"
+
+
+def test_run_ollama_case_uses_http_response_without_network(tmp_path):
+    case = _load_one_case(tmp_path)
+
+    def fake_post(url, json, timeout):
+        assert url == "http://ollama.test/api/chat"
+        assert timeout == 7
+        assert json["messages"][1]["images"] == ["fake-image"]
+        return _FakeResponse(
+            200,
+            '{"decision":"abstain","x_pct":null,"y_pct":null,"confidence":0.8,"reason":"wrong_window"}',
+        )
+
+    prediction = run_ollama_case(
+        case,
+        model="qwen-test",
+        endpoint="http://ollama.test",
+        timeout=7,
+        post=fake_post,
+        image_encoder=lambda _path: "fake-image",
+        retries=0,
+    )
+
+    assert prediction["model"] == "qwen-test"
+    assert prediction["decision"] == "abstain"
+    assert prediction["x_pct"] is None
+
+
+def test_write_ollama_predictions_outputs_valid_leabench_jsonl(tmp_path):
+    case = _load_one_case(tmp_path)
+    predictions_path = tmp_path / "predictions.jsonl"
+
+    def fake_post(_url, json, timeout):
+        assert json["format"] == "json"
+        assert timeout == 45
+        return _FakeResponse(
+            200,
+            '{"decision":"click","x_pct":0.5,"y_pct":0.8,"confidence":0.92,"reason":"visible"}',
+        )
+
+    write_ollama_predictions(
+        [case],
+        predictions_path,
+        model="qwen-test",
+        endpoint="http://ollama.test",
+        post=fake_post,
+        image_encoder=lambda _path: "fake-image",
+    )
+
+    predictions = load_predictions(predictions_path)
+    assert predictions["visible"].decision == "click"
+    assert predictions["visible"].x_pct == 0.5