import json from pathlib import Path from PIL import Image from core.evaluation.computer_use_bench import load_cases, load_predictions from core.evaluation.ollama_lea_bench_adapter import ( build_ollama_payload, extract_json_object, normalize_prediction, run_ollama_case, write_ollama_predictions, ) class _FakeResponse: def __init__(self, status_code: int, content: str): self.status_code = status_code self._content = content def json(self): return {"message": {"content": self._content}} def _write_jsonl(path: Path, rows: list[dict]) -> None: path.parent.mkdir(parents=True, exist_ok=True) with path.open("w", encoding="utf-8") as f: for row in rows: f.write(json.dumps(row) + "\n") def _write_image(path: Path) -> None: Image.new("RGB", (32, 24), color=(255, 255, 255)).save(path) def _case_rows(screenshot: Path) -> list[dict]: return [ { "case_id": "visible", "screenshot_path": str(screenshot), "task": { "intent": "click save", "target_text": "Enregistrer", "current_window": "Enregistrer sous", "expected_next_window": "Bloc-notes", "question": "Clique uniquement sur Enregistrer.", }, "expectation": { "decision": "click", "click_region": {"x_pct": 0.5, "y_pct": 0.8, "radius_pct": 0.1}, }, } ] def _load_one_case(tmp_path: Path): screenshot = tmp_path / "screen.png" _write_image(screenshot) cases_path = tmp_path / "cases.jsonl" _write_jsonl(cases_path, _case_rows(screenshot)) return load_cases(cases_path, repo_root=tmp_path)[0] def test_build_payload_uses_image_without_leaking_expectation(tmp_path): case = _load_one_case(tmp_path) payload = build_ollama_payload(case, model="qwen-test", image_b64="abc123") serialized = json.dumps(payload) assert payload["model"] == "qwen-test" assert payload["messages"][1]["images"] == ["abc123"] assert "Expected current window: Enregistrer sous" in payload["messages"][1]["content"] assert "click_region" not in serialized assert "expectation" not in serialized def test_extract_json_object_accepts_fences_and_single_quotes(): assert extract_json_object('```json\n{"decision":"abstain"}\n```') == {"decision": "abstain"} assert extract_json_object("prefix {'decision':'wait'} suffix") == {"decision": "wait"} def test_normalize_prediction_accepts_valid_click(tmp_path): case = _load_one_case(tmp_path) prediction = normalize_prediction( case, {"decision": "click", "x_pct": 0.51, "y_pct": "0.79", "confidence": 0.9, "reason": "ok"}, model="qwen-test", ) assert prediction["case_id"] == "visible" assert prediction["decision"] == "click" assert prediction["x_pct"] == 0.51 assert prediction["y_pct"] == 0.79 def test_normalize_prediction_forces_abstain_on_bad_click(tmp_path): case = _load_one_case(tmp_path) prediction = normalize_prediction( case, {"decision": "click", "x_pct": 1.2, "y_pct": 0.2, "confidence": 0.9}, model="qwen-test", ) assert prediction["decision"] == "abstain" assert prediction["reason"] == "coords_out_of_bounds" def test_run_ollama_case_uses_http_response_without_network(tmp_path): case = _load_one_case(tmp_path) def fake_post(url, json, timeout): assert url == "http://ollama.test/api/chat" assert timeout == 7 assert json["messages"][1]["images"] == ["fake-image"] return _FakeResponse( 200, '{"decision":"abstain","x_pct":null,"y_pct":null,"confidence":0.8,"reason":"wrong_window"}', ) prediction = run_ollama_case( case, model="qwen-test", endpoint="http://ollama.test", timeout=7, post=fake_post, image_encoder=lambda _path: "fake-image", retries=0, ) assert prediction["model"] == "qwen-test" assert prediction["decision"] == "abstain" assert prediction["x_pct"] is None def test_write_ollama_predictions_outputs_valid_leabench_jsonl(tmp_path): case = _load_one_case(tmp_path) predictions_path = tmp_path / "predictions.jsonl" def fake_post(_url, json, timeout): assert json["format"] == "json" assert timeout == 45 return _FakeResponse( 200, '{"decision":"click","x_pct":0.5,"y_pct":0.8,"confidence":0.92,"reason":"visible"}', ) write_ollama_predictions( [case], predictions_path, model="qwen-test", endpoint="http://ollama.test", post=fake_post, image_encoder=lambda _path: "fake-image", ) predictions = load_predictions(predictions_path) assert predictions["visible"].decision == "click" assert predictions["visible"].x_pct == 0.5