feat(evaluation): add local Ollama LeaBench adapter
This commit is contained in:
@@ -59,6 +59,16 @@ python3 tools/lea_bench.py \
|
||||
--json
|
||||
```
|
||||
|
||||
Produire des predictions avec Ollama local :
|
||||
|
||||
```bash
|
||||
python3 tools/lea_bench_ollama.py \
|
||||
--cases benchmarks/computer_use/cases/notepad_replay_failures_2026-05-24.jsonl \
|
||||
--repo-root . \
|
||||
--model qwen2.5vl:7b-rpa \
|
||||
--output benchmarks/computer_use/predictions/qwen25vl_notepad.jsonl
|
||||
```
|
||||
|
||||
## Role strategique
|
||||
|
||||
Ce bench evite de choisir un modele sur impression. On mesure :
|
||||
|
||||
313
core/evaluation/ollama_lea_bench_adapter.py
Normal file
313
core/evaluation/ollama_lea_bench_adapter.py
Normal file
@@ -0,0 +1,313 @@
|
||||
"""Ollama adapter that writes LeaBench-compatible prediction JSONL.
|
||||
|
||||
The adapter is deliberately outside Lea runtime. It only benchmarks a local
|
||||
vision model against static screenshots and never controls the desktop.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable
|
||||
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
from core.evaluation.computer_use_bench import (
|
||||
SAFE_NON_CLICK_DECISIONS,
|
||||
BenchCase,
|
||||
load_cases,
|
||||
)
|
||||
|
||||
|
||||
DEFAULT_MODEL = "qwen2.5vl:7b-rpa"
|
||||
DEFAULT_ENDPOINT = "http://localhost:11434"
|
||||
VALID_DECISIONS = {"click", *SAFE_NON_CLICK_DECISIONS}
|
||||
|
||||
OLLAMA_SYSTEM_PROMPT = """You are a desktop UI safety judge. You receive ONE screenshot and a question
|
||||
about a target UI element. Your only job is to decide whether clicking is safe.
|
||||
|
||||
You MUST respond with a single valid JSON object, nothing else: no prose, no
|
||||
markdown fences, no thinking tags. The JSON schema is:
|
||||
|
||||
{
|
||||
"decision": "click" | "abstain" | "pause" | "wait" | "no_action",
|
||||
"x_pct": <float 0.0-1.0 OR null>,
|
||||
"y_pct": <float 0.0-1.0 OR null>,
|
||||
"confidence": <float 0.0-1.0>,
|
||||
"reason": "<short string, max 120 chars, English or French>"
|
||||
}
|
||||
|
||||
Rules:
|
||||
- "click" REQUIRES x_pct AND y_pct normalized 0..1, origin top-left.
|
||||
- If the requested target is not visible, in the wrong state, behind a modal,
|
||||
or ambiguous: choose "abstain" and set x_pct/y_pct to null.
|
||||
- "pause" or "wait" is only for transient loading, animation, or moving focus.
|
||||
- "no_action" means the requested intent is already satisfied.
|
||||
- Clicking a wrong element is dangerous. When in doubt, abstain.
|
||||
- If the foreground window does not match the expected current window, abstain.
|
||||
|
||||
Output ONLY the JSON object.
|
||||
"""
|
||||
|
||||
|
||||
HttpPost = Callable[..., Any]
|
||||
ImageEncoder = Callable[[Path], str]
|
||||
|
||||
|
||||
def build_ollama_user_prompt(case: BenchCase) -> str:
|
||||
task = case.task
|
||||
return "\n".join(
|
||||
[
|
||||
f"Intent: {_task_value(task, 'intent')}",
|
||||
f"Target text or label: {_task_value(task, 'target_text')}",
|
||||
f"Expected current window: {_task_value(task, 'current_window')}",
|
||||
f"Expected next window after click: {_task_value(task, 'expected_next_window')}",
|
||||
f"Question: {_task_value(task, 'question')}",
|
||||
"",
|
||||
"Reply with one JSON object as specified by the system prompt.",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def build_ollama_payload(
|
||||
case: BenchCase,
|
||||
*,
|
||||
model: str,
|
||||
image_b64: str,
|
||||
temperature: float = 0.1,
|
||||
num_ctx: int = 4096,
|
||||
num_predict: int = 200,
|
||||
) -> dict[str, Any]:
|
||||
return {
|
||||
"model": model,
|
||||
"messages": [
|
||||
{"role": "system", "content": OLLAMA_SYSTEM_PROMPT.strip()},
|
||||
{
|
||||
"role": "user",
|
||||
"content": build_ollama_user_prompt(case),
|
||||
"images": [image_b64],
|
||||
},
|
||||
],
|
||||
"stream": False,
|
||||
"think": False,
|
||||
"format": "json",
|
||||
"options": {
|
||||
"temperature": temperature,
|
||||
"top_k": 1,
|
||||
"num_predict": num_predict,
|
||||
"num_ctx": num_ctx,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def encode_screenshot_base64(path: Path, *, max_long_edge: int = 1280) -> str:
|
||||
with Image.open(path) as img:
|
||||
rgb = img.convert("RGB")
|
||||
width, height = rgb.size
|
||||
long_edge = max(width, height)
|
||||
if long_edge > max_long_edge:
|
||||
scale = max_long_edge / float(long_edge)
|
||||
rgb = rgb.resize((int(width * scale), int(height * scale)))
|
||||
|
||||
buffer = io.BytesIO()
|
||||
rgb.save(buffer, format="JPEG", quality=90)
|
||||
return base64.b64encode(buffer.getvalue()).decode("ascii")
|
||||
|
||||
|
||||
def run_ollama_case(
|
||||
case: BenchCase,
|
||||
*,
|
||||
model: str = DEFAULT_MODEL,
|
||||
endpoint: str = DEFAULT_ENDPOINT,
|
||||
timeout: int = 45,
|
||||
post: HttpPost = requests.post,
|
||||
image_encoder: ImageEncoder = encode_screenshot_base64,
|
||||
retries: int = 1,
|
||||
) -> dict[str, Any]:
|
||||
image_b64 = image_encoder(case.screenshot_path)
|
||||
payload = build_ollama_payload(case, model=model, image_b64=image_b64)
|
||||
url = f"{endpoint.rstrip('/')}/api/chat"
|
||||
|
||||
last_error = ""
|
||||
for attempt in range(retries + 1):
|
||||
try:
|
||||
response = post(url, json=payload, timeout=timeout)
|
||||
if getattr(response, "status_code", 0) != 200:
|
||||
last_error = f"HTTP {getattr(response, 'status_code', 'unknown')}"
|
||||
else:
|
||||
text = response.json().get("message", {}).get("content", "")
|
||||
parsed = extract_json_object(text)
|
||||
if parsed is None and attempt < retries:
|
||||
payload["messages"][1]["content"] += (
|
||||
"\nYour previous answer was not valid JSON. Output JSON only."
|
||||
)
|
||||
continue
|
||||
return normalize_prediction(case, parsed, model=model, raw_text=text)
|
||||
except Exception as exc: # pragma: no cover - exercised via fake response paths
|
||||
last_error = str(exc)
|
||||
|
||||
if attempt < retries:
|
||||
time.sleep(2)
|
||||
|
||||
return _safe_abstain(case, model, f"ollama_error: {last_error[:80]}")
|
||||
|
||||
|
||||
def extract_json_object(text: str) -> dict[str, Any] | None:
|
||||
cleaned = text.strip()
|
||||
if "```" in cleaned:
|
||||
cleaned = "\n".join(line for line in cleaned.splitlines() if not line.strip().startswith("```"))
|
||||
cleaned = cleaned.strip()
|
||||
|
||||
for candidate in _json_candidates(cleaned):
|
||||
try:
|
||||
parsed = json.loads(candidate)
|
||||
return parsed if isinstance(parsed, dict) else None
|
||||
except json.JSONDecodeError:
|
||||
fixed = candidate.replace("'", '"')
|
||||
try:
|
||||
parsed = json.loads(fixed)
|
||||
return parsed if isinstance(parsed, dict) else None
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def normalize_prediction(
|
||||
case: BenchCase,
|
||||
data: dict[str, Any] | None,
|
||||
*,
|
||||
model: str,
|
||||
raw_text: str = "",
|
||||
) -> dict[str, Any]:
|
||||
if not isinstance(data, dict):
|
||||
return _safe_abstain(case, model, f"parse_error: {raw_text[:80]}")
|
||||
|
||||
decision = str(data.get("decision", "")).strip().lower()
|
||||
if decision not in VALID_DECISIONS:
|
||||
return _safe_abstain(case, model, f"invalid_decision: {decision[:40]}")
|
||||
|
||||
confidence = _optional_float(data.get("confidence"))
|
||||
reason = str(data.get("reason", ""))[:160]
|
||||
|
||||
if decision == "click":
|
||||
x_pct = _optional_float(data.get("x_pct"))
|
||||
y_pct = _optional_float(data.get("y_pct"))
|
||||
if x_pct is None or y_pct is None:
|
||||
return _safe_abstain(case, model, "click_without_coords")
|
||||
if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0):
|
||||
return _safe_abstain(case, model, "coords_out_of_bounds")
|
||||
return {
|
||||
"case_id": case.case_id,
|
||||
"model": model,
|
||||
"decision": "click",
|
||||
"x_pct": x_pct,
|
||||
"y_pct": y_pct,
|
||||
"confidence": confidence,
|
||||
"reason": reason,
|
||||
}
|
||||
|
||||
return {
|
||||
"case_id": case.case_id,
|
||||
"model": model,
|
||||
"decision": decision,
|
||||
"x_pct": None,
|
||||
"y_pct": None,
|
||||
"confidence": confidence,
|
||||
"reason": reason,
|
||||
}
|
||||
|
||||
|
||||
def write_ollama_predictions(
|
||||
cases: list[BenchCase],
|
||||
output_path: str | Path,
|
||||
*,
|
||||
model: str = DEFAULT_MODEL,
|
||||
endpoint: str = DEFAULT_ENDPOINT,
|
||||
timeout: int = 45,
|
||||
post: HttpPost = requests.post,
|
||||
image_encoder: ImageEncoder = encode_screenshot_base64,
|
||||
) -> None:
|
||||
out = Path(output_path)
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
with out.open("w", encoding="utf-8") as f:
|
||||
for case in cases:
|
||||
prediction = run_ollama_case(
|
||||
case,
|
||||
model=model,
|
||||
endpoint=endpoint,
|
||||
timeout=timeout,
|
||||
post=post,
|
||||
image_encoder=image_encoder,
|
||||
)
|
||||
f.write(json.dumps(prediction, ensure_ascii=False) + "\n")
|
||||
f.flush()
|
||||
|
||||
|
||||
def _safe_abstain(case: BenchCase, model: str, reason: str) -> dict[str, Any]:
|
||||
return {
|
||||
"case_id": case.case_id,
|
||||
"model": model,
|
||||
"decision": "abstain",
|
||||
"x_pct": None,
|
||||
"y_pct": None,
|
||||
"confidence": 0.0,
|
||||
"reason": reason,
|
||||
}
|
||||
|
||||
|
||||
def _json_candidates(text: str) -> list[str]:
|
||||
candidates = [text]
|
||||
candidates.extend(match.group(0) for match in re.finditer(r"\{[^{}]+\}", text))
|
||||
return candidates
|
||||
|
||||
|
||||
def _optional_float(value: Any) -> float | None:
|
||||
if value is None:
|
||||
return None
|
||||
try:
|
||||
out = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
if out != out or out in (float("inf"), float("-inf")):
|
||||
return None
|
||||
return out
|
||||
|
||||
|
||||
def _task_value(task: dict[str, Any], key: str) -> str:
|
||||
value = task.get(key)
|
||||
if value is None:
|
||||
return ""
|
||||
return str(value)
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(description="Run local Ollama model on LeaBench cases.")
|
||||
parser.add_argument("--cases", required=True, help="Path to LeaBench cases JSONL.")
|
||||
parser.add_argument("--output", required=True, help="Output predictions JSONL.")
|
||||
parser.add_argument("--repo-root", default=".", help="Repository root for relative screenshot paths.")
|
||||
parser.add_argument("--endpoint", default=DEFAULT_ENDPOINT, help="Ollama endpoint.")
|
||||
parser.add_argument("--model", default=DEFAULT_MODEL, help="Ollama model name.")
|
||||
parser.add_argument("--timeout", type=int, default=45, help="Per-case timeout in seconds.")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
cases = load_cases(args.cases, repo_root=args.repo_root)
|
||||
write_ollama_predictions(
|
||||
cases,
|
||||
args.output,
|
||||
model=args.model,
|
||||
endpoint=args.endpoint,
|
||||
timeout=args.timeout,
|
||||
)
|
||||
print(f"Wrote Ollama predictions: {args.output}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main(sys.argv[1:]))
|
||||
160
tests/unit/test_ollama_lea_bench_adapter.py
Normal file
160
tests/unit/test_ollama_lea_bench_adapter.py
Normal file
@@ -0,0 +1,160 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from core.evaluation.computer_use_bench import load_cases, load_predictions
|
||||
from core.evaluation.ollama_lea_bench_adapter import (
|
||||
build_ollama_payload,
|
||||
extract_json_object,
|
||||
normalize_prediction,
|
||||
run_ollama_case,
|
||||
write_ollama_predictions,
|
||||
)
|
||||
|
||||
|
||||
class _FakeResponse:
|
||||
def __init__(self, status_code: int, content: str):
|
||||
self.status_code = status_code
|
||||
self._content = content
|
||||
|
||||
def json(self):
|
||||
return {"message": {"content": self._content}}
|
||||
|
||||
|
||||
def _write_jsonl(path: Path, rows: list[dict]) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with path.open("w", encoding="utf-8") as f:
|
||||
for row in rows:
|
||||
f.write(json.dumps(row) + "\n")
|
||||
|
||||
|
||||
def _write_image(path: Path) -> None:
|
||||
Image.new("RGB", (32, 24), color=(255, 255, 255)).save(path)
|
||||
|
||||
|
||||
def _case_rows(screenshot: Path) -> list[dict]:
|
||||
return [
|
||||
{
|
||||
"case_id": "visible",
|
||||
"screenshot_path": str(screenshot),
|
||||
"task": {
|
||||
"intent": "click save",
|
||||
"target_text": "Enregistrer",
|
||||
"current_window": "Enregistrer sous",
|
||||
"expected_next_window": "Bloc-notes",
|
||||
"question": "Clique uniquement sur Enregistrer.",
|
||||
},
|
||||
"expectation": {
|
||||
"decision": "click",
|
||||
"click_region": {"x_pct": 0.5, "y_pct": 0.8, "radius_pct": 0.1},
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _load_one_case(tmp_path: Path):
|
||||
screenshot = tmp_path / "screen.png"
|
||||
_write_image(screenshot)
|
||||
cases_path = tmp_path / "cases.jsonl"
|
||||
_write_jsonl(cases_path, _case_rows(screenshot))
|
||||
return load_cases(cases_path, repo_root=tmp_path)[0]
|
||||
|
||||
|
||||
def test_build_payload_uses_image_without_leaking_expectation(tmp_path):
|
||||
case = _load_one_case(tmp_path)
|
||||
|
||||
payload = build_ollama_payload(case, model="qwen-test", image_b64="abc123")
|
||||
serialized = json.dumps(payload)
|
||||
|
||||
assert payload["model"] == "qwen-test"
|
||||
assert payload["messages"][1]["images"] == ["abc123"]
|
||||
assert "Expected current window: Enregistrer sous" in payload["messages"][1]["content"]
|
||||
assert "click_region" not in serialized
|
||||
assert "expectation" not in serialized
|
||||
|
||||
|
||||
def test_extract_json_object_accepts_fences_and_single_quotes():
|
||||
assert extract_json_object('```json\n{"decision":"abstain"}\n```') == {"decision": "abstain"}
|
||||
assert extract_json_object("prefix {'decision':'wait'} suffix") == {"decision": "wait"}
|
||||
|
||||
|
||||
def test_normalize_prediction_accepts_valid_click(tmp_path):
|
||||
case = _load_one_case(tmp_path)
|
||||
|
||||
prediction = normalize_prediction(
|
||||
case,
|
||||
{"decision": "click", "x_pct": 0.51, "y_pct": "0.79", "confidence": 0.9, "reason": "ok"},
|
||||
model="qwen-test",
|
||||
)
|
||||
|
||||
assert prediction["case_id"] == "visible"
|
||||
assert prediction["decision"] == "click"
|
||||
assert prediction["x_pct"] == 0.51
|
||||
assert prediction["y_pct"] == 0.79
|
||||
|
||||
|
||||
def test_normalize_prediction_forces_abstain_on_bad_click(tmp_path):
|
||||
case = _load_one_case(tmp_path)
|
||||
|
||||
prediction = normalize_prediction(
|
||||
case,
|
||||
{"decision": "click", "x_pct": 1.2, "y_pct": 0.2, "confidence": 0.9},
|
||||
model="qwen-test",
|
||||
)
|
||||
|
||||
assert prediction["decision"] == "abstain"
|
||||
assert prediction["reason"] == "coords_out_of_bounds"
|
||||
|
||||
|
||||
def test_run_ollama_case_uses_http_response_without_network(tmp_path):
|
||||
case = _load_one_case(tmp_path)
|
||||
|
||||
def fake_post(url, json, timeout):
|
||||
assert url == "http://ollama.test/api/chat"
|
||||
assert timeout == 7
|
||||
assert json["messages"][1]["images"] == ["fake-image"]
|
||||
return _FakeResponse(
|
||||
200,
|
||||
'{"decision":"abstain","x_pct":null,"y_pct":null,"confidence":0.8,"reason":"wrong_window"}',
|
||||
)
|
||||
|
||||
prediction = run_ollama_case(
|
||||
case,
|
||||
model="qwen-test",
|
||||
endpoint="http://ollama.test",
|
||||
timeout=7,
|
||||
post=fake_post,
|
||||
image_encoder=lambda _path: "fake-image",
|
||||
retries=0,
|
||||
)
|
||||
|
||||
assert prediction["model"] == "qwen-test"
|
||||
assert prediction["decision"] == "abstain"
|
||||
assert prediction["x_pct"] is None
|
||||
|
||||
|
||||
def test_write_ollama_predictions_outputs_valid_leabench_jsonl(tmp_path):
|
||||
case = _load_one_case(tmp_path)
|
||||
predictions_path = tmp_path / "predictions.jsonl"
|
||||
|
||||
def fake_post(_url, json, timeout):
|
||||
assert json["format"] == "json"
|
||||
assert timeout == 45
|
||||
return _FakeResponse(
|
||||
200,
|
||||
'{"decision":"click","x_pct":0.5,"y_pct":0.8,"confidence":0.92,"reason":"visible"}',
|
||||
)
|
||||
|
||||
write_ollama_predictions(
|
||||
[case],
|
||||
predictions_path,
|
||||
model="qwen-test",
|
||||
endpoint="http://ollama.test",
|
||||
post=fake_post,
|
||||
image_encoder=lambda _path: "fake-image",
|
||||
)
|
||||
|
||||
predictions = load_predictions(predictions_path)
|
||||
assert predictions["visible"].decision == "click"
|
||||
assert predictions["visible"].x_pct == 0.5
|
||||
15
tools/lea_bench_ollama.py
Normal file
15
tools/lea_bench_ollama.py
Normal file
@@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env python3
|
||||
"""CLI wrapper for the local Ollama LeaBench adapter."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
if str(ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from core.evaluation.ollama_lea_bench_adapter import main
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user