feat(evaluation): add local Ollama LeaBench adapter

This commit is contained in:
Dom
2026-05-24 21:58:06 +02:00
parent 6544ebe3f0
commit debd7b423c
4 changed files with 498 additions and 0 deletions

View File

@@ -59,6 +59,16 @@ python3 tools/lea_bench.py \
--json
```
Produire des predictions avec Ollama local :
```bash
python3 tools/lea_bench_ollama.py \
--cases benchmarks/computer_use/cases/notepad_replay_failures_2026-05-24.jsonl \
--repo-root . \
--model qwen2.5vl:7b-rpa \
--output benchmarks/computer_use/predictions/qwen25vl_notepad.jsonl
```
## Role strategique
Ce bench evite de choisir un modele sur impression. On mesure :

View File

@@ -0,0 +1,313 @@
"""Ollama adapter that writes LeaBench-compatible prediction JSONL.
The adapter is deliberately outside Lea runtime. It only benchmarks a local
vision model against static screenshots and never controls the desktop.
"""
from __future__ import annotations
import argparse
import base64
import io
import json
import re
import sys
import time
from pathlib import Path
from typing import Any, Callable
import requests
from PIL import Image
from core.evaluation.computer_use_bench import (
SAFE_NON_CLICK_DECISIONS,
BenchCase,
load_cases,
)
DEFAULT_MODEL = "qwen2.5vl:7b-rpa"
DEFAULT_ENDPOINT = "http://localhost:11434"
VALID_DECISIONS = {"click", *SAFE_NON_CLICK_DECISIONS}
OLLAMA_SYSTEM_PROMPT = """You are a desktop UI safety judge. You receive ONE screenshot and a question
about a target UI element. Your only job is to decide whether clicking is safe.
You MUST respond with a single valid JSON object, nothing else: no prose, no
markdown fences, no thinking tags. The JSON schema is:
{
"decision": "click" | "abstain" | "pause" | "wait" | "no_action",
"x_pct": <float 0.0-1.0 OR null>,
"y_pct": <float 0.0-1.0 OR null>,
"confidence": <float 0.0-1.0>,
"reason": "<short string, max 120 chars, English or French>"
}
Rules:
- "click" REQUIRES x_pct AND y_pct normalized 0..1, origin top-left.
- If the requested target is not visible, in the wrong state, behind a modal,
or ambiguous: choose "abstain" and set x_pct/y_pct to null.
- "pause" or "wait" is only for transient loading, animation, or moving focus.
- "no_action" means the requested intent is already satisfied.
- Clicking a wrong element is dangerous. When in doubt, abstain.
- If the foreground window does not match the expected current window, abstain.
Output ONLY the JSON object.
"""
HttpPost = Callable[..., Any]
ImageEncoder = Callable[[Path], str]
def build_ollama_user_prompt(case: BenchCase) -> str:
task = case.task
return "\n".join(
[
f"Intent: {_task_value(task, 'intent')}",
f"Target text or label: {_task_value(task, 'target_text')}",
f"Expected current window: {_task_value(task, 'current_window')}",
f"Expected next window after click: {_task_value(task, 'expected_next_window')}",
f"Question: {_task_value(task, 'question')}",
"",
"Reply with one JSON object as specified by the system prompt.",
]
)
def build_ollama_payload(
case: BenchCase,
*,
model: str,
image_b64: str,
temperature: float = 0.1,
num_ctx: int = 4096,
num_predict: int = 200,
) -> dict[str, Any]:
return {
"model": model,
"messages": [
{"role": "system", "content": OLLAMA_SYSTEM_PROMPT.strip()},
{
"role": "user",
"content": build_ollama_user_prompt(case),
"images": [image_b64],
},
],
"stream": False,
"think": False,
"format": "json",
"options": {
"temperature": temperature,
"top_k": 1,
"num_predict": num_predict,
"num_ctx": num_ctx,
},
}
def encode_screenshot_base64(path: Path, *, max_long_edge: int = 1280) -> str:
with Image.open(path) as img:
rgb = img.convert("RGB")
width, height = rgb.size
long_edge = max(width, height)
if long_edge > max_long_edge:
scale = max_long_edge / float(long_edge)
rgb = rgb.resize((int(width * scale), int(height * scale)))
buffer = io.BytesIO()
rgb.save(buffer, format="JPEG", quality=90)
return base64.b64encode(buffer.getvalue()).decode("ascii")
def run_ollama_case(
case: BenchCase,
*,
model: str = DEFAULT_MODEL,
endpoint: str = DEFAULT_ENDPOINT,
timeout: int = 45,
post: HttpPost = requests.post,
image_encoder: ImageEncoder = encode_screenshot_base64,
retries: int = 1,
) -> dict[str, Any]:
image_b64 = image_encoder(case.screenshot_path)
payload = build_ollama_payload(case, model=model, image_b64=image_b64)
url = f"{endpoint.rstrip('/')}/api/chat"
last_error = ""
for attempt in range(retries + 1):
try:
response = post(url, json=payload, timeout=timeout)
if getattr(response, "status_code", 0) != 200:
last_error = f"HTTP {getattr(response, 'status_code', 'unknown')}"
else:
text = response.json().get("message", {}).get("content", "")
parsed = extract_json_object(text)
if parsed is None and attempt < retries:
payload["messages"][1]["content"] += (
"\nYour previous answer was not valid JSON. Output JSON only."
)
continue
return normalize_prediction(case, parsed, model=model, raw_text=text)
except Exception as exc: # pragma: no cover - exercised via fake response paths
last_error = str(exc)
if attempt < retries:
time.sleep(2)
return _safe_abstain(case, model, f"ollama_error: {last_error[:80]}")
def extract_json_object(text: str) -> dict[str, Any] | None:
cleaned = text.strip()
if "```" in cleaned:
cleaned = "\n".join(line for line in cleaned.splitlines() if not line.strip().startswith("```"))
cleaned = cleaned.strip()
for candidate in _json_candidates(cleaned):
try:
parsed = json.loads(candidate)
return parsed if isinstance(parsed, dict) else None
except json.JSONDecodeError:
fixed = candidate.replace("'", '"')
try:
parsed = json.loads(fixed)
return parsed if isinstance(parsed, dict) else None
except json.JSONDecodeError:
pass
return None
def normalize_prediction(
case: BenchCase,
data: dict[str, Any] | None,
*,
model: str,
raw_text: str = "",
) -> dict[str, Any]:
if not isinstance(data, dict):
return _safe_abstain(case, model, f"parse_error: {raw_text[:80]}")
decision = str(data.get("decision", "")).strip().lower()
if decision not in VALID_DECISIONS:
return _safe_abstain(case, model, f"invalid_decision: {decision[:40]}")
confidence = _optional_float(data.get("confidence"))
reason = str(data.get("reason", ""))[:160]
if decision == "click":
x_pct = _optional_float(data.get("x_pct"))
y_pct = _optional_float(data.get("y_pct"))
if x_pct is None or y_pct is None:
return _safe_abstain(case, model, "click_without_coords")
if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0):
return _safe_abstain(case, model, "coords_out_of_bounds")
return {
"case_id": case.case_id,
"model": model,
"decision": "click",
"x_pct": x_pct,
"y_pct": y_pct,
"confidence": confidence,
"reason": reason,
}
return {
"case_id": case.case_id,
"model": model,
"decision": decision,
"x_pct": None,
"y_pct": None,
"confidence": confidence,
"reason": reason,
}
def write_ollama_predictions(
cases: list[BenchCase],
output_path: str | Path,
*,
model: str = DEFAULT_MODEL,
endpoint: str = DEFAULT_ENDPOINT,
timeout: int = 45,
post: HttpPost = requests.post,
image_encoder: ImageEncoder = encode_screenshot_base64,
) -> None:
out = Path(output_path)
out.parent.mkdir(parents=True, exist_ok=True)
with out.open("w", encoding="utf-8") as f:
for case in cases:
prediction = run_ollama_case(
case,
model=model,
endpoint=endpoint,
timeout=timeout,
post=post,
image_encoder=image_encoder,
)
f.write(json.dumps(prediction, ensure_ascii=False) + "\n")
f.flush()
def _safe_abstain(case: BenchCase, model: str, reason: str) -> dict[str, Any]:
return {
"case_id": case.case_id,
"model": model,
"decision": "abstain",
"x_pct": None,
"y_pct": None,
"confidence": 0.0,
"reason": reason,
}
def _json_candidates(text: str) -> list[str]:
candidates = [text]
candidates.extend(match.group(0) for match in re.finditer(r"\{[^{}]+\}", text))
return candidates
def _optional_float(value: Any) -> float | None:
if value is None:
return None
try:
out = float(value)
except (TypeError, ValueError):
return None
if out != out or out in (float("inf"), float("-inf")):
return None
return out
def _task_value(task: dict[str, Any], key: str) -> str:
value = task.get(key)
if value is None:
return ""
return str(value)
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description="Run local Ollama model on LeaBench cases.")
parser.add_argument("--cases", required=True, help="Path to LeaBench cases JSONL.")
parser.add_argument("--output", required=True, help="Output predictions JSONL.")
parser.add_argument("--repo-root", default=".", help="Repository root for relative screenshot paths.")
parser.add_argument("--endpoint", default=DEFAULT_ENDPOINT, help="Ollama endpoint.")
parser.add_argument("--model", default=DEFAULT_MODEL, help="Ollama model name.")
parser.add_argument("--timeout", type=int, default=45, help="Per-case timeout in seconds.")
args = parser.parse_args(argv)
cases = load_cases(args.cases, repo_root=args.repo_root)
write_ollama_predictions(
cases,
args.output,
model=args.model,
endpoint=args.endpoint,
timeout=args.timeout,
)
print(f"Wrote Ollama predictions: {args.output}")
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))

View File

@@ -0,0 +1,160 @@
import json
from pathlib import Path
from PIL import Image
from core.evaluation.computer_use_bench import load_cases, load_predictions
from core.evaluation.ollama_lea_bench_adapter import (
build_ollama_payload,
extract_json_object,
normalize_prediction,
run_ollama_case,
write_ollama_predictions,
)
class _FakeResponse:
def __init__(self, status_code: int, content: str):
self.status_code = status_code
self._content = content
def json(self):
return {"message": {"content": self._content}}
def _write_jsonl(path: Path, rows: list[dict]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding="utf-8") as f:
for row in rows:
f.write(json.dumps(row) + "\n")
def _write_image(path: Path) -> None:
Image.new("RGB", (32, 24), color=(255, 255, 255)).save(path)
def _case_rows(screenshot: Path) -> list[dict]:
return [
{
"case_id": "visible",
"screenshot_path": str(screenshot),
"task": {
"intent": "click save",
"target_text": "Enregistrer",
"current_window": "Enregistrer sous",
"expected_next_window": "Bloc-notes",
"question": "Clique uniquement sur Enregistrer.",
},
"expectation": {
"decision": "click",
"click_region": {"x_pct": 0.5, "y_pct": 0.8, "radius_pct": 0.1},
},
}
]
def _load_one_case(tmp_path: Path):
screenshot = tmp_path / "screen.png"
_write_image(screenshot)
cases_path = tmp_path / "cases.jsonl"
_write_jsonl(cases_path, _case_rows(screenshot))
return load_cases(cases_path, repo_root=tmp_path)[0]
def test_build_payload_uses_image_without_leaking_expectation(tmp_path):
case = _load_one_case(tmp_path)
payload = build_ollama_payload(case, model="qwen-test", image_b64="abc123")
serialized = json.dumps(payload)
assert payload["model"] == "qwen-test"
assert payload["messages"][1]["images"] == ["abc123"]
assert "Expected current window: Enregistrer sous" in payload["messages"][1]["content"]
assert "click_region" not in serialized
assert "expectation" not in serialized
def test_extract_json_object_accepts_fences_and_single_quotes():
assert extract_json_object('```json\n{"decision":"abstain"}\n```') == {"decision": "abstain"}
assert extract_json_object("prefix {'decision':'wait'} suffix") == {"decision": "wait"}
def test_normalize_prediction_accepts_valid_click(tmp_path):
case = _load_one_case(tmp_path)
prediction = normalize_prediction(
case,
{"decision": "click", "x_pct": 0.51, "y_pct": "0.79", "confidence": 0.9, "reason": "ok"},
model="qwen-test",
)
assert prediction["case_id"] == "visible"
assert prediction["decision"] == "click"
assert prediction["x_pct"] == 0.51
assert prediction["y_pct"] == 0.79
def test_normalize_prediction_forces_abstain_on_bad_click(tmp_path):
case = _load_one_case(tmp_path)
prediction = normalize_prediction(
case,
{"decision": "click", "x_pct": 1.2, "y_pct": 0.2, "confidence": 0.9},
model="qwen-test",
)
assert prediction["decision"] == "abstain"
assert prediction["reason"] == "coords_out_of_bounds"
def test_run_ollama_case_uses_http_response_without_network(tmp_path):
case = _load_one_case(tmp_path)
def fake_post(url, json, timeout):
assert url == "http://ollama.test/api/chat"
assert timeout == 7
assert json["messages"][1]["images"] == ["fake-image"]
return _FakeResponse(
200,
'{"decision":"abstain","x_pct":null,"y_pct":null,"confidence":0.8,"reason":"wrong_window"}',
)
prediction = run_ollama_case(
case,
model="qwen-test",
endpoint="http://ollama.test",
timeout=7,
post=fake_post,
image_encoder=lambda _path: "fake-image",
retries=0,
)
assert prediction["model"] == "qwen-test"
assert prediction["decision"] == "abstain"
assert prediction["x_pct"] is None
def test_write_ollama_predictions_outputs_valid_leabench_jsonl(tmp_path):
case = _load_one_case(tmp_path)
predictions_path = tmp_path / "predictions.jsonl"
def fake_post(_url, json, timeout):
assert json["format"] == "json"
assert timeout == 45
return _FakeResponse(
200,
'{"decision":"click","x_pct":0.5,"y_pct":0.8,"confidence":0.92,"reason":"visible"}',
)
write_ollama_predictions(
[case],
predictions_path,
model="qwen-test",
endpoint="http://ollama.test",
post=fake_post,
image_encoder=lambda _path: "fake-image",
)
predictions = load_predictions(predictions_path)
assert predictions["visible"].decision == "click"
assert predictions["visible"].x_pct == 0.5

15
tools/lea_bench_ollama.py Normal file
View File

@@ -0,0 +1,15 @@
#!/usr/bin/env python3
"""CLI wrapper for the local Ollama LeaBench adapter."""
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from core.evaluation.ollama_lea_bench_adapter import main
if __name__ == "__main__":
raise SystemExit(main())