feat(evaluation): add local Ollama LeaBench adapter

2026-05-24 21:58:06 +02:00
parent 6544ebe3f0
commit debd7b423c
4 changed files with 498 additions and 0 deletions
--- a/benchmarks/computer_use/README.md
+++ b/benchmarks/computer_use/README.md
@@ -59,6 +59,16 @@ python3 tools/lea_bench.py \
  --json
 ```

+Produire des predictions avec Ollama local :
+
+```bash
+python3 tools/lea_bench_ollama.py \
+  --cases benchmarks/computer_use/cases/notepad_replay_failures_2026-05-24.jsonl \
+  --repo-root . \
+  --model qwen2.5vl:7b-rpa \
+  --output benchmarks/computer_use/predictions/qwen25vl_notepad.jsonl
+```
+
 ## Role strategique

 Ce bench evite de choisir un modele sur impression. On mesure :
--- a/core/evaluation/ollama_lea_bench_adapter.py
+++ b/core/evaluation/ollama_lea_bench_adapter.py
@@ -0,0 +1,313 @@
+"""Ollama adapter that writes LeaBench-compatible prediction JSONL.
+
+The adapter is deliberately outside Lea runtime. It only benchmarks a local
+vision model against static screenshots and never controls the desktop.
+"""
+
+from __future__ import annotations
+
+import argparse
+import base64
+import io
+import json
+import re
+import sys
+import time
+from pathlib import Path
+from typing import Any, Callable
+
+import requests
+from PIL import Image
+
+from core.evaluation.computer_use_bench import (
+    SAFE_NON_CLICK_DECISIONS,
+    BenchCase,
+    load_cases,
+)
+
+
+DEFAULT_MODEL = "qwen2.5vl:7b-rpa"
+DEFAULT_ENDPOINT = "http://localhost:11434"
+VALID_DECISIONS = {"click", *SAFE_NON_CLICK_DECISIONS}
+
+OLLAMA_SYSTEM_PROMPT = """You are a desktop UI safety judge. You receive ONE screenshot and a question
+about a target UI element. Your only job is to decide whether clicking is safe.
+
+You MUST respond with a single valid JSON object, nothing else: no prose, no
+markdown fences, no thinking tags. The JSON schema is:
+
+{
+  "decision": "click" | "abstain" | "pause" | "wait" | "no_action",
+  "x_pct": <float 0.0-1.0 OR null>,
+  "y_pct": <float 0.0-1.0 OR null>,
+  "confidence": <float 0.0-1.0>,
+  "reason": "<short string, max 120 chars, English or French>"
+}
+
+Rules:
+- "click" REQUIRES x_pct AND y_pct normalized 0..1, origin top-left.
+- If the requested target is not visible, in the wrong state, behind a modal,
+  or ambiguous: choose "abstain" and set x_pct/y_pct to null.
+- "pause" or "wait" is only for transient loading, animation, or moving focus.
+- "no_action" means the requested intent is already satisfied.
+- Clicking a wrong element is dangerous. When in doubt, abstain.
+- If the foreground window does not match the expected current window, abstain.
+
+Output ONLY the JSON object.
+"""
+
+
+HttpPost = Callable[..., Any]
+ImageEncoder = Callable[[Path], str]
+
+
+def build_ollama_user_prompt(case: BenchCase) -> str:
+    task = case.task
+    return "\n".join(
+        [
+            f"Intent: {_task_value(task, 'intent')}",
+            f"Target text or label: {_task_value(task, 'target_text')}",
+            f"Expected current window: {_task_value(task, 'current_window')}",
+            f"Expected next window after click: {_task_value(task, 'expected_next_window')}",
+            f"Question: {_task_value(task, 'question')}",
+            "",
+            "Reply with one JSON object as specified by the system prompt.",
+        ]
+    )
+
+
+def build_ollama_payload(
+    case: BenchCase,
+    *,
+    model: str,
+    image_b64: str,
+    temperature: float = 0.1,
+    num_ctx: int = 4096,
+    num_predict: int = 200,
+) -> dict[str, Any]:
+    return {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": OLLAMA_SYSTEM_PROMPT.strip()},
+            {
+                "role": "user",
+                "content": build_ollama_user_prompt(case),
+                "images": [image_b64],
+            },
+        ],
+        "stream": False,
+        "think": False,
+        "format": "json",
+        "options": {
+            "temperature": temperature,
+            "top_k": 1,
+            "num_predict": num_predict,
+            "num_ctx": num_ctx,
+        },
+    }
+
+
+def encode_screenshot_base64(path: Path, *, max_long_edge: int = 1280) -> str:
+    with Image.open(path) as img:
+        rgb = img.convert("RGB")
+        width, height = rgb.size
+        long_edge = max(width, height)
+        if long_edge > max_long_edge:
+            scale = max_long_edge / float(long_edge)
+            rgb = rgb.resize((int(width * scale), int(height * scale)))
+
+        buffer = io.BytesIO()
+        rgb.save(buffer, format="JPEG", quality=90)
+        return base64.b64encode(buffer.getvalue()).decode("ascii")
+
+
+def run_ollama_case(
+    case: BenchCase,
+    *,
+    model: str = DEFAULT_MODEL,
+    endpoint: str = DEFAULT_ENDPOINT,
+    timeout: int = 45,
+    post: HttpPost = requests.post,
+    image_encoder: ImageEncoder = encode_screenshot_base64,
+    retries: int = 1,
+) -> dict[str, Any]:
+    image_b64 = image_encoder(case.screenshot_path)
+    payload = build_ollama_payload(case, model=model, image_b64=image_b64)
+    url = f"{endpoint.rstrip('/')}/api/chat"
+
+    last_error = ""
+    for attempt in range(retries + 1):
+        try:
+            response = post(url, json=payload, timeout=timeout)
+            if getattr(response, "status_code", 0) != 200:
+                last_error = f"HTTP {getattr(response, 'status_code', 'unknown')}"
+            else:
+                text = response.json().get("message", {}).get("content", "")
+                parsed = extract_json_object(text)
+                if parsed is None and attempt < retries:
+                    payload["messages"][1]["content"] += (
+                        "\nYour previous answer was not valid JSON. Output JSON only."
+                    )
+                    continue
+                return normalize_prediction(case, parsed, model=model, raw_text=text)
+        except Exception as exc:  # pragma: no cover - exercised via fake response paths
+            last_error = str(exc)
+
+        if attempt < retries:
+            time.sleep(2)
+
+    return _safe_abstain(case, model, f"ollama_error: {last_error[:80]}")
+
+
+def extract_json_object(text: str) -> dict[str, Any] | None:
+    cleaned = text.strip()
+    if "```" in cleaned:
+        cleaned = "\n".join(line for line in cleaned.splitlines() if not line.strip().startswith("```"))
+        cleaned = cleaned.strip()
+
+    for candidate in _json_candidates(cleaned):
+        try:
+            parsed = json.loads(candidate)
+            return parsed if isinstance(parsed, dict) else None
+        except json.JSONDecodeError:
+            fixed = candidate.replace("'", '"')
+            try:
+                parsed = json.loads(fixed)
+                return parsed if isinstance(parsed, dict) else None
+            except json.JSONDecodeError:
+                pass
+    return None
+
+
+def normalize_prediction(
+    case: BenchCase,
+    data: dict[str, Any] | None,
+    *,
+    model: str,
+    raw_text: str = "",
+) -> dict[str, Any]:
+    if not isinstance(data, dict):
+        return _safe_abstain(case, model, f"parse_error: {raw_text[:80]}")
+
+    decision = str(data.get("decision", "")).strip().lower()
+    if decision not in VALID_DECISIONS:
+        return _safe_abstain(case, model, f"invalid_decision: {decision[:40]}")
+
+    confidence = _optional_float(data.get("confidence"))
+    reason = str(data.get("reason", ""))[:160]
+
+    if decision == "click":
+        x_pct = _optional_float(data.get("x_pct"))
+        y_pct = _optional_float(data.get("y_pct"))
+        if x_pct is None or y_pct is None:
+            return _safe_abstain(case, model, "click_without_coords")
+        if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0):
+            return _safe_abstain(case, model, "coords_out_of_bounds")
+        return {
+            "case_id": case.case_id,
+            "model": model,
+            "decision": "click",
+            "x_pct": x_pct,
+            "y_pct": y_pct,
+            "confidence": confidence,
+            "reason": reason,
+        }
+
+    return {
+        "case_id": case.case_id,
+        "model": model,
+        "decision": decision,
+        "x_pct": None,
+        "y_pct": None,
+        "confidence": confidence,
+        "reason": reason,
+    }
+
+
+def write_ollama_predictions(
+    cases: list[BenchCase],
+    output_path: str | Path,
+    *,
+    model: str = DEFAULT_MODEL,
+    endpoint: str = DEFAULT_ENDPOINT,
+    timeout: int = 45,
+    post: HttpPost = requests.post,
+    image_encoder: ImageEncoder = encode_screenshot_base64,
+) -> None:
+    out = Path(output_path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with out.open("w", encoding="utf-8") as f:
+        for case in cases:
+            prediction = run_ollama_case(
+                case,
+                model=model,
+                endpoint=endpoint,
+                timeout=timeout,
+                post=post,
+                image_encoder=image_encoder,
+            )
+            f.write(json.dumps(prediction, ensure_ascii=False) + "\n")
+            f.flush()
+
+
+def _safe_abstain(case: BenchCase, model: str, reason: str) -> dict[str, Any]:
+    return {
+        "case_id": case.case_id,
+        "model": model,
+        "decision": "abstain",
+        "x_pct": None,
+        "y_pct": None,
+        "confidence": 0.0,
+        "reason": reason,
+    }
+
+
+def _json_candidates(text: str) -> list[str]:
+    candidates = [text]
+    candidates.extend(match.group(0) for match in re.finditer(r"\{[^{}]+\}", text))
+    return candidates
+
+
+def _optional_float(value: Any) -> float | None:
+    if value is None:
+        return None
+    try:
+        out = float(value)
+    except (TypeError, ValueError):
+        return None
+    if out != out or out in (float("inf"), float("-inf")):
+        return None
+    return out
+
+
+def _task_value(task: dict[str, Any], key: str) -> str:
+    value = task.get(key)
+    if value is None:
+        return ""
+    return str(value)
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Run local Ollama model on LeaBench cases.")
+    parser.add_argument("--cases", required=True, help="Path to LeaBench cases JSONL.")
+    parser.add_argument("--output", required=True, help="Output predictions JSONL.")
+    parser.add_argument("--repo-root", default=".", help="Repository root for relative screenshot paths.")
+    parser.add_argument("--endpoint", default=DEFAULT_ENDPOINT, help="Ollama endpoint.")
+    parser.add_argument("--model", default=DEFAULT_MODEL, help="Ollama model name.")
+    parser.add_argument("--timeout", type=int, default=45, help="Per-case timeout in seconds.")
+    args = parser.parse_args(argv)
+
+    cases = load_cases(args.cases, repo_root=args.repo_root)
+    write_ollama_predictions(
+        cases,
+        args.output,
+        model=args.model,
+        endpoint=args.endpoint,
+        timeout=args.timeout,
+    )
+    print(f"Wrote Ollama predictions: {args.output}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
--- a/tests/unit/test_ollama_lea_bench_adapter.py
+++ b/tests/unit/test_ollama_lea_bench_adapter.py
@@ -0,0 +1,160 @@
+import json
+from pathlib import Path
+
+from PIL import Image
+
+from core.evaluation.computer_use_bench import load_cases, load_predictions
+from core.evaluation.ollama_lea_bench_adapter import (
+    build_ollama_payload,
+    extract_json_object,
+    normalize_prediction,
+    run_ollama_case,
+    write_ollama_predictions,
+)
+
+
+class _FakeResponse:
+    def __init__(self, status_code: int, content: str):
+        self.status_code = status_code
+        self._content = content
+
+    def json(self):
+        return {"message": {"content": self._content}}
+
+
+def _write_jsonl(path: Path, rows: list[dict]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row) + "\n")
+
+
+def _write_image(path: Path) -> None:
+    Image.new("RGB", (32, 24), color=(255, 255, 255)).save(path)
+
+
+def _case_rows(screenshot: Path) -> list[dict]:
+    return [
+        {
+            "case_id": "visible",
+            "screenshot_path": str(screenshot),
+            "task": {
+                "intent": "click save",
+                "target_text": "Enregistrer",
+                "current_window": "Enregistrer sous",
+                "expected_next_window": "Bloc-notes",
+                "question": "Clique uniquement sur Enregistrer.",
+            },
+            "expectation": {
+                "decision": "click",
+                "click_region": {"x_pct": 0.5, "y_pct": 0.8, "radius_pct": 0.1},
+            },
+        }
+    ]
+
+
+def _load_one_case(tmp_path: Path):
+    screenshot = tmp_path / "screen.png"
+    _write_image(screenshot)
+    cases_path = tmp_path / "cases.jsonl"
+    _write_jsonl(cases_path, _case_rows(screenshot))
+    return load_cases(cases_path, repo_root=tmp_path)[0]
+
+
+def test_build_payload_uses_image_without_leaking_expectation(tmp_path):
+    case = _load_one_case(tmp_path)
+
+    payload = build_ollama_payload(case, model="qwen-test", image_b64="abc123")
+    serialized = json.dumps(payload)
+
+    assert payload["model"] == "qwen-test"
+    assert payload["messages"][1]["images"] == ["abc123"]
+    assert "Expected current window: Enregistrer sous" in payload["messages"][1]["content"]
+    assert "click_region" not in serialized
+    assert "expectation" not in serialized
+
+
+def test_extract_json_object_accepts_fences_and_single_quotes():
+    assert extract_json_object('```json\n{"decision":"abstain"}\n```') == {"decision": "abstain"}
+    assert extract_json_object("prefix {'decision':'wait'} suffix") == {"decision": "wait"}
+
+
+def test_normalize_prediction_accepts_valid_click(tmp_path):
+    case = _load_one_case(tmp_path)
+
+    prediction = normalize_prediction(
+        case,
+        {"decision": "click", "x_pct": 0.51, "y_pct": "0.79", "confidence": 0.9, "reason": "ok"},
+        model="qwen-test",
+    )
+
+    assert prediction["case_id"] == "visible"
+    assert prediction["decision"] == "click"
+    assert prediction["x_pct"] == 0.51
+    assert prediction["y_pct"] == 0.79
+
+
+def test_normalize_prediction_forces_abstain_on_bad_click(tmp_path):
+    case = _load_one_case(tmp_path)
+
+    prediction = normalize_prediction(
+        case,
+        {"decision": "click", "x_pct": 1.2, "y_pct": 0.2, "confidence": 0.9},
+        model="qwen-test",
+    )
+
+    assert prediction["decision"] == "abstain"
+    assert prediction["reason"] == "coords_out_of_bounds"
+
+
+def test_run_ollama_case_uses_http_response_without_network(tmp_path):
+    case = _load_one_case(tmp_path)
+
+    def fake_post(url, json, timeout):
+        assert url == "http://ollama.test/api/chat"
+        assert timeout == 7
+        assert json["messages"][1]["images"] == ["fake-image"]
+        return _FakeResponse(
+            200,
+            '{"decision":"abstain","x_pct":null,"y_pct":null,"confidence":0.8,"reason":"wrong_window"}',
+        )
+
+    prediction = run_ollama_case(
+        case,
+        model="qwen-test",
+        endpoint="http://ollama.test",
+        timeout=7,
+        post=fake_post,
+        image_encoder=lambda _path: "fake-image",
+        retries=0,
+    )
+
+    assert prediction["model"] == "qwen-test"
+    assert prediction["decision"] == "abstain"
+    assert prediction["x_pct"] is None
+
+
+def test_write_ollama_predictions_outputs_valid_leabench_jsonl(tmp_path):
+    case = _load_one_case(tmp_path)
+    predictions_path = tmp_path / "predictions.jsonl"
+
+    def fake_post(_url, json, timeout):
+        assert json["format"] == "json"
+        assert timeout == 45
+        return _FakeResponse(
+            200,
+            '{"decision":"click","x_pct":0.5,"y_pct":0.8,"confidence":0.92,"reason":"visible"}',
+        )
+
+    write_ollama_predictions(
+        [case],
+        predictions_path,
+        model="qwen-test",
+        endpoint="http://ollama.test",
+        post=fake_post,
+        image_encoder=lambda _path: "fake-image",
+    )
+
+    predictions = load_predictions(predictions_path)
+    assert predictions["visible"].decision == "click"
+    assert predictions["visible"].x_pct == 0.5
--- a/tools/lea_bench_ollama.py
+++ b/tools/lea_bench_ollama.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+"""CLI wrapper for the local Ollama LeaBench adapter."""
+
+import sys
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+from core.evaluation.ollama_lea_bench_adapter import main
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())