From 0f122a512f6dd952e5057a6fc1076a57489e1102 Mon Sep 17 00:00:00 2001 From: Dom Date: Thu, 4 Jun 2026 16:49:53 +0200 Subject: [PATCH] feat(p1y-alpha): add OpenAI-compatible LeaBench adapter (benchmark only) Adapter de benchmark isole (hors runtime Lea) ciblant un serveur /v1/chat/completions a support vision (vLLM/SGLang/TGI), pour comparer plus tard a Ollama via LeaBench. Ne controle jamais le desktop. - core/evaluation/openai_compat_lea_bench_adapter.py : payload data-URL image_url, parsing choices[0].message.content. Reutilise par import la logique prompt/parse/normalisation de ollama_lea_bench_adapter (zero refactor). - tools/lea_bench_openai_compat.py : wrapper CLI (--base-url defaut :8001). - tests/unit/test_openai_compat_lea_bench_adapter.py : 6 tests mockes HTTP (data URL, pas de fuite expectation/click_region, prediction valide, abstain safe sur HTTP!=200 et reponse malformee, JSONL rechargeable). Aucun runtime Lea modifie. Aucun service lance. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../openai_compat_lea_bench_adapter.py | 191 ++++++++++++++++++ .../test_openai_compat_lea_bench_adapter.py | 163 +++++++++++++++ tools/lea_bench_openai_compat.py | 15 ++ 3 files changed, 369 insertions(+) create mode 100644 core/evaluation/openai_compat_lea_bench_adapter.py create mode 100644 tests/unit/test_openai_compat_lea_bench_adapter.py create mode 100644 tools/lea_bench_openai_compat.py diff --git a/core/evaluation/openai_compat_lea_bench_adapter.py b/core/evaluation/openai_compat_lea_bench_adapter.py new file mode 100644 index 000000000..6907b3851 --- /dev/null +++ b/core/evaluation/openai_compat_lea_bench_adapter.py @@ -0,0 +1,191 @@ +"""OpenAI-compatible adapter that writes LeaBench-compatible prediction JSONL. + +Benchmark only — strictly outside Lea runtime. It targets any server exposing +`POST /v1/chat/completions` with vision support (vLLM, SGLang, TGI, ...) and +never controls the desktop. + +Réutilise la logique de prompt/parsing/normalisation de l'adapter Ollama +(`ollama_lea_bench_adapter`) pour garantir un comportement strictement aligné ; +seuls le format du payload (data URL `image_url`) et le parsing de la réponse +(`choices[0].message.content`) diffèrent. +""" + +from __future__ import annotations + +import argparse +import json +import sys +import time +from pathlib import Path +from typing import Any, Callable + +import requests + +from core.evaluation.computer_use_bench import BenchCase, load_cases +from core.evaluation.ollama_lea_bench_adapter import ( + OLLAMA_SYSTEM_PROMPT, + build_ollama_user_prompt, + encode_screenshot_base64, + extract_json_object, + normalize_prediction, + _safe_abstain, +) + + +DEFAULT_MODEL = "qwen3-vl:8b" +DEFAULT_BASE_URL = "http://localhost:8001" + +HttpPost = Callable[..., Any] +ImageEncoder = Callable[[Path], str] + + +def build_openai_compat_payload( + case: BenchCase, + *, + model: str, + image_b64: str, + temperature: float = 0.1, + max_tokens: int = 200, + json_response_format: bool = True, +) -> dict[str, Any]: + """Construit un payload `/v1/chat/completions` compatible vision. + + L'image est passée en data URL JPEG (`data:image/jpeg;base64,...`), format + `image_url` standard OpenAI/vLLM/SGLang. Le prompt système et utilisateur + sont ceux de l'adapter Ollama (provider-neutral). + """ + payload: dict[str, Any] = { + "model": model, + "messages": [ + {"role": "system", "content": OLLAMA_SYSTEM_PROMPT.strip()}, + { + "role": "user", + "content": [ + {"type": "text", "text": build_ollama_user_prompt(case)}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}, + }, + ], + }, + ], + "stream": False, + "temperature": temperature, + "max_tokens": max_tokens, + } + if json_response_format: + # Supporté par OpenAI, vLLM (>=0.4) et SGLang ; ignoré silencieusement + # par les serveurs qui ne le connaissent pas. + payload["response_format"] = {"type": "json_object"} + return payload + + +def _extract_content(response_json: Any) -> str | None: + """Extrait `choices[0].message.content` d'une réponse OpenAI-compatible.""" + if not isinstance(response_json, dict): + return None + choices = response_json.get("choices") + if not isinstance(choices, list) or not choices: + return None + message = choices[0].get("message") if isinstance(choices[0], dict) else None + if not isinstance(message, dict): + return None + content = message.get("content") + return content if isinstance(content, str) else None + + +def run_openai_compat_case( + case: BenchCase, + *, + model: str = DEFAULT_MODEL, + base_url: str = DEFAULT_BASE_URL, + timeout: int = 45, + post: HttpPost = requests.post, + image_encoder: ImageEncoder = encode_screenshot_base64, + retries: int = 1, +) -> dict[str, Any]: + image_b64 = image_encoder(case.screenshot_path) + payload = build_openai_compat_payload(case, model=model, image_b64=image_b64) + url = f"{base_url.rstrip('/')}/v1/chat/completions" + + last_error = "" + for attempt in range(retries + 1): + try: + response = post(url, json=payload, timeout=timeout) + if getattr(response, "status_code", 0) != 200: + last_error = f"HTTP {getattr(response, 'status_code', 'unknown')}" + else: + text = _extract_content(response.json()) + if text is None: + last_error = "missing_choices_content" + else: + parsed = extract_json_object(text) + if parsed is None and attempt < retries: + # On relance une fois en rappelant le contrat JSON. + text_msg = payload["messages"][1]["content"][0] + text_msg["text"] += ( + "\nYour previous answer was not valid JSON. Output JSON only." + ) + continue + return normalize_prediction(case, parsed, model=model, raw_text=text) + except Exception as exc: # pragma: no cover - exercised via fake response paths + last_error = str(exc) + + if attempt < retries: + time.sleep(2) + + return _safe_abstain(case, model, f"openai_compat_error: {last_error[:80]}") + + +def write_openai_compat_predictions( + cases: list[BenchCase], + output_path: str | Path, + *, + model: str = DEFAULT_MODEL, + base_url: str = DEFAULT_BASE_URL, + timeout: int = 45, + post: HttpPost = requests.post, + image_encoder: ImageEncoder = encode_screenshot_base64, +) -> None: + out = Path(output_path) + out.parent.mkdir(parents=True, exist_ok=True) + with out.open("w", encoding="utf-8") as f: + for case in cases: + prediction = run_openai_compat_case( + case, + model=model, + base_url=base_url, + timeout=timeout, + post=post, + image_encoder=image_encoder, + ) + f.write(json.dumps(prediction, ensure_ascii=False) + "\n") + f.flush() + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description="Run an OpenAI-compatible vision server on LeaBench cases." + ) + parser.add_argument("--cases", required=True, help="Path to LeaBench cases JSONL.") + parser.add_argument("--output", required=True, help="Output predictions JSONL.") + parser.add_argument("--repo-root", default=".", help="Repository root for relative screenshot paths.") + parser.add_argument("--base-url", default=DEFAULT_BASE_URL, help="OpenAI-compatible base URL.") + parser.add_argument("--model", default=DEFAULT_MODEL, help="Model name served by the endpoint.") + parser.add_argument("--timeout", type=int, default=45, help="Per-case timeout in seconds.") + args = parser.parse_args(argv) + + cases = load_cases(args.cases, repo_root=args.repo_root) + write_openai_compat_predictions( + cases, + args.output, + model=args.model, + base_url=args.base_url, + timeout=args.timeout, + ) + print(f"Wrote OpenAI-compatible predictions: {args.output}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/tests/unit/test_openai_compat_lea_bench_adapter.py b/tests/unit/test_openai_compat_lea_bench_adapter.py new file mode 100644 index 000000000..53cb6d981 --- /dev/null +++ b/tests/unit/test_openai_compat_lea_bench_adapter.py @@ -0,0 +1,163 @@ +"""Tests P1.y-alpha — adapter OpenAI-compatible LeaBench (benchmark isolé). + +Le module est hors runtime Lea : il benchmarke un modèle vision servi en +`/v1/chat/completions` (vLLM/SGLang/TGI) contre des screenshots statiques, +sans jamais contrôler le desktop. Tests mockés HTTP uniquement. +""" + +import json +from pathlib import Path + +from PIL import Image + +from core.evaluation.computer_use_bench import load_cases, load_predictions +from core.evaluation.openai_compat_lea_bench_adapter import ( + build_openai_compat_payload, + run_openai_compat_case, + write_openai_compat_predictions, +) + + +class _FakeResponse: + """Imite une réponse `requests` OpenAI-compatible.""" + + def __init__(self, status_code: int, content: str = "", *, raw: dict | None = None): + self.status_code = status_code + self._content = content + self._raw = raw + + def json(self): + if self._raw is not None: + return self._raw + return {"choices": [{"message": {"content": self._content}}]} + + +def _write_jsonl(path: Path, rows: list[dict]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as f: + for row in rows: + f.write(json.dumps(row) + "\n") + + +def _write_image(path: Path) -> None: + Image.new("RGB", (32, 24), color=(255, 255, 255)).save(path) + + +def _case_rows(screenshot: Path) -> list[dict]: + return [ + { + "case_id": "visible", + "screenshot_path": str(screenshot), + "task": { + "intent": "click save", + "target_text": "Enregistrer", + "current_window": "Enregistrer sous", + "expected_next_window": "Bloc-notes", + "question": "Clique uniquement sur Enregistrer.", + }, + "expectation": { + "decision": "click", + "click_region": {"x_pct": 0.5, "y_pct": 0.8, "radius_pct": 0.1}, + }, + } + ] + + +def _load_one_case(tmp_path: Path): + screenshot = tmp_path / "screen.png" + _write_image(screenshot) + cases_path = tmp_path / "cases.jsonl" + _write_jsonl(cases_path, _case_rows(screenshot)) + return load_cases(cases_path, repo_root=tmp_path)[0] + + +def test_payload_embeds_image_as_data_url(tmp_path): + case = _load_one_case(tmp_path) + + payload = build_openai_compat_payload(case, model="qwen-test", image_b64="abc123") + + assert payload["model"] == "qwen-test" + user_msg = next(m for m in payload["messages"] if m["role"] == "user") + image_parts = [p for p in user_msg["content"] if p.get("type") == "image_url"] + assert image_parts, "le message user doit contenir une part image_url" + assert image_parts[0]["image_url"]["url"] == "data:image/jpeg;base64,abc123" + + +def test_payload_does_not_leak_expectation(tmp_path): + case = _load_one_case(tmp_path) + + payload = build_openai_compat_payload(case, model="qwen-test", image_b64="abc123") + serialized = json.dumps(payload) + + assert "click_region" not in serialized + assert "expectation" not in serialized + assert "0.8" not in serialized # la coordonnée attendue ne doit pas fuiter + + +def test_valid_response_yields_valid_click_prediction(tmp_path): + case = _load_one_case(tmp_path) + content = json.dumps( + {"decision": "click", "x_pct": 0.5, "y_pct": 0.8, "confidence": 0.9, "reason": "ok"} + ) + + def fake_post(url, json=None, timeout=None): + assert url.endswith("/v1/chat/completions") + return _FakeResponse(200, content) + + pred = run_openai_compat_case( + case, model="qwen-test", post=fake_post, image_encoder=lambda p: "abc123" + ) + + assert pred["case_id"] == "visible" + assert pred["model"] == "qwen-test" + assert pred["decision"] == "click" + assert pred["x_pct"] == 0.5 and pred["y_pct"] == 0.8 + + +def test_http_error_returns_safe_abstain(tmp_path): + case = _load_one_case(tmp_path) + + def fake_post(url, json=None, timeout=None): + return _FakeResponse(500, "") + + pred = run_openai_compat_case( + case, model="qwen-test", post=fake_post, image_encoder=lambda p: "abc123" + ) + + assert pred["decision"] == "abstain" + assert pred["x_pct"] is None and pred["y_pct"] is None + assert pred["confidence"] == 0.0 + + +def test_malformed_response_returns_safe_abstain(tmp_path): + case = _load_one_case(tmp_path) + + def fake_post(url, json=None, timeout=None): + return _FakeResponse(200, raw={"unexpected": "shape"}) + + pred = run_openai_compat_case( + case, model="qwen-test", post=fake_post, image_encoder=lambda p: "abc123" + ) + + assert pred["decision"] == "abstain" + assert pred["x_pct"] is None + + +def test_write_predictions_is_loadable(tmp_path): + case = _load_one_case(tmp_path) + out = tmp_path / "preds.jsonl" + content = json.dumps( + {"decision": "abstain", "x_pct": None, "y_pct": None, "confidence": 0.2, "reason": "n/a"} + ) + + def fake_post(url, json=None, timeout=None): + return _FakeResponse(200, content) + + write_openai_compat_predictions( + [case], out, model="qwen-test", post=fake_post, image_encoder=lambda p: "abc123" + ) + + preds = load_predictions(out) + assert len(preds) == 1 + assert "visible" in preds + assert preds["visible"].decision == "abstain" diff --git a/tools/lea_bench_openai_compat.py b/tools/lea_bench_openai_compat.py new file mode 100644 index 000000000..2d4a8e156 --- /dev/null +++ b/tools/lea_bench_openai_compat.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +"""CLI wrapper for the OpenAI-compatible LeaBench adapter (benchmark only).""" + +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +from core.evaluation.openai_compat_lea_bench_adapter import main + + +if __name__ == "__main__": + raise SystemExit(main())