feat(p1y-alpha): add OpenAI-compatible LeaBench adapter (benchmark only)
Adapter de benchmark isole (hors runtime Lea) ciblant un serveur /v1/chat/completions a support vision (vLLM/SGLang/TGI), pour comparer plus tard a Ollama via LeaBench. Ne controle jamais le desktop. - core/evaluation/openai_compat_lea_bench_adapter.py : payload data-URL image_url, parsing choices[0].message.content. Reutilise par import la logique prompt/parse/normalisation de ollama_lea_bench_adapter (zero refactor). - tools/lea_bench_openai_compat.py : wrapper CLI (--base-url defaut :8001). - tests/unit/test_openai_compat_lea_bench_adapter.py : 6 tests mockes HTTP (data URL, pas de fuite expectation/click_region, prediction valide, abstain safe sur HTTP!=200 et reponse malformee, JSONL rechargeable). Aucun runtime Lea modifie. Aucun service lance. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
191
core/evaluation/openai_compat_lea_bench_adapter.py
Normal file
191
core/evaluation/openai_compat_lea_bench_adapter.py
Normal file
@@ -0,0 +1,191 @@
|
||||
"""OpenAI-compatible adapter that writes LeaBench-compatible prediction JSONL.
|
||||
|
||||
Benchmark only — strictly outside Lea runtime. It targets any server exposing
|
||||
`POST /v1/chat/completions` with vision support (vLLM, SGLang, TGI, ...) and
|
||||
never controls the desktop.
|
||||
|
||||
Réutilise la logique de prompt/parsing/normalisation de l'adapter Ollama
|
||||
(`ollama_lea_bench_adapter`) pour garantir un comportement strictement aligné ;
|
||||
seuls le format du payload (data URL `image_url`) et le parsing de la réponse
|
||||
(`choices[0].message.content`) diffèrent.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable
|
||||
|
||||
import requests
|
||||
|
||||
from core.evaluation.computer_use_bench import BenchCase, load_cases
|
||||
from core.evaluation.ollama_lea_bench_adapter import (
|
||||
OLLAMA_SYSTEM_PROMPT,
|
||||
build_ollama_user_prompt,
|
||||
encode_screenshot_base64,
|
||||
extract_json_object,
|
||||
normalize_prediction,
|
||||
_safe_abstain,
|
||||
)
|
||||
|
||||
|
||||
DEFAULT_MODEL = "qwen3-vl:8b"
|
||||
DEFAULT_BASE_URL = "http://localhost:8001"
|
||||
|
||||
HttpPost = Callable[..., Any]
|
||||
ImageEncoder = Callable[[Path], str]
|
||||
|
||||
|
||||
def build_openai_compat_payload(
|
||||
case: BenchCase,
|
||||
*,
|
||||
model: str,
|
||||
image_b64: str,
|
||||
temperature: float = 0.1,
|
||||
max_tokens: int = 200,
|
||||
json_response_format: bool = True,
|
||||
) -> dict[str, Any]:
|
||||
"""Construit un payload `/v1/chat/completions` compatible vision.
|
||||
|
||||
L'image est passée en data URL JPEG (`data:image/jpeg;base64,...`), format
|
||||
`image_url` standard OpenAI/vLLM/SGLang. Le prompt système et utilisateur
|
||||
sont ceux de l'adapter Ollama (provider-neutral).
|
||||
"""
|
||||
payload: dict[str, Any] = {
|
||||
"model": model,
|
||||
"messages": [
|
||||
{"role": "system", "content": OLLAMA_SYSTEM_PROMPT.strip()},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": build_ollama_user_prompt(case)},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/jpeg;base64,{image_b64}"},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
"stream": False,
|
||||
"temperature": temperature,
|
||||
"max_tokens": max_tokens,
|
||||
}
|
||||
if json_response_format:
|
||||
# Supporté par OpenAI, vLLM (>=0.4) et SGLang ; ignoré silencieusement
|
||||
# par les serveurs qui ne le connaissent pas.
|
||||
payload["response_format"] = {"type": "json_object"}
|
||||
return payload
|
||||
|
||||
|
||||
def _extract_content(response_json: Any) -> str | None:
|
||||
"""Extrait `choices[0].message.content` d'une réponse OpenAI-compatible."""
|
||||
if not isinstance(response_json, dict):
|
||||
return None
|
||||
choices = response_json.get("choices")
|
||||
if not isinstance(choices, list) or not choices:
|
||||
return None
|
||||
message = choices[0].get("message") if isinstance(choices[0], dict) else None
|
||||
if not isinstance(message, dict):
|
||||
return None
|
||||
content = message.get("content")
|
||||
return content if isinstance(content, str) else None
|
||||
|
||||
|
||||
def run_openai_compat_case(
|
||||
case: BenchCase,
|
||||
*,
|
||||
model: str = DEFAULT_MODEL,
|
||||
base_url: str = DEFAULT_BASE_URL,
|
||||
timeout: int = 45,
|
||||
post: HttpPost = requests.post,
|
||||
image_encoder: ImageEncoder = encode_screenshot_base64,
|
||||
retries: int = 1,
|
||||
) -> dict[str, Any]:
|
||||
image_b64 = image_encoder(case.screenshot_path)
|
||||
payload = build_openai_compat_payload(case, model=model, image_b64=image_b64)
|
||||
url = f"{base_url.rstrip('/')}/v1/chat/completions"
|
||||
|
||||
last_error = ""
|
||||
for attempt in range(retries + 1):
|
||||
try:
|
||||
response = post(url, json=payload, timeout=timeout)
|
||||
if getattr(response, "status_code", 0) != 200:
|
||||
last_error = f"HTTP {getattr(response, 'status_code', 'unknown')}"
|
||||
else:
|
||||
text = _extract_content(response.json())
|
||||
if text is None:
|
||||
last_error = "missing_choices_content"
|
||||
else:
|
||||
parsed = extract_json_object(text)
|
||||
if parsed is None and attempt < retries:
|
||||
# On relance une fois en rappelant le contrat JSON.
|
||||
text_msg = payload["messages"][1]["content"][0]
|
||||
text_msg["text"] += (
|
||||
"\nYour previous answer was not valid JSON. Output JSON only."
|
||||
)
|
||||
continue
|
||||
return normalize_prediction(case, parsed, model=model, raw_text=text)
|
||||
except Exception as exc: # pragma: no cover - exercised via fake response paths
|
||||
last_error = str(exc)
|
||||
|
||||
if attempt < retries:
|
||||
time.sleep(2)
|
||||
|
||||
return _safe_abstain(case, model, f"openai_compat_error: {last_error[:80]}")
|
||||
|
||||
|
||||
def write_openai_compat_predictions(
|
||||
cases: list[BenchCase],
|
||||
output_path: str | Path,
|
||||
*,
|
||||
model: str = DEFAULT_MODEL,
|
||||
base_url: str = DEFAULT_BASE_URL,
|
||||
timeout: int = 45,
|
||||
post: HttpPost = requests.post,
|
||||
image_encoder: ImageEncoder = encode_screenshot_base64,
|
||||
) -> None:
|
||||
out = Path(output_path)
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
with out.open("w", encoding="utf-8") as f:
|
||||
for case in cases:
|
||||
prediction = run_openai_compat_case(
|
||||
case,
|
||||
model=model,
|
||||
base_url=base_url,
|
||||
timeout=timeout,
|
||||
post=post,
|
||||
image_encoder=image_encoder,
|
||||
)
|
||||
f.write(json.dumps(prediction, ensure_ascii=False) + "\n")
|
||||
f.flush()
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run an OpenAI-compatible vision server on LeaBench cases."
|
||||
)
|
||||
parser.add_argument("--cases", required=True, help="Path to LeaBench cases JSONL.")
|
||||
parser.add_argument("--output", required=True, help="Output predictions JSONL.")
|
||||
parser.add_argument("--repo-root", default=".", help="Repository root for relative screenshot paths.")
|
||||
parser.add_argument("--base-url", default=DEFAULT_BASE_URL, help="OpenAI-compatible base URL.")
|
||||
parser.add_argument("--model", default=DEFAULT_MODEL, help="Model name served by the endpoint.")
|
||||
parser.add_argument("--timeout", type=int, default=45, help="Per-case timeout in seconds.")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
cases = load_cases(args.cases, repo_root=args.repo_root)
|
||||
write_openai_compat_predictions(
|
||||
cases,
|
||||
args.output,
|
||||
model=args.model,
|
||||
base_url=args.base_url,
|
||||
timeout=args.timeout,
|
||||
)
|
||||
print(f"Wrote OpenAI-compatible predictions: {args.output}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main(sys.argv[1:]))
|
||||
163
tests/unit/test_openai_compat_lea_bench_adapter.py
Normal file
163
tests/unit/test_openai_compat_lea_bench_adapter.py
Normal file
@@ -0,0 +1,163 @@
|
||||
"""Tests P1.y-alpha — adapter OpenAI-compatible LeaBench (benchmark isolé).
|
||||
|
||||
Le module est hors runtime Lea : il benchmarke un modèle vision servi en
|
||||
`/v1/chat/completions` (vLLM/SGLang/TGI) contre des screenshots statiques,
|
||||
sans jamais contrôler le desktop. Tests mockés HTTP uniquement.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from core.evaluation.computer_use_bench import load_cases, load_predictions
|
||||
from core.evaluation.openai_compat_lea_bench_adapter import (
|
||||
build_openai_compat_payload,
|
||||
run_openai_compat_case,
|
||||
write_openai_compat_predictions,
|
||||
)
|
||||
|
||||
|
||||
class _FakeResponse:
|
||||
"""Imite une réponse `requests` OpenAI-compatible."""
|
||||
|
||||
def __init__(self, status_code: int, content: str = "", *, raw: dict | None = None):
|
||||
self.status_code = status_code
|
||||
self._content = content
|
||||
self._raw = raw
|
||||
|
||||
def json(self):
|
||||
if self._raw is not None:
|
||||
return self._raw
|
||||
return {"choices": [{"message": {"content": self._content}}]}
|
||||
|
||||
|
||||
def _write_jsonl(path: Path, rows: list[dict]) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with path.open("w", encoding="utf-8") as f:
|
||||
for row in rows:
|
||||
f.write(json.dumps(row) + "\n")
|
||||
|
||||
|
||||
def _write_image(path: Path) -> None:
|
||||
Image.new("RGB", (32, 24), color=(255, 255, 255)).save(path)
|
||||
|
||||
|
||||
def _case_rows(screenshot: Path) -> list[dict]:
|
||||
return [
|
||||
{
|
||||
"case_id": "visible",
|
||||
"screenshot_path": str(screenshot),
|
||||
"task": {
|
||||
"intent": "click save",
|
||||
"target_text": "Enregistrer",
|
||||
"current_window": "Enregistrer sous",
|
||||
"expected_next_window": "Bloc-notes",
|
||||
"question": "Clique uniquement sur Enregistrer.",
|
||||
},
|
||||
"expectation": {
|
||||
"decision": "click",
|
||||
"click_region": {"x_pct": 0.5, "y_pct": 0.8, "radius_pct": 0.1},
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _load_one_case(tmp_path: Path):
|
||||
screenshot = tmp_path / "screen.png"
|
||||
_write_image(screenshot)
|
||||
cases_path = tmp_path / "cases.jsonl"
|
||||
_write_jsonl(cases_path, _case_rows(screenshot))
|
||||
return load_cases(cases_path, repo_root=tmp_path)[0]
|
||||
|
||||
|
||||
def test_payload_embeds_image_as_data_url(tmp_path):
|
||||
case = _load_one_case(tmp_path)
|
||||
|
||||
payload = build_openai_compat_payload(case, model="qwen-test", image_b64="abc123")
|
||||
|
||||
assert payload["model"] == "qwen-test"
|
||||
user_msg = next(m for m in payload["messages"] if m["role"] == "user")
|
||||
image_parts = [p for p in user_msg["content"] if p.get("type") == "image_url"]
|
||||
assert image_parts, "le message user doit contenir une part image_url"
|
||||
assert image_parts[0]["image_url"]["url"] == "data:image/jpeg;base64,abc123"
|
||||
|
||||
|
||||
def test_payload_does_not_leak_expectation(tmp_path):
|
||||
case = _load_one_case(tmp_path)
|
||||
|
||||
payload = build_openai_compat_payload(case, model="qwen-test", image_b64="abc123")
|
||||
serialized = json.dumps(payload)
|
||||
|
||||
assert "click_region" not in serialized
|
||||
assert "expectation" not in serialized
|
||||
assert "0.8" not in serialized # la coordonnée attendue ne doit pas fuiter
|
||||
|
||||
|
||||
def test_valid_response_yields_valid_click_prediction(tmp_path):
|
||||
case = _load_one_case(tmp_path)
|
||||
content = json.dumps(
|
||||
{"decision": "click", "x_pct": 0.5, "y_pct": 0.8, "confidence": 0.9, "reason": "ok"}
|
||||
)
|
||||
|
||||
def fake_post(url, json=None, timeout=None):
|
||||
assert url.endswith("/v1/chat/completions")
|
||||
return _FakeResponse(200, content)
|
||||
|
||||
pred = run_openai_compat_case(
|
||||
case, model="qwen-test", post=fake_post, image_encoder=lambda p: "abc123"
|
||||
)
|
||||
|
||||
assert pred["case_id"] == "visible"
|
||||
assert pred["model"] == "qwen-test"
|
||||
assert pred["decision"] == "click"
|
||||
assert pred["x_pct"] == 0.5 and pred["y_pct"] == 0.8
|
||||
|
||||
|
||||
def test_http_error_returns_safe_abstain(tmp_path):
|
||||
case = _load_one_case(tmp_path)
|
||||
|
||||
def fake_post(url, json=None, timeout=None):
|
||||
return _FakeResponse(500, "")
|
||||
|
||||
pred = run_openai_compat_case(
|
||||
case, model="qwen-test", post=fake_post, image_encoder=lambda p: "abc123"
|
||||
)
|
||||
|
||||
assert pred["decision"] == "abstain"
|
||||
assert pred["x_pct"] is None and pred["y_pct"] is None
|
||||
assert pred["confidence"] == 0.0
|
||||
|
||||
|
||||
def test_malformed_response_returns_safe_abstain(tmp_path):
|
||||
case = _load_one_case(tmp_path)
|
||||
|
||||
def fake_post(url, json=None, timeout=None):
|
||||
return _FakeResponse(200, raw={"unexpected": "shape"})
|
||||
|
||||
pred = run_openai_compat_case(
|
||||
case, model="qwen-test", post=fake_post, image_encoder=lambda p: "abc123"
|
||||
)
|
||||
|
||||
assert pred["decision"] == "abstain"
|
||||
assert pred["x_pct"] is None
|
||||
|
||||
|
||||
def test_write_predictions_is_loadable(tmp_path):
|
||||
case = _load_one_case(tmp_path)
|
||||
out = tmp_path / "preds.jsonl"
|
||||
content = json.dumps(
|
||||
{"decision": "abstain", "x_pct": None, "y_pct": None, "confidence": 0.2, "reason": "n/a"}
|
||||
)
|
||||
|
||||
def fake_post(url, json=None, timeout=None):
|
||||
return _FakeResponse(200, content)
|
||||
|
||||
write_openai_compat_predictions(
|
||||
[case], out, model="qwen-test", post=fake_post, image_encoder=lambda p: "abc123"
|
||||
)
|
||||
|
||||
preds = load_predictions(out)
|
||||
assert len(preds) == 1
|
||||
assert "visible" in preds
|
||||
assert preds["visible"].decision == "abstain"
|
||||
15
tools/lea_bench_openai_compat.py
Normal file
15
tools/lea_bench_openai_compat.py
Normal file
@@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env python3
|
||||
"""CLI wrapper for the OpenAI-compatible LeaBench adapter (benchmark only)."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
if str(ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from core.evaluation.openai_compat_lea_bench_adapter import main
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user