feat(p1y-alpha): add OpenAI-compatible LeaBench adapter (benchmark only)

Adapter de benchmark isole (hors runtime Lea) ciblant un serveur /v1/chat/completions a support vision (vLLM/SGLang/TGI), pour comparer plus tard a Ollama via LeaBench. Ne controle jamais le desktop. - core/evaluation/openai_compat_lea_bench_adapter.py : payload data-URL image_url, parsing choices[0].message.content. Reutilise par import la logique prompt/parse/normalisation de ollama_lea_bench_adapter (zero refactor). - tools/lea_bench_openai_compat.py : wrapper CLI (--base-url defaut :8001). - tests/unit/test_openai_compat_lea_bench_adapter.py : 6 tests mockes HTTP (data URL, pas de fuite expectation/click_region, prediction valide, abstain safe sur HTTP!=200 et reponse malformee, JSONL rechargeable). Aucun runtime Lea modifie. Aucun service lance. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-04 16:49:53 +02:00
parent 806cc04b82
commit 0f122a512f
3 changed files with 369 additions and 0 deletions
--- a/tests/unit/test_openai_compat_lea_bench_adapter.py
+++ b/tests/unit/test_openai_compat_lea_bench_adapter.py
@@ -0,0 +1,163 @@
+"""Tests P1.y-alpha — adapter OpenAI-compatible LeaBench (benchmark isolé).
+
+Le module est hors runtime Lea : il benchmarke un modèle vision servi en
+`/v1/chat/completions` (vLLM/SGLang/TGI) contre des screenshots statiques,
+sans jamais contrôler le desktop. Tests mockés HTTP uniquement.
+"""
+
+import json
+from pathlib import Path
+
+from PIL import Image
+
+from core.evaluation.computer_use_bench import load_cases, load_predictions
+from core.evaluation.openai_compat_lea_bench_adapter import (
+    build_openai_compat_payload,
+    run_openai_compat_case,
+    write_openai_compat_predictions,
+)
+
+
+class _FakeResponse:
+    """Imite une réponse `requests` OpenAI-compatible."""
+
+    def __init__(self, status_code: int, content: str = "", *, raw: dict | None = None):
+        self.status_code = status_code
+        self._content = content
+        self._raw = raw
+
+    def json(self):
+        if self._raw is not None:
+            return self._raw
+        return {"choices": [{"message": {"content": self._content}}]}
+
+
+def _write_jsonl(path: Path, rows: list[dict]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row) + "\n")
+
+
+def _write_image(path: Path) -> None:
+    Image.new("RGB", (32, 24), color=(255, 255, 255)).save(path)
+
+
+def _case_rows(screenshot: Path) -> list[dict]:
+    return [
+        {
+            "case_id": "visible",
+            "screenshot_path": str(screenshot),
+            "task": {
+                "intent": "click save",
+                "target_text": "Enregistrer",
+                "current_window": "Enregistrer sous",
+                "expected_next_window": "Bloc-notes",
+                "question": "Clique uniquement sur Enregistrer.",
+            },
+            "expectation": {
+                "decision": "click",
+                "click_region": {"x_pct": 0.5, "y_pct": 0.8, "radius_pct": 0.1},
+            },
+        }
+    ]
+
+
+def _load_one_case(tmp_path: Path):
+    screenshot = tmp_path / "screen.png"
+    _write_image(screenshot)
+    cases_path = tmp_path / "cases.jsonl"
+    _write_jsonl(cases_path, _case_rows(screenshot))
+    return load_cases(cases_path, repo_root=tmp_path)[0]
+
+
+def test_payload_embeds_image_as_data_url(tmp_path):
+    case = _load_one_case(tmp_path)
+
+    payload = build_openai_compat_payload(case, model="qwen-test", image_b64="abc123")
+
+    assert payload["model"] == "qwen-test"
+    user_msg = next(m for m in payload["messages"] if m["role"] == "user")
+    image_parts = [p for p in user_msg["content"] if p.get("type") == "image_url"]
+    assert image_parts, "le message user doit contenir une part image_url"
+    assert image_parts[0]["image_url"]["url"] == "data:image/jpeg;base64,abc123"
+
+
+def test_payload_does_not_leak_expectation(tmp_path):
+    case = _load_one_case(tmp_path)
+
+    payload = build_openai_compat_payload(case, model="qwen-test", image_b64="abc123")
+    serialized = json.dumps(payload)
+
+    assert "click_region" not in serialized
+    assert "expectation" not in serialized
+    assert "0.8" not in serialized  # la coordonnée attendue ne doit pas fuiter
+
+
+def test_valid_response_yields_valid_click_prediction(tmp_path):
+    case = _load_one_case(tmp_path)
+    content = json.dumps(
+        {"decision": "click", "x_pct": 0.5, "y_pct": 0.8, "confidence": 0.9, "reason": "ok"}
+    )
+
+    def fake_post(url, json=None, timeout=None):
+        assert url.endswith("/v1/chat/completions")
+        return _FakeResponse(200, content)
+
+    pred = run_openai_compat_case(
+        case, model="qwen-test", post=fake_post, image_encoder=lambda p: "abc123"
+    )
+
+    assert pred["case_id"] == "visible"
+    assert pred["model"] == "qwen-test"
+    assert pred["decision"] == "click"
+    assert pred["x_pct"] == 0.5 and pred["y_pct"] == 0.8
+
+
+def test_http_error_returns_safe_abstain(tmp_path):
+    case = _load_one_case(tmp_path)
+
+    def fake_post(url, json=None, timeout=None):
+        return _FakeResponse(500, "")
+
+    pred = run_openai_compat_case(
+        case, model="qwen-test", post=fake_post, image_encoder=lambda p: "abc123"
+    )
+
+    assert pred["decision"] == "abstain"
+    assert pred["x_pct"] is None and pred["y_pct"] is None
+    assert pred["confidence"] == 0.0
+
+
+def test_malformed_response_returns_safe_abstain(tmp_path):
+    case = _load_one_case(tmp_path)
+
+    def fake_post(url, json=None, timeout=None):
+        return _FakeResponse(200, raw={"unexpected": "shape"})
+
+    pred = run_openai_compat_case(
+        case, model="qwen-test", post=fake_post, image_encoder=lambda p: "abc123"
+    )
+
+    assert pred["decision"] == "abstain"
+    assert pred["x_pct"] is None
+
+
+def test_write_predictions_is_loadable(tmp_path):
+    case = _load_one_case(tmp_path)
+    out = tmp_path / "preds.jsonl"
+    content = json.dumps(
+        {"decision": "abstain", "x_pct": None, "y_pct": None, "confidence": 0.2, "reason": "n/a"}
+    )
+
+    def fake_post(url, json=None, timeout=None):
+        return _FakeResponse(200, content)
+
+    write_openai_compat_predictions(
+        [case], out, model="qwen-test", post=fake_post, image_encoder=lambda p: "abc123"
+    )
+
+    preds = load_predictions(out)
+    assert len(preds) == 1
+    assert "visible" in preds
+    assert preds["visible"].decision == "abstain"