feat(grounding): câblage Qwen3-VL-4B/vLLM (RPA_GROUNDING_ENGINE, défaut off)

Active via RPA_GROUNDING_ENGINE=qwen3vl_vllm (défaut OFF = legacy Qwen2.5-VL inchangé, byte-identique). Mode qwen3vl : port 8001/Qwen3-VL-4B, prompt point 0-1, think=false, parse /1000 (dissout DETTE-006), method "grounding" gardée (seuil 0.60), pas de fallback Ollama (abstention si vLLM down). Grounder validé au bench Easily réel (0.933, ~1s/cas). TDD : 4 tests (normalisation 0-1000, think=false, prompt fractions 0-1, gating score bas). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-13 08:39:29 +02:00
parent b20d17882e
commit 5c5ce747b0
2 changed files with 243 additions and 21 deletions
--- a/tests/unit/test_resolve_engine_qwen3vl_vllm_cabling.py
+++ b/tests/unit/test_resolve_engine_qwen3vl_vllm_cabling.py
@@ -0,0 +1,177 @@
+"""Câblage resolve_engine ← Qwen3-VL-4B/vLLM (grounder POC validé 0.933, nuit 12→13/06).
+
+Contrat (approche A, env-gated, défaut OFF) : quand RPA_GROUNDING_ENGINE=qwen3vl_vllm,
+`_resolve_by_grounding` doit :
+  1. parser les coordonnées Qwen3-VL en 0-1000 (divisor=1000), PAS en pixels image
+     → dissout DETTE-006 (résolution-indépendant) ;
+  2. poser think=false dans le payload vLLM (chat_template_kwargs.enable_thinking=False) ;
+  3. émettre une `method` GARDÉE par _RESOLUTION_MIN_SCORES (sinon Check-1 du
+     validateur est sauté → clic non-gardé).
+
+Réf design : inbox_codex/2026-06-13_0210_claude-to-codex_DESIGN-CABLAGE-RESOLVE-ENGINE-QWEN3VL.md
+Le 0.933 est une propriété de (modèle+moteur+prompt+parser+think), pas juste (modèle+moteur).
+"""
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import pytest
+
+ROOT = Path(__file__).resolve().parents[2]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+
+def _gated(method: str, table: dict) -> bool:
+    """Reproduit la logique de matching de _validate_resolution_quality (:2751)."""
+    if method in table:
+        return True
+    return any(p.endswith("_") and method.startswith(p) for p in table)
+
+
+def _make_vllm_post(captured: list):
+    """Mock requests.post : vLLM renvoie un bbox Qwen3-VL 0-1000 centré (500,500)."""
+    def fake_post(url, json=None, timeout=None):
+        captured.append({"url": url, "payload": json})
+        resp = MagicMock()
+        if "/v1/chat/completions" in url:
+            resp.ok = True
+            # Qwen3-VL : coordonnées normalisées 0-1000. Centre = (500,500).
+            resp.json.return_value = {
+                "choices": [{"message": {"content": '{"bbox_2d": [490, 490, 510, 510]}'}}]
+            }
+        else:  # Ollama fallback ne doit pas être atteint dans ce mode
+            resp.ok = False
+            resp.json.return_value = {"message": {"content": ""}}
+        return resp
+    return fake_post
+
+
+@pytest.mark.unit
+def test_qwen3vl_vllm_grounding_normalise_0_1000_et_method_gardee(monkeypatch, tmp_path):
+    from PIL import Image
+
+    # Image volontairement petite (200x120) : si le code divisait par les
+    # pixels image au lieu de 1000, le centre 500 → 500/200 = 2.5 (hors bornes,
+    # → None). C'est ce qui rend ce test RED sur le code actuel.
+    shot = tmp_path / "shot.png"
+    Image.new("RGB", (200, 120), (255, 255, 255)).save(shot)
+
+    monkeypatch.setenv("RPA_GROUNDING_ENGINE", "qwen3vl_vllm")
+    captured: list = []
+    import requests
+    monkeypatch.setattr(requests, "post", _make_vllm_post(captured))
+
+    from agent_v0.server_v1 import resolve_engine as re_module
+
+    result = re_module._resolve_by_grounding(
+        screenshot_path=str(shot),
+        target_spec={"by_text": "Synthèse"},
+        screen_width=200,
+        screen_height=120,
+    )
+
+    # (1) résolu et normalisé par /1000 → centre ~ (0.5, 0.5)
+    assert result is not None, "Résolution None : coords Qwen3-VL 0-1000 mal normalisées (DETTE-006)"
+    assert abs(result["x_pct"] - 0.5) < 0.02, f"x_pct={result['x_pct']} (attendu ~0.5 via /1000)"
+    assert abs(result["y_pct"] - 0.5) < 0.02, f"y_pct={result['y_pct']} (attendu ~0.5 via /1000)"
+
+    # (3) method gardée par le seuil
+    assert _gated(result["method"], re_module._RESOLUTION_MIN_SCORES), (
+        f"method {result['method']!r} non gardée → Check-1 du validateur sauté (clic non-gardé)"
+    )
+
+
+@pytest.mark.unit
+def test_qwen3vl_vllm_payload_think_false(monkeypatch, tmp_path):
+    from PIL import Image
+
+    shot = tmp_path / "shot.png"
+    Image.new("RGB", (200, 120), (255, 255, 255)).save(shot)
+
+    monkeypatch.setenv("RPA_GROUNDING_ENGINE", "qwen3vl_vllm")
+    captured: list = []
+    import requests
+    monkeypatch.setattr(requests, "post", _make_vllm_post(captured))
+
+    from agent_v0.server_v1 import resolve_engine as re_module
+    re_module._resolve_by_grounding(
+        screenshot_path=str(shot),
+        target_spec={"by_text": "Synthèse"},
+        screen_width=200,
+        screen_height=120,
+    )
+
+    vllm = [c for c in captured if "/v1/chat/completions" in c["url"]]
+    assert vllm, "Aucun appel vLLM capturé"
+    payload = vllm[0]["payload"]
+    # think=false : pour vLLM via chat_template_kwargs.enable_thinking=False
+    cek = payload.get("chat_template_kwargs", {})
+    assert cek.get("enable_thinking") is False, (
+        f"think non désactivé dans payload vLLM : chat_template_kwargs={cek} "
+        f"(Qwen3-VL penserait → grounding inutilisable, cf. bench)"
+    )
+
+
+@pytest.mark.unit
+def test_qwen3vl_vllm_prompt_demande_fractions_0_1(monkeypatch, tmp_path):
+    """Fidélité au tuple validé (0.933) : le prompt qwen3vl demande un point de
+    clic en FRACTIONS 0-1 (format {"x","y"}), pas un 'bounding box' générique."""
+    from PIL import Image
+
+    shot = tmp_path / "shot.png"
+    Image.new("RGB", (200, 120), (255, 255, 255)).save(shot)
+
+    monkeypatch.setenv("RPA_GROUNDING_ENGINE", "qwen3vl_vllm")
+    captured: list = []
+    import requests
+    monkeypatch.setattr(requests, "post", _make_vllm_post(captured))
+
+    from agent_v0.server_v1 import resolve_engine as re_module
+    re_module._resolve_by_grounding(
+        screenshot_path=str(shot),
+        target_spec={"by_text": "Synthèse"},
+        screen_width=200,
+        screen_height=120,
+    )
+
+    vllm = [c for c in captured if "/v1/chat/completions" in c["url"]]
+    assert vllm, "Aucun appel vLLM capturé"
+    user_txt = ""
+    for m in vllm[0]["payload"]["messages"]:
+        c = m.get("content")
+        if isinstance(c, list):
+            user_txt += " ".join(p.get("text", "") for p in c if isinstance(p, dict))
+        elif isinstance(c, str):
+            user_txt += " " + c
+
+    assert "Synthèse" in user_txt, "cible non injectée dans le prompt"
+    low = user_txt.lower()
+    assert "fraction" in low or ("0.0" in user_txt and "1.0" in user_txt), (
+        f"prompt qwen3vl ne demande pas un point 0-1 (tuple validé non reproduit) : {user_txt!r}"
+    )
+
+
+@pytest.mark.unit
+def test_qwen3vl_vllm_method_grounding_rejetee_si_score_bas():
+    """Le method qwen3vl doit activer le garde-seuil, pas sauter Check-1."""
+    from agent_v0.server_v1 import resolve_engine as re_module
+
+    out = re_module._validate_resolution_quality(
+        {
+            "resolved": True,
+            "method": "grounding",
+            "score": 0.10,
+            "x_pct": 0.50,
+            "y_pct": 0.50,
+        },
+        0.50,
+        0.50,
+    )
+
+    assert out is not None
+    assert out["resolved"] is False
+    assert out["method"] == "rejected_low_score_grounding"
+    assert out["original_method"] == "grounding"