feat(grounding): câblage Qwen3-VL-4B/vLLM (RPA_GROUNDING_ENGINE, défaut off)

Active via RPA_GROUNDING_ENGINE=qwen3vl_vllm (défaut OFF = legacy Qwen2.5-VL inchangé, byte-identique). Mode qwen3vl : port 8001/Qwen3-VL-4B, prompt point 0-1, think=false, parse /1000 (dissout DETTE-006), method "grounding" gardée (seuil 0.60), pas de fallback Ollama (abstention si vLLM down). Grounder validé au bench Easily réel (0.933, ~1s/cas). TDD : 4 tests (normalisation 0-1000, think=false, prompt fractions 0-1, gating score bas). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-13 08:39:29 +02:00
parent b20d17882e
commit 5c5ce747b0
2 changed files with 243 additions and 21 deletions
--- a/agent_v0/server_v1/resolve_engine.py
+++ b/agent_v0/server_v1/resolve_engine.py
@@ -953,26 +953,58 @@ def _resolve_by_grounding(
    import requests as _requests
    content = ""
-    # Port vLLM configurable via env
+    # Grounder POC validé (bench Easily réel 12→13/06, 0.933) : Qwen3-VL-4B/vLLM.
-    _vllm_port = os.environ.get("VLLM_PORT", "8100")
+    # Activé via RPA_GROUNDING_ENGINE=qwen3vl_vllm (défaut OFF = legacy Qwen2.5-VL
-    _vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ")
+    # inchangé, byte-identique). Le 0.933 est une propriété de
    # (modèle+moteur+prompt+parser+think) → ce mode reproduit le tuple validé :
    # prompt point 0-1, think=false, parse /1000 (dissout DETTE-006), method gardée.
    # Réf design : inbox_codex/2026-06-13_0210_..._DESIGN-CABLAGE-RESOLVE-ENGINE-QWEN3VL.md
    _grounding_engine = os.environ.get("RPA_GROUNDING_ENGINE", "").strip().lower()
    _use_qwen3vl = _grounding_engine == "qwen3vl_vllm"
    if _use_qwen3vl:
        _vllm_port = os.environ.get("VLLM_PORT", "8001")
        _vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen3-VL-4B-Instruct")
        _sys_prompt = (
            "Tu localises une cible sur une capture d'écran d'interface. "
            "Si la cible n'est pas clairement visible, réponds par une abstention."
        )
        _user_text = (
            f"Cible : « {description} ». Donne le point de clic en FRACTIONS de "
            "l'image : x et y entre 0.0 et 1.0 (0,0 = coin haut-gauche, "
            '1,1 = coin bas-droite). Réponds UNIQUEMENT par un JSON '
            '{"x":0.xx,"y":0.xx} ou {"abstain":true} si la cible n\'est pas '
            "clairement visible."
        )
    else:
        _vllm_port = os.environ.get("VLLM_PORT", "8100")
        _vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ")
        _sys_prompt = "You locate UI elements on screenshots. Return coordinates."
        _user_text = prompt
    # Essai 1 : vLLM (API OpenAI-compatible, GPU)
    try:
        _vllm_payload = {
            "model": _vllm_model,
            "messages": [
                {"role": "system", "content": _sys_prompt},
                {"role": "user", "content": [
                    {"type": "text", "text": _user_text},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}},
                ]},
            ],
            "temperature": 0.1,
            "max_tokens": 80,
        }
        if _use_qwen3vl:
            # think=false obligatoire (Qwen3-VL/vLLM) : sinon raisonnement →
            # grounding inutilisable (observé au bench).
            _vllm_payload["chat_template_kwargs"] = {"enable_thinking": False}
            _vllm_payload["temperature"] = 0.0
            _vllm_payload["max_tokens"] = 256
        vllm_resp = _requests.post(
            f"http://localhost:{_vllm_port}/v1/chat/completions",
-            json={
+            json=_vllm_payload,
                "model": _vllm_model,
                "messages": [
                    {"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."},
                    {"role": "user", "content": [
                        {"type": "text", "text": prompt},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}},
                    ]},
                ],
                "temperature": 0.1,
                "max_tokens": 80,
            },
            timeout=30,
        )
        if vllm_resp.ok:
@@ -982,8 +1014,11 @@ def _resolve_by_grounding(
    except Exception as e:
        logger.debug("vLLM non disponible (%s), fallback Ollama", e)
-    # Essai 2 : Ollama (qwen2.5vl:7b pour le grounding — format bbox_2d natif)
+    # Essai 2 : Ollama (qwen2.5vl:7b pour le grounding — format bbox_2d natif).
-    if not content:
+    # En mode qwen3vl_vllm, PAS de fallback Ollama (modèle non-viable/dangereux
    # prouvé au bench) : si vLLM échoue, on abstient (None) et la cascade externe
    # (OCR/template/SoM) prend le relais.
    if not content and not _use_qwen3vl:
        try:
            resp = _requests.post("http://localhost:11434/api/chat", json={
                "model": _grounding_model,
@@ -1003,12 +1038,19 @@ def _resolve_by_grounding(
    elapsed = time.time() - t0
    # Parser la réponse — délégué à core.grounding.bbox_parser
-    x_pct, y_pct = parse_bbox_to_norm(content, small_w, small_h)
+    if _use_qwen3vl:
        # Qwen3-VL : 0-1 (consigne respectée) OU 0-1000 natif. divisor=1000 gère
        # les DEUX (xy_json ≤1 pris tel quel ; bbox_2d / valeurs >1 → ÷1000).
        # Résolution-indépendant → dissout le bug d'échelle DETTE-006.
        x_pct, y_pct = parse_bbox_to_norm(content, 1000, 1000)
    else:
        x_pct, y_pct = parse_bbox_to_norm(content, small_w, small_h)
    if x_pct is None or y_pct is None:
-        # Fallback multi-image : screenshot + crop → grounding sans description
+        # Fallback multi-image : screenshot + crop → grounding sans description.
        # Skippé en mode qwen3vl_vllm (le fallback s'appuie sur Ollama qwen2.5vl).
        anchor_b64 = target_spec.get("anchor_image_base64", "")
-        if anchor_b64:
+        if anchor_b64 and not _use_qwen3vl:
            try:
                prompt_mi = (
                    "Image 1 is a screenshot. Image 2 shows a UI element.\n"
@@ -1073,7 +1115,10 @@ def _resolve_by_grounding(
    return {
        "resolved": True,
-        "method": "grounding_vlm",
+        # method gardée par _RESOLUTION_MIN_SCORES : en mode qwen3vl, "grounding"
        # (clé exacte, seuil 0.60) → Check-1 du validateur s'applique. Le legacy
        # garde "grounding_vlm" (non gardé aujourd'hui — bug latent, DETTE séparée).
        "method": "grounding" if _use_qwen3vl else "grounding_vlm",
        "x_pct": round(x_pct, 6),
        "y_pct": round(y_pct, 6),
        "matched_element": {
--- a/tests/unit/test_resolve_engine_qwen3vl_vllm_cabling.py
+++ b/tests/unit/test_resolve_engine_qwen3vl_vllm_cabling.py
@@ -0,0 +1,177 @@
 """Câblage resolve_engine ← Qwen3-VL-4B/vLLM (grounder POC validé 0.933, nuit 12→13/06).
 Contrat (approche A, env-gated, défaut OFF) : quand RPA_GROUNDING_ENGINE=qwen3vl_vllm,
 `_resolve_by_grounding` doit :
  1. parser les coordonnées Qwen3-VL en 0-1000 (divisor=1000), PAS en pixels image
     → dissout DETTE-006 (résolution-indépendant) ;
  2. poser think=false dans le payload vLLM (chat_template_kwargs.enable_thinking=False) ;
  3. émettre une `method` GARDÉE par _RESOLUTION_MIN_SCORES (sinon Check-1 du
     validateur est sauté → clic non-gardé).
 Réf design : inbox_codex/2026-06-13_0210_claude-to-codex_DESIGN-CABLAGE-RESOLVE-ENGINE-QWEN3VL.md
 Le 0.933 est une propriété de (modèle+moteur+prompt+parser+think), pas juste (modèle+moteur).
 """
 from __future__ import annotations
 import sys
 from pathlib import Path
 from unittest.mock import MagicMock
 import pytest
 ROOT = Path(__file__).resolve().parents[2]
 if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
 def _gated(method: str, table: dict) -> bool:
    """Reproduit la logique de matching de _validate_resolution_quality (:2751)."""
    if method in table:
        return True
    return any(p.endswith("_") and method.startswith(p) for p in table)
 def _make_vllm_post(captured: list):
    """Mock requests.post : vLLM renvoie un bbox Qwen3-VL 0-1000 centré (500,500)."""
    def fake_post(url, json=None, timeout=None):
        captured.append({"url": url, "payload": json})
        resp = MagicMock()
        if "/v1/chat/completions" in url:
            resp.ok = True
            # Qwen3-VL : coordonnées normalisées 0-1000. Centre = (500,500).
            resp.json.return_value = {
                "choices": [{"message": {"content": '{"bbox_2d": [490, 490, 510, 510]}'}}]
            }
        else:  # Ollama fallback ne doit pas être atteint dans ce mode
            resp.ok = False
            resp.json.return_value = {"message": {"content": ""}}
        return resp
    return fake_post
@pytest.mark.unit
 def test_qwen3vl_vllm_grounding_normalise_0_1000_et_method_gardee(monkeypatch, tmp_path):
    from PIL import Image
    # Image volontairement petite (200x120) : si le code divisait par les
    # pixels image au lieu de 1000, le centre 500 → 500/200 = 2.5 (hors bornes,
    # → None). C'est ce qui rend ce test RED sur le code actuel.
    shot = tmp_path / "shot.png"
    Image.new("RGB", (200, 120), (255, 255, 255)).save(shot)
    monkeypatch.setenv("RPA_GROUNDING_ENGINE", "qwen3vl_vllm")
    captured: list = []
    import requests
    monkeypatch.setattr(requests, "post", _make_vllm_post(captured))
    from agent_v0.server_v1 import resolve_engine as re_module
    result = re_module._resolve_by_grounding(
        screenshot_path=str(shot),
        target_spec={"by_text": "Synthèse"},
        screen_width=200,
        screen_height=120,
    )
    # (1) résolu et normalisé par /1000 → centre ~ (0.5, 0.5)
    assert result is not None, "Résolution None : coords Qwen3-VL 0-1000 mal normalisées (DETTE-006)"
    assert abs(result["x_pct"] - 0.5) < 0.02, f"x_pct={result['x_pct']} (attendu ~0.5 via /1000)"
    assert abs(result["y_pct"] - 0.5) < 0.02, f"y_pct={result['y_pct']} (attendu ~0.5 via /1000)"
    # (3) method gardée par le seuil
    assert _gated(result["method"], re_module._RESOLUTION_MIN_SCORES), (
        f"method {result['method']!r} non gardée → Check-1 du validateur sauté (clic non-gardé)"
    )
@pytest.mark.unit
 def test_qwen3vl_vllm_payload_think_false(monkeypatch, tmp_path):
    from PIL import Image
    shot = tmp_path / "shot.png"
    Image.new("RGB", (200, 120), (255, 255, 255)).save(shot)
    monkeypatch.setenv("RPA_GROUNDING_ENGINE", "qwen3vl_vllm")
    captured: list = []
    import requests
    monkeypatch.setattr(requests, "post", _make_vllm_post(captured))
    from agent_v0.server_v1 import resolve_engine as re_module
    re_module._resolve_by_grounding(
        screenshot_path=str(shot),
        target_spec={"by_text": "Synthèse"},
        screen_width=200,
        screen_height=120,
    )
    vllm = [c for c in captured if "/v1/chat/completions" in c["url"]]
    assert vllm, "Aucun appel vLLM capturé"
    payload = vllm[0]["payload"]
    # think=false : pour vLLM via chat_template_kwargs.enable_thinking=False
    cek = payload.get("chat_template_kwargs", {})
    assert cek.get("enable_thinking") is False, (
        f"think non désactivé dans payload vLLM : chat_template_kwargs={cek} "
        f"(Qwen3-VL penserait → grounding inutilisable, cf. bench)"
    )
@pytest.mark.unit
 def test_qwen3vl_vllm_prompt_demande_fractions_0_1(monkeypatch, tmp_path):
    """Fidélité au tuple validé (0.933) : le prompt qwen3vl demande un point de
    clic en FRACTIONS 0-1 (format {"x","y"}), pas un 'bounding box' générique."""
    from PIL import Image
    shot = tmp_path / "shot.png"
    Image.new("RGB", (200, 120), (255, 255, 255)).save(shot)
    monkeypatch.setenv("RPA_GROUNDING_ENGINE", "qwen3vl_vllm")
    captured: list = []
    import requests
    monkeypatch.setattr(requests, "post", _make_vllm_post(captured))
    from agent_v0.server_v1 import resolve_engine as re_module
    re_module._resolve_by_grounding(
        screenshot_path=str(shot),
        target_spec={"by_text": "Synthèse"},
        screen_width=200,
        screen_height=120,
    )
    vllm = [c for c in captured if "/v1/chat/completions" in c["url"]]
    assert vllm, "Aucun appel vLLM capturé"
    user_txt = ""
    for m in vllm[0]["payload"]["messages"]:
        c = m.get("content")
        if isinstance(c, list):
            user_txt += " ".join(p.get("text", "") for p in c if isinstance(p, dict))
        elif isinstance(c, str):
            user_txt += " " + c
    assert "Synthèse" in user_txt, "cible non injectée dans le prompt"
    low = user_txt.lower()
    assert "fraction" in low or ("0.0" in user_txt and "1.0" in user_txt), (
        f"prompt qwen3vl ne demande pas un point 0-1 (tuple validé non reproduit) : {user_txt!r}"
    )
@pytest.mark.unit
 def test_qwen3vl_vllm_method_grounding_rejetee_si_score_bas():
    """Le method qwen3vl doit activer le garde-seuil, pas sauter Check-1."""
    from agent_v0.server_v1 import resolve_engine as re_module
    out = re_module._validate_resolution_quality(
        {
            "resolved": True,
            "method": "grounding",
            "score": 0.10,
            "x_pct": 0.50,
            "y_pct": 0.50,
        },
        0.50,
        0.50,
    )
    assert out is not None
    assert out["resolved"] is False
    assert out["method"] == "rejected_low_score_grounding"
    assert out["original_method"] == "grounding"