From 5c5ce747b032db7b88b5cdffc0bdd22ed1d12fdb Mon Sep 17 00:00:00 2001 From: Dom Date: Sat, 13 Jun 2026 08:39:29 +0200 Subject: [PATCH] =?UTF-8?q?feat(grounding):=20c=C3=A2blage=20Qwen3-VL-4B/v?= =?UTF-8?q?LLM=20(RPA=5FGROUNDING=5FENGINE,=20d=C3=A9faut=20off)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Active via RPA_GROUNDING_ENGINE=qwen3vl_vllm (défaut OFF = legacy Qwen2.5-VL inchangé, byte-identique). Mode qwen3vl : port 8001/Qwen3-VL-4B, prompt point 0-1, think=false, parse /1000 (dissout DETTE-006), method "grounding" gardée (seuil 0.60), pas de fallback Ollama (abstention si vLLM down). Grounder validé au bench Easily réel (0.933, ~1s/cas). TDD : 4 tests (normalisation 0-1000, think=false, prompt fractions 0-1, gating score bas). Co-Authored-By: Claude Opus 4.8 (1M context) --- agent_v0/server_v1/resolve_engine.py | 87 ++++++--- ...est_resolve_engine_qwen3vl_vllm_cabling.py | 177 ++++++++++++++++++ 2 files changed, 243 insertions(+), 21 deletions(-) create mode 100644 tests/unit/test_resolve_engine_qwen3vl_vllm_cabling.py diff --git a/agent_v0/server_v1/resolve_engine.py b/agent_v0/server_v1/resolve_engine.py index 12872debf..4e4b99486 100644 --- a/agent_v0/server_v1/resolve_engine.py +++ b/agent_v0/server_v1/resolve_engine.py @@ -953,26 +953,58 @@ def _resolve_by_grounding( import requests as _requests content = "" - # Port vLLM configurable via env - _vllm_port = os.environ.get("VLLM_PORT", "8100") - _vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ") + # Grounder POC validé (bench Easily réel 12→13/06, 0.933) : Qwen3-VL-4B/vLLM. + # Activé via RPA_GROUNDING_ENGINE=qwen3vl_vllm (défaut OFF = legacy Qwen2.5-VL + # inchangé, byte-identique). Le 0.933 est une propriété de + # (modèle+moteur+prompt+parser+think) → ce mode reproduit le tuple validé : + # prompt point 0-1, think=false, parse /1000 (dissout DETTE-006), method gardée. + # Réf design : inbox_codex/2026-06-13_0210_..._DESIGN-CABLAGE-RESOLVE-ENGINE-QWEN3VL.md + _grounding_engine = os.environ.get("RPA_GROUNDING_ENGINE", "").strip().lower() + _use_qwen3vl = _grounding_engine == "qwen3vl_vllm" + + if _use_qwen3vl: + _vllm_port = os.environ.get("VLLM_PORT", "8001") + _vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen3-VL-4B-Instruct") + _sys_prompt = ( + "Tu localises une cible sur une capture d'écran d'interface. " + "Si la cible n'est pas clairement visible, réponds par une abstention." + ) + _user_text = ( + f"Cible : « {description} ». Donne le point de clic en FRACTIONS de " + "l'image : x et y entre 0.0 et 1.0 (0,0 = coin haut-gauche, " + '1,1 = coin bas-droite). Réponds UNIQUEMENT par un JSON ' + '{"x":0.xx,"y":0.xx} ou {"abstain":true} si la cible n\'est pas ' + "clairement visible." + ) + else: + _vllm_port = os.environ.get("VLLM_PORT", "8100") + _vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ") + _sys_prompt = "You locate UI elements on screenshots. Return coordinates." + _user_text = prompt # Essai 1 : vLLM (API OpenAI-compatible, GPU) try: + _vllm_payload = { + "model": _vllm_model, + "messages": [ + {"role": "system", "content": _sys_prompt}, + {"role": "user", "content": [ + {"type": "text", "text": _user_text}, + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}}, + ]}, + ], + "temperature": 0.1, + "max_tokens": 80, + } + if _use_qwen3vl: + # think=false obligatoire (Qwen3-VL/vLLM) : sinon raisonnement → + # grounding inutilisable (observé au bench). + _vllm_payload["chat_template_kwargs"] = {"enable_thinking": False} + _vllm_payload["temperature"] = 0.0 + _vllm_payload["max_tokens"] = 256 vllm_resp = _requests.post( f"http://localhost:{_vllm_port}/v1/chat/completions", - json={ - "model": _vllm_model, - "messages": [ - {"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."}, - {"role": "user", "content": [ - {"type": "text", "text": prompt}, - {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}}, - ]}, - ], - "temperature": 0.1, - "max_tokens": 80, - }, + json=_vllm_payload, timeout=30, ) if vllm_resp.ok: @@ -982,8 +1014,11 @@ def _resolve_by_grounding( except Exception as e: logger.debug("vLLM non disponible (%s), fallback Ollama", e) - # Essai 2 : Ollama (qwen2.5vl:7b pour le grounding — format bbox_2d natif) - if not content: + # Essai 2 : Ollama (qwen2.5vl:7b pour le grounding — format bbox_2d natif). + # En mode qwen3vl_vllm, PAS de fallback Ollama (modèle non-viable/dangereux + # prouvé au bench) : si vLLM échoue, on abstient (None) et la cascade externe + # (OCR/template/SoM) prend le relais. + if not content and not _use_qwen3vl: try: resp = _requests.post("http://localhost:11434/api/chat", json={ "model": _grounding_model, @@ -1003,12 +1038,19 @@ def _resolve_by_grounding( elapsed = time.time() - t0 # Parser la réponse — délégué à core.grounding.bbox_parser - x_pct, y_pct = parse_bbox_to_norm(content, small_w, small_h) + if _use_qwen3vl: + # Qwen3-VL : 0-1 (consigne respectée) OU 0-1000 natif. divisor=1000 gère + # les DEUX (xy_json ≤1 pris tel quel ; bbox_2d / valeurs >1 → ÷1000). + # Résolution-indépendant → dissout le bug d'échelle DETTE-006. + x_pct, y_pct = parse_bbox_to_norm(content, 1000, 1000) + else: + x_pct, y_pct = parse_bbox_to_norm(content, small_w, small_h) if x_pct is None or y_pct is None: - # Fallback multi-image : screenshot + crop → grounding sans description + # Fallback multi-image : screenshot + crop → grounding sans description. + # Skippé en mode qwen3vl_vllm (le fallback s'appuie sur Ollama qwen2.5vl). anchor_b64 = target_spec.get("anchor_image_base64", "") - if anchor_b64: + if anchor_b64 and not _use_qwen3vl: try: prompt_mi = ( "Image 1 is a screenshot. Image 2 shows a UI element.\n" @@ -1073,7 +1115,10 @@ def _resolve_by_grounding( return { "resolved": True, - "method": "grounding_vlm", + # method gardée par _RESOLUTION_MIN_SCORES : en mode qwen3vl, "grounding" + # (clé exacte, seuil 0.60) → Check-1 du validateur s'applique. Le legacy + # garde "grounding_vlm" (non gardé aujourd'hui — bug latent, DETTE séparée). + "method": "grounding" if _use_qwen3vl else "grounding_vlm", "x_pct": round(x_pct, 6), "y_pct": round(y_pct, 6), "matched_element": { diff --git a/tests/unit/test_resolve_engine_qwen3vl_vllm_cabling.py b/tests/unit/test_resolve_engine_qwen3vl_vllm_cabling.py new file mode 100644 index 000000000..5f22955e0 --- /dev/null +++ b/tests/unit/test_resolve_engine_qwen3vl_vllm_cabling.py @@ -0,0 +1,177 @@ +"""Câblage resolve_engine ← Qwen3-VL-4B/vLLM (grounder POC validé 0.933, nuit 12→13/06). + +Contrat (approche A, env-gated, défaut OFF) : quand RPA_GROUNDING_ENGINE=qwen3vl_vllm, +`_resolve_by_grounding` doit : + 1. parser les coordonnées Qwen3-VL en 0-1000 (divisor=1000), PAS en pixels image + → dissout DETTE-006 (résolution-indépendant) ; + 2. poser think=false dans le payload vLLM (chat_template_kwargs.enable_thinking=False) ; + 3. émettre une `method` GARDÉE par _RESOLUTION_MIN_SCORES (sinon Check-1 du + validateur est sauté → clic non-gardé). + +Réf design : inbox_codex/2026-06-13_0210_claude-to-codex_DESIGN-CABLAGE-RESOLVE-ENGINE-QWEN3VL.md +Le 0.933 est une propriété de (modèle+moteur+prompt+parser+think), pas juste (modèle+moteur). +""" +from __future__ import annotations + +import sys +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +ROOT = Path(__file__).resolve().parents[2] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + + +def _gated(method: str, table: dict) -> bool: + """Reproduit la logique de matching de _validate_resolution_quality (:2751).""" + if method in table: + return True + return any(p.endswith("_") and method.startswith(p) for p in table) + + +def _make_vllm_post(captured: list): + """Mock requests.post : vLLM renvoie un bbox Qwen3-VL 0-1000 centré (500,500).""" + def fake_post(url, json=None, timeout=None): + captured.append({"url": url, "payload": json}) + resp = MagicMock() + if "/v1/chat/completions" in url: + resp.ok = True + # Qwen3-VL : coordonnées normalisées 0-1000. Centre = (500,500). + resp.json.return_value = { + "choices": [{"message": {"content": '{"bbox_2d": [490, 490, 510, 510]}'}}] + } + else: # Ollama fallback ne doit pas être atteint dans ce mode + resp.ok = False + resp.json.return_value = {"message": {"content": ""}} + return resp + return fake_post + + +@pytest.mark.unit +def test_qwen3vl_vllm_grounding_normalise_0_1000_et_method_gardee(monkeypatch, tmp_path): + from PIL import Image + + # Image volontairement petite (200x120) : si le code divisait par les + # pixels image au lieu de 1000, le centre 500 → 500/200 = 2.5 (hors bornes, + # → None). C'est ce qui rend ce test RED sur le code actuel. + shot = tmp_path / "shot.png" + Image.new("RGB", (200, 120), (255, 255, 255)).save(shot) + + monkeypatch.setenv("RPA_GROUNDING_ENGINE", "qwen3vl_vllm") + captured: list = [] + import requests + monkeypatch.setattr(requests, "post", _make_vllm_post(captured)) + + from agent_v0.server_v1 import resolve_engine as re_module + + result = re_module._resolve_by_grounding( + screenshot_path=str(shot), + target_spec={"by_text": "Synthèse"}, + screen_width=200, + screen_height=120, + ) + + # (1) résolu et normalisé par /1000 → centre ~ (0.5, 0.5) + assert result is not None, "Résolution None : coords Qwen3-VL 0-1000 mal normalisées (DETTE-006)" + assert abs(result["x_pct"] - 0.5) < 0.02, f"x_pct={result['x_pct']} (attendu ~0.5 via /1000)" + assert abs(result["y_pct"] - 0.5) < 0.02, f"y_pct={result['y_pct']} (attendu ~0.5 via /1000)" + + # (3) method gardée par le seuil + assert _gated(result["method"], re_module._RESOLUTION_MIN_SCORES), ( + f"method {result['method']!r} non gardée → Check-1 du validateur sauté (clic non-gardé)" + ) + + +@pytest.mark.unit +def test_qwen3vl_vllm_payload_think_false(monkeypatch, tmp_path): + from PIL import Image + + shot = tmp_path / "shot.png" + Image.new("RGB", (200, 120), (255, 255, 255)).save(shot) + + monkeypatch.setenv("RPA_GROUNDING_ENGINE", "qwen3vl_vllm") + captured: list = [] + import requests + monkeypatch.setattr(requests, "post", _make_vllm_post(captured)) + + from agent_v0.server_v1 import resolve_engine as re_module + re_module._resolve_by_grounding( + screenshot_path=str(shot), + target_spec={"by_text": "Synthèse"}, + screen_width=200, + screen_height=120, + ) + + vllm = [c for c in captured if "/v1/chat/completions" in c["url"]] + assert vllm, "Aucun appel vLLM capturé" + payload = vllm[0]["payload"] + # think=false : pour vLLM via chat_template_kwargs.enable_thinking=False + cek = payload.get("chat_template_kwargs", {}) + assert cek.get("enable_thinking") is False, ( + f"think non désactivé dans payload vLLM : chat_template_kwargs={cek} " + f"(Qwen3-VL penserait → grounding inutilisable, cf. bench)" + ) + + +@pytest.mark.unit +def test_qwen3vl_vllm_prompt_demande_fractions_0_1(monkeypatch, tmp_path): + """Fidélité au tuple validé (0.933) : le prompt qwen3vl demande un point de + clic en FRACTIONS 0-1 (format {"x","y"}), pas un 'bounding box' générique.""" + from PIL import Image + + shot = tmp_path / "shot.png" + Image.new("RGB", (200, 120), (255, 255, 255)).save(shot) + + monkeypatch.setenv("RPA_GROUNDING_ENGINE", "qwen3vl_vllm") + captured: list = [] + import requests + monkeypatch.setattr(requests, "post", _make_vllm_post(captured)) + + from agent_v0.server_v1 import resolve_engine as re_module + re_module._resolve_by_grounding( + screenshot_path=str(shot), + target_spec={"by_text": "Synthèse"}, + screen_width=200, + screen_height=120, + ) + + vllm = [c for c in captured if "/v1/chat/completions" in c["url"]] + assert vllm, "Aucun appel vLLM capturé" + user_txt = "" + for m in vllm[0]["payload"]["messages"]: + c = m.get("content") + if isinstance(c, list): + user_txt += " ".join(p.get("text", "") for p in c if isinstance(p, dict)) + elif isinstance(c, str): + user_txt += " " + c + + assert "Synthèse" in user_txt, "cible non injectée dans le prompt" + low = user_txt.lower() + assert "fraction" in low or ("0.0" in user_txt and "1.0" in user_txt), ( + f"prompt qwen3vl ne demande pas un point 0-1 (tuple validé non reproduit) : {user_txt!r}" + ) + + +@pytest.mark.unit +def test_qwen3vl_vllm_method_grounding_rejetee_si_score_bas(): + """Le method qwen3vl doit activer le garde-seuil, pas sauter Check-1.""" + from agent_v0.server_v1 import resolve_engine as re_module + + out = re_module._validate_resolution_quality( + { + "resolved": True, + "method": "grounding", + "score": 0.10, + "x_pct": 0.50, + "y_pct": 0.50, + }, + 0.50, + 0.50, + ) + + assert out is not None + assert out["resolved"] is False + assert out["method"] == "rejected_low_score_grounding" + assert out["original_method"] == "grounding"