feat(grounding): câblage Qwen3-VL-4B/vLLM (RPA_GROUNDING_ENGINE, défaut off)

Active via RPA_GROUNDING_ENGINE=qwen3vl_vllm (défaut OFF = legacy Qwen2.5-VL inchangé, byte-identique). Mode qwen3vl : port 8001/Qwen3-VL-4B, prompt point 0-1, think=false, parse /1000 (dissout DETTE-006), method "grounding" gardée (seuil 0.60), pas de fallback Ollama (abstention si vLLM down). Grounder validé au bench Easily réel (0.933, ~1s/cas). TDD : 4 tests (normalisation 0-1000, think=false, prompt fractions 0-1, gating score bas). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-13 08:39:29 +02:00
parent b20d17882e
commit 5c5ce747b0
2 changed files with 243 additions and 21 deletions
--- a/agent_v0/server_v1/resolve_engine.py
+++ b/agent_v0/server_v1/resolve_engine.py
@@ -953,26 +953,58 @@ def _resolve_by_grounding(
    import requests as _requests
    content = ""

-    # Port vLLM configurable via env
-    _vllm_port = os.environ.get("VLLM_PORT", "8100")
-    _vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ")
+    # Grounder POC validé (bench Easily réel 12→13/06, 0.933) : Qwen3-VL-4B/vLLM.
+    # Activé via RPA_GROUNDING_ENGINE=qwen3vl_vllm (défaut OFF = legacy Qwen2.5-VL
+    # inchangé, byte-identique). Le 0.933 est une propriété de
+    # (modèle+moteur+prompt+parser+think) → ce mode reproduit le tuple validé :
+    # prompt point 0-1, think=false, parse /1000 (dissout DETTE-006), method gardée.
+    # Réf design : inbox_codex/2026-06-13_0210_..._DESIGN-CABLAGE-RESOLVE-ENGINE-QWEN3VL.md
+    _grounding_engine = os.environ.get("RPA_GROUNDING_ENGINE", "").strip().lower()
+    _use_qwen3vl = _grounding_engine == "qwen3vl_vllm"
+
+    if _use_qwen3vl:
+        _vllm_port = os.environ.get("VLLM_PORT", "8001")
+        _vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen3-VL-4B-Instruct")
+        _sys_prompt = (
+            "Tu localises une cible sur une capture d'écran d'interface. "
+            "Si la cible n'est pas clairement visible, réponds par une abstention."
+        )
+        _user_text = (
+            f"Cible : « {description} ». Donne le point de clic en FRACTIONS de "
+            "l'image : x et y entre 0.0 et 1.0 (0,0 = coin haut-gauche, "
+            '1,1 = coin bas-droite). Réponds UNIQUEMENT par un JSON '
+            '{"x":0.xx,"y":0.xx} ou {"abstain":true} si la cible n\'est pas '
+            "clairement visible."
+        )
+    else:
+        _vllm_port = os.environ.get("VLLM_PORT", "8100")
+        _vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ")
+        _sys_prompt = "You locate UI elements on screenshots. Return coordinates."
+        _user_text = prompt

    # Essai 1 : vLLM (API OpenAI-compatible, GPU)
    try:
+        _vllm_payload = {
+            "model": _vllm_model,
+            "messages": [
+                {"role": "system", "content": _sys_prompt},
+                {"role": "user", "content": [
+                    {"type": "text", "text": _user_text},
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}},
+                ]},
+            ],
+            "temperature": 0.1,
+            "max_tokens": 80,
+        }
+        if _use_qwen3vl:
+            # think=false obligatoire (Qwen3-VL/vLLM) : sinon raisonnement →
+            # grounding inutilisable (observé au bench).
+            _vllm_payload["chat_template_kwargs"] = {"enable_thinking": False}
+            _vllm_payload["temperature"] = 0.0
+            _vllm_payload["max_tokens"] = 256
        vllm_resp = _requests.post(
            f"http://localhost:{_vllm_port}/v1/chat/completions",
-            json={
-                "model": _vllm_model,
-                "messages": [
-                    {"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."},
-                    {"role": "user", "content": [
-                        {"type": "text", "text": prompt},
-                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}},
-                    ]},
-                ],
-                "temperature": 0.1,
-                "max_tokens": 80,
-            },
+            json=_vllm_payload,
            timeout=30,
        )
        if vllm_resp.ok:
@@ -982,8 +1014,11 @@ def _resolve_by_grounding(
    except Exception as e:
        logger.debug("vLLM non disponible (%s), fallback Ollama", e)

-    # Essai 2 : Ollama (qwen2.5vl:7b pour le grounding — format bbox_2d natif)
-    if not content:
+    # Essai 2 : Ollama (qwen2.5vl:7b pour le grounding — format bbox_2d natif).
+    # En mode qwen3vl_vllm, PAS de fallback Ollama (modèle non-viable/dangereux
+    # prouvé au bench) : si vLLM échoue, on abstient (None) et la cascade externe
+    # (OCR/template/SoM) prend le relais.
+    if not content and not _use_qwen3vl:
        try:
            resp = _requests.post("http://localhost:11434/api/chat", json={
                "model": _grounding_model,
@@ -1003,12 +1038,19 @@ def _resolve_by_grounding(
    elapsed = time.time() - t0

    # Parser la réponse — délégué à core.grounding.bbox_parser
-    x_pct, y_pct = parse_bbox_to_norm(content, small_w, small_h)
+    if _use_qwen3vl:
+        # Qwen3-VL : 0-1 (consigne respectée) OU 0-1000 natif. divisor=1000 gère
+        # les DEUX (xy_json ≤1 pris tel quel ; bbox_2d / valeurs >1 → ÷1000).
+        # Résolution-indépendant → dissout le bug d'échelle DETTE-006.
+        x_pct, y_pct = parse_bbox_to_norm(content, 1000, 1000)
+    else:
+        x_pct, y_pct = parse_bbox_to_norm(content, small_w, small_h)

    if x_pct is None or y_pct is None:
-        # Fallback multi-image : screenshot + crop → grounding sans description
+        # Fallback multi-image : screenshot + crop → grounding sans description.
+        # Skippé en mode qwen3vl_vllm (le fallback s'appuie sur Ollama qwen2.5vl).
        anchor_b64 = target_spec.get("anchor_image_base64", "")
-        if anchor_b64:
+        if anchor_b64 and not _use_qwen3vl:
            try:
                prompt_mi = (
                    "Image 1 is a screenshot. Image 2 shows a UI element.\n"
@@ -1073,7 +1115,10 @@ def _resolve_by_grounding(

    return {
        "resolved": True,
-        "method": "grounding_vlm",
+        # method gardée par _RESOLUTION_MIN_SCORES : en mode qwen3vl, "grounding"
+        # (clé exacte, seuil 0.60) → Check-1 du validateur s'applique. Le legacy
+        # garde "grounding_vlm" (non gardé aujourd'hui — bug latent, DETTE séparée).
+        "method": "grounding" if _use_qwen3vl else "grounding_vlm",
        "x_pct": round(x_pct, 6),
        "y_pct": round(y_pct, 6),
        "matched_element": {