feat(grounding): câblage Qwen3-VL-4B/vLLM (RPA_GROUNDING_ENGINE, défaut off)
Active via RPA_GROUNDING_ENGINE=qwen3vl_vllm (défaut OFF = legacy Qwen2.5-VL inchangé, byte-identique). Mode qwen3vl : port 8001/Qwen3-VL-4B, prompt point 0-1, think=false, parse /1000 (dissout DETTE-006), method "grounding" gardée (seuil 0.60), pas de fallback Ollama (abstention si vLLM down). Grounder validé au bench Easily réel (0.933, ~1s/cas). TDD : 4 tests (normalisation 0-1000, think=false, prompt fractions 0-1, gating score bas). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -953,26 +953,58 @@ def _resolve_by_grounding(
|
||||
import requests as _requests
|
||||
content = ""
|
||||
|
||||
# Port vLLM configurable via env
|
||||
_vllm_port = os.environ.get("VLLM_PORT", "8100")
|
||||
_vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ")
|
||||
# Grounder POC validé (bench Easily réel 12→13/06, 0.933) : Qwen3-VL-4B/vLLM.
|
||||
# Activé via RPA_GROUNDING_ENGINE=qwen3vl_vllm (défaut OFF = legacy Qwen2.5-VL
|
||||
# inchangé, byte-identique). Le 0.933 est une propriété de
|
||||
# (modèle+moteur+prompt+parser+think) → ce mode reproduit le tuple validé :
|
||||
# prompt point 0-1, think=false, parse /1000 (dissout DETTE-006), method gardée.
|
||||
# Réf design : inbox_codex/2026-06-13_0210_..._DESIGN-CABLAGE-RESOLVE-ENGINE-QWEN3VL.md
|
||||
_grounding_engine = os.environ.get("RPA_GROUNDING_ENGINE", "").strip().lower()
|
||||
_use_qwen3vl = _grounding_engine == "qwen3vl_vllm"
|
||||
|
||||
if _use_qwen3vl:
|
||||
_vllm_port = os.environ.get("VLLM_PORT", "8001")
|
||||
_vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen3-VL-4B-Instruct")
|
||||
_sys_prompt = (
|
||||
"Tu localises une cible sur une capture d'écran d'interface. "
|
||||
"Si la cible n'est pas clairement visible, réponds par une abstention."
|
||||
)
|
||||
_user_text = (
|
||||
f"Cible : « {description} ». Donne le point de clic en FRACTIONS de "
|
||||
"l'image : x et y entre 0.0 et 1.0 (0,0 = coin haut-gauche, "
|
||||
'1,1 = coin bas-droite). Réponds UNIQUEMENT par un JSON '
|
||||
'{"x":0.xx,"y":0.xx} ou {"abstain":true} si la cible n\'est pas '
|
||||
"clairement visible."
|
||||
)
|
||||
else:
|
||||
_vllm_port = os.environ.get("VLLM_PORT", "8100")
|
||||
_vllm_model = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct-AWQ")
|
||||
_sys_prompt = "You locate UI elements on screenshots. Return coordinates."
|
||||
_user_text = prompt
|
||||
|
||||
# Essai 1 : vLLM (API OpenAI-compatible, GPU)
|
||||
try:
|
||||
_vllm_payload = {
|
||||
"model": _vllm_model,
|
||||
"messages": [
|
||||
{"role": "system", "content": _sys_prompt},
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": _user_text},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}},
|
||||
]},
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 80,
|
||||
}
|
||||
if _use_qwen3vl:
|
||||
# think=false obligatoire (Qwen3-VL/vLLM) : sinon raisonnement →
|
||||
# grounding inutilisable (observé au bench).
|
||||
_vllm_payload["chat_template_kwargs"] = {"enable_thinking": False}
|
||||
_vllm_payload["temperature"] = 0.0
|
||||
_vllm_payload["max_tokens"] = 256
|
||||
vllm_resp = _requests.post(
|
||||
f"http://localhost:{_vllm_port}/v1/chat/completions",
|
||||
json={
|
||||
"model": _vllm_model,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You locate UI elements on screenshots. Return coordinates."},
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{shot_b64}"}},
|
||||
]},
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 80,
|
||||
},
|
||||
json=_vllm_payload,
|
||||
timeout=30,
|
||||
)
|
||||
if vllm_resp.ok:
|
||||
@@ -982,8 +1014,11 @@ def _resolve_by_grounding(
|
||||
except Exception as e:
|
||||
logger.debug("vLLM non disponible (%s), fallback Ollama", e)
|
||||
|
||||
# Essai 2 : Ollama (qwen2.5vl:7b pour le grounding — format bbox_2d natif)
|
||||
if not content:
|
||||
# Essai 2 : Ollama (qwen2.5vl:7b pour le grounding — format bbox_2d natif).
|
||||
# En mode qwen3vl_vllm, PAS de fallback Ollama (modèle non-viable/dangereux
|
||||
# prouvé au bench) : si vLLM échoue, on abstient (None) et la cascade externe
|
||||
# (OCR/template/SoM) prend le relais.
|
||||
if not content and not _use_qwen3vl:
|
||||
try:
|
||||
resp = _requests.post("http://localhost:11434/api/chat", json={
|
||||
"model": _grounding_model,
|
||||
@@ -1003,12 +1038,19 @@ def _resolve_by_grounding(
|
||||
elapsed = time.time() - t0
|
||||
|
||||
# Parser la réponse — délégué à core.grounding.bbox_parser
|
||||
x_pct, y_pct = parse_bbox_to_norm(content, small_w, small_h)
|
||||
if _use_qwen3vl:
|
||||
# Qwen3-VL : 0-1 (consigne respectée) OU 0-1000 natif. divisor=1000 gère
|
||||
# les DEUX (xy_json ≤1 pris tel quel ; bbox_2d / valeurs >1 → ÷1000).
|
||||
# Résolution-indépendant → dissout le bug d'échelle DETTE-006.
|
||||
x_pct, y_pct = parse_bbox_to_norm(content, 1000, 1000)
|
||||
else:
|
||||
x_pct, y_pct = parse_bbox_to_norm(content, small_w, small_h)
|
||||
|
||||
if x_pct is None or y_pct is None:
|
||||
# Fallback multi-image : screenshot + crop → grounding sans description
|
||||
# Fallback multi-image : screenshot + crop → grounding sans description.
|
||||
# Skippé en mode qwen3vl_vllm (le fallback s'appuie sur Ollama qwen2.5vl).
|
||||
anchor_b64 = target_spec.get("anchor_image_base64", "")
|
||||
if anchor_b64:
|
||||
if anchor_b64 and not _use_qwen3vl:
|
||||
try:
|
||||
prompt_mi = (
|
||||
"Image 1 is a screenshot. Image 2 shows a UI element.\n"
|
||||
@@ -1073,7 +1115,10 @@ def _resolve_by_grounding(
|
||||
|
||||
return {
|
||||
"resolved": True,
|
||||
"method": "grounding_vlm",
|
||||
# method gardée par _RESOLUTION_MIN_SCORES : en mode qwen3vl, "grounding"
|
||||
# (clé exacte, seuil 0.60) → Check-1 du validateur s'applique. Le legacy
|
||||
# garde "grounding_vlm" (non gardé aujourd'hui — bug latent, DETTE séparée).
|
||||
"method": "grounding" if _use_qwen3vl else "grounding_vlm",
|
||||
"x_pct": round(x_pct, 6),
|
||||
"y_pct": round(y_pct, 6),
|
||||
"matched_element": {
|
||||
|
||||
Reference in New Issue
Block a user