- core/navigation/ : visual_verifier (presence=OCR, role=VLM ancre sur tokens), grounding (OCR-anchor first, VLM fallback, cache coords valide par la vue), visual_login (verify_before/after, DETTE-023), action_resolver (pont runtime) - api_stream/replay_engine : dispatch action navigate server-side, never-fail -> needs_review, import depuis core.navigation (boot 5005 garanti) - 131 tests verts (wiring boot, e2e handler, unit modules) Chantier Qwen 01-02/07/2026, revue croisee Claude (plan deploy v2). Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
407 lines
15 KiB
Python
407 lines
15 KiB
Python
"""Tests for core/navigation/grounding.py — OCR-anchored grounding + VLM fallback + coords cache."""
|
|
|
|
import json
|
|
import pytest
|
|
from core.navigation.grounding import (
|
|
OcrTokenInfo,
|
|
GroundedElement,
|
|
CoordsCacheEntry,
|
|
CoordsCache,
|
|
bbox_center,
|
|
make_element_key,
|
|
ocr_anchor_ground,
|
|
build_grounder_prompt,
|
|
parse_grounder_response,
|
|
ground_element,
|
|
)
|
|
from core.navigation.visual_verifier import normalize_text
|
|
|
|
|
|
# ── Mock factories ─────────────────────────────────────────────────────
|
|
|
|
|
|
def mock_ocr_detailed_client_factory(tokens: list):
|
|
"""Factory for mock OcrDetailedClient returning List[OcrTokenInfo]."""
|
|
def client(image_path: str) -> list:
|
|
return tokens
|
|
return client
|
|
|
|
|
|
def mock_vlm_client_factory(response_json: dict):
|
|
"""Factory for mock VlmClient returning given JSON."""
|
|
def client(image_path: str, prompt: str) -> str:
|
|
return json.dumps(response_json)
|
|
return client
|
|
|
|
|
|
# ── bbox_center tests ──────────────────────────────────────────────────
|
|
|
|
|
|
class TestBboxCenter:
|
|
def test_basic(self):
|
|
assert bbox_center((100, 200, 300, 400)) == (200, 300)
|
|
|
|
def test_zero_origin(self):
|
|
assert bbox_center((0, 0, 100, 100)) == (50, 50)
|
|
|
|
def test_symmetric(self):
|
|
assert bbox_center((10, 10, 20, 20)) == (15, 15)
|
|
|
|
|
|
# ── make_element_key tests ─────────────────────────────────────────────
|
|
|
|
|
|
class TestMakeElementKey:
|
|
def test_basic(self):
|
|
key = make_element_key("bouton", "Rechercher")
|
|
assert key == "bouton:rechercher"
|
|
|
|
def test_normalized(self):
|
|
key = make_element_key("champ", "Nom Prénom")
|
|
assert "nom" in key and "prenom" in key
|
|
|
|
def test_consistent(self):
|
|
# Same element always produces same key
|
|
assert make_element_key("bouton", "Connexion") == make_element_key("bouton", "CONNEXION")
|
|
|
|
|
|
# ── ocr_anchor_ground tests ────────────────────────────────────────────
|
|
|
|
|
|
class TestOcrAnchorGround:
|
|
def test_exact_match(self):
|
|
tokens = [OcrTokenInfo(text="Rechercher", bbox=(100, 50, 250, 90), confidence=0.95)]
|
|
result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Rechercher"})
|
|
assert result is not None
|
|
assert result.method == "ocr_anchor"
|
|
assert result.bbox == (100, 50, 250, 90)
|
|
assert result.center == (175, 70)
|
|
assert result.confidence == 0.95
|
|
|
|
def test_fuzzy_match(self):
|
|
tokens = [OcrTokenInfo(text="Rechércher", bbox=(100, 50, 250, 90))]
|
|
result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Rechercher"})
|
|
assert result is not None
|
|
assert result.source_ocr_text == "Rechércher"
|
|
|
|
def test_no_match(self):
|
|
tokens = [OcrTokenInfo(text="Accueil", bbox=(100, 50, 250, 90))]
|
|
result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Rechercher"})
|
|
assert result is None
|
|
|
|
def test_token_without_bbox(self):
|
|
tokens = [OcrTokenInfo(text="Rechercher", bbox=None)]
|
|
result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Rechercher"})
|
|
assert result is None # found text but no bbox → can't ground
|
|
|
|
def test_no_text_target(self):
|
|
tokens = [OcrTokenInfo(text="Dashboard", bbox=(0, 0, 1920, 1080))]
|
|
result = ocr_anchor_ground(tokens, {"role": "page"}) # no text key
|
|
assert result is None # no text to match
|
|
|
|
def test_multiple_tokens_first_match(self):
|
|
tokens = [
|
|
OcrTokenInfo(text="Accueil", bbox=(0, 0, 100, 40)),
|
|
OcrTokenInfo(text="Connexion", bbox=(200, 50, 350, 90)),
|
|
]
|
|
result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Connexion"})
|
|
assert result is not None
|
|
assert result.bbox == (200, 50, 350, 90)
|
|
|
|
|
|
# ── build_grounder_prompt tests ────────────────────────────────────────
|
|
|
|
|
|
class TestBuildGrounderPrompt:
|
|
def test_basic_prompt(self):
|
|
prompt = build_grounder_prompt({"role": "bouton", "text": "Connexion"})
|
|
assert "bouton" in prompt
|
|
assert "Connexion" in prompt
|
|
assert "bbox" in prompt
|
|
|
|
def test_with_context(self):
|
|
prompt = build_grounder_prompt(
|
|
{"role": "champ", "text": "Login"},
|
|
context="page login DPI",
|
|
)
|
|
assert "page login DPI" in prompt
|
|
|
|
def test_with_extra(self):
|
|
prompt = build_grounder_prompt(
|
|
{"role": "champ", "text": "IPP", "extra": "colonne gauche"},
|
|
)
|
|
assert "colonne gauche" in prompt
|
|
|
|
|
|
# ── parse_grounder_response tests ──────────────────────────────────────
|
|
|
|
|
|
class TestParseGrounderResponse:
|
|
def test_valid_response(self):
|
|
vlm_text = json.dumps({
|
|
"found": True,
|
|
"bbox": [0.1, 0.2, 0.3, 0.4],
|
|
"confidence": 0.92,
|
|
"description": "login button",
|
|
})
|
|
result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"})
|
|
assert result is not None
|
|
assert result.method == "vlm_grounder"
|
|
assert result.bbox == (192, 216, 576, 432) # 0.1*1920, 0.2*1080, 0.3*1920, 0.4*1080
|
|
assert result.confidence == 0.92
|
|
|
|
def test_not_found(self):
|
|
vlm_text = json.dumps({"found": False, "bbox": [], "confidence": 0.0})
|
|
result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"})
|
|
assert result is None
|
|
|
|
def test_json_in_markdown(self):
|
|
vlm_text = "```json\n{\"found\": true, \"bbox\": [0.5, 0.5, 0.6, 0.6], \"confidence\": 0.8}\n```"
|
|
result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"})
|
|
assert result is not None
|
|
|
|
def test_garbled_response(self):
|
|
result = parse_grounder_response("I cannot find the element", 1920, 1080, {"role": "bouton", "text": "Connexion"})
|
|
assert result is None
|
|
|
|
def test_invalid_bbox_format(self):
|
|
vlm_text = json.dumps({"found": True, "bbox": [0.1, 0.2], "confidence": 0.8})
|
|
result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"})
|
|
assert result is None # bbox must have 4 values
|
|
|
|
def test_confidence_as_string(self):
|
|
vlm_text = json.dumps({"found": True, "bbox": [0.1, 0.2, 0.3, 0.4], "confidence": "0.85"})
|
|
result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"})
|
|
assert result is not None
|
|
assert result.confidence == 0.85
|
|
|
|
def test_bbox_clamped_to_screen(self):
|
|
vlm_text = json.dumps({"found": True, "bbox": [-0.1, -0.1, 1.5, 1.5], "confidence": 0.7})
|
|
result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"})
|
|
assert result is not None
|
|
assert result.bbox[0] >= 0
|
|
assert result.bbox[1] >= 0
|
|
assert result.bbox[2] <= 1920
|
|
assert result.bbox[3] <= 1080
|
|
|
|
|
|
# ── ground_element (composition) tests ─────────────────────────────────
|
|
|
|
|
|
class TestGroundElement:
|
|
def test_ocr_anchor_success(self):
|
|
"""OCR finds text with bbox → grounded via OCR (deterministic)."""
|
|
ocr = mock_ocr_detailed_client_factory([
|
|
OcrTokenInfo(text="Connexion", bbox=(200, 50, 350, 90), confidence=0.95),
|
|
])
|
|
vlm = mock_vlm_client_factory({})
|
|
result = ground_element(
|
|
"/tmp/login.png",
|
|
{"role": "bouton", "text": "Connexion"},
|
|
ocr_client=ocr,
|
|
vlm_client=vlm,
|
|
)
|
|
assert result is not None
|
|
assert result.method == "ocr_anchor"
|
|
assert result.bbox == (200, 50, 350, 90)
|
|
|
|
def test_vlm_fallback(self):
|
|
"""OCR doesn't find text → VLM grounder succeeds."""
|
|
ocr = mock_ocr_detailed_client_factory([
|
|
OcrTokenInfo(text="Accueil", bbox=(0, 0, 100, 40)),
|
|
])
|
|
vlm = mock_vlm_client_factory({
|
|
"found": True,
|
|
"bbox": [0.2, 0.3, 0.4, 0.5],
|
|
"confidence": 0.85,
|
|
})
|
|
result = ground_element(
|
|
"/tmp/login.png",
|
|
{"role": "bouton", "text": "Connexion"},
|
|
ocr_client=ocr,
|
|
vlm_client=vlm,
|
|
)
|
|
assert result is not None
|
|
assert result.method == "vlm_grounder"
|
|
|
|
def test_not_found_any_method(self):
|
|
"""Both OCR and VLM fail → None."""
|
|
ocr = mock_ocr_detailed_client_factory([OcrTokenInfo(text="Accueil", bbox=(0, 0, 100, 40))])
|
|
vlm = mock_vlm_client_factory({"found": False, "bbox": [], "confidence": 0.0})
|
|
result = ground_element(
|
|
"/tmp/login.png",
|
|
{"role": "bouton", "text": "Connexion"},
|
|
ocr_client=ocr,
|
|
vlm_client=vlm,
|
|
)
|
|
assert result is None
|
|
|
|
def test_ocr_error_vlm_fallback(self):
|
|
"""OCR engine fails → VLM fallback."""
|
|
def failing_ocr(image_path):
|
|
raise RuntimeError("OCR engine down")
|
|
vlm = mock_vlm_client_factory({
|
|
"found": True,
|
|
"bbox": [0.2, 0.3, 0.4, 0.5],
|
|
"confidence": 0.8,
|
|
})
|
|
result = ground_element(
|
|
"/tmp/login.png",
|
|
{"role": "bouton", "text": "Connexion"},
|
|
ocr_client=failing_ocr,
|
|
vlm_client=vlm,
|
|
)
|
|
assert result is not None
|
|
assert result.method == "vlm_grounder"
|
|
|
|
def test_vlm_error_ocr_success(self):
|
|
"""VLM fails but OCR succeeds → OCR anchor used."""
|
|
ocr = mock_ocr_detailed_client_factory([
|
|
OcrTokenInfo(text="Connexion", bbox=(200, 50, 350, 90)),
|
|
])
|
|
def failing_vlm(image_path, prompt):
|
|
raise RuntimeError("VLM down")
|
|
result = ground_element(
|
|
"/tmp/login.png",
|
|
{"role": "bouton", "text": "Connexion"},
|
|
ocr_client=ocr,
|
|
vlm_client=failing_vlm,
|
|
)
|
|
assert result is not None
|
|
assert result.method == "ocr_anchor"
|
|
|
|
def test_both_fail(self):
|
|
"""OCR + VLM both fail → None."""
|
|
def failing_ocr(image_path):
|
|
raise RuntimeError("OCR down")
|
|
def failing_vlm(image_path, prompt):
|
|
raise RuntimeError("VLM down")
|
|
result = ground_element(
|
|
"/tmp/login.png",
|
|
{"role": "bouton", "text": "Connexion"},
|
|
ocr_client=failing_ocr,
|
|
vlm_client=failing_vlm,
|
|
)
|
|
assert result is None
|
|
|
|
def test_no_text_target(self):
|
|
"""Target without text → VLM grounder skipped, None."""
|
|
ocr = mock_ocr_detailed_client_factory([])
|
|
vlm = mock_vlm_client_factory({})
|
|
result = ground_element(
|
|
"/tmp/page.png",
|
|
{"role": "page"},
|
|
ocr_client=ocr,
|
|
vlm_client=vlm,
|
|
)
|
|
assert result is None
|
|
|
|
def test_cache_hit(self):
|
|
"""Cached coords exist → returned directly."""
|
|
cache = CoordsCache()
|
|
cache.put("bouton:connexion", (200, 50, 350, 90), (275, 70), "ocr_anchor")
|
|
|
|
ocr = mock_ocr_detailed_client_factory([])
|
|
vlm = mock_vlm_client_factory({})
|
|
result = ground_element(
|
|
"/tmp/login.png",
|
|
{"role": "bouton", "text": "Connexion"},
|
|
ocr_client=ocr,
|
|
vlm_client=vlm,
|
|
coords_cache=cache,
|
|
)
|
|
assert result is not None
|
|
assert result.method == "cache"
|
|
assert result.bbox == (200, 50, 350, 90)
|
|
|
|
def test_cache_stored_on_ocr_anchor(self):
|
|
"""OCR anchor result → stored in cache."""
|
|
cache = CoordsCache()
|
|
ocr = mock_ocr_detailed_client_factory([
|
|
OcrTokenInfo(text="Connexion", bbox=(200, 50, 350, 90)),
|
|
])
|
|
vlm = mock_vlm_client_factory({})
|
|
ground_element(
|
|
"/tmp/login.png",
|
|
{"role": "bouton", "text": "Connexion"},
|
|
ocr_client=ocr,
|
|
vlm_client=vlm,
|
|
coords_cache=cache,
|
|
)
|
|
cached = cache.get("bouton:connexion")
|
|
assert cached is not None
|
|
assert cached.bbox == (200, 50, 350, 90)
|
|
assert cached.method == "ocr_anchor"
|
|
|
|
def test_cache_stored_on_vlm_grounder(self):
|
|
"""VLM grounder result → stored in cache."""
|
|
cache = CoordsCache()
|
|
ocr = mock_ocr_detailed_client_factory([])
|
|
vlm = mock_vlm_client_factory({
|
|
"found": True,
|
|
"bbox": [0.2, 0.3, 0.4, 0.5],
|
|
"confidence": 0.85,
|
|
})
|
|
ground_element(
|
|
"/tmp/login.png",
|
|
{"role": "bouton", "text": "Connexion"},
|
|
ocr_client=ocr,
|
|
vlm_client=vlm,
|
|
coords_cache=cache,
|
|
)
|
|
cached = cache.get("bouton:connexion")
|
|
assert cached is not None
|
|
assert cached.method == "vlm_grounder"
|
|
|
|
|
|
# ── CoordsCache tests ──────────────────────────────────────────────────
|
|
|
|
|
|
class TestCoordsCache:
|
|
def test_put_and_get(self):
|
|
cache = CoordsCache()
|
|
cache.put("bouton:connexion", (200, 50, 350, 90), (275, 70), "ocr_anchor")
|
|
entry = cache.get("bouton:connexion")
|
|
assert entry is not None
|
|
assert entry.bbox == (200, 50, 350, 90)
|
|
|
|
def test_get_missing(self):
|
|
cache = CoordsCache()
|
|
assert cache.get("bouton:connexion") is None
|
|
|
|
def test_invalidate(self):
|
|
cache = CoordsCache()
|
|
cache.put("bouton:connexion", (200, 50, 350, 90), (275, 70), "ocr_anchor")
|
|
cache.invalidate("bouton:connexion")
|
|
assert cache.get("bouton:connexion") is None
|
|
|
|
def test_clear(self):
|
|
cache = CoordsCache()
|
|
cache.put("a", (0, 0, 10, 10), (5, 5), "ocr_anchor")
|
|
cache.put("b", (0, 0, 20, 20), (10, 10), "vlm_grounder")
|
|
cache.clear()
|
|
assert cache.get("a") is None
|
|
assert cache.get("b") is None
|
|
|
|
def test_keys(self):
|
|
cache = CoordsCache()
|
|
cache.put("a", (0, 0, 10, 10), (5, 5), "ocr_anchor")
|
|
cache.put("b", (0, 0, 20, 20), (10, 10), "vlm_grounder")
|
|
assert sorted(cache.keys()) == ["a", "b"]
|
|
|
|
def test_update_existing(self):
|
|
cache = CoordsCache()
|
|
cache.put("bouton:connexion", (200, 50, 350, 90), (275, 70), "ocr_anchor")
|
|
cache.put("bouton:connexion", (300, 60, 400, 100), (350, 80), "vlm_grounder")
|
|
entry = cache.get("bouton:connexion")
|
|
assert entry is not None
|
|
assert entry.bbox == (300, 60, 400, 100) # updated
|
|
assert entry.validation_count == 2
|
|
|
|
def test_validation_count_increments(self):
|
|
cache = CoordsCache()
|
|
cache.put("a", (0, 0, 10, 10), (5, 5), "ocr_anchor")
|
|
assert cache.get("a").validation_count == 1
|
|
cache.put("a", (0, 0, 10, 10), (5, 5), "ocr_anchor")
|
|
assert cache.get("a").validation_count == 2
|