"""Tests for core/navigation/grounding.py — OCR-anchored grounding + VLM fallback + coords cache.""" import json import pytest from core.navigation.grounding import ( OcrTokenInfo, GroundedElement, CoordsCacheEntry, CoordsCache, bbox_center, make_element_key, ocr_anchor_ground, build_grounder_prompt, parse_grounder_response, ground_element, ) from core.navigation.visual_verifier import normalize_text # ── Mock factories ───────────────────────────────────────────────────── def mock_ocr_detailed_client_factory(tokens: list): """Factory for mock OcrDetailedClient returning List[OcrTokenInfo].""" def client(image_path: str) -> list: return tokens return client def mock_vlm_client_factory(response_json: dict): """Factory for mock VlmClient returning given JSON.""" def client(image_path: str, prompt: str) -> str: return json.dumps(response_json) return client # ── bbox_center tests ────────────────────────────────────────────────── class TestBboxCenter: def test_basic(self): assert bbox_center((100, 200, 300, 400)) == (200, 300) def test_zero_origin(self): assert bbox_center((0, 0, 100, 100)) == (50, 50) def test_symmetric(self): assert bbox_center((10, 10, 20, 20)) == (15, 15) # ── make_element_key tests ───────────────────────────────────────────── class TestMakeElementKey: def test_basic(self): key = make_element_key("bouton", "Rechercher") assert key == "bouton:rechercher" def test_normalized(self): key = make_element_key("champ", "Nom Prénom") assert "nom" in key and "prenom" in key def test_consistent(self): # Same element always produces same key assert make_element_key("bouton", "Connexion") == make_element_key("bouton", "CONNEXION") # ── ocr_anchor_ground tests ──────────────────────────────────────────── class TestOcrAnchorGround: def test_exact_match(self): tokens = [OcrTokenInfo(text="Rechercher", bbox=(100, 50, 250, 90), confidence=0.95)] result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Rechercher"}) assert result is not None assert result.method == "ocr_anchor" assert result.bbox == (100, 50, 250, 90) assert result.center == (175, 70) assert result.confidence == 0.95 def test_fuzzy_match(self): tokens = [OcrTokenInfo(text="Rechércher", bbox=(100, 50, 250, 90))] result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Rechercher"}) assert result is not None assert result.source_ocr_text == "Rechércher" def test_no_match(self): tokens = [OcrTokenInfo(text="Accueil", bbox=(100, 50, 250, 90))] result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Rechercher"}) assert result is None def test_token_without_bbox(self): tokens = [OcrTokenInfo(text="Rechercher", bbox=None)] result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Rechercher"}) assert result is None # found text but no bbox → can't ground def test_no_text_target(self): tokens = [OcrTokenInfo(text="Dashboard", bbox=(0, 0, 1920, 1080))] result = ocr_anchor_ground(tokens, {"role": "page"}) # no text key assert result is None # no text to match def test_multiple_tokens_first_match(self): tokens = [ OcrTokenInfo(text="Accueil", bbox=(0, 0, 100, 40)), OcrTokenInfo(text="Connexion", bbox=(200, 50, 350, 90)), ] result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Connexion"}) assert result is not None assert result.bbox == (200, 50, 350, 90) # ── build_grounder_prompt tests ──────────────────────────────────────── class TestBuildGrounderPrompt: def test_basic_prompt(self): prompt = build_grounder_prompt({"role": "bouton", "text": "Connexion"}) assert "bouton" in prompt assert "Connexion" in prompt assert "bbox" in prompt def test_with_context(self): prompt = build_grounder_prompt( {"role": "champ", "text": "Login"}, context="page login DPI", ) assert "page login DPI" in prompt def test_with_extra(self): prompt = build_grounder_prompt( {"role": "champ", "text": "IPP", "extra": "colonne gauche"}, ) assert "colonne gauche" in prompt # ── parse_grounder_response tests ────────────────────────────────────── class TestParseGrounderResponse: def test_valid_response(self): vlm_text = json.dumps({ "found": True, "bbox": [0.1, 0.2, 0.3, 0.4], "confidence": 0.92, "description": "login button", }) result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"}) assert result is not None assert result.method == "vlm_grounder" assert result.bbox == (192, 216, 576, 432) # 0.1*1920, 0.2*1080, 0.3*1920, 0.4*1080 assert result.confidence == 0.92 def test_not_found(self): vlm_text = json.dumps({"found": False, "bbox": [], "confidence": 0.0}) result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"}) assert result is None def test_json_in_markdown(self): vlm_text = "```json\n{\"found\": true, \"bbox\": [0.5, 0.5, 0.6, 0.6], \"confidence\": 0.8}\n```" result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"}) assert result is not None def test_garbled_response(self): result = parse_grounder_response("I cannot find the element", 1920, 1080, {"role": "bouton", "text": "Connexion"}) assert result is None def test_invalid_bbox_format(self): vlm_text = json.dumps({"found": True, "bbox": [0.1, 0.2], "confidence": 0.8}) result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"}) assert result is None # bbox must have 4 values def test_confidence_as_string(self): vlm_text = json.dumps({"found": True, "bbox": [0.1, 0.2, 0.3, 0.4], "confidence": "0.85"}) result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"}) assert result is not None assert result.confidence == 0.85 def test_bbox_clamped_to_screen(self): vlm_text = json.dumps({"found": True, "bbox": [-0.1, -0.1, 1.5, 1.5], "confidence": 0.7}) result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"}) assert result is not None assert result.bbox[0] >= 0 assert result.bbox[1] >= 0 assert result.bbox[2] <= 1920 assert result.bbox[3] <= 1080 # ── ground_element (composition) tests ───────────────────────────────── class TestGroundElement: def test_ocr_anchor_success(self): """OCR finds text with bbox → grounded via OCR (deterministic).""" ocr = mock_ocr_detailed_client_factory([ OcrTokenInfo(text="Connexion", bbox=(200, 50, 350, 90), confidence=0.95), ]) vlm = mock_vlm_client_factory({}) result = ground_element( "/tmp/login.png", {"role": "bouton", "text": "Connexion"}, ocr_client=ocr, vlm_client=vlm, ) assert result is not None assert result.method == "ocr_anchor" assert result.bbox == (200, 50, 350, 90) def test_vlm_fallback(self): """OCR doesn't find text → VLM grounder succeeds.""" ocr = mock_ocr_detailed_client_factory([ OcrTokenInfo(text="Accueil", bbox=(0, 0, 100, 40)), ]) vlm = mock_vlm_client_factory({ "found": True, "bbox": [0.2, 0.3, 0.4, 0.5], "confidence": 0.85, }) result = ground_element( "/tmp/login.png", {"role": "bouton", "text": "Connexion"}, ocr_client=ocr, vlm_client=vlm, ) assert result is not None assert result.method == "vlm_grounder" def test_not_found_any_method(self): """Both OCR and VLM fail → None.""" ocr = mock_ocr_detailed_client_factory([OcrTokenInfo(text="Accueil", bbox=(0, 0, 100, 40))]) vlm = mock_vlm_client_factory({"found": False, "bbox": [], "confidence": 0.0}) result = ground_element( "/tmp/login.png", {"role": "bouton", "text": "Connexion"}, ocr_client=ocr, vlm_client=vlm, ) assert result is None def test_ocr_error_vlm_fallback(self): """OCR engine fails → VLM fallback.""" def failing_ocr(image_path): raise RuntimeError("OCR engine down") vlm = mock_vlm_client_factory({ "found": True, "bbox": [0.2, 0.3, 0.4, 0.5], "confidence": 0.8, }) result = ground_element( "/tmp/login.png", {"role": "bouton", "text": "Connexion"}, ocr_client=failing_ocr, vlm_client=vlm, ) assert result is not None assert result.method == "vlm_grounder" def test_vlm_error_ocr_success(self): """VLM fails but OCR succeeds → OCR anchor used.""" ocr = mock_ocr_detailed_client_factory([ OcrTokenInfo(text="Connexion", bbox=(200, 50, 350, 90)), ]) def failing_vlm(image_path, prompt): raise RuntimeError("VLM down") result = ground_element( "/tmp/login.png", {"role": "bouton", "text": "Connexion"}, ocr_client=ocr, vlm_client=failing_vlm, ) assert result is not None assert result.method == "ocr_anchor" def test_both_fail(self): """OCR + VLM both fail → None.""" def failing_ocr(image_path): raise RuntimeError("OCR down") def failing_vlm(image_path, prompt): raise RuntimeError("VLM down") result = ground_element( "/tmp/login.png", {"role": "bouton", "text": "Connexion"}, ocr_client=failing_ocr, vlm_client=failing_vlm, ) assert result is None def test_no_text_target(self): """Target without text → VLM grounder skipped, None.""" ocr = mock_ocr_detailed_client_factory([]) vlm = mock_vlm_client_factory({}) result = ground_element( "/tmp/page.png", {"role": "page"}, ocr_client=ocr, vlm_client=vlm, ) assert result is None def test_cache_hit(self): """Cached coords exist → returned directly.""" cache = CoordsCache() cache.put("bouton:connexion", (200, 50, 350, 90), (275, 70), "ocr_anchor") ocr = mock_ocr_detailed_client_factory([]) vlm = mock_vlm_client_factory({}) result = ground_element( "/tmp/login.png", {"role": "bouton", "text": "Connexion"}, ocr_client=ocr, vlm_client=vlm, coords_cache=cache, ) assert result is not None assert result.method == "cache" assert result.bbox == (200, 50, 350, 90) def test_cache_stored_on_ocr_anchor(self): """OCR anchor result → stored in cache.""" cache = CoordsCache() ocr = mock_ocr_detailed_client_factory([ OcrTokenInfo(text="Connexion", bbox=(200, 50, 350, 90)), ]) vlm = mock_vlm_client_factory({}) ground_element( "/tmp/login.png", {"role": "bouton", "text": "Connexion"}, ocr_client=ocr, vlm_client=vlm, coords_cache=cache, ) cached = cache.get("bouton:connexion") assert cached is not None assert cached.bbox == (200, 50, 350, 90) assert cached.method == "ocr_anchor" def test_cache_stored_on_vlm_grounder(self): """VLM grounder result → stored in cache.""" cache = CoordsCache() ocr = mock_ocr_detailed_client_factory([]) vlm = mock_vlm_client_factory({ "found": True, "bbox": [0.2, 0.3, 0.4, 0.5], "confidence": 0.85, }) ground_element( "/tmp/login.png", {"role": "bouton", "text": "Connexion"}, ocr_client=ocr, vlm_client=vlm, coords_cache=cache, ) cached = cache.get("bouton:connexion") assert cached is not None assert cached.method == "vlm_grounder" # ── CoordsCache tests ────────────────────────────────────────────────── class TestCoordsCache: def test_put_and_get(self): cache = CoordsCache() cache.put("bouton:connexion", (200, 50, 350, 90), (275, 70), "ocr_anchor") entry = cache.get("bouton:connexion") assert entry is not None assert entry.bbox == (200, 50, 350, 90) def test_get_missing(self): cache = CoordsCache() assert cache.get("bouton:connexion") is None def test_invalidate(self): cache = CoordsCache() cache.put("bouton:connexion", (200, 50, 350, 90), (275, 70), "ocr_anchor") cache.invalidate("bouton:connexion") assert cache.get("bouton:connexion") is None def test_clear(self): cache = CoordsCache() cache.put("a", (0, 0, 10, 10), (5, 5), "ocr_anchor") cache.put("b", (0, 0, 20, 20), (10, 10), "vlm_grounder") cache.clear() assert cache.get("a") is None assert cache.get("b") is None def test_keys(self): cache = CoordsCache() cache.put("a", (0, 0, 10, 10), (5, 5), "ocr_anchor") cache.put("b", (0, 0, 20, 20), (10, 10), "vlm_grounder") assert sorted(cache.keys()) == ["a", "b"] def test_update_existing(self): cache = CoordsCache() cache.put("bouton:connexion", (200, 50, 350, 90), (275, 70), "ocr_anchor") cache.put("bouton:connexion", (300, 60, 400, 100), (350, 80), "vlm_grounder") entry = cache.get("bouton:connexion") assert entry is not None assert entry.bbox == (300, 60, 400, 100) # updated assert entry.validation_count == 2 def test_validation_count_increments(self): cache = CoordsCache() cache.put("a", (0, 0, 10, 10), (5, 5), "ocr_anchor") assert cache.get("a").validation_count == 1 cache.put("a", (0, 0, 10, 10), (5, 5), "ocr_anchor") assert cache.get("a").validation_count == 2