Files
rpa_vision_v3/tests/unit/test_grounding.py
Dom f9a0531325
Some checks failed
tests / Lint (ruff + black) (push) Failing after 1m52s
tests / Tests unitaires (sans GPU) (push) Failing after 1m58s
tests / Tests sécurité (critique) (push) Has been skipped
feat(navigation): brique login visuel OCR-ancre + action navigate au replay
- core/navigation/ : visual_verifier (presence=OCR, role=VLM ancre sur tokens),
  grounding (OCR-anchor first, VLM fallback, cache coords valide par la vue),
  visual_login (verify_before/after, DETTE-023), action_resolver (pont runtime)
- api_stream/replay_engine : dispatch action navigate server-side,
  never-fail -> needs_review, import depuis core.navigation (boot 5005 garanti)
- 131 tests verts (wiring boot, e2e handler, unit modules)

Chantier Qwen 01-02/07/2026, revue croisee Claude (plan deploy v2).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-02 10:31:44 +02:00

407 lines
15 KiB
Python

"""Tests for core/navigation/grounding.py — OCR-anchored grounding + VLM fallback + coords cache."""
import json
import pytest
from core.navigation.grounding import (
OcrTokenInfo,
GroundedElement,
CoordsCacheEntry,
CoordsCache,
bbox_center,
make_element_key,
ocr_anchor_ground,
build_grounder_prompt,
parse_grounder_response,
ground_element,
)
from core.navigation.visual_verifier import normalize_text
# ── Mock factories ─────────────────────────────────────────────────────
def mock_ocr_detailed_client_factory(tokens: list):
"""Factory for mock OcrDetailedClient returning List[OcrTokenInfo]."""
def client(image_path: str) -> list:
return tokens
return client
def mock_vlm_client_factory(response_json: dict):
"""Factory for mock VlmClient returning given JSON."""
def client(image_path: str, prompt: str) -> str:
return json.dumps(response_json)
return client
# ── bbox_center tests ──────────────────────────────────────────────────
class TestBboxCenter:
def test_basic(self):
assert bbox_center((100, 200, 300, 400)) == (200, 300)
def test_zero_origin(self):
assert bbox_center((0, 0, 100, 100)) == (50, 50)
def test_symmetric(self):
assert bbox_center((10, 10, 20, 20)) == (15, 15)
# ── make_element_key tests ─────────────────────────────────────────────
class TestMakeElementKey:
def test_basic(self):
key = make_element_key("bouton", "Rechercher")
assert key == "bouton:rechercher"
def test_normalized(self):
key = make_element_key("champ", "Nom Prénom")
assert "nom" in key and "prenom" in key
def test_consistent(self):
# Same element always produces same key
assert make_element_key("bouton", "Connexion") == make_element_key("bouton", "CONNEXION")
# ── ocr_anchor_ground tests ────────────────────────────────────────────
class TestOcrAnchorGround:
def test_exact_match(self):
tokens = [OcrTokenInfo(text="Rechercher", bbox=(100, 50, 250, 90), confidence=0.95)]
result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Rechercher"})
assert result is not None
assert result.method == "ocr_anchor"
assert result.bbox == (100, 50, 250, 90)
assert result.center == (175, 70)
assert result.confidence == 0.95
def test_fuzzy_match(self):
tokens = [OcrTokenInfo(text="Rechércher", bbox=(100, 50, 250, 90))]
result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Rechercher"})
assert result is not None
assert result.source_ocr_text == "Rechércher"
def test_no_match(self):
tokens = [OcrTokenInfo(text="Accueil", bbox=(100, 50, 250, 90))]
result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Rechercher"})
assert result is None
def test_token_without_bbox(self):
tokens = [OcrTokenInfo(text="Rechercher", bbox=None)]
result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Rechercher"})
assert result is None # found text but no bbox → can't ground
def test_no_text_target(self):
tokens = [OcrTokenInfo(text="Dashboard", bbox=(0, 0, 1920, 1080))]
result = ocr_anchor_ground(tokens, {"role": "page"}) # no text key
assert result is None # no text to match
def test_multiple_tokens_first_match(self):
tokens = [
OcrTokenInfo(text="Accueil", bbox=(0, 0, 100, 40)),
OcrTokenInfo(text="Connexion", bbox=(200, 50, 350, 90)),
]
result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Connexion"})
assert result is not None
assert result.bbox == (200, 50, 350, 90)
# ── build_grounder_prompt tests ────────────────────────────────────────
class TestBuildGrounderPrompt:
def test_basic_prompt(self):
prompt = build_grounder_prompt({"role": "bouton", "text": "Connexion"})
assert "bouton" in prompt
assert "Connexion" in prompt
assert "bbox" in prompt
def test_with_context(self):
prompt = build_grounder_prompt(
{"role": "champ", "text": "Login"},
context="page login DPI",
)
assert "page login DPI" in prompt
def test_with_extra(self):
prompt = build_grounder_prompt(
{"role": "champ", "text": "IPP", "extra": "colonne gauche"},
)
assert "colonne gauche" in prompt
# ── parse_grounder_response tests ──────────────────────────────────────
class TestParseGrounderResponse:
def test_valid_response(self):
vlm_text = json.dumps({
"found": True,
"bbox": [0.1, 0.2, 0.3, 0.4],
"confidence": 0.92,
"description": "login button",
})
result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"})
assert result is not None
assert result.method == "vlm_grounder"
assert result.bbox == (192, 216, 576, 432) # 0.1*1920, 0.2*1080, 0.3*1920, 0.4*1080
assert result.confidence == 0.92
def test_not_found(self):
vlm_text = json.dumps({"found": False, "bbox": [], "confidence": 0.0})
result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"})
assert result is None
def test_json_in_markdown(self):
vlm_text = "```json\n{\"found\": true, \"bbox\": [0.5, 0.5, 0.6, 0.6], \"confidence\": 0.8}\n```"
result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"})
assert result is not None
def test_garbled_response(self):
result = parse_grounder_response("I cannot find the element", 1920, 1080, {"role": "bouton", "text": "Connexion"})
assert result is None
def test_invalid_bbox_format(self):
vlm_text = json.dumps({"found": True, "bbox": [0.1, 0.2], "confidence": 0.8})
result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"})
assert result is None # bbox must have 4 values
def test_confidence_as_string(self):
vlm_text = json.dumps({"found": True, "bbox": [0.1, 0.2, 0.3, 0.4], "confidence": "0.85"})
result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"})
assert result is not None
assert result.confidence == 0.85
def test_bbox_clamped_to_screen(self):
vlm_text = json.dumps({"found": True, "bbox": [-0.1, -0.1, 1.5, 1.5], "confidence": 0.7})
result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"})
assert result is not None
assert result.bbox[0] >= 0
assert result.bbox[1] >= 0
assert result.bbox[2] <= 1920
assert result.bbox[3] <= 1080
# ── ground_element (composition) tests ─────────────────────────────────
class TestGroundElement:
def test_ocr_anchor_success(self):
"""OCR finds text with bbox → grounded via OCR (deterministic)."""
ocr = mock_ocr_detailed_client_factory([
OcrTokenInfo(text="Connexion", bbox=(200, 50, 350, 90), confidence=0.95),
])
vlm = mock_vlm_client_factory({})
result = ground_element(
"/tmp/login.png",
{"role": "bouton", "text": "Connexion"},
ocr_client=ocr,
vlm_client=vlm,
)
assert result is not None
assert result.method == "ocr_anchor"
assert result.bbox == (200, 50, 350, 90)
def test_vlm_fallback(self):
"""OCR doesn't find text → VLM grounder succeeds."""
ocr = mock_ocr_detailed_client_factory([
OcrTokenInfo(text="Accueil", bbox=(0, 0, 100, 40)),
])
vlm = mock_vlm_client_factory({
"found": True,
"bbox": [0.2, 0.3, 0.4, 0.5],
"confidence": 0.85,
})
result = ground_element(
"/tmp/login.png",
{"role": "bouton", "text": "Connexion"},
ocr_client=ocr,
vlm_client=vlm,
)
assert result is not None
assert result.method == "vlm_grounder"
def test_not_found_any_method(self):
"""Both OCR and VLM fail → None."""
ocr = mock_ocr_detailed_client_factory([OcrTokenInfo(text="Accueil", bbox=(0, 0, 100, 40))])
vlm = mock_vlm_client_factory({"found": False, "bbox": [], "confidence": 0.0})
result = ground_element(
"/tmp/login.png",
{"role": "bouton", "text": "Connexion"},
ocr_client=ocr,
vlm_client=vlm,
)
assert result is None
def test_ocr_error_vlm_fallback(self):
"""OCR engine fails → VLM fallback."""
def failing_ocr(image_path):
raise RuntimeError("OCR engine down")
vlm = mock_vlm_client_factory({
"found": True,
"bbox": [0.2, 0.3, 0.4, 0.5],
"confidence": 0.8,
})
result = ground_element(
"/tmp/login.png",
{"role": "bouton", "text": "Connexion"},
ocr_client=failing_ocr,
vlm_client=vlm,
)
assert result is not None
assert result.method == "vlm_grounder"
def test_vlm_error_ocr_success(self):
"""VLM fails but OCR succeeds → OCR anchor used."""
ocr = mock_ocr_detailed_client_factory([
OcrTokenInfo(text="Connexion", bbox=(200, 50, 350, 90)),
])
def failing_vlm(image_path, prompt):
raise RuntimeError("VLM down")
result = ground_element(
"/tmp/login.png",
{"role": "bouton", "text": "Connexion"},
ocr_client=ocr,
vlm_client=failing_vlm,
)
assert result is not None
assert result.method == "ocr_anchor"
def test_both_fail(self):
"""OCR + VLM both fail → None."""
def failing_ocr(image_path):
raise RuntimeError("OCR down")
def failing_vlm(image_path, prompt):
raise RuntimeError("VLM down")
result = ground_element(
"/tmp/login.png",
{"role": "bouton", "text": "Connexion"},
ocr_client=failing_ocr,
vlm_client=failing_vlm,
)
assert result is None
def test_no_text_target(self):
"""Target without text → VLM grounder skipped, None."""
ocr = mock_ocr_detailed_client_factory([])
vlm = mock_vlm_client_factory({})
result = ground_element(
"/tmp/page.png",
{"role": "page"},
ocr_client=ocr,
vlm_client=vlm,
)
assert result is None
def test_cache_hit(self):
"""Cached coords exist → returned directly."""
cache = CoordsCache()
cache.put("bouton:connexion", (200, 50, 350, 90), (275, 70), "ocr_anchor")
ocr = mock_ocr_detailed_client_factory([])
vlm = mock_vlm_client_factory({})
result = ground_element(
"/tmp/login.png",
{"role": "bouton", "text": "Connexion"},
ocr_client=ocr,
vlm_client=vlm,
coords_cache=cache,
)
assert result is not None
assert result.method == "cache"
assert result.bbox == (200, 50, 350, 90)
def test_cache_stored_on_ocr_anchor(self):
"""OCR anchor result → stored in cache."""
cache = CoordsCache()
ocr = mock_ocr_detailed_client_factory([
OcrTokenInfo(text="Connexion", bbox=(200, 50, 350, 90)),
])
vlm = mock_vlm_client_factory({})
ground_element(
"/tmp/login.png",
{"role": "bouton", "text": "Connexion"},
ocr_client=ocr,
vlm_client=vlm,
coords_cache=cache,
)
cached = cache.get("bouton:connexion")
assert cached is not None
assert cached.bbox == (200, 50, 350, 90)
assert cached.method == "ocr_anchor"
def test_cache_stored_on_vlm_grounder(self):
"""VLM grounder result → stored in cache."""
cache = CoordsCache()
ocr = mock_ocr_detailed_client_factory([])
vlm = mock_vlm_client_factory({
"found": True,
"bbox": [0.2, 0.3, 0.4, 0.5],
"confidence": 0.85,
})
ground_element(
"/tmp/login.png",
{"role": "bouton", "text": "Connexion"},
ocr_client=ocr,
vlm_client=vlm,
coords_cache=cache,
)
cached = cache.get("bouton:connexion")
assert cached is not None
assert cached.method == "vlm_grounder"
# ── CoordsCache tests ──────────────────────────────────────────────────
class TestCoordsCache:
def test_put_and_get(self):
cache = CoordsCache()
cache.put("bouton:connexion", (200, 50, 350, 90), (275, 70), "ocr_anchor")
entry = cache.get("bouton:connexion")
assert entry is not None
assert entry.bbox == (200, 50, 350, 90)
def test_get_missing(self):
cache = CoordsCache()
assert cache.get("bouton:connexion") is None
def test_invalidate(self):
cache = CoordsCache()
cache.put("bouton:connexion", (200, 50, 350, 90), (275, 70), "ocr_anchor")
cache.invalidate("bouton:connexion")
assert cache.get("bouton:connexion") is None
def test_clear(self):
cache = CoordsCache()
cache.put("a", (0, 0, 10, 10), (5, 5), "ocr_anchor")
cache.put("b", (0, 0, 20, 20), (10, 10), "vlm_grounder")
cache.clear()
assert cache.get("a") is None
assert cache.get("b") is None
def test_keys(self):
cache = CoordsCache()
cache.put("a", (0, 0, 10, 10), (5, 5), "ocr_anchor")
cache.put("b", (0, 0, 20, 20), (10, 10), "vlm_grounder")
assert sorted(cache.keys()) == ["a", "b"]
def test_update_existing(self):
cache = CoordsCache()
cache.put("bouton:connexion", (200, 50, 350, 90), (275, 70), "ocr_anchor")
cache.put("bouton:connexion", (300, 60, 400, 100), (350, 80), "vlm_grounder")
entry = cache.get("bouton:connexion")
assert entry is not None
assert entry.bbox == (300, 60, 400, 100) # updated
assert entry.validation_count == 2
def test_validation_count_increments(self):
cache = CoordsCache()
cache.put("a", (0, 0, 10, 10), (5, 5), "ocr_anchor")
assert cache.get("a").validation_count == 1
cache.put("a", (0, 0, 10, 10), (5, 5), "ocr_anchor")
assert cache.get("a").validation_count == 2