- core/navigation/ : visual_verifier (presence=OCR, role=VLM ancre sur tokens), grounding (OCR-anchor first, VLM fallback, cache coords valide par la vue), visual_login (verify_before/after, DETTE-023), action_resolver (pont runtime) - api_stream/replay_engine : dispatch action navigate server-side, never-fail -> needs_review, import depuis core.navigation (boot 5005 garanti) - 131 tests verts (wiring boot, e2e handler, unit modules) Chantier Qwen 01-02/07/2026, revue croisee Claude (plan deploy v2). Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
376 lines
12 KiB
Python
376 lines
12 KiB
Python
"""Grounding — résolution visuelle d'éléments UI → coords (bbox + center).
|
|
|
|
Architecture OCR-ancrée (alignée avec visual_verifier) :
|
|
- STRATÉGIE 1 : OCR-anchor — si le texte cible est trouvé par OCR,
|
|
utiliser le bbox du token OCR (déterministe, zero hallucination).
|
|
- STRATÉGIE 2 : VLM grounder — si OCR ne trouve pas le texte,
|
|
le VLM localise l'élément visuellement (fallback, risque contrôlé).
|
|
- CACHE coords : mémorise les coords résolues, validées par vision avant usage.
|
|
Si cached coords fail → re-résolution visuelle.
|
|
|
|
Coords = cache local validé par vue (Dom/Claude recadrage 01/07).
|
|
Vision = source de vérité, coords = shortcut validé.
|
|
|
|
BBox format interne : LTRB (x1, y1, x2, y2) pixels absolus —
|
|
cohérent avec SomElement, OcrToken, DetectedUIElement.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
|
|
from core.navigation.visual_verifier import (
|
|
fuzzy_match,
|
|
normalize_text,
|
|
OcrClient,
|
|
VlmClient,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# BBox format: LTRB pixels (x1, y1, x2, y2)
|
|
BBox = Tuple[int, int, int, int]
|
|
|
|
|
|
# ── Dataclasses ──────────────────────────────────────────────────────
|
|
|
|
|
|
@dataclass
|
|
class OcrTokenInfo:
|
|
"""OCR token with bounding box — for grounding (richer than text-only)."""
|
|
|
|
text: str
|
|
bbox: Optional[BBox] = None # (x1, y1, x2, y2) LTRB pixels
|
|
confidence: float = 1.0
|
|
|
|
|
|
# Type alias — injectable OCR client returning tokens with bbox
|
|
# More detailed than visual_verifier's OcrClient (which returns List[str])
|
|
OcrDetailedClient = Callable[[str], List[OcrTokenInfo]]
|
|
|
|
|
|
@dataclass
|
|
class GroundedElement:
|
|
"""A UI element grounded on screen with coordinates."""
|
|
|
|
role: str
|
|
text: str
|
|
bbox: BBox # (x1, y1, x2, y2) LTRB pixels
|
|
center: Tuple[int, int] # (cx, cy) — click target
|
|
confidence: float
|
|
method: str # "ocr_anchor" or "vlm_grounder" or "cache"
|
|
source_ocr_text: str = "" # actual OCR text that matched (for fuzzy)
|
|
|
|
|
|
@dataclass
|
|
class CoordsCacheEntry:
|
|
"""Cached coordinates for a UI element."""
|
|
|
|
element_key: str # "role:text"
|
|
bbox: BBox
|
|
center: Tuple[int, int]
|
|
method: str # how it was originally resolved
|
|
validation_count: int = 0
|
|
|
|
|
|
class CoordsCache:
|
|
"""In-memory cache of grounded coordinates.
|
|
|
|
Entries are validated by vision before use (verify_after).
|
|
If cached coords fail verification → invalidate + re-resolve.
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
self._entries: Dict[str, CoordsCacheEntry] = {}
|
|
|
|
def get(self, element_key: str) -> Optional[CoordsCacheEntry]:
|
|
return self._entries.get(element_key)
|
|
|
|
def put(
|
|
self,
|
|
element_key: str,
|
|
bbox: BBox,
|
|
center: Tuple[int, int],
|
|
method: str,
|
|
) -> None:
|
|
entry = self._entries.get(element_key)
|
|
if entry:
|
|
entry.bbox = bbox
|
|
entry.center = center
|
|
entry.method = method
|
|
entry.validation_count += 1
|
|
else:
|
|
self._entries[element_key] = CoordsCacheEntry(
|
|
element_key=element_key,
|
|
bbox=bbox,
|
|
center=center,
|
|
method=method,
|
|
validation_count=1,
|
|
)
|
|
|
|
def invalidate(self, element_key: str) -> None:
|
|
self._entries.pop(element_key, None)
|
|
|
|
def clear(self) -> None:
|
|
self._entries.clear()
|
|
|
|
def keys(self) -> List[str]:
|
|
return list(self._entries.keys())
|
|
|
|
|
|
# ── Helper functions ─────────────────────────────────────────────────
|
|
|
|
|
|
def bbox_center(bbox: BBox) -> Tuple[int, int]:
|
|
"""Compute center point from LTRB bbox."""
|
|
x1, y1, x2, y2 = bbox
|
|
return ((x1 + x2) // 2, (y1 + y2) // 2)
|
|
|
|
|
|
def make_element_key(role: str, text: str) -> str:
|
|
"""Create a stable cache key from role + text."""
|
|
return f"{role}:{normalize_text(text)}"
|
|
|
|
|
|
# ── OCR-anchored grounding (deterministic) ───────────────────────────
|
|
|
|
|
|
def ocr_anchor_ground(
|
|
ocr_tokens: List[OcrTokenInfo],
|
|
target: Dict[str, Any],
|
|
fuzzy_threshold: float = 0.8,
|
|
) -> Optional[GroundedElement]:
|
|
"""Ground an element using OCR tokens with bbox (deterministic).
|
|
|
|
Finds the target text in OCR tokens via fuzzy match.
|
|
Returns GroundedElement with bbox from the matching OCR token.
|
|
"""
|
|
target_text = target.get("text", "")
|
|
target_role = target.get("role", "?")
|
|
|
|
if not target_text:
|
|
return None
|
|
|
|
for token in ocr_tokens:
|
|
if fuzzy_match(target_text, token.text, threshold=fuzzy_threshold):
|
|
if token.bbox is None:
|
|
continue # token found but no bbox → can't ground
|
|
|
|
return GroundedElement(
|
|
role=target_role,
|
|
text=target_text,
|
|
bbox=token.bbox,
|
|
center=bbox_center(token.bbox),
|
|
confidence=token.confidence,
|
|
method="ocr_anchor",
|
|
source_ocr_text=token.text,
|
|
)
|
|
|
|
return None
|
|
|
|
|
|
# ── VLM grounder (fallback) ─────────────────────────────────────────
|
|
|
|
|
|
def build_grounder_prompt(
|
|
target: Dict[str, Any],
|
|
context: str = "",
|
|
) -> str:
|
|
"""Build VLM prompt for locating a UI element on screen.
|
|
|
|
Asks for bounding box in normalized coordinates [0-1].
|
|
"""
|
|
role = target.get("role", "?")
|
|
text = target.get("text", "")
|
|
extra = target.get("extra", "")
|
|
|
|
prompt = (
|
|
"You are a UI element locator. Find the specified element on this "
|
|
"screenshot and return its bounding box.\n"
|
|
)
|
|
if context:
|
|
prompt += f"Context: {context}\n"
|
|
prompt += f"Target element: {role} with text \"{text}\""
|
|
if extra:
|
|
prompt += f" ({extra})"
|
|
prompt += (
|
|
"\n\nRespond in JSON format:\n"
|
|
"{\"found\": true/false, "
|
|
"\"bbox\": [x1_norm, y1_norm, x2_norm, y2_norm], "
|
|
"\"confidence\": 0.0-1.0, "
|
|
"\"description\": \"...\"}\n"
|
|
"bbox coordinates are normalized [0.0-1.0] relative to image dimensions "
|
|
"(x1=left, y1=top, x2=right, y2=bottom). "
|
|
"Only return found=true if you can clearly locate the element."
|
|
)
|
|
return prompt
|
|
|
|
|
|
def parse_grounder_response(
|
|
vlm_text: str,
|
|
screen_width: int,
|
|
screen_height: int,
|
|
target: Dict[str, Any],
|
|
) -> Optional[GroundedElement]:
|
|
"""Parse VLM grounder response into GroundedElement.
|
|
|
|
Converts normalized bbox [0-1] to absolute pixels.
|
|
"""
|
|
try:
|
|
data = json.loads(vlm_text)
|
|
except json.JSONDecodeError:
|
|
json_match = re.search(r"\{[\s\S]*\}", vlm_text)
|
|
if json_match:
|
|
try:
|
|
data = json.loads(json_match.group())
|
|
except json.JSONDecodeError:
|
|
logger.warning("grounding: VLM response not parseable as JSON")
|
|
return None
|
|
else:
|
|
return None
|
|
|
|
if not data.get("found", False):
|
|
return None
|
|
|
|
bbox_norm = data.get("bbox", [])
|
|
if not isinstance(bbox_norm, list) or len(bbox_norm) != 4:
|
|
logger.warning("grounding: invalid bbox format from VLM")
|
|
return None
|
|
|
|
# Convert normalized [0-1] to absolute pixels
|
|
try:
|
|
x1 = int(float(bbox_norm[0]) * screen_width)
|
|
y1 = int(float(bbox_norm[1]) * screen_height)
|
|
x2 = int(float(bbox_norm[2]) * screen_width)
|
|
y2 = int(float(bbox_norm[3]) * screen_height)
|
|
except (ValueError, TypeError):
|
|
logger.warning("grounding: bbox values not numeric")
|
|
return None
|
|
|
|
# Clamp to screen bounds
|
|
x1 = max(0, min(x1, screen_width))
|
|
y1 = max(0, min(y1, screen_height))
|
|
x2 = max(x1, min(x2, screen_width))
|
|
y2 = max(y1, min(y2, screen_height))
|
|
|
|
confidence = data.get("confidence", 0.5)
|
|
if isinstance(confidence, str):
|
|
try:
|
|
confidence = float(confidence)
|
|
except ValueError:
|
|
confidence = 0.5
|
|
|
|
bbox_abs: BBox = (x1, y1, x2, y2)
|
|
|
|
return GroundedElement(
|
|
role=target.get("role", "?"),
|
|
text=target.get("text", ""),
|
|
bbox=bbox_abs,
|
|
center=bbox_center(bbox_abs),
|
|
confidence=confidence,
|
|
method="vlm_grounder",
|
|
)
|
|
|
|
|
|
# ── Core grounding function (composition) ───────────────────────────
|
|
|
|
|
|
def ground_element(
|
|
screenshot_path: str,
|
|
target: Dict[str, Any],
|
|
ocr_client: OcrDetailedClient,
|
|
vlm_client: VlmClient,
|
|
screen_width: int = 1920,
|
|
screen_height: int = 1080,
|
|
coords_cache: Optional[CoordsCache] = None,
|
|
context: str = "",
|
|
fuzzy_threshold: float = 0.8,
|
|
) -> Optional[GroundedElement]:
|
|
"""Ground a UI element on screen — OCR-anchor first, VLM fallback.
|
|
|
|
Resolution strategy:
|
|
1. Cache: if cached coords exist → return cached (validated separately)
|
|
2. OCR-anchor: deterministic, zero hallucination
|
|
3. VLM grounder: fallback when OCR can't find the text
|
|
|
|
Args:
|
|
screenshot_path: path to screenshot image
|
|
target: {"role": "bouton", "text": "Connexion"} — element to find
|
|
ocr_client: injectable OCR client returning List[OcrTokenInfo]
|
|
vlm_client: injectable VLM client (image_path, prompt) -> text
|
|
screen_width/height: screen dimensions for pixel conversion
|
|
coords_cache: optional CoordsCache for memoization
|
|
context: optional context (e.g. "page login DPI")
|
|
fuzzy_threshold: fuzzy match threshold for OCR anchoring
|
|
|
|
Returns:
|
|
GroundedElement with bbox + center, or None if not found
|
|
"""
|
|
target_text = target.get("text", "")
|
|
target_role = target.get("role", "?")
|
|
element_key = make_element_key(target_role, target_text)
|
|
|
|
# Step 0: Check cache
|
|
if coords_cache:
|
|
cached = coords_cache.get(element_key)
|
|
if cached:
|
|
cached.validation_count += 1
|
|
logger.info("grounding: using cached coords for %s", element_key)
|
|
return GroundedElement(
|
|
role=target_role,
|
|
text=target_text,
|
|
bbox=cached.bbox,
|
|
center=cached.center,
|
|
confidence=1.0, # cached = previously validated
|
|
method="cache",
|
|
)
|
|
|
|
# Step 1: OCR-anchor (deterministic)
|
|
try:
|
|
ocr_tokens = ocr_client(screenshot_path)
|
|
except Exception as e:
|
|
logger.warning("grounding: OCR call failed (%s)", e)
|
|
ocr_tokens = []
|
|
|
|
ocr_result = ocr_anchor_ground(ocr_tokens, target, fuzzy_threshold)
|
|
|
|
if ocr_result:
|
|
if coords_cache:
|
|
coords_cache.put(element_key, ocr_result.bbox, ocr_result.center, "ocr_anchor")
|
|
logger.info(
|
|
"grounding: OCR-anchor found '%s' (matched OCR='%s', conf=%.2f)",
|
|
target_text, ocr_result.source_ocr_text, ocr_result.confidence,
|
|
)
|
|
return ocr_result
|
|
|
|
# Step 2: VLM grounder (fallback)
|
|
if not target_text:
|
|
logger.warning("grounding: no text for target, VLM grounder needs text")
|
|
return None
|
|
|
|
prompt = build_grounder_prompt(target, context)
|
|
|
|
try:
|
|
vlm_text = vlm_client(screenshot_path, prompt)
|
|
except Exception as e:
|
|
logger.warning("grounding: VLM grounder call failed (%s)", e)
|
|
return None
|
|
|
|
vlm_result = parse_grounder_response(vlm_text, screen_width, screen_height, target)
|
|
|
|
if vlm_result:
|
|
if coords_cache:
|
|
coords_cache.put(element_key, vlm_result.bbox, vlm_result.center, "vlm_grounder")
|
|
logger.info(
|
|
"grounding: VLM grounder found '%s' (conf=%.2f)",
|
|
target_text, vlm_result.confidence,
|
|
)
|
|
return vlm_result
|
|
|
|
logger.warning("grounding: element '%s' not found by OCR or VLM", target_text)
|
|
return None
|