"""Grounding — résolution visuelle d'éléments UI → coords (bbox + center). Architecture OCR-ancrée (alignée avec visual_verifier) : - STRATÉGIE 1 : OCR-anchor — si le texte cible est trouvé par OCR, utiliser le bbox du token OCR (déterministe, zero hallucination). - STRATÉGIE 2 : VLM grounder — si OCR ne trouve pas le texte, le VLM localise l'élément visuellement (fallback, risque contrôlé). - CACHE coords : mémorise les coords résolues, validées par vision avant usage. Si cached coords fail → re-résolution visuelle. Coords = cache local validé par vue (Dom/Claude recadrage 01/07). Vision = source de vérité, coords = shortcut validé. BBox format interne : LTRB (x1, y1, x2, y2) pixels absolus — cohérent avec SomElement, OcrToken, DetectedUIElement. """ from __future__ import annotations import json import logging import re from dataclasses import dataclass, field from typing import Any, Callable, Dict, List, Optional, Tuple from core.navigation.visual_verifier import ( fuzzy_match, normalize_text, OcrClient, VlmClient, ) logger = logging.getLogger(__name__) # BBox format: LTRB pixels (x1, y1, x2, y2) BBox = Tuple[int, int, int, int] # ── Dataclasses ────────────────────────────────────────────────────── @dataclass class OcrTokenInfo: """OCR token with bounding box — for grounding (richer than text-only).""" text: str bbox: Optional[BBox] = None # (x1, y1, x2, y2) LTRB pixels confidence: float = 1.0 # Type alias — injectable OCR client returning tokens with bbox # More detailed than visual_verifier's OcrClient (which returns List[str]) OcrDetailedClient = Callable[[str], List[OcrTokenInfo]] @dataclass class GroundedElement: """A UI element grounded on screen with coordinates.""" role: str text: str bbox: BBox # (x1, y1, x2, y2) LTRB pixels center: Tuple[int, int] # (cx, cy) — click target confidence: float method: str # "ocr_anchor" or "vlm_grounder" or "cache" source_ocr_text: str = "" # actual OCR text that matched (for fuzzy) @dataclass class CoordsCacheEntry: """Cached coordinates for a UI element.""" element_key: str # "role:text" bbox: BBox center: Tuple[int, int] method: str # how it was originally resolved validation_count: int = 0 class CoordsCache: """In-memory cache of grounded coordinates. Entries are validated by vision before use (verify_after). If cached coords fail verification → invalidate + re-resolve. """ def __init__(self) -> None: self._entries: Dict[str, CoordsCacheEntry] = {} def get(self, element_key: str) -> Optional[CoordsCacheEntry]: return self._entries.get(element_key) def put( self, element_key: str, bbox: BBox, center: Tuple[int, int], method: str, ) -> None: entry = self._entries.get(element_key) if entry: entry.bbox = bbox entry.center = center entry.method = method entry.validation_count += 1 else: self._entries[element_key] = CoordsCacheEntry( element_key=element_key, bbox=bbox, center=center, method=method, validation_count=1, ) def invalidate(self, element_key: str) -> None: self._entries.pop(element_key, None) def clear(self) -> None: self._entries.clear() def keys(self) -> List[str]: return list(self._entries.keys()) # ── Helper functions ───────────────────────────────────────────────── def bbox_center(bbox: BBox) -> Tuple[int, int]: """Compute center point from LTRB bbox.""" x1, y1, x2, y2 = bbox return ((x1 + x2) // 2, (y1 + y2) // 2) def make_element_key(role: str, text: str) -> str: """Create a stable cache key from role + text.""" return f"{role}:{normalize_text(text)}" # ── OCR-anchored grounding (deterministic) ─────────────────────────── def ocr_anchor_ground( ocr_tokens: List[OcrTokenInfo], target: Dict[str, Any], fuzzy_threshold: float = 0.8, ) -> Optional[GroundedElement]: """Ground an element using OCR tokens with bbox (deterministic). Finds the target text in OCR tokens via fuzzy match. Returns GroundedElement with bbox from the matching OCR token. """ target_text = target.get("text", "") target_role = target.get("role", "?") if not target_text: return None for token in ocr_tokens: if fuzzy_match(target_text, token.text, threshold=fuzzy_threshold): if token.bbox is None: continue # token found but no bbox → can't ground return GroundedElement( role=target_role, text=target_text, bbox=token.bbox, center=bbox_center(token.bbox), confidence=token.confidence, method="ocr_anchor", source_ocr_text=token.text, ) return None # ── VLM grounder (fallback) ───────────────────────────────────────── def build_grounder_prompt( target: Dict[str, Any], context: str = "", ) -> str: """Build VLM prompt for locating a UI element on screen. Asks for bounding box in normalized coordinates [0-1]. """ role = target.get("role", "?") text = target.get("text", "") extra = target.get("extra", "") prompt = ( "You are a UI element locator. Find the specified element on this " "screenshot and return its bounding box.\n" ) if context: prompt += f"Context: {context}\n" prompt += f"Target element: {role} with text \"{text}\"" if extra: prompt += f" ({extra})" prompt += ( "\n\nRespond in JSON format:\n" "{\"found\": true/false, " "\"bbox\": [x1_norm, y1_norm, x2_norm, y2_norm], " "\"confidence\": 0.0-1.0, " "\"description\": \"...\"}\n" "bbox coordinates are normalized [0.0-1.0] relative to image dimensions " "(x1=left, y1=top, x2=right, y2=bottom). " "Only return found=true if you can clearly locate the element." ) return prompt def parse_grounder_response( vlm_text: str, screen_width: int, screen_height: int, target: Dict[str, Any], ) -> Optional[GroundedElement]: """Parse VLM grounder response into GroundedElement. Converts normalized bbox [0-1] to absolute pixels. """ try: data = json.loads(vlm_text) except json.JSONDecodeError: json_match = re.search(r"\{[\s\S]*\}", vlm_text) if json_match: try: data = json.loads(json_match.group()) except json.JSONDecodeError: logger.warning("grounding: VLM response not parseable as JSON") return None else: return None if not data.get("found", False): return None bbox_norm = data.get("bbox", []) if not isinstance(bbox_norm, list) or len(bbox_norm) != 4: logger.warning("grounding: invalid bbox format from VLM") return None # Convert normalized [0-1] to absolute pixels try: x1 = int(float(bbox_norm[0]) * screen_width) y1 = int(float(bbox_norm[1]) * screen_height) x2 = int(float(bbox_norm[2]) * screen_width) y2 = int(float(bbox_norm[3]) * screen_height) except (ValueError, TypeError): logger.warning("grounding: bbox values not numeric") return None # Clamp to screen bounds x1 = max(0, min(x1, screen_width)) y1 = max(0, min(y1, screen_height)) x2 = max(x1, min(x2, screen_width)) y2 = max(y1, min(y2, screen_height)) confidence = data.get("confidence", 0.5) if isinstance(confidence, str): try: confidence = float(confidence) except ValueError: confidence = 0.5 bbox_abs: BBox = (x1, y1, x2, y2) return GroundedElement( role=target.get("role", "?"), text=target.get("text", ""), bbox=bbox_abs, center=bbox_center(bbox_abs), confidence=confidence, method="vlm_grounder", ) # ── Core grounding function (composition) ─────────────────────────── def ground_element( screenshot_path: str, target: Dict[str, Any], ocr_client: OcrDetailedClient, vlm_client: VlmClient, screen_width: int = 1920, screen_height: int = 1080, coords_cache: Optional[CoordsCache] = None, context: str = "", fuzzy_threshold: float = 0.8, ) -> Optional[GroundedElement]: """Ground a UI element on screen — OCR-anchor first, VLM fallback. Resolution strategy: 1. Cache: if cached coords exist → return cached (validated separately) 2. OCR-anchor: deterministic, zero hallucination 3. VLM grounder: fallback when OCR can't find the text Args: screenshot_path: path to screenshot image target: {"role": "bouton", "text": "Connexion"} — element to find ocr_client: injectable OCR client returning List[OcrTokenInfo] vlm_client: injectable VLM client (image_path, prompt) -> text screen_width/height: screen dimensions for pixel conversion coords_cache: optional CoordsCache for memoization context: optional context (e.g. "page login DPI") fuzzy_threshold: fuzzy match threshold for OCR anchoring Returns: GroundedElement with bbox + center, or None if not found """ target_text = target.get("text", "") target_role = target.get("role", "?") element_key = make_element_key(target_role, target_text) # Step 0: Check cache if coords_cache: cached = coords_cache.get(element_key) if cached: cached.validation_count += 1 logger.info("grounding: using cached coords for %s", element_key) return GroundedElement( role=target_role, text=target_text, bbox=cached.bbox, center=cached.center, confidence=1.0, # cached = previously validated method="cache", ) # Step 1: OCR-anchor (deterministic) try: ocr_tokens = ocr_client(screenshot_path) except Exception as e: logger.warning("grounding: OCR call failed (%s)", e) ocr_tokens = [] ocr_result = ocr_anchor_ground(ocr_tokens, target, fuzzy_threshold) if ocr_result: if coords_cache: coords_cache.put(element_key, ocr_result.bbox, ocr_result.center, "ocr_anchor") logger.info( "grounding: OCR-anchor found '%s' (matched OCR='%s', conf=%.2f)", target_text, ocr_result.source_ocr_text, ocr_result.confidence, ) return ocr_result # Step 2: VLM grounder (fallback) if not target_text: logger.warning("grounding: no text for target, VLM grounder needs text") return None prompt = build_grounder_prompt(target, context) try: vlm_text = vlm_client(screenshot_path, prompt) except Exception as e: logger.warning("grounding: VLM grounder call failed (%s)", e) return None vlm_result = parse_grounder_response(vlm_text, screen_width, screen_height, target) if vlm_result: if coords_cache: coords_cache.put(element_key, vlm_result.bbox, vlm_result.center, "vlm_grounder") logger.info( "grounding: VLM grounder found '%s' (conf=%.2f)", target_text, vlm_result.confidence, ) return vlm_result logger.warning("grounding: element '%s' not found by OCR or VLM", target_text) return None