rpa_vision_v3/core/navigation/grounding.py

"""Grounding — résolution visuelle d'éléments UI → coords (bbox + center).

Architecture OCR-ancrée (alignée avec visual_verifier) :
- STRATÉGIE 1 : OCR-anchor — si le texte cible est trouvé par OCR,
  utiliser le bbox du token OCR (déterministe, zero hallucination).
- STRATÉGIE 2 : VLM grounder — si OCR ne trouve pas le texte,
  le VLM localise l'élément visuellement (fallback, risque contrôlé).
- CACHE coords : mémorise les coords résolues, validées par vision avant usage.
  Si cached coords fail → re-résolution visuelle.

Coords = cache local validé par vue (Dom/Claude recadrage 01/07).
Vision = source de vérité, coords = shortcut validé.

BBox format interne : LTRB (x1, y1, x2, y2) pixels absolus —
cohérent avec SomElement, OcrToken, DetectedUIElement.
"""

from __future__ import annotations

import json
import logging
import re
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional, Tuple

from core.navigation.visual_verifier import (
    fuzzy_match,
    normalize_text,
    OcrClient,
    VlmClient,
)

logger = logging.getLogger(__name__)

# BBox format: LTRB pixels (x1, y1, x2, y2)
BBox = Tuple[int, int, int, int]


# ── Dataclasses ──────────────────────────────────────────────────────


@dataclass
class OcrTokenInfo:
    """OCR token with bounding box — for grounding (richer than text-only)."""

    text: str
    bbox: Optional[BBox] = None  # (x1, y1, x2, y2) LTRB pixels
    confidence: float = 1.0


# Type alias — injectable OCR client returning tokens with bbox
# More detailed than visual_verifier's OcrClient (which returns List[str])
OcrDetailedClient = Callable[[str], List[OcrTokenInfo]]


@dataclass
class GroundedElement:
    """A UI element grounded on screen with coordinates."""

    role: str
    text: str
    bbox: BBox  # (x1, y1, x2, y2) LTRB pixels
    center: Tuple[int, int]  # (cx, cy) — click target
    confidence: float
    method: str  # "ocr_anchor" or "vlm_grounder" or "cache"
    source_ocr_text: str = ""  # actual OCR text that matched (for fuzzy)


@dataclass
class CoordsCacheEntry:
    """Cached coordinates for a UI element."""

    element_key: str  # "role:text"
    bbox: BBox
    center: Tuple[int, int]
    method: str  # how it was originally resolved
    validation_count: int = 0


class CoordsCache:
    """In-memory cache of grounded coordinates.

    Entries are validated by vision before use (verify_after).
    If cached coords fail verification → invalidate + re-resolve.
    """

    def __init__(self) -> None:
        self._entries: Dict[str, CoordsCacheEntry] = {}

    def get(self, element_key: str) -> Optional[CoordsCacheEntry]:
        return self._entries.get(element_key)

    def put(
        self,
        element_key: str,
        bbox: BBox,
        center: Tuple[int, int],
        method: str,
    ) -> None:
        entry = self._entries.get(element_key)
        if entry:
            entry.bbox = bbox
            entry.center = center
            entry.method = method
            entry.validation_count += 1
        else:
            self._entries[element_key] = CoordsCacheEntry(
                element_key=element_key,
                bbox=bbox,
                center=center,
                method=method,
                validation_count=1,
            )

    def invalidate(self, element_key: str) -> None:
        self._entries.pop(element_key, None)

    def clear(self) -> None:
        self._entries.clear()

    def keys(self) -> List[str]:
        return list(self._entries.keys())


# ── Helper functions ─────────────────────────────────────────────────


def bbox_center(bbox: BBox) -> Tuple[int, int]:
    """Compute center point from LTRB bbox."""
    x1, y1, x2, y2 = bbox
    return ((x1 + x2) // 2, (y1 + y2) // 2)


def make_element_key(role: str, text: str) -> str:
    """Create a stable cache key from role + text."""
    return f"{role}:{normalize_text(text)}"


# ── OCR-anchored grounding (deterministic) ───────────────────────────


def ocr_anchor_ground(
    ocr_tokens: List[OcrTokenInfo],
    target: Dict[str, Any],
    fuzzy_threshold: float = 0.8,
) -> Optional[GroundedElement]:
    """Ground an element using OCR tokens with bbox (deterministic).

    Finds the target text in OCR tokens via fuzzy match.
    Returns GroundedElement with bbox from the matching OCR token.
    """
    target_text = target.get("text", "")
    target_role = target.get("role", "?")

    if not target_text:
        return None

    for token in ocr_tokens:
        if fuzzy_match(target_text, token.text, threshold=fuzzy_threshold):
            if token.bbox is None:
                continue  # token found but no bbox → can't ground

            return GroundedElement(
                role=target_role,
                text=target_text,
                bbox=token.bbox,
                center=bbox_center(token.bbox),
                confidence=token.confidence,
                method="ocr_anchor",
                source_ocr_text=token.text,
            )

    return None


# ── VLM grounder (fallback) ─────────────────────────────────────────


def build_grounder_prompt(
    target: Dict[str, Any],
    context: str = "",
) -> str:
    """Build VLM prompt for locating a UI element on screen.

    Asks for bounding box in normalized coordinates [0-1].
    """
    role = target.get("role", "?")
    text = target.get("text", "")
    extra = target.get("extra", "")

    prompt = (
        "You are a UI element locator. Find the specified element on this "
        "screenshot and return its bounding box.\n"
    )
    if context:
        prompt += f"Context: {context}\n"
    prompt += f"Target element: {role} with text \"{text}\""
    if extra:
        prompt += f" ({extra})"
    prompt += (
        "\n\nRespond in JSON format:\n"
        "{\"found\": true/false, "
        "\"bbox\": [x1_norm, y1_norm, x2_norm, y2_norm], "
        "\"confidence\": 0.0-1.0, "
        "\"description\": \"...\"}\n"
        "bbox coordinates are normalized [0.0-1.0] relative to image dimensions "
        "(x1=left, y1=top, x2=right, y2=bottom). "
        "Only return found=true if you can clearly locate the element."
    )
    return prompt


def parse_grounder_response(
    vlm_text: str,
    screen_width: int,
    screen_height: int,
    target: Dict[str, Any],
) -> Optional[GroundedElement]:
    """Parse VLM grounder response into GroundedElement.

    Converts normalized bbox [0-1] to absolute pixels.
    """
    try:
        data = json.loads(vlm_text)
    except json.JSONDecodeError:
        json_match = re.search(r"\{[\s\S]*\}", vlm_text)
        if json_match:
            try:
                data = json.loads(json_match.group())
            except json.JSONDecodeError:
                logger.warning("grounding: VLM response not parseable as JSON")
                return None
        else:
            return None

    if not data.get("found", False):
        return None

    bbox_norm = data.get("bbox", [])
    if not isinstance(bbox_norm, list) or len(bbox_norm) != 4:
        logger.warning("grounding: invalid bbox format from VLM")
        return None

    # Convert normalized [0-1] to absolute pixels
    try:
        x1 = int(float(bbox_norm[0]) * screen_width)
        y1 = int(float(bbox_norm[1]) * screen_height)
        x2 = int(float(bbox_norm[2]) * screen_width)
        y2 = int(float(bbox_norm[3]) * screen_height)
    except (ValueError, TypeError):
        logger.warning("grounding: bbox values not numeric")
        return None

    # Clamp to screen bounds
    x1 = max(0, min(x1, screen_width))
    y1 = max(0, min(y1, screen_height))
    x2 = max(x1, min(x2, screen_width))
    y2 = max(y1, min(y2, screen_height))

    confidence = data.get("confidence", 0.5)
    if isinstance(confidence, str):
        try:
            confidence = float(confidence)
        except ValueError:
            confidence = 0.5

    bbox_abs: BBox = (x1, y1, x2, y2)

    return GroundedElement(
        role=target.get("role", "?"),
        text=target.get("text", ""),
        bbox=bbox_abs,
        center=bbox_center(bbox_abs),
        confidence=confidence,
        method="vlm_grounder",
    )


# ── Core grounding function (composition) ───────────────────────────


def ground_element(
    screenshot_path: str,
    target: Dict[str, Any],
    ocr_client: OcrDetailedClient,
    vlm_client: VlmClient,
    screen_width: int = 1920,
    screen_height: int = 1080,
    coords_cache: Optional[CoordsCache] = None,
    context: str = "",
    fuzzy_threshold: float = 0.8,
) -> Optional[GroundedElement]:
    """Ground a UI element on screen — OCR-anchor first, VLM fallback.

    Resolution strategy:
    1. Cache: if cached coords exist → return cached (validated separately)
    2. OCR-anchor: deterministic, zero hallucination
    3. VLM grounder: fallback when OCR can't find the text

    Args:
        screenshot_path: path to screenshot image
        target: {"role": "bouton", "text": "Connexion"} — element to find
        ocr_client: injectable OCR client returning List[OcrTokenInfo]
        vlm_client: injectable VLM client (image_path, prompt) -> text
        screen_width/height: screen dimensions for pixel conversion
        coords_cache: optional CoordsCache for memoization
        context: optional context (e.g. "page login DPI")
        fuzzy_threshold: fuzzy match threshold for OCR anchoring

    Returns:
        GroundedElement with bbox + center, or None if not found
    """
    target_text = target.get("text", "")
    target_role = target.get("role", "?")
    element_key = make_element_key(target_role, target_text)

    # Step 0: Check cache
    if coords_cache:
        cached = coords_cache.get(element_key)
        if cached:
            cached.validation_count += 1
            logger.info("grounding: using cached coords for %s", element_key)
            return GroundedElement(
                role=target_role,
                text=target_text,
                bbox=cached.bbox,
                center=cached.center,
                confidence=1.0,  # cached = previously validated
                method="cache",
            )

    # Step 1: OCR-anchor (deterministic)
    try:
        ocr_tokens = ocr_client(screenshot_path)
    except Exception as e:
        logger.warning("grounding: OCR call failed (%s)", e)
        ocr_tokens = []

    ocr_result = ocr_anchor_ground(ocr_tokens, target, fuzzy_threshold)

    if ocr_result:
        if coords_cache:
            coords_cache.put(element_key, ocr_result.bbox, ocr_result.center, "ocr_anchor")
        logger.info(
            "grounding: OCR-anchor found '%s' (matched OCR='%s', conf=%.2f)",
            target_text, ocr_result.source_ocr_text, ocr_result.confidence,
        )
        return ocr_result

    # Step 2: VLM grounder (fallback)
    if not target_text:
        logger.warning("grounding: no text for target, VLM grounder needs text")
        return None

    prompt = build_grounder_prompt(target, context)

    try:
        vlm_text = vlm_client(screenshot_path, prompt)
    except Exception as e:
        logger.warning("grounding: VLM grounder call failed (%s)", e)
        return None

    vlm_result = parse_grounder_response(vlm_text, screen_width, screen_height, target)

    if vlm_result:
        if coords_cache:
            coords_cache.put(element_key, vlm_result.bbox, vlm_result.center, "vlm_grounder")
        logger.info(
            "grounding: VLM grounder found '%s' (conf=%.2f)",
            target_text, vlm_result.confidence,
        )
        return vlm_result

    logger.warning("grounding: element '%s' not found by OCR or VLM", target_text)
    return None