rpa_vision_v3/core/navigation/action_resolver.py

"""Action resolver — pont entre modules navigation et runtime replay.

Orchestre verify → ground → store coords pour le handler replay_engine.
Convertit coords pixels → normalisé (x_pct/y_pct) pour le client Agent V1.

Architecture :
- handler replay_engine = thin wrapper (appelle action_resolver)
- action_resolver = bridge (adapte OCR/VLM runtime → interfaces navigation)
- modules navigation = pure functions (ne connaissent pas le runtime)
"""

from __future__ import annotations

import logging
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Optional, Tuple

from core.navigation.grounding import (
    BBox,
    CoordsCache,
    GroundedElement,
    OcrDetailedClient,
    OcrTokenInfo,
    ground_element,
)
from core.navigation.visual_login import (
    LoginFormConfig,
    LoginResolution,
    dpi_urgences_login_config,
    resolve_login_form,
    verify_login_visible,
    verify_login_success,
)
from core.navigation.visual_verifier import (
    OcrClient,
    ScreenMatchResult,
    VlmClient,
)

logger = logging.getLogger(__name__)


# ── Dataclasses ──────────────────────────────────────────────────────


@dataclass
class NavigateCoords:
    """Normalized coords for a grounded element — format Agent V1 client."""

    x_pct: float  # center x normalized [0-1]
    y_pct: float  # center y normalized [0-1]
    bbox_pct: Optional[Tuple[float, float, float, float]] = None  # (x1, y1, x2, y2) normalized
    method: str = ""  # grounding method used

    def to_dict(self) -> Dict[str, Any]:
        d = {"x_pct": self.x_pct, "y_pct": self.y_pct, "method": self.method}
        if self.bbox_pct:
            d["bbox_pct"] = list(self.bbox_pct)
        return d


@dataclass
class NavigateResult:
    """Result of a navigate action — coords for each resolved field."""

    login_coords: Optional[NavigateCoords] = None
    password_coords: Optional[NavigateCoords] = None
    submit_coords: Optional[NavigateCoords] = None
    all_resolved: bool = False
    pre_verify: Optional[ScreenMatchResult] = None
    post_verify: Optional[ScreenMatchResult] = None  # set later by verify_after
    error: str = ""


# ── Coordinate conversion ────────────────────────────────────────────


def grounded_to_coords(
    element: GroundedElement,
    screen_width: int,
    screen_height: int,
) -> NavigateCoords:
    """Convert GroundedElement (pixels) to NavigateCoords (normalized pct)."""
    x_pct = element.center[0] / screen_width if screen_width else 0
    y_pct = element.center[1] / screen_height if screen_height else 0
    x1_pct = element.bbox[0] / screen_width if screen_width else 0
    y1_pct = element.bbox[1] / screen_height if screen_height else 0
    x2_pct = element.bbox[2] / screen_width if screen_width else 0
    y2_pct = element.bbox[3] / screen_height if screen_height else 0
    return NavigateCoords(
        x_pct=x_pct,
        y_pct=y_pct,
        bbox_pct=(x1_pct, y1_pct, x2_pct, y2_pct),
        method=element.method,
    )


# ── OCR adapter ──────────────────────────────────────────────────────


def make_ocr_detailed_from_grid(
    grid_fn: Callable[[str], List[List[Dict[str, Any]]]],
) -> OcrDetailedClient:
    """Adapt extract_grid_from_image → OcrDetailedClient (List[OcrTokenInfo]).

    Converts the grid format (list of rows of cells with bbox) into
    flat OcrTokenInfo list with normalized LTRB bbox.
    """
    from core.extraction.role_mapper import tokens_from_grid

    def client(image_path: str) -> List[OcrTokenInfo]:
        grid = grid_fn(image_path)
        ocr_tokens = tokens_from_grid(grid)
        return [
            OcrTokenInfo(
                text=t.text,
                bbox=t.bbox,
                confidence=t.confidence,
            )
            for t in ocr_tokens
        ]

    return client


def make_ocr_simple_from_detailed(
    ocr_detailed: OcrDetailedClient,
) -> OcrClient:
    """Derive text-only OcrClient from OcrDetailedClient."""
    def client(image_path: str) -> List[str]:
        return [t.text for t in ocr_detailed(image_path)]
    return client


# ── Navigate login orchestration ─────────────────────────────────────


def navigate_login(
    screenshot_path: str,
    config: Optional[LoginFormConfig] = None,
    ocr_client: Optional[OcrDetailedClient] = None,
    vlm_client: Optional[VlmClient] = None,
    screen_width: int = 1920,
    screen_height: int = 1080,
    coords_cache: Optional[CoordsCache] = None,
    skip_pre_verify: bool = False,
) -> NavigateResult:
    """Orchestrate login navigation: verify → ground → convert coords.

    Returns NavigateResult with normalized coords for each field.
    The handler stores these in replay_state variables for subsequent
    type/click actions.
    """
    if config is None:
        config = dpi_urgences_login_config()

    if ocr_client is None or vlm_client is None:
        return NavigateResult(
            all_resolved=False,
            error="ocr_client and vlm_client required",
        )

    ocr_simple = make_ocr_simple_from_detailed(ocr_client)

    # Step 1: Pre-verification (optional)
    pre_verify = None
    if not skip_pre_verify:
        pre_verify = verify_login_visible(
            screenshot_path, config, ocr_simple, vlm_client,
        )
        if not pre_verify.match:
            logger.warning("navigate_login: pre-verify failed — %s", pre_verify.describe())
            return NavigateResult(
                all_resolved=False,
                pre_verify=pre_verify,
                error=f"pre-verify failed: {pre_verify.describe()}",
            )

    # Step 2: Ground all fields
    resolution = resolve_login_form(
        screenshot_path, config, ocr_client, vlm_client,
        screen_width=screen_width, screen_height=screen_height,
        coords_cache=coords_cache,
    )

    if not resolution.all_resolved:
        logger.warning("navigate_login: incomplete resolution — %s", resolution.describe())
        return NavigateResult(
            all_resolved=False,
            pre_verify=pre_verify,
            error=f"incomplete resolution: {resolution.describe()}",
        )

    # Step 3: Convert to normalized coords
    login_coords = grounded_to_coords(resolution.login_field, screen_width, screen_height) if resolution.login_field else None
    password_coords = grounded_to_coords(resolution.password_field, screen_width, screen_height) if resolution.password_field else None
    submit_coords = grounded_to_coords(resolution.submit_button, screen_width, screen_height) if resolution.submit_button else None

    return NavigateResult(
        login_coords=login_coords,
        password_coords=password_coords,
        submit_coords=submit_coords,
        all_resolved=True,
        pre_verify=pre_verify,
    )