"""Action resolver — pont entre modules navigation et runtime replay. Orchestre verify → ground → store coords pour le handler replay_engine. Convertit coords pixels → normalisé (x_pct/y_pct) pour le client Agent V1. Architecture : - handler replay_engine = thin wrapper (appelle action_resolver) - action_resolver = bridge (adapte OCR/VLM runtime → interfaces navigation) - modules navigation = pure functions (ne connaissent pas le runtime) """ from __future__ import annotations import logging from dataclasses import dataclass from typing import Any, Callable, Dict, List, Optional, Tuple from core.navigation.grounding import ( BBox, CoordsCache, GroundedElement, OcrDetailedClient, OcrTokenInfo, ground_element, ) from core.navigation.visual_login import ( LoginFormConfig, LoginResolution, dpi_urgences_login_config, resolve_login_form, verify_login_visible, verify_login_success, ) from core.navigation.visual_verifier import ( OcrClient, ScreenMatchResult, VlmClient, ) logger = logging.getLogger(__name__) # ── Dataclasses ────────────────────────────────────────────────────── @dataclass class NavigateCoords: """Normalized coords for a grounded element — format Agent V1 client.""" x_pct: float # center x normalized [0-1] y_pct: float # center y normalized [0-1] bbox_pct: Optional[Tuple[float, float, float, float]] = None # (x1, y1, x2, y2) normalized method: str = "" # grounding method used def to_dict(self) -> Dict[str, Any]: d = {"x_pct": self.x_pct, "y_pct": self.y_pct, "method": self.method} if self.bbox_pct: d["bbox_pct"] = list(self.bbox_pct) return d @dataclass class NavigateResult: """Result of a navigate action — coords for each resolved field.""" login_coords: Optional[NavigateCoords] = None password_coords: Optional[NavigateCoords] = None submit_coords: Optional[NavigateCoords] = None all_resolved: bool = False pre_verify: Optional[ScreenMatchResult] = None post_verify: Optional[ScreenMatchResult] = None # set later by verify_after error: str = "" # ── Coordinate conversion ──────────────────────────────────────────── def grounded_to_coords( element: GroundedElement, screen_width: int, screen_height: int, ) -> NavigateCoords: """Convert GroundedElement (pixels) to NavigateCoords (normalized pct).""" x_pct = element.center[0] / screen_width if screen_width else 0 y_pct = element.center[1] / screen_height if screen_height else 0 x1_pct = element.bbox[0] / screen_width if screen_width else 0 y1_pct = element.bbox[1] / screen_height if screen_height else 0 x2_pct = element.bbox[2] / screen_width if screen_width else 0 y2_pct = element.bbox[3] / screen_height if screen_height else 0 return NavigateCoords( x_pct=x_pct, y_pct=y_pct, bbox_pct=(x1_pct, y1_pct, x2_pct, y2_pct), method=element.method, ) # ── OCR adapter ────────────────────────────────────────────────────── def make_ocr_detailed_from_grid( grid_fn: Callable[[str], List[List[Dict[str, Any]]]], ) -> OcrDetailedClient: """Adapt extract_grid_from_image → OcrDetailedClient (List[OcrTokenInfo]). Converts the grid format (list of rows of cells with bbox) into flat OcrTokenInfo list with normalized LTRB bbox. """ from core.extraction.role_mapper import tokens_from_grid def client(image_path: str) -> List[OcrTokenInfo]: grid = grid_fn(image_path) ocr_tokens = tokens_from_grid(grid) return [ OcrTokenInfo( text=t.text, bbox=t.bbox, confidence=t.confidence, ) for t in ocr_tokens ] return client def make_ocr_simple_from_detailed( ocr_detailed: OcrDetailedClient, ) -> OcrClient: """Derive text-only OcrClient from OcrDetailedClient.""" def client(image_path: str) -> List[str]: return [t.text for t in ocr_detailed(image_path)] return client # ── Navigate login orchestration ───────────────────────────────────── def navigate_login( screenshot_path: str, config: Optional[LoginFormConfig] = None, ocr_client: Optional[OcrDetailedClient] = None, vlm_client: Optional[VlmClient] = None, screen_width: int = 1920, screen_height: int = 1080, coords_cache: Optional[CoordsCache] = None, skip_pre_verify: bool = False, ) -> NavigateResult: """Orchestrate login navigation: verify → ground → convert coords. Returns NavigateResult with normalized coords for each field. The handler stores these in replay_state variables for subsequent type/click actions. """ if config is None: config = dpi_urgences_login_config() if ocr_client is None or vlm_client is None: return NavigateResult( all_resolved=False, error="ocr_client and vlm_client required", ) ocr_simple = make_ocr_simple_from_detailed(ocr_client) # Step 1: Pre-verification (optional) pre_verify = None if not skip_pre_verify: pre_verify = verify_login_visible( screenshot_path, config, ocr_simple, vlm_client, ) if not pre_verify.match: logger.warning("navigate_login: pre-verify failed — %s", pre_verify.describe()) return NavigateResult( all_resolved=False, pre_verify=pre_verify, error=f"pre-verify failed: {pre_verify.describe()}", ) # Step 2: Ground all fields resolution = resolve_login_form( screenshot_path, config, ocr_client, vlm_client, screen_width=screen_width, screen_height=screen_height, coords_cache=coords_cache, ) if not resolution.all_resolved: logger.warning("navigate_login: incomplete resolution — %s", resolution.describe()) return NavigateResult( all_resolved=False, pre_verify=pre_verify, error=f"incomplete resolution: {resolution.describe()}", ) # Step 3: Convert to normalized coords login_coords = grounded_to_coords(resolution.login_field, screen_width, screen_height) if resolution.login_field else None password_coords = grounded_to_coords(resolution.password_field, screen_width, screen_height) if resolution.password_field else None submit_coords = grounded_to_coords(resolution.submit_button, screen_width, screen_height) if resolution.submit_button else None return NavigateResult( login_coords=login_coords, password_coords=password_coords, submit_coords=submit_coords, all_resolved=True, pre_verify=pre_verify, )