feat(navigation): brique login visuel OCR-ancre + action navigate au replay

- core/navigation/ : visual_verifier (presence=OCR, role=VLM ancre sur tokens), grounding (OCR-anchor first, VLM fallback, cache coords valide par la vue), visual_login (verify_before/after, DETTE-023), action_resolver (pont runtime) - api_stream/replay_engine : dispatch action navigate server-side, never-fail -> needs_review, import depuis core.navigation (boot 5005 garanti) - 131 tests verts (wiring boot, e2e handler, unit modules) Chantier Qwen 01-02/07/2026, revue croisee Claude (plan deploy v2). Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-02 10:31:44 +02:00
parent ab78ae390a
commit f9a0531325
13 changed files with 2998 additions and 0 deletions
--- a/core/navigation/action_resolver.py
+++ b/core/navigation/action_resolver.py
@@ -0,0 +1,205 @@
+"""Action resolver — pont entre modules navigation et runtime replay.
+
+Orchestre verify → ground → store coords pour le handler replay_engine.
+Convertit coords pixels → normalisé (x_pct/y_pct) pour le client Agent V1.
+
+Architecture :
+- handler replay_engine = thin wrapper (appelle action_resolver)
+- action_resolver = bridge (adapte OCR/VLM runtime → interfaces navigation)
+- modules navigation = pure functions (ne connaissent pas le runtime)
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+from core.navigation.grounding import (
+    BBox,
+    CoordsCache,
+    GroundedElement,
+    OcrDetailedClient,
+    OcrTokenInfo,
+    ground_element,
+)
+from core.navigation.visual_login import (
+    LoginFormConfig,
+    LoginResolution,
+    dpi_urgences_login_config,
+    resolve_login_form,
+    verify_login_visible,
+    verify_login_success,
+)
+from core.navigation.visual_verifier import (
+    OcrClient,
+    ScreenMatchResult,
+    VlmClient,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ── Dataclasses ──────────────────────────────────────────────────────
+
+
+@dataclass
+class NavigateCoords:
+    """Normalized coords for a grounded element — format Agent V1 client."""
+
+    x_pct: float  # center x normalized [0-1]
+    y_pct: float  # center y normalized [0-1]
+    bbox_pct: Optional[Tuple[float, float, float, float]] = None  # (x1, y1, x2, y2) normalized
+    method: str = ""  # grounding method used
+
+    def to_dict(self) -> Dict[str, Any]:
+        d = {"x_pct": self.x_pct, "y_pct": self.y_pct, "method": self.method}
+        if self.bbox_pct:
+            d["bbox_pct"] = list(self.bbox_pct)
+        return d
+
+
+@dataclass
+class NavigateResult:
+    """Result of a navigate action — coords for each resolved field."""
+
+    login_coords: Optional[NavigateCoords] = None
+    password_coords: Optional[NavigateCoords] = None
+    submit_coords: Optional[NavigateCoords] = None
+    all_resolved: bool = False
+    pre_verify: Optional[ScreenMatchResult] = None
+    post_verify: Optional[ScreenMatchResult] = None  # set later by verify_after
+    error: str = ""
+
+
+# ── Coordinate conversion ────────────────────────────────────────────
+
+
+def grounded_to_coords(
+    element: GroundedElement,
+    screen_width: int,
+    screen_height: int,
+) -> NavigateCoords:
+    """Convert GroundedElement (pixels) to NavigateCoords (normalized pct)."""
+    x_pct = element.center[0] / screen_width if screen_width else 0
+    y_pct = element.center[1] / screen_height if screen_height else 0
+    x1_pct = element.bbox[0] / screen_width if screen_width else 0
+    y1_pct = element.bbox[1] / screen_height if screen_height else 0
+    x2_pct = element.bbox[2] / screen_width if screen_width else 0
+    y2_pct = element.bbox[3] / screen_height if screen_height else 0
+    return NavigateCoords(
+        x_pct=x_pct,
+        y_pct=y_pct,
+        bbox_pct=(x1_pct, y1_pct, x2_pct, y2_pct),
+        method=element.method,
+    )
+
+
+# ── OCR adapter ──────────────────────────────────────────────────────
+
+
+def make_ocr_detailed_from_grid(
+    grid_fn: Callable[[str], List[List[Dict[str, Any]]]],
+) -> OcrDetailedClient:
+    """Adapt extract_grid_from_image → OcrDetailedClient (List[OcrTokenInfo]).
+
+    Converts the grid format (list of rows of cells with bbox) into
+    flat OcrTokenInfo list with normalized LTRB bbox.
+    """
+    from core.extraction.role_mapper import tokens_from_grid
+
+    def client(image_path: str) -> List[OcrTokenInfo]:
+        grid = grid_fn(image_path)
+        ocr_tokens = tokens_from_grid(grid)
+        return [
+            OcrTokenInfo(
+                text=t.text,
+                bbox=t.bbox,
+                confidence=t.confidence,
+            )
+            for t in ocr_tokens
+        ]
+
+    return client
+
+
+def make_ocr_simple_from_detailed(
+    ocr_detailed: OcrDetailedClient,
+) -> OcrClient:
+    """Derive text-only OcrClient from OcrDetailedClient."""
+    def client(image_path: str) -> List[str]:
+        return [t.text for t in ocr_detailed(image_path)]
+    return client
+
+
+# ── Navigate login orchestration ─────────────────────────────────────
+
+
+def navigate_login(
+    screenshot_path: str,
+    config: Optional[LoginFormConfig] = None,
+    ocr_client: Optional[OcrDetailedClient] = None,
+    vlm_client: Optional[VlmClient] = None,
+    screen_width: int = 1920,
+    screen_height: int = 1080,
+    coords_cache: Optional[CoordsCache] = None,
+    skip_pre_verify: bool = False,
+) -> NavigateResult:
+    """Orchestrate login navigation: verify → ground → convert coords.
+
+    Returns NavigateResult with normalized coords for each field.
+    The handler stores these in replay_state variables for subsequent
+    type/click actions.
+    """
+    if config is None:
+        config = dpi_urgences_login_config()
+
+    if ocr_client is None or vlm_client is None:
+        return NavigateResult(
+            all_resolved=False,
+            error="ocr_client and vlm_client required",
+        )
+
+    ocr_simple = make_ocr_simple_from_detailed(ocr_client)
+
+    # Step 1: Pre-verification (optional)
+    pre_verify = None
+    if not skip_pre_verify:
+        pre_verify = verify_login_visible(
+            screenshot_path, config, ocr_simple, vlm_client,
+        )
+        if not pre_verify.match:
+            logger.warning("navigate_login: pre-verify failed — %s", pre_verify.describe())
+            return NavigateResult(
+                all_resolved=False,
+                pre_verify=pre_verify,
+                error=f"pre-verify failed: {pre_verify.describe()}",
+            )
+
+    # Step 2: Ground all fields
+    resolution = resolve_login_form(
+        screenshot_path, config, ocr_client, vlm_client,
+        screen_width=screen_width, screen_height=screen_height,
+        coords_cache=coords_cache,
+    )
+
+    if not resolution.all_resolved:
+        logger.warning("navigate_login: incomplete resolution — %s", resolution.describe())
+        return NavigateResult(
+            all_resolved=False,
+            pre_verify=pre_verify,
+            error=f"incomplete resolution: {resolution.describe()}",
+        )
+
+    # Step 3: Convert to normalized coords
+    login_coords = grounded_to_coords(resolution.login_field, screen_width, screen_height) if resolution.login_field else None
+    password_coords = grounded_to_coords(resolution.password_field, screen_width, screen_height) if resolution.password_field else None
+    submit_coords = grounded_to_coords(resolution.submit_button, screen_width, screen_height) if resolution.submit_button else None
+
+    return NavigateResult(
+        login_coords=login_coords,
+        password_coords=password_coords,
+        submit_coords=submit_coords,
+        all_resolved=True,
+        pre_verify=pre_verify,
+    )