feat(navigation): brique login visuel OCR-ancre + action navigate au replay
- core/navigation/ : visual_verifier (presence=OCR, role=VLM ancre sur tokens), grounding (OCR-anchor first, VLM fallback, cache coords valide par la vue), visual_login (verify_before/after, DETTE-023), action_resolver (pont runtime) - api_stream/replay_engine : dispatch action navigate server-side, never-fail -> needs_review, import depuis core.navigation (boot 5005 garanti) - 131 tests verts (wiring boot, e2e handler, unit modules) Chantier Qwen 01-02/07/2026, revue croisee Claude (plan deploy v2). Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
205
core/navigation/action_resolver.py
Normal file
205
core/navigation/action_resolver.py
Normal file
@@ -0,0 +1,205 @@
|
||||
"""Action resolver — pont entre modules navigation et runtime replay.
|
||||
|
||||
Orchestre verify → ground → store coords pour le handler replay_engine.
|
||||
Convertit coords pixels → normalisé (x_pct/y_pct) pour le client Agent V1.
|
||||
|
||||
Architecture :
|
||||
- handler replay_engine = thin wrapper (appelle action_resolver)
|
||||
- action_resolver = bridge (adapte OCR/VLM runtime → interfaces navigation)
|
||||
- modules navigation = pure functions (ne connaissent pas le runtime)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||
|
||||
from core.navigation.grounding import (
|
||||
BBox,
|
||||
CoordsCache,
|
||||
GroundedElement,
|
||||
OcrDetailedClient,
|
||||
OcrTokenInfo,
|
||||
ground_element,
|
||||
)
|
||||
from core.navigation.visual_login import (
|
||||
LoginFormConfig,
|
||||
LoginResolution,
|
||||
dpi_urgences_login_config,
|
||||
resolve_login_form,
|
||||
verify_login_visible,
|
||||
verify_login_success,
|
||||
)
|
||||
from core.navigation.visual_verifier import (
|
||||
OcrClient,
|
||||
ScreenMatchResult,
|
||||
VlmClient,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ── Dataclasses ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@dataclass
|
||||
class NavigateCoords:
|
||||
"""Normalized coords for a grounded element — format Agent V1 client."""
|
||||
|
||||
x_pct: float # center x normalized [0-1]
|
||||
y_pct: float # center y normalized [0-1]
|
||||
bbox_pct: Optional[Tuple[float, float, float, float]] = None # (x1, y1, x2, y2) normalized
|
||||
method: str = "" # grounding method used
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
d = {"x_pct": self.x_pct, "y_pct": self.y_pct, "method": self.method}
|
||||
if self.bbox_pct:
|
||||
d["bbox_pct"] = list(self.bbox_pct)
|
||||
return d
|
||||
|
||||
|
||||
@dataclass
|
||||
class NavigateResult:
|
||||
"""Result of a navigate action — coords for each resolved field."""
|
||||
|
||||
login_coords: Optional[NavigateCoords] = None
|
||||
password_coords: Optional[NavigateCoords] = None
|
||||
submit_coords: Optional[NavigateCoords] = None
|
||||
all_resolved: bool = False
|
||||
pre_verify: Optional[ScreenMatchResult] = None
|
||||
post_verify: Optional[ScreenMatchResult] = None # set later by verify_after
|
||||
error: str = ""
|
||||
|
||||
|
||||
# ── Coordinate conversion ────────────────────────────────────────────
|
||||
|
||||
|
||||
def grounded_to_coords(
|
||||
element: GroundedElement,
|
||||
screen_width: int,
|
||||
screen_height: int,
|
||||
) -> NavigateCoords:
|
||||
"""Convert GroundedElement (pixels) to NavigateCoords (normalized pct)."""
|
||||
x_pct = element.center[0] / screen_width if screen_width else 0
|
||||
y_pct = element.center[1] / screen_height if screen_height else 0
|
||||
x1_pct = element.bbox[0] / screen_width if screen_width else 0
|
||||
y1_pct = element.bbox[1] / screen_height if screen_height else 0
|
||||
x2_pct = element.bbox[2] / screen_width if screen_width else 0
|
||||
y2_pct = element.bbox[3] / screen_height if screen_height else 0
|
||||
return NavigateCoords(
|
||||
x_pct=x_pct,
|
||||
y_pct=y_pct,
|
||||
bbox_pct=(x1_pct, y1_pct, x2_pct, y2_pct),
|
||||
method=element.method,
|
||||
)
|
||||
|
||||
|
||||
# ── OCR adapter ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def make_ocr_detailed_from_grid(
|
||||
grid_fn: Callable[[str], List[List[Dict[str, Any]]]],
|
||||
) -> OcrDetailedClient:
|
||||
"""Adapt extract_grid_from_image → OcrDetailedClient (List[OcrTokenInfo]).
|
||||
|
||||
Converts the grid format (list of rows of cells with bbox) into
|
||||
flat OcrTokenInfo list with normalized LTRB bbox.
|
||||
"""
|
||||
from core.extraction.role_mapper import tokens_from_grid
|
||||
|
||||
def client(image_path: str) -> List[OcrTokenInfo]:
|
||||
grid = grid_fn(image_path)
|
||||
ocr_tokens = tokens_from_grid(grid)
|
||||
return [
|
||||
OcrTokenInfo(
|
||||
text=t.text,
|
||||
bbox=t.bbox,
|
||||
confidence=t.confidence,
|
||||
)
|
||||
for t in ocr_tokens
|
||||
]
|
||||
|
||||
return client
|
||||
|
||||
|
||||
def make_ocr_simple_from_detailed(
|
||||
ocr_detailed: OcrDetailedClient,
|
||||
) -> OcrClient:
|
||||
"""Derive text-only OcrClient from OcrDetailedClient."""
|
||||
def client(image_path: str) -> List[str]:
|
||||
return [t.text for t in ocr_detailed(image_path)]
|
||||
return client
|
||||
|
||||
|
||||
# ── Navigate login orchestration ─────────────────────────────────────
|
||||
|
||||
|
||||
def navigate_login(
|
||||
screenshot_path: str,
|
||||
config: Optional[LoginFormConfig] = None,
|
||||
ocr_client: Optional[OcrDetailedClient] = None,
|
||||
vlm_client: Optional[VlmClient] = None,
|
||||
screen_width: int = 1920,
|
||||
screen_height: int = 1080,
|
||||
coords_cache: Optional[CoordsCache] = None,
|
||||
skip_pre_verify: bool = False,
|
||||
) -> NavigateResult:
|
||||
"""Orchestrate login navigation: verify → ground → convert coords.
|
||||
|
||||
Returns NavigateResult with normalized coords for each field.
|
||||
The handler stores these in replay_state variables for subsequent
|
||||
type/click actions.
|
||||
"""
|
||||
if config is None:
|
||||
config = dpi_urgences_login_config()
|
||||
|
||||
if ocr_client is None or vlm_client is None:
|
||||
return NavigateResult(
|
||||
all_resolved=False,
|
||||
error="ocr_client and vlm_client required",
|
||||
)
|
||||
|
||||
ocr_simple = make_ocr_simple_from_detailed(ocr_client)
|
||||
|
||||
# Step 1: Pre-verification (optional)
|
||||
pre_verify = None
|
||||
if not skip_pre_verify:
|
||||
pre_verify = verify_login_visible(
|
||||
screenshot_path, config, ocr_simple, vlm_client,
|
||||
)
|
||||
if not pre_verify.match:
|
||||
logger.warning("navigate_login: pre-verify failed — %s", pre_verify.describe())
|
||||
return NavigateResult(
|
||||
all_resolved=False,
|
||||
pre_verify=pre_verify,
|
||||
error=f"pre-verify failed: {pre_verify.describe()}",
|
||||
)
|
||||
|
||||
# Step 2: Ground all fields
|
||||
resolution = resolve_login_form(
|
||||
screenshot_path, config, ocr_client, vlm_client,
|
||||
screen_width=screen_width, screen_height=screen_height,
|
||||
coords_cache=coords_cache,
|
||||
)
|
||||
|
||||
if not resolution.all_resolved:
|
||||
logger.warning("navigate_login: incomplete resolution — %s", resolution.describe())
|
||||
return NavigateResult(
|
||||
all_resolved=False,
|
||||
pre_verify=pre_verify,
|
||||
error=f"incomplete resolution: {resolution.describe()}",
|
||||
)
|
||||
|
||||
# Step 3: Convert to normalized coords
|
||||
login_coords = grounded_to_coords(resolution.login_field, screen_width, screen_height) if resolution.login_field else None
|
||||
password_coords = grounded_to_coords(resolution.password_field, screen_width, screen_height) if resolution.password_field else None
|
||||
submit_coords = grounded_to_coords(resolution.submit_button, screen_width, screen_height) if resolution.submit_button else None
|
||||
|
||||
return NavigateResult(
|
||||
login_coords=login_coords,
|
||||
password_coords=password_coords,
|
||||
submit_coords=submit_coords,
|
||||
all_resolved=True,
|
||||
pre_verify=pre_verify,
|
||||
)
|
||||
Reference in New Issue
Block a user