- core/navigation/ : visual_verifier (presence=OCR, role=VLM ancre sur tokens), grounding (OCR-anchor first, VLM fallback, cache coords valide par la vue), visual_login (verify_before/after, DETTE-023), action_resolver (pont runtime) - api_stream/replay_engine : dispatch action navigate server-side, never-fail -> needs_review, import depuis core.navigation (boot 5005 garanti) - 131 tests verts (wiring boot, e2e handler, unit modules) Chantier Qwen 01-02/07/2026, revue croisee Claude (plan deploy v2). Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
206 lines
7.0 KiB
Python
206 lines
7.0 KiB
Python
"""Action resolver — pont entre modules navigation et runtime replay.
|
|
|
|
Orchestre verify → ground → store coords pour le handler replay_engine.
|
|
Convertit coords pixels → normalisé (x_pct/y_pct) pour le client Agent V1.
|
|
|
|
Architecture :
|
|
- handler replay_engine = thin wrapper (appelle action_resolver)
|
|
- action_resolver = bridge (adapte OCR/VLM runtime → interfaces navigation)
|
|
- modules navigation = pure functions (ne connaissent pas le runtime)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
|
|
from core.navigation.grounding import (
|
|
BBox,
|
|
CoordsCache,
|
|
GroundedElement,
|
|
OcrDetailedClient,
|
|
OcrTokenInfo,
|
|
ground_element,
|
|
)
|
|
from core.navigation.visual_login import (
|
|
LoginFormConfig,
|
|
LoginResolution,
|
|
dpi_urgences_login_config,
|
|
resolve_login_form,
|
|
verify_login_visible,
|
|
verify_login_success,
|
|
)
|
|
from core.navigation.visual_verifier import (
|
|
OcrClient,
|
|
ScreenMatchResult,
|
|
VlmClient,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ── Dataclasses ──────────────────────────────────────────────────────
|
|
|
|
|
|
@dataclass
|
|
class NavigateCoords:
|
|
"""Normalized coords for a grounded element — format Agent V1 client."""
|
|
|
|
x_pct: float # center x normalized [0-1]
|
|
y_pct: float # center y normalized [0-1]
|
|
bbox_pct: Optional[Tuple[float, float, float, float]] = None # (x1, y1, x2, y2) normalized
|
|
method: str = "" # grounding method used
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
d = {"x_pct": self.x_pct, "y_pct": self.y_pct, "method": self.method}
|
|
if self.bbox_pct:
|
|
d["bbox_pct"] = list(self.bbox_pct)
|
|
return d
|
|
|
|
|
|
@dataclass
|
|
class NavigateResult:
|
|
"""Result of a navigate action — coords for each resolved field."""
|
|
|
|
login_coords: Optional[NavigateCoords] = None
|
|
password_coords: Optional[NavigateCoords] = None
|
|
submit_coords: Optional[NavigateCoords] = None
|
|
all_resolved: bool = False
|
|
pre_verify: Optional[ScreenMatchResult] = None
|
|
post_verify: Optional[ScreenMatchResult] = None # set later by verify_after
|
|
error: str = ""
|
|
|
|
|
|
# ── Coordinate conversion ────────────────────────────────────────────
|
|
|
|
|
|
def grounded_to_coords(
|
|
element: GroundedElement,
|
|
screen_width: int,
|
|
screen_height: int,
|
|
) -> NavigateCoords:
|
|
"""Convert GroundedElement (pixels) to NavigateCoords (normalized pct)."""
|
|
x_pct = element.center[0] / screen_width if screen_width else 0
|
|
y_pct = element.center[1] / screen_height if screen_height else 0
|
|
x1_pct = element.bbox[0] / screen_width if screen_width else 0
|
|
y1_pct = element.bbox[1] / screen_height if screen_height else 0
|
|
x2_pct = element.bbox[2] / screen_width if screen_width else 0
|
|
y2_pct = element.bbox[3] / screen_height if screen_height else 0
|
|
return NavigateCoords(
|
|
x_pct=x_pct,
|
|
y_pct=y_pct,
|
|
bbox_pct=(x1_pct, y1_pct, x2_pct, y2_pct),
|
|
method=element.method,
|
|
)
|
|
|
|
|
|
# ── OCR adapter ──────────────────────────────────────────────────────
|
|
|
|
|
|
def make_ocr_detailed_from_grid(
|
|
grid_fn: Callable[[str], List[List[Dict[str, Any]]]],
|
|
) -> OcrDetailedClient:
|
|
"""Adapt extract_grid_from_image → OcrDetailedClient (List[OcrTokenInfo]).
|
|
|
|
Converts the grid format (list of rows of cells with bbox) into
|
|
flat OcrTokenInfo list with normalized LTRB bbox.
|
|
"""
|
|
from core.extraction.role_mapper import tokens_from_grid
|
|
|
|
def client(image_path: str) -> List[OcrTokenInfo]:
|
|
grid = grid_fn(image_path)
|
|
ocr_tokens = tokens_from_grid(grid)
|
|
return [
|
|
OcrTokenInfo(
|
|
text=t.text,
|
|
bbox=t.bbox,
|
|
confidence=t.confidence,
|
|
)
|
|
for t in ocr_tokens
|
|
]
|
|
|
|
return client
|
|
|
|
|
|
def make_ocr_simple_from_detailed(
|
|
ocr_detailed: OcrDetailedClient,
|
|
) -> OcrClient:
|
|
"""Derive text-only OcrClient from OcrDetailedClient."""
|
|
def client(image_path: str) -> List[str]:
|
|
return [t.text for t in ocr_detailed(image_path)]
|
|
return client
|
|
|
|
|
|
# ── Navigate login orchestration ─────────────────────────────────────
|
|
|
|
|
|
def navigate_login(
|
|
screenshot_path: str,
|
|
config: Optional[LoginFormConfig] = None,
|
|
ocr_client: Optional[OcrDetailedClient] = None,
|
|
vlm_client: Optional[VlmClient] = None,
|
|
screen_width: int = 1920,
|
|
screen_height: int = 1080,
|
|
coords_cache: Optional[CoordsCache] = None,
|
|
skip_pre_verify: bool = False,
|
|
) -> NavigateResult:
|
|
"""Orchestrate login navigation: verify → ground → convert coords.
|
|
|
|
Returns NavigateResult with normalized coords for each field.
|
|
The handler stores these in replay_state variables for subsequent
|
|
type/click actions.
|
|
"""
|
|
if config is None:
|
|
config = dpi_urgences_login_config()
|
|
|
|
if ocr_client is None or vlm_client is None:
|
|
return NavigateResult(
|
|
all_resolved=False,
|
|
error="ocr_client and vlm_client required",
|
|
)
|
|
|
|
ocr_simple = make_ocr_simple_from_detailed(ocr_client)
|
|
|
|
# Step 1: Pre-verification (optional)
|
|
pre_verify = None
|
|
if not skip_pre_verify:
|
|
pre_verify = verify_login_visible(
|
|
screenshot_path, config, ocr_simple, vlm_client,
|
|
)
|
|
if not pre_verify.match:
|
|
logger.warning("navigate_login: pre-verify failed — %s", pre_verify.describe())
|
|
return NavigateResult(
|
|
all_resolved=False,
|
|
pre_verify=pre_verify,
|
|
error=f"pre-verify failed: {pre_verify.describe()}",
|
|
)
|
|
|
|
# Step 2: Ground all fields
|
|
resolution = resolve_login_form(
|
|
screenshot_path, config, ocr_client, vlm_client,
|
|
screen_width=screen_width, screen_height=screen_height,
|
|
coords_cache=coords_cache,
|
|
)
|
|
|
|
if not resolution.all_resolved:
|
|
logger.warning("navigate_login: incomplete resolution — %s", resolution.describe())
|
|
return NavigateResult(
|
|
all_resolved=False,
|
|
pre_verify=pre_verify,
|
|
error=f"incomplete resolution: {resolution.describe()}",
|
|
)
|
|
|
|
# Step 3: Convert to normalized coords
|
|
login_coords = grounded_to_coords(resolution.login_field, screen_width, screen_height) if resolution.login_field else None
|
|
password_coords = grounded_to_coords(resolution.password_field, screen_width, screen_height) if resolution.password_field else None
|
|
submit_coords = grounded_to_coords(resolution.submit_button, screen_width, screen_height) if resolution.submit_button else None
|
|
|
|
return NavigateResult(
|
|
login_coords=login_coords,
|
|
password_coords=password_coords,
|
|
submit_coords=submit_coords,
|
|
all_resolved=True,
|
|
pre_verify=pre_verify,
|
|
)
|