diff --git a/agent_v0/server_v1/api_stream.py b/agent_v0/server_v1/api_stream.py index 0dd590f06..26f64b460 100644 --- a/agent_v0/server_v1/api_stream.py +++ b/agent_v0/server_v1/api_stream.py @@ -436,6 +436,9 @@ from .replay_engine import ( _notify_error_callback as _notify_error_callback_impl, ) +# Navigate handler — import direct depuis core/navigation (pas via replay_engine) +from core.navigation import _handle_navigate_action + # Wrappers pour les fonctions replay_engine qui accèdent aux variables globales du module. @@ -4453,6 +4456,15 @@ async def get_next_action(session_id: str, machine_id: str = "default"): ), timeout=180, ) + elif type_ == "navigate": + await asyncio.wait_for( + loop.run_in_executor( + None, + _handle_navigate_action, + action, owning_replay, session_id, + ), + timeout=180, + ) elif type_ == "t2a_decision": await asyncio.wait_for( loop.run_in_executor( diff --git a/agent_v0/server_v1/replay_engine.py b/agent_v0/server_v1/replay_engine.py index 3c08b2eaf..37a996a36 100644 --- a/agent_v0/server_v1/replay_engine.py +++ b/agent_v0/server_v1/replay_engine.py @@ -41,6 +41,7 @@ _ALLOWED_ACTION_TYPES = { "extract_text", # OCR serveur sur dernier heartbeat → variable workflow "extract_table", # OCR serveur + filtre regex → liste structurée (boucle) "extract_dossier", # OCR grille structurée → dossier patient persisté (brique 3) + "navigate", # Navigation visuelle → coords login/recherche (brique navigation) "extract_text_scroll", # Marker côté graphe — expansé en sous-actions par _edge_to_normalized_actions "_concat_text_vars", # Action serveur interne (générée par expansion extract_text_scroll) "t2a_decision", # Analyse LLM facturation T2A → variable workflow @@ -55,6 +56,7 @@ _SERVER_SIDE_ACTION_TYPES = { "extract_text", "extract_table", "extract_dossier", + "navigate", "t2a_decision", "llm_generate", "_concat_text_vars", diff --git a/core/navigation/__init__.py b/core/navigation/__init__.py new file mode 100644 index 000000000..114b4c01d --- /dev/null +++ b/core/navigation/__init__.py @@ -0,0 +1,119 @@ +"""Navigation brique — login visuel, recherche dossiers, vérification écran. + +Modules : +- visual_verifier : verify_before / verify_after chaque action (vision = validateur, OCR-ancré) +- grounding : résolution visuelle d'éléments UI (OCR-anchor first, VLM fallback, coords cache) +- visual_login : login form resolution + verification (DPI urgences default config) +- action_resolver : pont navigation → runtime (coords normalisés, OCR/VLM adapters) + +Pattern d'injection : VlmClient + OcrClient + OcrDetailedClient injectables +""" + +from .visual_verifier import verify_screen_match, ScreenMatchResult +from .action_resolver import navigate_login, NavigateResult + +__all__ = [ + "verify_screen_match", + "ScreenMatchResult", + "navigate_login", + "NavigateResult", + "_handle_navigate_action", +] + +# Handler pour replay_engine — importé par api_stream.py +def _handle_navigate_action( + action: dict, + replay_state: dict, + session_id: str, +) -> bool: + """Handler serveur pour action navigate (branchement replay_engine). + + Thin wrapper : résout coords du login form et les stocke dans + replay_state["variables"] pour les actions type/click suivantes. + + N'échoue jamais le replay — toute erreur → log + needs_review. + """ + import logging + logger = logging.getLogger("navigation._handle_navigate_action") + + params = action.get("parameters") or {} + navigate_action = params.get("action", "login") + + # Noms des variables output (configurable) + login_var = (params.get("login_coords_var") or "navigate_login_coords").strip() + password_var = (params.get("password_coords_var") or "navigate_password_coords").strip() + submit_var = (params.get("submit_coords_var") or "navigate_submit_coords").strip() + + variables = replay_state.setdefault("variables", {}) + + try: + screenshot_path = "" + # Résoudre screenshot depuis replay_state + if "last_screenshot_path" in replay_state: + screenshot_path = replay_state["last_screenshot_path"] + elif "last_heartbeat" in replay_state: + hb = replay_state["last_heartbeat"] + screenshot_path = hb.get("screenshot_path", "") if isinstance(hb, dict) else "" + + if not screenshot_path: + logger.warning("navigate: no screenshot for session %s", session_id) + variables[login_var] = {"error": "no_screenshot"} + return False + + # Dimensions écran (fallback 1920×1080) + screen_width = replay_state.get("screen_width", 1920) + screen_height = replay_state.get("screen_height", 1080) + + # OCR/VLM clients — lazy import pour éviter circular dependency + from core.llm import extract_grid_from_image + from core.extraction.vlm_client import make_vllm_client + from core.navigation.action_resolver import make_ocr_detailed_from_grid + + ocr_detailed = make_ocr_detailed_from_grid(extract_grid_from_image) + vlm_client = make_vllm_client() + + # Config login + from core.navigation.visual_login import LoginFormConfig, dpi_urgences_login_config + config = dpi_urgences_login_config() + if "login_field" in params: + config = LoginFormConfig( + login_field=params.get("login_field", config.login_field), + password_field=params.get("password_field", config.password_field), + submit_button=params.get("submit_button", config.submit_button), + success_elements=params.get("success_elements", config.success_elements), + context=params.get("context", config.context), + ) + + # Orchestration navigate + from core.navigation.action_resolver import navigate_login + result = navigate_login( + screenshot_path, config=config, + ocr_client=ocr_detailed, vlm_client=vlm_client, + screen_width=screen_width, screen_height=screen_height, + ) + + # Stocker coords dans variables (format dict pour substitution) + if result.login_coords: + variables[login_var] = result.login_coords.to_dict() + if result.password_coords: + variables[password_var] = result.password_coords.to_dict() + if result.submit_coords: + variables[submit_var] = result.submit_coords.to_dict() + + variables["navigate_result"] = { + "all_resolved": result.all_resolved, + "method": result.login_coords.method if result.login_coords else "", + "error": result.error, + } + + if not result.all_resolved: + logger.warning("navigate: incomplete — %s", result.error) + return False + + logger.info("navigate: login form resolved OK (method=%s)", result.login_coords.method if result.login_coords else "?") + return True + + except Exception as e: + logger.warning("navigate: exception (%s) — needs_review", e) + variables["navigate_result"] = {"all_resolved": False, "error": str(e)} + return False diff --git a/core/navigation/action_resolver.py b/core/navigation/action_resolver.py new file mode 100644 index 000000000..f5680a2cc --- /dev/null +++ b/core/navigation/action_resolver.py @@ -0,0 +1,205 @@ +"""Action resolver — pont entre modules navigation et runtime replay. + +Orchestre verify → ground → store coords pour le handler replay_engine. +Convertit coords pixels → normalisé (x_pct/y_pct) pour le client Agent V1. + +Architecture : +- handler replay_engine = thin wrapper (appelle action_resolver) +- action_resolver = bridge (adapte OCR/VLM runtime → interfaces navigation) +- modules navigation = pure functions (ne connaissent pas le runtime) +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import Any, Callable, Dict, List, Optional, Tuple + +from core.navigation.grounding import ( + BBox, + CoordsCache, + GroundedElement, + OcrDetailedClient, + OcrTokenInfo, + ground_element, +) +from core.navigation.visual_login import ( + LoginFormConfig, + LoginResolution, + dpi_urgences_login_config, + resolve_login_form, + verify_login_visible, + verify_login_success, +) +from core.navigation.visual_verifier import ( + OcrClient, + ScreenMatchResult, + VlmClient, +) + +logger = logging.getLogger(__name__) + + +# ── Dataclasses ────────────────────────────────────────────────────── + + +@dataclass +class NavigateCoords: + """Normalized coords for a grounded element — format Agent V1 client.""" + + x_pct: float # center x normalized [0-1] + y_pct: float # center y normalized [0-1] + bbox_pct: Optional[Tuple[float, float, float, float]] = None # (x1, y1, x2, y2) normalized + method: str = "" # grounding method used + + def to_dict(self) -> Dict[str, Any]: + d = {"x_pct": self.x_pct, "y_pct": self.y_pct, "method": self.method} + if self.bbox_pct: + d["bbox_pct"] = list(self.bbox_pct) + return d + + +@dataclass +class NavigateResult: + """Result of a navigate action — coords for each resolved field.""" + + login_coords: Optional[NavigateCoords] = None + password_coords: Optional[NavigateCoords] = None + submit_coords: Optional[NavigateCoords] = None + all_resolved: bool = False + pre_verify: Optional[ScreenMatchResult] = None + post_verify: Optional[ScreenMatchResult] = None # set later by verify_after + error: str = "" + + +# ── Coordinate conversion ──────────────────────────────────────────── + + +def grounded_to_coords( + element: GroundedElement, + screen_width: int, + screen_height: int, +) -> NavigateCoords: + """Convert GroundedElement (pixels) to NavigateCoords (normalized pct).""" + x_pct = element.center[0] / screen_width if screen_width else 0 + y_pct = element.center[1] / screen_height if screen_height else 0 + x1_pct = element.bbox[0] / screen_width if screen_width else 0 + y1_pct = element.bbox[1] / screen_height if screen_height else 0 + x2_pct = element.bbox[2] / screen_width if screen_width else 0 + y2_pct = element.bbox[3] / screen_height if screen_height else 0 + return NavigateCoords( + x_pct=x_pct, + y_pct=y_pct, + bbox_pct=(x1_pct, y1_pct, x2_pct, y2_pct), + method=element.method, + ) + + +# ── OCR adapter ────────────────────────────────────────────────────── + + +def make_ocr_detailed_from_grid( + grid_fn: Callable[[str], List[List[Dict[str, Any]]]], +) -> OcrDetailedClient: + """Adapt extract_grid_from_image → OcrDetailedClient (List[OcrTokenInfo]). + + Converts the grid format (list of rows of cells with bbox) into + flat OcrTokenInfo list with normalized LTRB bbox. + """ + from core.extraction.role_mapper import tokens_from_grid + + def client(image_path: str) -> List[OcrTokenInfo]: + grid = grid_fn(image_path) + ocr_tokens = tokens_from_grid(grid) + return [ + OcrTokenInfo( + text=t.text, + bbox=t.bbox, + confidence=t.confidence, + ) + for t in ocr_tokens + ] + + return client + + +def make_ocr_simple_from_detailed( + ocr_detailed: OcrDetailedClient, +) -> OcrClient: + """Derive text-only OcrClient from OcrDetailedClient.""" + def client(image_path: str) -> List[str]: + return [t.text for t in ocr_detailed(image_path)] + return client + + +# ── Navigate login orchestration ───────────────────────────────────── + + +def navigate_login( + screenshot_path: str, + config: Optional[LoginFormConfig] = None, + ocr_client: Optional[OcrDetailedClient] = None, + vlm_client: Optional[VlmClient] = None, + screen_width: int = 1920, + screen_height: int = 1080, + coords_cache: Optional[CoordsCache] = None, + skip_pre_verify: bool = False, +) -> NavigateResult: + """Orchestrate login navigation: verify → ground → convert coords. + + Returns NavigateResult with normalized coords for each field. + The handler stores these in replay_state variables for subsequent + type/click actions. + """ + if config is None: + config = dpi_urgences_login_config() + + if ocr_client is None or vlm_client is None: + return NavigateResult( + all_resolved=False, + error="ocr_client and vlm_client required", + ) + + ocr_simple = make_ocr_simple_from_detailed(ocr_client) + + # Step 1: Pre-verification (optional) + pre_verify = None + if not skip_pre_verify: + pre_verify = verify_login_visible( + screenshot_path, config, ocr_simple, vlm_client, + ) + if not pre_verify.match: + logger.warning("navigate_login: pre-verify failed — %s", pre_verify.describe()) + return NavigateResult( + all_resolved=False, + pre_verify=pre_verify, + error=f"pre-verify failed: {pre_verify.describe()}", + ) + + # Step 2: Ground all fields + resolution = resolve_login_form( + screenshot_path, config, ocr_client, vlm_client, + screen_width=screen_width, screen_height=screen_height, + coords_cache=coords_cache, + ) + + if not resolution.all_resolved: + logger.warning("navigate_login: incomplete resolution — %s", resolution.describe()) + return NavigateResult( + all_resolved=False, + pre_verify=pre_verify, + error=f"incomplete resolution: {resolution.describe()}", + ) + + # Step 3: Convert to normalized coords + login_coords = grounded_to_coords(resolution.login_field, screen_width, screen_height) if resolution.login_field else None + password_coords = grounded_to_coords(resolution.password_field, screen_width, screen_height) if resolution.password_field else None + submit_coords = grounded_to_coords(resolution.submit_button, screen_width, screen_height) if resolution.submit_button else None + + return NavigateResult( + login_coords=login_coords, + password_coords=password_coords, + submit_coords=submit_coords, + all_resolved=True, + pre_verify=pre_verify, + ) diff --git a/core/navigation/grounding.py b/core/navigation/grounding.py new file mode 100644 index 000000000..4edcc181e --- /dev/null +++ b/core/navigation/grounding.py @@ -0,0 +1,375 @@ +"""Grounding — résolution visuelle d'éléments UI → coords (bbox + center). + +Architecture OCR-ancrée (alignée avec visual_verifier) : +- STRATÉGIE 1 : OCR-anchor — si le texte cible est trouvé par OCR, + utiliser le bbox du token OCR (déterministe, zero hallucination). +- STRATÉGIE 2 : VLM grounder — si OCR ne trouve pas le texte, + le VLM localise l'élément visuellement (fallback, risque contrôlé). +- CACHE coords : mémorise les coords résolues, validées par vision avant usage. + Si cached coords fail → re-résolution visuelle. + +Coords = cache local validé par vue (Dom/Claude recadrage 01/07). +Vision = source de vérité, coords = shortcut validé. + +BBox format interne : LTRB (x1, y1, x2, y2) pixels absolus — +cohérent avec SomElement, OcrToken, DetectedUIElement. +""" + +from __future__ import annotations + +import json +import logging +import re +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional, Tuple + +from core.navigation.visual_verifier import ( + fuzzy_match, + normalize_text, + OcrClient, + VlmClient, +) + +logger = logging.getLogger(__name__) + +# BBox format: LTRB pixels (x1, y1, x2, y2) +BBox = Tuple[int, int, int, int] + + +# ── Dataclasses ────────────────────────────────────────────────────── + + +@dataclass +class OcrTokenInfo: + """OCR token with bounding box — for grounding (richer than text-only).""" + + text: str + bbox: Optional[BBox] = None # (x1, y1, x2, y2) LTRB pixels + confidence: float = 1.0 + + +# Type alias — injectable OCR client returning tokens with bbox +# More detailed than visual_verifier's OcrClient (which returns List[str]) +OcrDetailedClient = Callable[[str], List[OcrTokenInfo]] + + +@dataclass +class GroundedElement: + """A UI element grounded on screen with coordinates.""" + + role: str + text: str + bbox: BBox # (x1, y1, x2, y2) LTRB pixels + center: Tuple[int, int] # (cx, cy) — click target + confidence: float + method: str # "ocr_anchor" or "vlm_grounder" or "cache" + source_ocr_text: str = "" # actual OCR text that matched (for fuzzy) + + +@dataclass +class CoordsCacheEntry: + """Cached coordinates for a UI element.""" + + element_key: str # "role:text" + bbox: BBox + center: Tuple[int, int] + method: str # how it was originally resolved + validation_count: int = 0 + + +class CoordsCache: + """In-memory cache of grounded coordinates. + + Entries are validated by vision before use (verify_after). + If cached coords fail verification → invalidate + re-resolve. + """ + + def __init__(self) -> None: + self._entries: Dict[str, CoordsCacheEntry] = {} + + def get(self, element_key: str) -> Optional[CoordsCacheEntry]: + return self._entries.get(element_key) + + def put( + self, + element_key: str, + bbox: BBox, + center: Tuple[int, int], + method: str, + ) -> None: + entry = self._entries.get(element_key) + if entry: + entry.bbox = bbox + entry.center = center + entry.method = method + entry.validation_count += 1 + else: + self._entries[element_key] = CoordsCacheEntry( + element_key=element_key, + bbox=bbox, + center=center, + method=method, + validation_count=1, + ) + + def invalidate(self, element_key: str) -> None: + self._entries.pop(element_key, None) + + def clear(self) -> None: + self._entries.clear() + + def keys(self) -> List[str]: + return list(self._entries.keys()) + + +# ── Helper functions ───────────────────────────────────────────────── + + +def bbox_center(bbox: BBox) -> Tuple[int, int]: + """Compute center point from LTRB bbox.""" + x1, y1, x2, y2 = bbox + return ((x1 + x2) // 2, (y1 + y2) // 2) + + +def make_element_key(role: str, text: str) -> str: + """Create a stable cache key from role + text.""" + return f"{role}:{normalize_text(text)}" + + +# ── OCR-anchored grounding (deterministic) ─────────────────────────── + + +def ocr_anchor_ground( + ocr_tokens: List[OcrTokenInfo], + target: Dict[str, Any], + fuzzy_threshold: float = 0.8, +) -> Optional[GroundedElement]: + """Ground an element using OCR tokens with bbox (deterministic). + + Finds the target text in OCR tokens via fuzzy match. + Returns GroundedElement with bbox from the matching OCR token. + """ + target_text = target.get("text", "") + target_role = target.get("role", "?") + + if not target_text: + return None + + for token in ocr_tokens: + if fuzzy_match(target_text, token.text, threshold=fuzzy_threshold): + if token.bbox is None: + continue # token found but no bbox → can't ground + + return GroundedElement( + role=target_role, + text=target_text, + bbox=token.bbox, + center=bbox_center(token.bbox), + confidence=token.confidence, + method="ocr_anchor", + source_ocr_text=token.text, + ) + + return None + + +# ── VLM grounder (fallback) ───────────────────────────────────────── + + +def build_grounder_prompt( + target: Dict[str, Any], + context: str = "", +) -> str: + """Build VLM prompt for locating a UI element on screen. + + Asks for bounding box in normalized coordinates [0-1]. + """ + role = target.get("role", "?") + text = target.get("text", "") + extra = target.get("extra", "") + + prompt = ( + "You are a UI element locator. Find the specified element on this " + "screenshot and return its bounding box.\n" + ) + if context: + prompt += f"Context: {context}\n" + prompt += f"Target element: {role} with text \"{text}\"" + if extra: + prompt += f" ({extra})" + prompt += ( + "\n\nRespond in JSON format:\n" + "{\"found\": true/false, " + "\"bbox\": [x1_norm, y1_norm, x2_norm, y2_norm], " + "\"confidence\": 0.0-1.0, " + "\"description\": \"...\"}\n" + "bbox coordinates are normalized [0.0-1.0] relative to image dimensions " + "(x1=left, y1=top, x2=right, y2=bottom). " + "Only return found=true if you can clearly locate the element." + ) + return prompt + + +def parse_grounder_response( + vlm_text: str, + screen_width: int, + screen_height: int, + target: Dict[str, Any], +) -> Optional[GroundedElement]: + """Parse VLM grounder response into GroundedElement. + + Converts normalized bbox [0-1] to absolute pixels. + """ + try: + data = json.loads(vlm_text) + except json.JSONDecodeError: + json_match = re.search(r"\{[\s\S]*\}", vlm_text) + if json_match: + try: + data = json.loads(json_match.group()) + except json.JSONDecodeError: + logger.warning("grounding: VLM response not parseable as JSON") + return None + else: + return None + + if not data.get("found", False): + return None + + bbox_norm = data.get("bbox", []) + if not isinstance(bbox_norm, list) or len(bbox_norm) != 4: + logger.warning("grounding: invalid bbox format from VLM") + return None + + # Convert normalized [0-1] to absolute pixels + try: + x1 = int(float(bbox_norm[0]) * screen_width) + y1 = int(float(bbox_norm[1]) * screen_height) + x2 = int(float(bbox_norm[2]) * screen_width) + y2 = int(float(bbox_norm[3]) * screen_height) + except (ValueError, TypeError): + logger.warning("grounding: bbox values not numeric") + return None + + # Clamp to screen bounds + x1 = max(0, min(x1, screen_width)) + y1 = max(0, min(y1, screen_height)) + x2 = max(x1, min(x2, screen_width)) + y2 = max(y1, min(y2, screen_height)) + + confidence = data.get("confidence", 0.5) + if isinstance(confidence, str): + try: + confidence = float(confidence) + except ValueError: + confidence = 0.5 + + bbox_abs: BBox = (x1, y1, x2, y2) + + return GroundedElement( + role=target.get("role", "?"), + text=target.get("text", ""), + bbox=bbox_abs, + center=bbox_center(bbox_abs), + confidence=confidence, + method="vlm_grounder", + ) + + +# ── Core grounding function (composition) ─────────────────────────── + + +def ground_element( + screenshot_path: str, + target: Dict[str, Any], + ocr_client: OcrDetailedClient, + vlm_client: VlmClient, + screen_width: int = 1920, + screen_height: int = 1080, + coords_cache: Optional[CoordsCache] = None, + context: str = "", + fuzzy_threshold: float = 0.8, +) -> Optional[GroundedElement]: + """Ground a UI element on screen — OCR-anchor first, VLM fallback. + + Resolution strategy: + 1. Cache: if cached coords exist → return cached (validated separately) + 2. OCR-anchor: deterministic, zero hallucination + 3. VLM grounder: fallback when OCR can't find the text + + Args: + screenshot_path: path to screenshot image + target: {"role": "bouton", "text": "Connexion"} — element to find + ocr_client: injectable OCR client returning List[OcrTokenInfo] + vlm_client: injectable VLM client (image_path, prompt) -> text + screen_width/height: screen dimensions for pixel conversion + coords_cache: optional CoordsCache for memoization + context: optional context (e.g. "page login DPI") + fuzzy_threshold: fuzzy match threshold for OCR anchoring + + Returns: + GroundedElement with bbox + center, or None if not found + """ + target_text = target.get("text", "") + target_role = target.get("role", "?") + element_key = make_element_key(target_role, target_text) + + # Step 0: Check cache + if coords_cache: + cached = coords_cache.get(element_key) + if cached: + cached.validation_count += 1 + logger.info("grounding: using cached coords for %s", element_key) + return GroundedElement( + role=target_role, + text=target_text, + bbox=cached.bbox, + center=cached.center, + confidence=1.0, # cached = previously validated + method="cache", + ) + + # Step 1: OCR-anchor (deterministic) + try: + ocr_tokens = ocr_client(screenshot_path) + except Exception as e: + logger.warning("grounding: OCR call failed (%s)", e) + ocr_tokens = [] + + ocr_result = ocr_anchor_ground(ocr_tokens, target, fuzzy_threshold) + + if ocr_result: + if coords_cache: + coords_cache.put(element_key, ocr_result.bbox, ocr_result.center, "ocr_anchor") + logger.info( + "grounding: OCR-anchor found '%s' (matched OCR='%s', conf=%.2f)", + target_text, ocr_result.source_ocr_text, ocr_result.confidence, + ) + return ocr_result + + # Step 2: VLM grounder (fallback) + if not target_text: + logger.warning("grounding: no text for target, VLM grounder needs text") + return None + + prompt = build_grounder_prompt(target, context) + + try: + vlm_text = vlm_client(screenshot_path, prompt) + except Exception as e: + logger.warning("grounding: VLM grounder call failed (%s)", e) + return None + + vlm_result = parse_grounder_response(vlm_text, screen_width, screen_height, target) + + if vlm_result: + if coords_cache: + coords_cache.put(element_key, vlm_result.bbox, vlm_result.center, "vlm_grounder") + logger.info( + "grounding: VLM grounder found '%s' (conf=%.2f)", + target_text, vlm_result.confidence, + ) + return vlm_result + + logger.warning("grounding: element '%s' not found by OCR or VLM", target_text) + return None diff --git a/core/navigation/visual_login.py b/core/navigation/visual_login.py new file mode 100644 index 000000000..b1647dd9d --- /dev/null +++ b/core/navigation/visual_login.py @@ -0,0 +1,227 @@ +"""Visual login — résolution + vérification du formulaire de login par grounding. + +Architecture (alignée visual_verifier + grounding) : +- verify_before : formulaire login visible (champs + bouton présents) +- resolve_login_form : ground chaque champ (login, password, bouton) → coords +- verify_after : dashboard/accueil visible (post-login) +- Chaque étape encadrée par vision (DETTE-023 couvert) + +Coords = cache local validé par vue (Dom/Claude recadrage). +Le runtime exécute les actions (type/click) — ce module résout + valide. +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional, Tuple + +from core.navigation.grounding import ( + BBox, + CoordsCache, + GroundedElement, + OcrDetailedClient, + OcrTokenInfo, + ground_element, +) +from core.navigation.visual_verifier import ( + OcrClient, + ScreenMatchResult, + VlmClient, + verify_before, + verify_after, +) + +logger = logging.getLogger(__name__) + + +# ── Dataclasses ────────────────────────────────────────────────────── + + +@dataclass +class LoginFormConfig: + """Configuration for a login form — what to look for.""" + + login_field: Dict[str, Any] # {"role": "champ", "text": "Login"} + password_field: Dict[str, Any] # {"role": "champ", "text": "Mot de passe"} + submit_button: Dict[str, Any] # {"role": "bouton", "text": "Connexion"} + success_elements: List[Dict[str, Any]] = field(default_factory=list) + context: str = "" # e.g. "DPI urgences" + + +@dataclass +class LoginResolution: + """Result of login form resolution — grounded coords for each field.""" + + login_field: Optional[GroundedElement] = None + password_field: Optional[GroundedElement] = None + submit_button: Optional[GroundedElement] = None + all_resolved: bool = False + method: str = "" # "ocr_anchor", "vlm_grounder", "mixed", "cache" + + def describe(self) -> str: + parts = [] + if self.login_field: + parts.append(f"login@{self.login_field.center} ({self.login_field.method})") + else: + parts.append("login: NOT FOUND") + if self.password_field: + parts.append(f"password@{self.password_field.center} ({self.password_field.method})") + else: + parts.append("password: NOT FOUND") + if self.submit_button: + parts.append(f"button@{self.submit_button.center} ({self.submit_button.method})") + else: + parts.append("button: NOT FOUND") + status = "OK" if self.all_resolved else "INCOMPLETE" + return f"Login resolution [{status}]: " + ", ".join(parts) + + +# ── Default configs ────────────────────────────────────────────────── + + +def dpi_urgences_login_config() -> LoginFormConfig: + """Default config for DPI urgences login form.""" + return LoginFormConfig( + login_field={"role": "champ", "text": "Login", "extra": "champ identifiant"}, + password_field={"role": "champ", "text": "Mot de passe", "extra": "champ password"}, + submit_button={"role": "bouton", "text": "Connexion", "extra": "bouton submit"}, + success_elements=[ + {"role": "page", "text": "Accueil"}, + {"role": "page", "text": "Dashboard"}, + ], + context="DPI urgences — page login", + ) + + +# ── Helper ─────────────────────────────────────────────────────────── + + +def _ocr_detailed_to_simple(ocr_detailed: OcrDetailedClient) -> OcrClient: + """Convert OcrDetailedClient (text+bbox) to OcrClient (text-only) for verification.""" + def client(image_path: str) -> List[str]: + return [t.text for t in ocr_detailed(image_path)] + return client + + +# ── Core functions ─────────────────────────────────────────────────── + + +def verify_login_visible( + screenshot_path: str, + config: LoginFormConfig, + ocr_client: OcrClient, + vlm_client: VlmClient, +) -> ScreenMatchResult: + """Verify login form is visible on screen (pre-condition). + + Checks that login field, password field, and submit button are present. + Uses OCR-anchored verification (deterministic presence, VLM role). + """ + expected = [ + config.login_field, + config.password_field, + config.submit_button, + ] + return verify_before( + screenshot_path, expected, ocr_client, vlm_client, + context=config.context, + ) + + +def verify_login_success( + screenshot_path: str, + config: LoginFormConfig, + ocr_client: OcrClient, + vlm_client: VlmClient, +) -> ScreenMatchResult: + """Verify dashboard/accueil visible after login (post-condition). + + Higher threshold (verify_after = 0.8) — false positive = Léa proceeds wrong. + """ + if not config.success_elements: + # No success criteria defined → can't verify + return ScreenMatchResult( + match=False, + confidence=0.0, + reason="no success_elements defined in config", + ) + return verify_after( + screenshot_path, config.success_elements, ocr_client, vlm_client, + context=f"POST-LOGIN: {config.context}", + ) + + +def resolve_login_form( + screenshot_path: str, + config: LoginFormConfig, + ocr_client: OcrDetailedClient, + vlm_client: VlmClient, + screen_width: int = 1920, + screen_height: int = 1080, + coords_cache: Optional[CoordsCache] = None, +) -> LoginResolution: + """Ground all login form elements → coords for runtime action. + + Resolution strategy per element: + 1. Cache hit → return cached coords (validated separately) + 2. OCR-anchor → deterministic bbox from OCR token + 3. VLM grounder → fallback visual grounding + + Returns LoginResolution with grounded coords for each field. + Runtime uses these coords to type/click. + """ + login_el = ground_element( + screenshot_path, config.login_field, + ocr_client=ocr_client, vlm_client=vlm_client, + screen_width=screen_width, screen_height=screen_height, + coords_cache=coords_cache, context=config.context, + ) + + password_el = ground_element( + screenshot_path, config.password_field, + ocr_client=ocr_client, vlm_client=vlm_client, + screen_width=screen_width, screen_height=screen_height, + coords_cache=coords_cache, context=config.context, + ) + + button_el = ground_element( + screenshot_path, config.submit_button, + ocr_client=ocr_client, vlm_client=vlm_client, + screen_width=screen_width, screen_height=screen_height, + coords_cache=coords_cache, context=config.context, + ) + + all_resolved = login_el is not None and password_el is not None and button_el is not None + + # Determine overall method + methods = [] + if login_el: + methods.append(login_el.method) + if password_el: + methods.append(password_el.method) + if button_el: + methods.append(button_el.method) + + unique_methods = set(methods) + if len(unique_methods) == 1: + method = unique_methods.pop() + elif len(unique_methods) > 1: + method = "mixed" + else: + method = "" + + resolution = LoginResolution( + login_field=login_el, + password_field=password_el, + submit_button=button_el, + all_resolved=all_resolved, + method=method, + ) + + if all_resolved: + logger.info("resolve_login_form: %s", resolution.describe()) + else: + logger.warning("resolve_login_form: incomplete — %s", resolution.describe()) + + return resolution diff --git a/core/navigation/visual_verifier.py b/core/navigation/visual_verifier.py new file mode 100644 index 000000000..89958068e --- /dev/null +++ b/core/navigation/visual_verifier.py @@ -0,0 +1,408 @@ +"""Visual verifier — verify_before / verify_after avec ancrage OCR. + +Architecture OCR-ancrée (challenge Claude 01/07, gate-vert 30/06) : +- PRESENCE = tokens OCR (déterministe, pas d'hallucination possible) +- RÔLE = VLM confirmation (semantic, ancré sur tokens OCR trouvés) +- VLM ne décide JAMAIS de la présence d'un élément +- Faux positif impossible par construction ; faux négatif = retry acceptable + +Pattern d'injection : OcrClient + VlmClient injectables (tests sans réseau). +""" + +from __future__ import annotations + +import json +import logging +import re +import unicodedata +from dataclasses import dataclass, field +from difflib import SequenceMatcher +from typing import Any, Callable, Dict, List, Optional + +logger = logging.getLogger(__name__) + +# Type aliases — injectable callables for offline testing +VlmClient = Callable[[str, str], str] # (image_path, prompt) -> text +OcrClient = Callable[[str], List[str]] # (image_path) -> list of OCR text strings + + +@dataclass +class ScreenMatchResult: + """Result of a screen verification check.""" + + match: bool + confidence: float = 0.0 + reason: str = "" + observed_elements: List[Dict[str, Any]] = field(default_factory=list) + expected_elements: List[Dict[str, Any]] = field(default_factory=list) + mismatches: List[str] = field(default_factory=list) + + def describe(self) -> str: + if self.match: + return f"Screen match OK (conf={self.confidence:.2f})" + parts = [f"Screen mismatch (conf={self.confidence:.2f})"] + if self.mismatches: + parts.append("missing: " + ", ".join(self.mismatches)) + if self.reason: + parts.append(self.reason) + return " | ".join(parts) + + +# ── Text normalization (pure functions) ──────────────────────────────── + + +def normalize_text(text: str) -> str: + """Normalize text for fuzzy matching: lowercase, strip accents, collapse whitespace.""" + text = text.lower().strip() + # Strip accents: é→e, è→e, ê→e, à→a, etc. + text = unicodedata.normalize("NFKD", text) + text = "".join(c for c in text if not unicodedata.combining(c)) + # Collapse whitespace + text = re.sub(r"\s+", " ", text) + return text + + +def fuzzy_match(expected: str, observed: str, threshold: float = 0.8) -> bool: + """Check if observed text fuzzy-matches expected text. + + Three strategies (any wins): + 1. Exact match after normalization + 2. Substring containment (either direction) + 3. SequenceMatcher ratio >= threshold + """ + norm_expected = normalize_text(expected) + norm_observed = normalize_text(observed) + + if norm_expected == norm_observed: + return True + + if norm_expected in norm_observed or norm_observed in norm_expected: + return True + + ratio = SequenceMatcher(None, norm_expected, norm_observed).ratio() + return ratio >= threshold + + +# ── OCR presence check (deterministic, no VLM) ────────────────────── + + +@dataclass +class OcrPresenceResult: + """Result of OCR-based presence check.""" + + found_texts: Dict[str, str] = field(default_factory=dict) + missing: List[str] = field(default_factory=list) + all_found: bool = False + + @property + def presence_ratio(self) -> float: + if not self.found_texts: + return 1.0 + found_count = sum(1 for v in self.found_texts.values() if v != "") + return found_count / len(self.found_texts) + + +def ocr_presence_check( + ocr_tokens: List[str], + expected_elements: List[Dict[str, Any]], + fuzzy_threshold: float = 0.8, +) -> OcrPresenceResult: + """Check presence of expected texts against OCR tokens (deterministic). + + Pure function — no VLM call, zero hallucination risk. + """ + found_texts: Dict[str, str] = {} + missing: List[str] = [] + + for el in expected_elements: + expected_text = el.get("text", "") + if not expected_text: + found_texts[""] = "" + continue + + matched_ocr = "" + for token in ocr_tokens: + if fuzzy_match(expected_text, token, threshold=fuzzy_threshold): + matched_ocr = token + break + + if matched_ocr: + found_texts[expected_text] = matched_ocr + else: + found_texts[expected_text] = "" + missing.append(f"{el.get('role', '?')}: {expected_text}") + + all_found = len(missing) == 0 + return OcrPresenceResult( + found_texts=found_texts, + missing=missing, + all_found=all_found, + ) + + +# ── VLM role confirmation (semantic, anchored on found OCR texts) ──── + + +def build_role_confirm_prompt( + found_elements: List[Dict[str, Any]], + expected_elements: List[Dict[str, Any]], + context: str = "", +) -> str: + """Build VLM prompt for role confirmation of OCR-found elements. + + VLM receives found texts and confirms their ROLE only — never presence. + """ + found_lines = [] + for i, el in enumerate(found_elements): + matched_ocr = el.get("matched_ocr", "") + expected_role = el.get("expected_role", "?") + line = f"{i+1}. Text \"{matched_ocr}\" — expected role: {expected_role}" + found_lines.append(line) + + found_block = "\n".join(found_lines) + + prompt = ( + "You are a screen role validator. OCR has confirmed these texts are " + "present on the screen. Your job is ONLY to confirm their ROLE — " + "do NOT re-declare whether they are present.\n" + ) + if context: + prompt += f"Context: {context}\n" + prompt += ( + f"Found texts with expected roles:\n{found_block}\n\n" + "Respond in JSON format:\n" + "{\"confirmed\": [{\"index\": 1, \"role_confirmed\": true/false, " + "\"actual_role\": \"...\", \"confidence\": 0.0-1.0}], " + "\"overall_confidence\": 0.0-1.0}\n" + "Only confirm role_confirmed=true if the text clearly plays the " + "expected role (e.g., a button, not just a label with the same text)." + ) + return prompt + + +def parse_role_confirm_response(vlm_text: str) -> Dict[str, Any]: + """Parse VLM role confirmation JSON response.""" + try: + data = json.loads(vlm_text) + except json.JSONDecodeError: + json_match = re.search(r"\{[\s\S]*\}", vlm_text) + if json_match: + try: + data = json.loads(json_match.group()) + except json.JSONDecodeError: + logger.warning("role_confirm: VLM response not parseable as JSON") + return {"confirmed": [], "overall_confidence": 0.0} + else: + return {"confirmed": [], "overall_confidence": 0.0} + + confirmed = data.get("confirmed", []) + overall_conf = data.get("overall_confidence", 0.0) + if isinstance(overall_conf, str): + try: + overall_conf = float(overall_conf) + except ValueError: + overall_conf = 0.0 + + return { + "confirmed": confirmed, + "overall_confidence": float(overall_conf), + } + + +# ── Core verification (OCR-anchored composition) ──────────────────── + + +def verify_screen_match( + screenshot_path: str, + expected_elements: List[Dict[str, Any]], + ocr_client: OcrClient, + vlm_client: VlmClient, + context: str = "", + min_confidence: float = 0.7, +) -> ScreenMatchResult: + """Verify screen state with OCR-anchored presence + VLM role confirmation. + + Step 1: OCR screenshot → tokens → deterministic presence check + Step 2: VLM confirms role of found elements (not presence!) + + Eliminates VLM self-report hallucination for presence checks. + """ + if not expected_elements: + return ScreenMatchResult( + match=True, + confidence=1.0, + reason="no expected elements to verify", + ) + + # Step 1: OCR presence check (deterministic) + try: + ocr_tokens = ocr_client(screenshot_path) + except Exception as e: + logger.warning("verify_screen_match: OCR call failed (%s)", e) + return ScreenMatchResult( + match=False, + confidence=0.0, + reason=f"OCR error: {e}", + expected_elements=expected_elements, + ) + + presence = ocr_presence_check(ocr_tokens, expected_elements) + + if not presence.all_found: + observed = [] + for el in expected_elements: + text = el.get("text", "") + matched = presence.found_texts.get(text, "") + observed.append({ + "role": el.get("role", "?"), + "expected_text": text, + "matched_ocr": matched, + "found": matched != "", + }) + return ScreenMatchResult( + match=False, + confidence=presence.presence_ratio, + reason="OCR presence check: some texts not found", + observed_elements=observed, + expected_elements=expected_elements, + mismatches=presence.missing, + ) + + # Step 2: VLM role confirmation (only for found elements) + found_elements = [] + for el in expected_elements: + text = el.get("text", "") + matched_ocr = presence.found_texts.get(text, "") + if text and matched_ocr: + found_elements.append({ + "text": text, + "expected_role": el.get("role", "?"), + "matched_ocr": matched_ocr, + }) + + if not found_elements: + # All elements had no text → presence trivially OK + return ScreenMatchResult( + match=True, + confidence=1.0, + reason="no text-based elements to verify", + expected_elements=expected_elements, + ) + + prompt = build_role_confirm_prompt(found_elements, expected_elements, context) + + try: + vlm_text = vlm_client(screenshot_path, prompt) + except Exception as e: + logger.warning("verify_screen_match: VLM role confirm failed (%s)", e) + observed = [] + for el in expected_elements: + text = el.get("text", "") + observed.append({ + "role": el.get("role", "?"), + "expected_text": text, + "matched_ocr": presence.found_texts.get(text, ""), + "found": True, + "role_confirmed": False, + "role_confidence": 0.0, + }) + return ScreenMatchResult( + match=True, + confidence=0.5, + reason=f"OCR presence OK, VLM role confirm failed: {e}", + observed_elements=observed, + expected_elements=expected_elements, + ) + + parsed = parse_role_confirm_response(vlm_text) + overall_conf = parsed.get("overall_confidence", 0.0) + confirmed = parsed.get("confirmed", []) + + observed = [] + role_mismatches = [] + for i, el in enumerate(expected_elements): + text = el.get("text", "") + expected_role = el.get("role", "?") + matched_ocr = presence.found_texts.get(text, "") + + role_entry = None + for c in confirmed: + if c.get("index") == i + 1: + role_entry = c + break + + role_confirmed = False + actual_role = "" + role_confidence = 0.0 + + if role_entry: + role_confirmed = role_entry.get("role_confirmed", False) + actual_role = role_entry.get("actual_role", "") + role_confidence = role_entry.get("confidence", 0.0) + if isinstance(role_confidence, str): + try: + role_confidence = float(role_confidence) + except ValueError: + role_confidence = 0.0 + + observed.append({ + "role": expected_role, + "expected_text": text, + "matched_ocr": matched_ocr, + "found": True, + "role_confirmed": role_confirmed, + "actual_role": actual_role, + "role_confidence": role_confidence, + }) + + if not role_confirmed or role_confidence < min_confidence: + role_mismatches.append( + f"{expected_role}: {text} (actual={actual_role}, conf={role_confidence:.2f})" + ) + + is_match = len(role_mismatches) == 0 and overall_conf >= min_confidence + + return ScreenMatchResult( + match=is_match, + confidence=overall_conf, + reason=f"OCR presence: {presence.presence_ratio:.0%}, VLM role: {overall_conf:.2f}", + observed_elements=observed, + expected_elements=expected_elements, + mismatches=presence.missing + role_mismatches, + ) + + +def verify_before( + screenshot_path: str, + expected_elements: List[Dict[str, Any]], + ocr_client: OcrClient, + vlm_client: VlmClient, + context: str = "", +) -> ScreenMatchResult: + """Verify screen state BEFORE an action (OCR-anchored). + + Checks pre-conditions: expected texts present + roles correct. + min_confidence=0.7 — some tolerance for pre-action verification. + """ + return verify_screen_match( + screenshot_path, expected_elements, ocr_client, vlm_client, + context=f"PRE-ACTION: {context}", min_confidence=0.7, + ) + + +def verify_after( + screenshot_path: str, + expected_elements: List[Dict[str, Any]], + ocr_client: OcrClient, + vlm_client: VlmClient, + context: str = "", +) -> ScreenMatchResult: + """Verify screen state AFTER an action (OCR-anchored). + + Checks post-conditions with higher threshold (0.8). + False positive = Léa proceeds on wrong assumption → stricter gate. + """ + return verify_screen_match( + screenshot_path, expected_elements, ocr_client, vlm_client, + context=f"POST-ACTION: {context}", min_confidence=0.8, + ) diff --git a/tests/unit/test_action_resolver.py b/tests/unit/test_action_resolver.py new file mode 100644 index 000000000..ee1001cc9 --- /dev/null +++ b/tests/unit/test_action_resolver.py @@ -0,0 +1,205 @@ +"""Tests for core/navigation/action_resolver.py — coordinate conversion + OCR adapters.""" + +import json +import pytest +from core.navigation.action_resolver import ( + NavigateCoords, + NavigateResult, + grounded_to_coords, + make_ocr_simple_from_detailed, + navigate_login, +) +from core.navigation.grounding import ( + CoordsCache, + GroundedElement, + OcrTokenInfo, + OcrDetailedClient, +) +from core.navigation.visual_verifier import VlmClient + + +# ── Mock factories ───────────────────────────────────────────────────── + + +def mock_ocr_detailed_client_factory(tokens: list): + def client(image_path: str) -> list: + return tokens + return client + + +def mock_vlm_client_factory(response_json: dict): + def client(image_path: str, prompt: str) -> str: + return json.dumps(response_json) + return client + + +# ── grounded_to_coords tests ─────────────────────────────────────────── + + +class TestGroundedToCoords: + def test_basic_conversion(self): + el = GroundedElement( + role="bouton", text="Connexion", + bbox=(200, 50, 400, 100), center=(300, 75), + confidence=0.9, method="ocr_anchor", + ) + coords = grounded_to_coords(el, 1920, 1080) + assert coords.x_pct == pytest.approx(300 / 1920, abs=0.01) + assert coords.y_pct == pytest.approx(75 / 1080, abs=0.01) + assert coords.method == "ocr_anchor" + assert coords.bbox_pct is not None + + def test_to_dict(self): + coords = NavigateCoords(x_pct=0.15, y_pct=0.07, method="ocr_anchor") + d = coords.to_dict() + assert d["x_pct"] == 0.15 + assert d["y_pct"] == 0.07 + assert d["method"] == "ocr_anchor" + + def test_to_dict_with_bbox(self): + coords = NavigateCoords( + x_pct=0.15, y_pct=0.07, + bbox_pct=(0.10, 0.05, 0.20, 0.09), + method="vlm_grounder", + ) + d = coords.to_dict() + assert "bbox_pct" in d + assert len(d["bbox_pct"]) == 4 + + +# ── make_ocr_simple_from_detailed tests ──────────────────────────────── + + +class TestMakeOcrSimpleFromDetailed: + def test_conversion(self): + tokens = [ + OcrTokenInfo(text="Login", bbox=(100, 50, 250, 90)), + OcrTokenInfo(text="Password", bbox=(100, 100, 250, 140)), + ] + detailed = mock_ocr_detailed_client_factory(tokens) + simple = make_ocr_simple_from_detailed(detailed) + result = simple("/tmp/test.png") + assert result == ["Login", "Password"] + + def test_empty_tokens(self): + detailed = mock_ocr_detailed_client_factory([]) + simple = make_ocr_simple_from_detailed(detailed) + result = simple("/tmp/test.png") + assert result == [] + + +# ── navigate_login tests ─────────────────────────────────────────────── + + +class TestNavigateLogin: + def test_full_success(self): + """All fields grounded → NavigateResult with coords.""" + ocr = mock_ocr_detailed_client_factory([ + OcrTokenInfo(text="Login", bbox=(100, 50, 250, 90), confidence=0.95), + OcrTokenInfo(text="Mot de passe", bbox=(100, 100, 250, 140), confidence=0.95), + OcrTokenInfo(text="Connexion", bbox=(100, 150, 250, 190), confidence=0.95), + ]) + vlm = mock_vlm_client_factory({ + "confirmed": [ + {"index": 1, "role_confirmed": True, "actual_role": "champ", "confidence": 0.9}, + {"index": 2, "role_confirmed": True, "actual_role": "champ", "confidence": 0.9}, + {"index": 3, "role_confirmed": True, "actual_role": "bouton", "confidence": 0.9}, + ], + "overall_confidence": 0.9, + }) + result = navigate_login( + "/tmp/login.png", + ocr_client=ocr, vlm_client=vlm, + skip_pre_verify=True, + ) + assert result.all_resolved == True + assert result.login_coords is not None + assert result.password_coords is not None + assert result.submit_coords is not None + assert result.submit_coords.x_pct > 0 + assert result.submit_coords.y_pct > 0 + + def test_no_clients_error(self): + """Missing OCR/VLM clients → error.""" + result = navigate_login("/tmp/login.png", ocr_client=None, vlm_client=None) + assert result.all_resolved == False + assert "required" in result.error + + def test_pre_verify_fail(self): + """Pre-verify fails → early abort.""" + ocr = mock_ocr_detailed_client_factory([ + OcrTokenInfo(text="Accueil", bbox=(0, 0, 100, 40)), + ]) + vlm = mock_vlm_client_factory({}) + result = navigate_login( + "/tmp/page.png", + ocr_client=ocr, vlm_client=vlm, + skip_pre_verify=False, + ) + assert result.all_resolved == False + assert result.pre_verify is not None + assert result.pre_verify.match == False + + def test_skip_pre_verify(self): + """Skip pre-verify → proceed to grounding even if form incomplete.""" + ocr = mock_ocr_detailed_client_factory([ + OcrTokenInfo(text="Login", bbox=(100, 50, 250, 90)), + OcrTokenInfo(text="Mot de passe", bbox=(100, 100, 250, 140)), + OcrTokenInfo(text="Connexion", bbox=(100, 150, 250, 190)), + ]) + vlm = mock_vlm_client_factory({}) + result = navigate_login( + "/tmp/login.png", + ocr_client=ocr, vlm_client=vlm, + skip_pre_verify=True, + ) + assert result.pre_verify is None # skipped + assert result.all_resolved == True + + +# ── NavigateResult dataclass tests ───────────────────────────────────── + + +class TestNavigateResult: + def test_default(self): + result = NavigateResult() + assert result.all_resolved == False + assert result.login_coords is None + assert result.error == "" + + def test_with_coords(self): + result = NavigateResult( + login_coords=NavigateCoords(x_pct=0.15, y_pct=0.07, method="ocr_anchor"), + all_resolved=True, + ) + assert result.login_coords.x_pct == 0.15 + + +# ── Import validation ────────────────────────────────────────────────── + + +class TestImportValidation: + def test_action_resolver_imports(self): + """Verify action_resolver module imports cleanly.""" + from core.navigation.action_resolver import ( + NavigateCoords, + NavigateResult, + grounded_to_coords, + make_ocr_detailed_from_grid, + make_ocr_simple_from_detailed, + navigate_login, + ) + assert NavigateCoords is not None + assert NavigateResult is not None + + def test_navigation_package_handler(self): + """Verify _handle_navigate_action is importable from package.""" + from core.navigation import _handle_navigate_action + assert callable(_handle_navigate_action) + + def test_navigation_package_exports(self): + """Verify package __all__ includes navigate exports.""" + import core.navigation as nav + assert "navigate_login" in nav.__all__ + assert "NavigateResult" in nav.__all__ + assert "_handle_navigate_action" in nav.__all__ diff --git a/tests/unit/test_grounding.py b/tests/unit/test_grounding.py new file mode 100644 index 000000000..746535407 --- /dev/null +++ b/tests/unit/test_grounding.py @@ -0,0 +1,406 @@ +"""Tests for core/navigation/grounding.py — OCR-anchored grounding + VLM fallback + coords cache.""" + +import json +import pytest +from core.navigation.grounding import ( + OcrTokenInfo, + GroundedElement, + CoordsCacheEntry, + CoordsCache, + bbox_center, + make_element_key, + ocr_anchor_ground, + build_grounder_prompt, + parse_grounder_response, + ground_element, +) +from core.navigation.visual_verifier import normalize_text + + +# ── Mock factories ───────────────────────────────────────────────────── + + +def mock_ocr_detailed_client_factory(tokens: list): + """Factory for mock OcrDetailedClient returning List[OcrTokenInfo].""" + def client(image_path: str) -> list: + return tokens + return client + + +def mock_vlm_client_factory(response_json: dict): + """Factory for mock VlmClient returning given JSON.""" + def client(image_path: str, prompt: str) -> str: + return json.dumps(response_json) + return client + + +# ── bbox_center tests ────────────────────────────────────────────────── + + +class TestBboxCenter: + def test_basic(self): + assert bbox_center((100, 200, 300, 400)) == (200, 300) + + def test_zero_origin(self): + assert bbox_center((0, 0, 100, 100)) == (50, 50) + + def test_symmetric(self): + assert bbox_center((10, 10, 20, 20)) == (15, 15) + + +# ── make_element_key tests ───────────────────────────────────────────── + + +class TestMakeElementKey: + def test_basic(self): + key = make_element_key("bouton", "Rechercher") + assert key == "bouton:rechercher" + + def test_normalized(self): + key = make_element_key("champ", "Nom Prénom") + assert "nom" in key and "prenom" in key + + def test_consistent(self): + # Same element always produces same key + assert make_element_key("bouton", "Connexion") == make_element_key("bouton", "CONNEXION") + + +# ── ocr_anchor_ground tests ──────────────────────────────────────────── + + +class TestOcrAnchorGround: + def test_exact_match(self): + tokens = [OcrTokenInfo(text="Rechercher", bbox=(100, 50, 250, 90), confidence=0.95)] + result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Rechercher"}) + assert result is not None + assert result.method == "ocr_anchor" + assert result.bbox == (100, 50, 250, 90) + assert result.center == (175, 70) + assert result.confidence == 0.95 + + def test_fuzzy_match(self): + tokens = [OcrTokenInfo(text="Rechércher", bbox=(100, 50, 250, 90))] + result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Rechercher"}) + assert result is not None + assert result.source_ocr_text == "Rechércher" + + def test_no_match(self): + tokens = [OcrTokenInfo(text="Accueil", bbox=(100, 50, 250, 90))] + result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Rechercher"}) + assert result is None + + def test_token_without_bbox(self): + tokens = [OcrTokenInfo(text="Rechercher", bbox=None)] + result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Rechercher"}) + assert result is None # found text but no bbox → can't ground + + def test_no_text_target(self): + tokens = [OcrTokenInfo(text="Dashboard", bbox=(0, 0, 1920, 1080))] + result = ocr_anchor_ground(tokens, {"role": "page"}) # no text key + assert result is None # no text to match + + def test_multiple_tokens_first_match(self): + tokens = [ + OcrTokenInfo(text="Accueil", bbox=(0, 0, 100, 40)), + OcrTokenInfo(text="Connexion", bbox=(200, 50, 350, 90)), + ] + result = ocr_anchor_ground(tokens, {"role": "bouton", "text": "Connexion"}) + assert result is not None + assert result.bbox == (200, 50, 350, 90) + + +# ── build_grounder_prompt tests ──────────────────────────────────────── + + +class TestBuildGrounderPrompt: + def test_basic_prompt(self): + prompt = build_grounder_prompt({"role": "bouton", "text": "Connexion"}) + assert "bouton" in prompt + assert "Connexion" in prompt + assert "bbox" in prompt + + def test_with_context(self): + prompt = build_grounder_prompt( + {"role": "champ", "text": "Login"}, + context="page login DPI", + ) + assert "page login DPI" in prompt + + def test_with_extra(self): + prompt = build_grounder_prompt( + {"role": "champ", "text": "IPP", "extra": "colonne gauche"}, + ) + assert "colonne gauche" in prompt + + +# ── parse_grounder_response tests ────────────────────────────────────── + + +class TestParseGrounderResponse: + def test_valid_response(self): + vlm_text = json.dumps({ + "found": True, + "bbox": [0.1, 0.2, 0.3, 0.4], + "confidence": 0.92, + "description": "login button", + }) + result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"}) + assert result is not None + assert result.method == "vlm_grounder" + assert result.bbox == (192, 216, 576, 432) # 0.1*1920, 0.2*1080, 0.3*1920, 0.4*1080 + assert result.confidence == 0.92 + + def test_not_found(self): + vlm_text = json.dumps({"found": False, "bbox": [], "confidence": 0.0}) + result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"}) + assert result is None + + def test_json_in_markdown(self): + vlm_text = "```json\n{\"found\": true, \"bbox\": [0.5, 0.5, 0.6, 0.6], \"confidence\": 0.8}\n```" + result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"}) + assert result is not None + + def test_garbled_response(self): + result = parse_grounder_response("I cannot find the element", 1920, 1080, {"role": "bouton", "text": "Connexion"}) + assert result is None + + def test_invalid_bbox_format(self): + vlm_text = json.dumps({"found": True, "bbox": [0.1, 0.2], "confidence": 0.8}) + result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"}) + assert result is None # bbox must have 4 values + + def test_confidence_as_string(self): + vlm_text = json.dumps({"found": True, "bbox": [0.1, 0.2, 0.3, 0.4], "confidence": "0.85"}) + result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"}) + assert result is not None + assert result.confidence == 0.85 + + def test_bbox_clamped_to_screen(self): + vlm_text = json.dumps({"found": True, "bbox": [-0.1, -0.1, 1.5, 1.5], "confidence": 0.7}) + result = parse_grounder_response(vlm_text, 1920, 1080, {"role": "bouton", "text": "Connexion"}) + assert result is not None + assert result.bbox[0] >= 0 + assert result.bbox[1] >= 0 + assert result.bbox[2] <= 1920 + assert result.bbox[3] <= 1080 + + +# ── ground_element (composition) tests ───────────────────────────────── + + +class TestGroundElement: + def test_ocr_anchor_success(self): + """OCR finds text with bbox → grounded via OCR (deterministic).""" + ocr = mock_ocr_detailed_client_factory([ + OcrTokenInfo(text="Connexion", bbox=(200, 50, 350, 90), confidence=0.95), + ]) + vlm = mock_vlm_client_factory({}) + result = ground_element( + "/tmp/login.png", + {"role": "bouton", "text": "Connexion"}, + ocr_client=ocr, + vlm_client=vlm, + ) + assert result is not None + assert result.method == "ocr_anchor" + assert result.bbox == (200, 50, 350, 90) + + def test_vlm_fallback(self): + """OCR doesn't find text → VLM grounder succeeds.""" + ocr = mock_ocr_detailed_client_factory([ + OcrTokenInfo(text="Accueil", bbox=(0, 0, 100, 40)), + ]) + vlm = mock_vlm_client_factory({ + "found": True, + "bbox": [0.2, 0.3, 0.4, 0.5], + "confidence": 0.85, + }) + result = ground_element( + "/tmp/login.png", + {"role": "bouton", "text": "Connexion"}, + ocr_client=ocr, + vlm_client=vlm, + ) + assert result is not None + assert result.method == "vlm_grounder" + + def test_not_found_any_method(self): + """Both OCR and VLM fail → None.""" + ocr = mock_ocr_detailed_client_factory([OcrTokenInfo(text="Accueil", bbox=(0, 0, 100, 40))]) + vlm = mock_vlm_client_factory({"found": False, "bbox": [], "confidence": 0.0}) + result = ground_element( + "/tmp/login.png", + {"role": "bouton", "text": "Connexion"}, + ocr_client=ocr, + vlm_client=vlm, + ) + assert result is None + + def test_ocr_error_vlm_fallback(self): + """OCR engine fails → VLM fallback.""" + def failing_ocr(image_path): + raise RuntimeError("OCR engine down") + vlm = mock_vlm_client_factory({ + "found": True, + "bbox": [0.2, 0.3, 0.4, 0.5], + "confidence": 0.8, + }) + result = ground_element( + "/tmp/login.png", + {"role": "bouton", "text": "Connexion"}, + ocr_client=failing_ocr, + vlm_client=vlm, + ) + assert result is not None + assert result.method == "vlm_grounder" + + def test_vlm_error_ocr_success(self): + """VLM fails but OCR succeeds → OCR anchor used.""" + ocr = mock_ocr_detailed_client_factory([ + OcrTokenInfo(text="Connexion", bbox=(200, 50, 350, 90)), + ]) + def failing_vlm(image_path, prompt): + raise RuntimeError("VLM down") + result = ground_element( + "/tmp/login.png", + {"role": "bouton", "text": "Connexion"}, + ocr_client=ocr, + vlm_client=failing_vlm, + ) + assert result is not None + assert result.method == "ocr_anchor" + + def test_both_fail(self): + """OCR + VLM both fail → None.""" + def failing_ocr(image_path): + raise RuntimeError("OCR down") + def failing_vlm(image_path, prompt): + raise RuntimeError("VLM down") + result = ground_element( + "/tmp/login.png", + {"role": "bouton", "text": "Connexion"}, + ocr_client=failing_ocr, + vlm_client=failing_vlm, + ) + assert result is None + + def test_no_text_target(self): + """Target without text → VLM grounder skipped, None.""" + ocr = mock_ocr_detailed_client_factory([]) + vlm = mock_vlm_client_factory({}) + result = ground_element( + "/tmp/page.png", + {"role": "page"}, + ocr_client=ocr, + vlm_client=vlm, + ) + assert result is None + + def test_cache_hit(self): + """Cached coords exist → returned directly.""" + cache = CoordsCache() + cache.put("bouton:connexion", (200, 50, 350, 90), (275, 70), "ocr_anchor") + + ocr = mock_ocr_detailed_client_factory([]) + vlm = mock_vlm_client_factory({}) + result = ground_element( + "/tmp/login.png", + {"role": "bouton", "text": "Connexion"}, + ocr_client=ocr, + vlm_client=vlm, + coords_cache=cache, + ) + assert result is not None + assert result.method == "cache" + assert result.bbox == (200, 50, 350, 90) + + def test_cache_stored_on_ocr_anchor(self): + """OCR anchor result → stored in cache.""" + cache = CoordsCache() + ocr = mock_ocr_detailed_client_factory([ + OcrTokenInfo(text="Connexion", bbox=(200, 50, 350, 90)), + ]) + vlm = mock_vlm_client_factory({}) + ground_element( + "/tmp/login.png", + {"role": "bouton", "text": "Connexion"}, + ocr_client=ocr, + vlm_client=vlm, + coords_cache=cache, + ) + cached = cache.get("bouton:connexion") + assert cached is not None + assert cached.bbox == (200, 50, 350, 90) + assert cached.method == "ocr_anchor" + + def test_cache_stored_on_vlm_grounder(self): + """VLM grounder result → stored in cache.""" + cache = CoordsCache() + ocr = mock_ocr_detailed_client_factory([]) + vlm = mock_vlm_client_factory({ + "found": True, + "bbox": [0.2, 0.3, 0.4, 0.5], + "confidence": 0.85, + }) + ground_element( + "/tmp/login.png", + {"role": "bouton", "text": "Connexion"}, + ocr_client=ocr, + vlm_client=vlm, + coords_cache=cache, + ) + cached = cache.get("bouton:connexion") + assert cached is not None + assert cached.method == "vlm_grounder" + + +# ── CoordsCache tests ────────────────────────────────────────────────── + + +class TestCoordsCache: + def test_put_and_get(self): + cache = CoordsCache() + cache.put("bouton:connexion", (200, 50, 350, 90), (275, 70), "ocr_anchor") + entry = cache.get("bouton:connexion") + assert entry is not None + assert entry.bbox == (200, 50, 350, 90) + + def test_get_missing(self): + cache = CoordsCache() + assert cache.get("bouton:connexion") is None + + def test_invalidate(self): + cache = CoordsCache() + cache.put("bouton:connexion", (200, 50, 350, 90), (275, 70), "ocr_anchor") + cache.invalidate("bouton:connexion") + assert cache.get("bouton:connexion") is None + + def test_clear(self): + cache = CoordsCache() + cache.put("a", (0, 0, 10, 10), (5, 5), "ocr_anchor") + cache.put("b", (0, 0, 20, 20), (10, 10), "vlm_grounder") + cache.clear() + assert cache.get("a") is None + assert cache.get("b") is None + + def test_keys(self): + cache = CoordsCache() + cache.put("a", (0, 0, 10, 10), (5, 5), "ocr_anchor") + cache.put("b", (0, 0, 20, 20), (10, 10), "vlm_grounder") + assert sorted(cache.keys()) == ["a", "b"] + + def test_update_existing(self): + cache = CoordsCache() + cache.put("bouton:connexion", (200, 50, 350, 90), (275, 70), "ocr_anchor") + cache.put("bouton:connexion", (300, 60, 400, 100), (350, 80), "vlm_grounder") + entry = cache.get("bouton:connexion") + assert entry is not None + assert entry.bbox == (300, 60, 400, 100) # updated + assert entry.validation_count == 2 + + def test_validation_count_increments(self): + cache = CoordsCache() + cache.put("a", (0, 0, 10, 10), (5, 5), "ocr_anchor") + assert cache.get("a").validation_count == 1 + cache.put("a", (0, 0, 10, 10), (5, 5), "ocr_anchor") + assert cache.get("a").validation_count == 2 diff --git a/tests/unit/test_navigate_handler_e2e.py b/tests/unit/test_navigate_handler_e2e.py new file mode 100644 index 000000000..195f74fef --- /dev/null +++ b/tests/unit/test_navigate_handler_e2e.py @@ -0,0 +1,151 @@ +"""End-to-end mocked test for navigate action handler — 3 edge-case scenarios. + +Tests the _handle_navigate_action handler with mocked OCR/VLM, verifying: +- Nominal: all resolved, coords populated in variables +- OCR miss + VLM fail: no phantom coords, all_resolved=False +- No screenshot: error="no_screenshot", False return + +NOTE: The handler uses lazy imports inside its body. Mock targets must be +at the source module (core.navigation.action_resolver.navigate_login) rather +than the package-level re-export (core.navigation.navigate_login). +""" + +import pytest +from unittest.mock import patch, MagicMock +from core.navigation.action_resolver import NavigateCoords, NavigateResult +from core.navigation import _handle_navigate_action + + +def _patch_all_deps(navigate_login_result=None, navigate_login_side_effect=None): + """Return stacked patches for handler's lazy imports + navigate_login.""" + nl_mock = MagicMock(return_value=navigate_login_result) if navigate_login_result else None + if navigate_login_side_effect: + nl_mock = MagicMock(side_effect=navigate_login_side_effect) + + return ( + patch("core.llm.extract_grid_from_image", return_value=[]), + patch("core.extraction.vlm_client.make_vllm_client", return_value=MagicMock()), + patch("core.navigation.action_resolver.make_ocr_detailed_from_grid", + return_value=MagicMock(return_value=[])), + patch("core.navigation.action_resolver.navigate_login", nl_mock), + ) + + +class TestNominalCase: + """All fields grounded → coords populated, all_resolved=True.""" + + def test_nominal_coords_populated(self): + mock_result = NavigateResult( + login_coords=NavigateCoords(x_pct=0.15, y_pct=0.07, method="ocr_anchor"), + password_coords=NavigateCoords(x_pct=0.15, y_pct=0.25, method="ocr_anchor"), + submit_coords=NavigateCoords(x_pct=0.50, y_pct=0.35, method="ocr_anchor"), + all_resolved=True, + ) + + action = {"parameters": {"action": "login"}} + replay_state = { + "last_screenshot_path": "/tmp/login_screen.png", + "screen_width": 1920, + "screen_height": 1080, + } + + p1, p2, p3, p4 = _patch_all_deps(navigate_login_result=mock_result) + with p1, p2, p3, p4: + result = _handle_navigate_action(action, replay_state, "test-session") + + assert result is True + vars_ = replay_state["variables"] + assert "navigate_login_coords" in vars_ + assert vars_["navigate_login_coords"]["x_pct"] == 0.15 + assert "navigate_password_coords" in vars_ + assert "navigate_submit_coords" in vars_ + assert vars_["navigate_result"]["all_resolved"] is True + + +class TestOcrMissVlmFail: + """OCR misses target + VLM grounder also fails → no phantom coords.""" + + def test_no_phantom_coords_on_failure(self): + mock_result = NavigateResult( + login_coords=None, + password_coords=None, + submit_coords=None, + all_resolved=False, + error="grounding failed — no login form elements found", + ) + + action = {"parameters": {"action": "login"}} + replay_state = { + "last_screenshot_path": "/tmp/no_login_form.png", + "screen_width": 1920, + "screen_height": 1080, + } + + p1, p2, p3, p4 = _patch_all_deps(navigate_login_result=mock_result) + with p1, p2, p3, p4: + result = _handle_navigate_action(action, replay_state, "test-session") + + assert result is False + vars_ = replay_state["variables"] + # No coords keys should be present (coords are None → not stored) + assert "navigate_login_coords" not in vars_ + assert "navigate_password_coords" not in vars_ + assert "navigate_submit_coords" not in vars_ + # Error must be non-empty + assert vars_["navigate_result"]["all_resolved"] is False + assert "grounding failed" in vars_["navigate_result"]["error"] + + +class TestNoScreenshot: + """No screenshot in replay_state → error="no_screenshot", False.""" + + def test_no_screenshot_error(self): + action = {"parameters": {"action": "login"}} + replay_state = {} # No screenshot at all + + result = _handle_navigate_action(action, replay_state, "test-session") + + assert result is False + vars_ = replay_state["variables"] + assert vars_["navigate_login_coords"]["error"] == "no_screenshot" + + def test_empty_screenshot_path(self): + action = {"parameters": {"action": "login"}} + replay_state = {"last_screenshot_path": ""} + + result = _handle_navigate_action(action, replay_state, "test-session") + + assert result is False + vars_ = replay_state["variables"] + assert vars_["navigate_login_coords"]["error"] == "no_screenshot" + + +class TestNeverFailReplay: + """Handler must never raise — even on malformed input, returns False.""" + + def test_missing_parameters(self): + action = {} # No "parameters" key + replay_state = {"last_screenshot_path": "/tmp/x.png"} + + mock_result = NavigateResult(all_resolved=False, error="no params") + p1, p2, p3, p4 = _patch_all_deps(navigate_login_result=mock_result) + with p1, p2, p3, p4: + result = _handle_navigate_action(action, replay_state, "test-session") + assert result is False + + def test_exception_in_inner_call(self): + action = {"parameters": {"action": "login"}} + replay_state = { + "last_screenshot_path": "/tmp/login.png", + "screen_width": 1920, + "screen_height": 1080, + } + + p1, p2, p3, p4 = _patch_all_deps(navigate_login_side_effect=RuntimeError("boom")) + with p1, p2, p3, p4: + result = _handle_navigate_action(action, replay_state, "test-session") + + assert result is False + vars_ = replay_state["variables"] + assert vars_["navigate_result"]["all_resolved"] is False + assert "boom" in vars_["navigate_result"]["error"] diff --git a/tests/unit/test_navigate_wiring.py b/tests/unit/test_navigate_wiring.py new file mode 100644 index 000000000..34a8e1a01 --- /dev/null +++ b/tests/unit/test_navigate_wiring.py @@ -0,0 +1,62 @@ +"""Boot non-regression test for navigate wiring — catches import/regression bugs. + +This test would have caught the ImportError where _handle_navigate_action +was incorrectly imported from replay_engine instead of core/navigation. +""" + +import pytest + + +class TestApiStreamImports: + """(1) api_stream must import without error.""" + + def test_import_api_stream(self): + from agent_v0.server_v1 import api_stream + assert api_stream is not None + + +class TestAllowedActionTypes: + """(2) 'navigate' must be in both _ALLOWED and _SERVER_SIDE.""" + + def test_navigate_in_allowed(self): + from agent_v0.server_v1.replay_engine import _ALLOWED_ACTION_TYPES + assert "navigate" in _ALLOWED_ACTION_TYPES + + def test_navigate_in_server_side(self): + from agent_v0.server_v1.replay_engine import _SERVER_SIDE_ACTION_TYPES + assert "navigate" in _SERVER_SIDE_ACTION_TYPES + + +class TestNavigateHandlerCallable: + """(3) _handle_navigate_action must be callable with correct signature.""" + + def test_handler_imported_from_core_navigation(self): + from core.navigation import _handle_navigate_action + assert callable(_handle_navigate_action) + + def test_handler_imported_in_api_stream(self): + from agent_v0.server_v1 import api_stream + handler = api_stream._handle_navigate_action + assert callable(handler) + + def test_handler_signature(self): + """Signature: (action: dict, replay_state: dict, session_id: str) -> bool.""" + from core.navigation import _handle_navigate_action + import inspect + sig = inspect.signature(_handle_navigate_action) + params = list(sig.parameters.keys()) + assert params == ["action", "replay_state", "session_id"] + assert sig.return_annotation == bool + + +class TestDispatchBlockExists: + """Verify the navigate dispatch block is wired in api_stream.""" + + def test_navigate_dispatch_reference(self): + """Source must contain the navigate dispatch elif block.""" + import agent_v0.server_v1.api_stream as mod + source = inspect.getsource(mod) + assert "type_ == \"navigate\"" in source + + +import inspect diff --git a/tests/unit/test_visual_login.py b/tests/unit/test_visual_login.py new file mode 100644 index 000000000..ce3917bbf --- /dev/null +++ b/tests/unit/test_visual_login.py @@ -0,0 +1,336 @@ +"""Tests for core/navigation/visual_login.py — login form resolution + verification.""" + +import json +import pytest +from core.navigation.visual_login import ( + LoginFormConfig, + LoginResolution, + dpi_urgences_login_config, + verify_login_visible, + verify_login_success, + resolve_login_form, + _ocr_detailed_to_simple, +) +from core.navigation.grounding import ( + CoordsCache, + GroundedElement, + OcrTokenInfo, + OcrDetailedClient, +) +from core.navigation.visual_verifier import ( + ScreenMatchResult, + VlmClient, + OcrClient, +) + + +# ── Mock factories ───────────────────────────────────────────────────── + + +def mock_ocr_detailed_client_factory(tokens: list): + """Factory for mock OcrDetailedClient.""" + def client(image_path: str) -> list: + return tokens + return client + + +def mock_ocr_simple_client_factory(tokens: list): + """Factory for mock OcrClient (text-only).""" + def client(image_path: str) -> list: + return tokens + return client + + +def mock_vlm_client_factory(response_json: dict): + """Factory for mock VlmClient.""" + def client(image_path: str, prompt: str) -> str: + return json.dumps(response_json) + return client + + +# ── Default config tests ─────────────────────────────────────────────── + + +class TestDpiUrgencesLoginConfig: + def test_default_config(self): + config = dpi_urgences_login_config() + assert config.login_field["role"] == "champ" + assert config.login_field["text"] == "Login" + assert config.password_field["text"] == "Mot de passe" + assert config.submit_button["text"] == "Connexion" + assert len(config.success_elements) >= 1 + assert config.context != "" + + def test_config_fields_are_dicts(self): + config = dpi_urgences_login_config() + assert isinstance(config.login_field, dict) + assert isinstance(config.password_field, dict) + assert isinstance(config.submit_button, dict) + + +# ── _ocr_detailed_to_simple tests ──────────────────────────────────── + + +class TestOcrDetailedToSimple: + def test_conversion(self): + tokens = [ + OcrTokenInfo(text="Login", bbox=(100, 50, 200, 90)), + OcrTokenInfo(text="Password", bbox=(100, 100, 200, 140)), + ] + detailed = mock_ocr_detailed_client_factory(tokens) + simple = _ocr_detailed_to_simple(detailed) + result = simple("/tmp/test.png") + assert result == ["Login", "Password"] + + def test_empty_tokens(self): + detailed = mock_ocr_detailed_client_factory([]) + simple = _ocr_detailed_to_simple(detailed) + result = simple("/tmp/test.png") + assert result == [] + + +# ── verify_login_visible tests ──────────────────────────────────────── + + +class TestVerifyLoginVisible: + def test_form_visible(self): + """All 3 fields found by OCR + roles confirmed → match.""" + config = LoginFormConfig( + login_field={"role": "champ", "text": "Login"}, + password_field={"role": "champ", "text": "Mot de passe"}, + submit_button={"role": "bouton", "text": "Connexion"}, + context="DPI login", + ) + ocr = mock_ocr_simple_client_factory(["Login", "Mot de passe", "Connexion"]) + vlm = mock_vlm_client_factory({ + "confirmed": [ + {"index": 1, "role_confirmed": True, "actual_role": "champ", "confidence": 0.9}, + {"index": 2, "role_confirmed": True, "actual_role": "champ", "confidence": 0.9}, + {"index": 3, "role_confirmed": True, "actual_role": "bouton", "confidence": 0.9}, + ], + "overall_confidence": 0.9, + }) + result = verify_login_visible("/tmp/login.png", config, ocr, vlm) + assert result.match == True + + def test_form_missing_button(self): + """Connexion button not found by OCR → mismatch.""" + config = LoginFormConfig( + login_field={"role": "champ", "text": "Login"}, + password_field={"role": "champ", "text": "Mot de passe"}, + submit_button={"role": "bouton", "text": "Connexion"}, + ) + ocr = mock_ocr_simple_client_factory(["Login", "Mot de passe"]) # missing Connexion + vlm = mock_vlm_client_factory({}) + result = verify_login_visible("/tmp/login.png", config, ocr, vlm) + assert result.match == False + + def test_form_wrong_role(self): + """OCR finds text but VLM says button is a label → mismatch.""" + config = LoginFormConfig( + login_field={"role": "champ", "text": "Login"}, + password_field={"role": "champ", "text": "Mot de passe"}, + submit_button={"role": "bouton", "text": "Connexion"}, + ) + ocr = mock_ocr_simple_client_factory(["Login", "Mot de passe", "Connexion"]) + vlm = mock_vlm_client_factory({ + "confirmed": [ + {"index": 1, "role_confirmed": True, "actual_role": "champ", "confidence": 0.9}, + {"index": 2, "role_confirmed": True, "actual_role": "champ", "confidence": 0.9}, + {"index": 3, "role_confirmed": False, "actual_role": "label", "confidence": 0.5}, + ], + "overall_confidence": 0.5, + }) + result = verify_login_visible("/tmp/login.png", config, ocr, vlm) + assert result.match == False + + +# ── verify_login_success tests ──────────────────────────────────────── + + +class TestVerifyLoginSuccess: + def test_dashboard_visible(self): + """Dashboard found by OCR + role confirmed → success.""" + config = LoginFormConfig( + login_field={"role": "champ", "text": "Login"}, + password_field={"role": "champ", "text": "Mot de passe"}, + submit_button={"role": "bouton", "text": "Connexion"}, + success_elements=[{"role": "page", "text": "Dashboard"}], + ) + ocr = mock_ocr_simple_client_factory(["Dashboard", "Accueil"]) + vlm = mock_vlm_client_factory({ + "confirmed": [ + {"index": 1, "role_confirmed": True, "actual_role": "page", "confidence": 0.92}, + ], + "overall_confidence": 0.92, + }) + result = verify_login_success("/tmp/dashboard.png", config, ocr, vlm) + assert result.match == True + + def test_no_success_elements(self): + """Config has no success_elements → can't verify.""" + config = LoginFormConfig( + login_field={"role": "champ", "text": "Login"}, + password_field={"role": "champ", "text": "Mot de passe"}, + submit_button={"role": "bouton", "text": "Connexion"}, + success_elements=[], # empty! + ) + ocr = mock_ocr_simple_client_factory(["Dashboard"]) + vlm = mock_vlm_client_factory({}) + result = verify_login_success("/tmp/page.png", config, ocr, vlm) + assert result.match == False + assert "no success_elements" in result.reason + + def test_still_on_login_page(self): + """After login, still seeing login form → mismatch.""" + config = LoginFormConfig( + login_field={"role": "champ", "text": "Login"}, + password_field={"role": "champ", "text": "Mot de passe"}, + submit_button={"role": "bouton", "text": "Connexion"}, + success_elements=[{"role": "page", "text": "Dashboard"}], + ) + # OCR sees login form texts, not Dashboard + ocr = mock_ocr_simple_client_factory(["Login", "Mot de passe", "Connexion"]) + vlm = mock_vlm_client_factory({}) + result = verify_login_success("/tmp/still_login.png", config, ocr, vlm) + assert result.match == False + + +# ── resolve_login_form tests ────────────────────────────────────────── + + +class TestResolveLoginForm: + def test_all_fields_ocr_anchor(self): + """All 3 fields found by OCR with bbox → full resolution.""" + config = LoginFormConfig( + login_field={"role": "champ", "text": "Login"}, + password_field={"role": "champ", "text": "Mot de passe"}, + submit_button={"role": "bouton", "text": "Connexion"}, + ) + ocr = mock_ocr_detailed_client_factory([ + OcrTokenInfo(text="Login", bbox=(100, 50, 250, 90)), + OcrTokenInfo(text="Mot de passe", bbox=(100, 100, 250, 140)), + OcrTokenInfo(text="Connexion", bbox=(100, 150, 250, 190)), + ]) + vlm = mock_vlm_client_factory({}) + result = resolve_login_form("/tmp/login.png", config, ocr, vlm) + assert result.all_resolved == True + assert result.login_field is not None + assert result.login_field.method == "ocr_anchor" + assert result.password_field is not None + assert result.submit_button is not None + assert result.method == "ocr_anchor" + + def test_partial_ocr_vlm_fallback(self): + """Login + password by OCR, button by VLM → mixed method.""" + config = LoginFormConfig( + login_field={"role": "champ", "text": "Login"}, + password_field={"role": "champ", "text": "Password"}, + submit_button={"role": "bouton", "text": "Connexion"}, + ) + ocr = mock_ocr_detailed_client_factory([ + OcrTokenInfo(text="Login", bbox=(100, 50, 250, 90)), + OcrTokenInfo(text="Password", bbox=(100, 100, 250, 140)), + # Connexion not in OCR → VLM fallback + ]) + vlm = mock_vlm_client_factory({ + "found": True, + "bbox": [0.2, 0.4, 0.4, 0.5], + "confidence": 0.85, + }) + result = resolve_login_form("/tmp/login.png", config, ocr, vlm) + assert result.all_resolved == True + assert result.login_field.method == "ocr_anchor" + assert result.submit_button.method == "vlm_grounder" + assert result.method == "mixed" + + def test_incomplete_resolution(self): + """Button not found by OCR or VLM → incomplete.""" + config = LoginFormConfig( + login_field={"role": "champ", "text": "Login"}, + password_field={"role": "champ", "text": "Password"}, + submit_button={"role": "bouton", "text": "Connexion"}, + ) + ocr = mock_ocr_detailed_client_factory([ + OcrTokenInfo(text="Login", bbox=(100, 50, 250, 90)), + OcrTokenInfo(text="Password", bbox=(100, 100, 250, 140)), + ]) + vlm = mock_vlm_client_factory({"found": False, "bbox": [], "confidence": 0.0}) + result = resolve_login_form("/tmp/login.png", config, ocr, vlm) + assert result.all_resolved == False + assert result.submit_button is None + + def test_cache_hit(self): + """All fields cached → returned directly.""" + cache = CoordsCache() + cache.put("champ:login", (100, 50, 250, 90), (175, 70), "ocr_anchor") + cache.put("champ:mot de passe", (100, 100, 250, 140), (175, 120), "ocr_anchor") + cache.put("bouton:connexion", (100, 150, 250, 190), (175, 170), "ocr_anchor") + + config = LoginFormConfig( + login_field={"role": "champ", "text": "Login"}, + password_field={"role": "champ", "text": "Mot de passe"}, + submit_button={"role": "bouton", "text": "Connexion"}, + ) + ocr = mock_ocr_detailed_client_factory([]) + vlm = mock_vlm_client_factory({}) + result = resolve_login_form( + "/tmp/login.png", config, ocr, vlm, coords_cache=cache, + ) + assert result.all_resolved == True + assert result.method == "cache" + assert result.login_field.center == (175, 70) + + def test_with_dpi_default_config(self): + """Full flow with dpi_urgences_login_config.""" + config = dpi_urgences_login_config() + ocr = mock_ocr_detailed_client_factory([ + OcrTokenInfo(text="Login", bbox=(100, 50, 250, 90)), + OcrTokenInfo(text="Mot de passe", bbox=(100, 100, 250, 140)), + OcrTokenInfo(text="Connexion", bbox=(100, 150, 250, 190)), + ]) + vlm = mock_vlm_client_factory({}) + result = resolve_login_form("/tmp/login.png", config, ocr, vlm) + assert result.all_resolved == True + + +# ── LoginResolution describe tests ──────────────────────────────────── + + +class TestLoginResolutionDescribe: + def test_all_resolved(self): + resolution = LoginResolution( + login_field=GroundedElement( + role="champ", text="Login", + bbox=(100, 50, 250, 90), center=(175, 70), + confidence=0.9, method="ocr_anchor", + ), + password_field=GroundedElement( + role="champ", text="Mot de passe", + bbox=(100, 100, 250, 140), center=(175, 120), + confidence=0.9, method="ocr_anchor", + ), + submit_button=GroundedElement( + role="bouton", text="Connexion", + bbox=(100, 150, 250, 190), center=(175, 170), + confidence=0.9, method="ocr_anchor", + ), + all_resolved=True, + method="ocr_anchor", + ) + desc = resolution.describe() + assert "OK" in desc + assert "login@" in desc + assert "button@" in desc + + def test_incomplete(self): + resolution = LoginResolution( + login_field=None, + password_field=None, + submit_button=None, + all_resolved=False, + method="", + ) + desc = resolution.describe() + assert "INCOMPLETE" in desc + assert "NOT FOUND" in desc diff --git a/tests/unit/test_visual_verifier.py b/tests/unit/test_visual_verifier.py new file mode 100644 index 000000000..c318fa07b --- /dev/null +++ b/tests/unit/test_visual_verifier.py @@ -0,0 +1,490 @@ +"""Tests for core/navigation/visual_verifier.py — OCR-anchored architecture. + +Tests pure functions (normalize_text, fuzzy_match, ocr_presence_check, +build_role_confirm_prompt, parse_role_confirm_response) offline, +then verifies verify_screen_match with mock OcrClient + VlmClient. +""" + +import json +import pytest +from core.navigation.visual_verifier import ( + normalize_text, + fuzzy_match, + ocr_presence_check, + build_role_confirm_prompt, + parse_role_confirm_response, + verify_screen_match, + verify_before, + verify_after, + ScreenMatchResult, + OcrPresenceResult, +) + + +# ── Mock factories ───────────────────────────────────────────────────── + + +def mock_ocr_client_factory(tokens: list): + """Factory that creates a mock OcrClient returning the given tokens.""" + def client(image_path: str) -> list: + return tokens + return client + + +def mock_vlm_client_factory(response_json: dict): + """Factory that creates a mock VlmClient returning the given JSON.""" + def client(image_path: str, prompt: str) -> str: + return json.dumps(response_json) + return client + + +# ── normalize_text tests ────────────────────────────────────────────── + + +class TestNormalizeText: + def test_lowercase(self): + assert normalize_text("RECHERCHER") == "rechercher" + + def test_strip_accents(self): + assert normalize_text("Recherché") == "recherche" + + def test_collapse_whitespace(self): + assert normalize_text(" hello world ") == "hello world" + + def test_combined(self): + assert normalize_text(" Nom Prénom ") == "nom prenom" + + def test_empty(self): + assert normalize_text("") == "" + + def test_numbers_preserved(self): + assert normalize_text("IPP 12345") == "ipp 12345" + + +# ── fuzzy_match tests ───────────────────────────────────────────────── + + +class TestFuzzyMatch: + def test_exact_match(self): + assert fuzzy_match("Rechercher", "Rechercher") == True + + def test_case_insensitive(self): + assert fuzzy_match("rechercher", "RECHERCHER") == True + + def test_accent_match(self): + assert fuzzy_match("Recherché", "Recherche") == True + + def test_substring_containment(self): + # Short text contained in longer OCR token + assert fuzzy_match("Rechercher", "Bouton Rechercher") == True + + def test_reverse_containment(self): + # OCR token contained in expected text + assert fuzzy_match("Nom Prénom Patient", "Nom") == True + + def test_fuzzy_ratio(self): + # Similar but not exact/substring — ratio ~0.90 + assert fuzzy_match("Connexion", "Connection", threshold=0.8) == True + + def test_no_match(self): + assert fuzzy_match("Dashboard", "Login", threshold=0.8) == False + + def test_custom_threshold(self): + # "Connection" vs "Connexion" ratio ~0.90, passes at 0.8 but fails at 0.95 + assert fuzzy_match("Connexion", "Connection", threshold=0.95) == False + + +# ── ocr_presence_check tests ────────────────────────────────────────── + + +class TestOcrPresenceCheck: + def test_all_found(self): + tokens = ["Rechercher", "Connexion", "Nom Patient"] + elements = [ + {"role": "bouton", "text": "Rechercher"}, + {"role": "bouton", "text": "Connexion"}, + ] + result = ocr_presence_check(tokens, elements) + assert result.all_found == True + assert result.presence_ratio == 1.0 + assert len(result.missing) == 0 + assert result.found_texts["Rechercher"] == "Rechercher" + + def test_partial_found(self): + tokens = ["Rechercher"] + elements = [ + {"role": "bouton", "text": "Rechercher"}, + {"role": "bouton", "text": "Connexion"}, + ] + result = ocr_presence_check(tokens, elements) + assert result.all_found == False + assert result.presence_ratio == 0.5 + assert "bouton: Connexion" in result.missing + + def test_none_found(self): + tokens = ["Accueil", "Paramètres"] + elements = [ + {"role": "bouton", "text": "Rechercher"}, + ] + result = ocr_presence_check(tokens, elements) + assert result.all_found == False + assert result.presence_ratio == 0.0 + assert "bouton: Rechercher" in result.missing + + def test_fuzzy_match_in_presence(self): + tokens = ["Rechércher"] # OCR with accent variation + elements = [{"role": "bouton", "text": "Rechercher"}] + result = ocr_presence_check(tokens, elements) + assert result.all_found == True + + def test_empty_tokens(self): + result = ocr_presence_check([], [{"role": "bouton", "text": "Login"}]) + assert result.all_found == False + assert result.presence_ratio == 0.0 + + def test_empty_elements(self): + result = ocr_presence_check(["Login", "Password"], []) + assert result.all_found == True + assert result.presence_ratio == 1.0 + + def test_no_text_key(self): + elements = [{"role": "page"}] # no text key + result = ocr_presence_check(["Dashboard"], elements) + assert result.all_found == True # no text to check → trivially found + + def test_multiple_elements_same_text(self): + tokens = ["Connexion"] + elements = [ + {"role": "bouton", "text": "Connexion"}, + {"role": "label", "text": "Connexion"}, + ] + result = ocr_presence_check(tokens, elements) + assert result.all_found == True + + +# ── build_role_confirm_prompt tests ─────────────────────────────────── + + +class TestBuildRoleConfirmPrompt: + def test_basic_prompt(self): + found = [ + {"text": "Rechercher", "expected_role": "bouton", "matched_ocr": "Rechercher"}, + ] + expected = [{"role": "bouton", "text": "Rechercher"}] + prompt = build_role_confirm_prompt(found, expected) + assert "Text \"Rechercher\"" in prompt + assert "expected role: bouton" in prompt + assert "role_confirmed" in prompt + + def test_with_context(self): + found = [ + {"text": "Connexion", "expected_role": "bouton", "matched_ocr": "Connexion"}, + ] + expected = [{"role": "bouton", "text": "Connexion"}] + prompt = build_role_confirm_prompt(found, expected, context="page login DPI") + assert "Context: page login DPI" in prompt + + def test_multiple_elements(self): + found = [ + {"text": "Login", "expected_role": "champ", "matched_ocr": "Login"}, + {"text": "Password", "expected_role": "champ", "matched_ocr": "Password"}, + {"text": "Connexion", "expected_role": "bouton", "matched_ocr": "Connexion"}, + ] + expected = [ + {"role": "champ", "text": "Login"}, + {"role": "champ", "text": "Password"}, + {"role": "bouton", "text": "Connexion"}, + ] + prompt = build_role_confirm_prompt(found, expected) + assert "1." in prompt + assert "2." in prompt + assert "3." in prompt + + def test_no_self_declaration(self): + """Prompt must NOT ask VLM to declare presence — only role.""" + found = [ + {"text": "Login", "expected_role": "champ", "matched_ocr": "Login"}, + ] + expected = [{"role": "champ", "text": "Login"}] + prompt = build_role_confirm_prompt(found, expected) + assert "present" not in prompt.lower() or "confirmed" in prompt.lower() + + +# ── parse_role_confirm_response tests ───────────────────────────────── + + +class TestParseRoleConfirmResponse: + def test_valid_json(self): + data = json.dumps({ + "confirmed": [ + {"index": 1, "role_confirmed": True, "actual_role": "bouton", "confidence": 0.92}, + ], + "overall_confidence": 0.92, + }) + result = parse_role_confirm_response(data) + assert len(result["confirmed"]) == 1 + assert result["overall_confidence"] == 0.92 + + def test_json_in_markdown(self): + vlm_text = "```json\n{\"confirmed\": [], \"overall_confidence\": 0.0}\n```" + result = parse_role_confirm_response(vlm_text) + assert result["overall_confidence"] == 0.0 + + def test_garbled_response(self): + result = parse_role_confirm_response("I cannot determine the roles") + assert result["overall_confidence"] == 0.0 + assert len(result["confirmed"]) == 0 + + def test_confidence_as_string(self): + data = json.dumps({"confirmed": [], "overall_confidence": "0.85"}) + result = parse_role_confirm_response(data) + assert result["overall_confidence"] == 0.85 + + +# ── verify_screen_match (OCR-anchored) tests ───────────────────────── + + +class TestVerifyScreenMatchOcrAnchored: + def test_full_match(self): + ocr = mock_ocr_client_factory(["Rechercher", "Connexion", "Dashboard"]) + vlm = mock_vlm_client_factory({ + "confirmed": [ + {"index": 1, "role_confirmed": True, "actual_role": "bouton", "confidence": 0.92}, + ], + "overall_confidence": 0.92, + }) + result = verify_screen_match( + "/tmp/test.png", + [{"role": "bouton", "text": "Rechercher"}], + ocr_client=ocr, + vlm_client=vlm, + ) + assert result.match == True + assert result.confidence >= 0.7 + + def test_ocr_presence_fail(self): + """OCR doesn't find expected text → mismatch (deterministic, no VLM needed).""" + ocr = mock_ocr_client_factory(["Accueil", "Paramètres"]) + vlm = mock_vlm_client_factory({}) + result = verify_screen_match( + "/tmp/test.png", + [{"role": "bouton", "text": "Rechercher"}], + ocr_client=ocr, + vlm_client=vlm, + ) + assert result.match == False + assert "OCR presence" in result.reason + assert len(result.mismatches) > 0 + + def test_role_not_confirmed(self): + """OCR finds text, VLM says it's a label not a button → mismatch.""" + ocr = mock_ocr_client_factory(["Rechercher"]) + vlm = mock_vlm_client_factory({ + "confirmed": [ + {"index": 1, "role_confirmed": False, "actual_role": "label", "confidence": 0.6}, + ], + "overall_confidence": 0.6, + }) + result = verify_screen_match( + "/tmp/test.png", + [{"role": "bouton", "text": "Rechercher"}], + ocr_client=ocr, + vlm_client=vlm, + ) + assert result.match == False + + def test_ocr_error(self): + """OCR engine fails → fail-safe mismatch.""" + def failing_ocr(image_path): + raise RuntimeError("OCR engine down") + vlm = mock_vlm_client_factory({}) + result = verify_screen_match( + "/tmp/test.png", + [{"role": "bouton", "text": "Rechercher"}], + ocr_client=failing_ocr, + vlm_client=vlm, + ) + assert result.match == False + assert "OCR error" in result.reason + + def test_vlm_error_partial_match(self): + """OCR finds texts, VLM fails → partial match (presence OK, role unknown).""" + ocr = mock_ocr_client_factory(["Rechercher"]) + def failing_vlm(image_path, prompt): + raise RuntimeError("VLM service down") + result = verify_screen_match( + "/tmp/test.png", + [{"role": "bouton", "text": "Rechercher"}], + ocr_client=ocr, + vlm_client=failing_vlm, + ) + # Presence confirmed by OCR → partial match, confidence=0.5 + assert result.match == True + assert result.confidence == 0.5 + assert "VLM role confirm failed" in result.reason + + def test_no_expected_elements(self): + ocr = mock_ocr_client_factory(["Login"]) + vlm = mock_vlm_client_factory({}) + result = verify_screen_match("/tmp/test.png", [], ocr_client=ocr, vlm_client=vlm) + assert result.match == True + assert result.confidence == 1.0 + + def test_describe_match(self): + result = ScreenMatchResult(match=True, confidence=0.92) + assert "OK" in result.describe() + + def test_describe_mismatch(self): + result = ScreenMatchResult( + match=False, confidence=0.3, + mismatches=["bouton: Rechercher"], + ) + assert "mismatch" in result.describe() + + def test_multiple_elements_mixed(self): + """2 elements: 1 found+role OK, 1 not found in OCR → mismatch.""" + ocr = mock_ocr_client_factory(["Connexion"]) + vlm = mock_vlm_client_factory({ + "confirmed": [ + {"index": 1, "role_confirmed": True, "actual_role": "bouton", "confidence": 0.9}, + ], + "overall_confidence": 0.9, + }) + result = verify_screen_match( + "/tmp/test.png", + [ + {"role": "bouton", "text": "Connexion"}, + {"role": "champ", "text": "Nom Patient"}, + ], + ocr_client=ocr, + vlm_client=vlm, + ) + assert result.match == False # "Nom Patient" not found by OCR + + def test_fuzzy_ocr_match(self): + """OCR reads 'Rechércher' (accent), expected 'Rechercher' → still found.""" + ocr = mock_ocr_client_factory(["Rechércher"]) + vlm = mock_vlm_client_factory({ + "confirmed": [ + {"index": 1, "role_confirmed": True, "actual_role": "bouton", "confidence": 0.9}, + ], + "overall_confidence": 0.9, + }) + result = verify_screen_match( + "/tmp/test.png", + [{"role": "bouton", "text": "Rechercher"}], + ocr_client=ocr, + vlm_client=vlm, + ) + assert result.match == True + + def test_no_text_elements_trivially_match(self): + """Elements without text key → no presence check needed → trivially OK.""" + ocr = mock_ocr_client_factory(["Dashboard"]) + vlm = mock_vlm_client_factory({}) + result = verify_screen_match( + "/tmp/test.png", + [{"role": "page"}], + ocr_client=ocr, + vlm_client=vlm, + ) + assert result.match == True + + +# ── verify_before / verify_after tests ──────────────────────────────── + + +class TestVerifyBeforeAfter: + def test_verify_before_match(self): + ocr = mock_ocr_client_factory(["Login", "Password", "Connexion"]) + vlm = mock_vlm_client_factory({ + "confirmed": [ + {"index": 1, "role_confirmed": True, "actual_role": "champ", "confidence": 0.85}, + ], + "overall_confidence": 0.85, + }) + result = verify_before( + "/tmp/login.png", + [{"role": "champ", "text": "Login"}], + ocr_client=ocr, + vlm_client=vlm, + context="page login", + ) + assert result.match == True + + def test_verify_after_higher_threshold(self): + """verify_after uses min_confidence=0.8. VLM returns 0.75 → mismatch.""" + ocr = mock_ocr_client_factory(["Dashboard"]) + vlm = mock_vlm_client_factory({ + "confirmed": [ + {"index": 1, "role_confirmed": True, "actual_role": "page", "confidence": 0.75}, + ], + "overall_confidence": 0.75, + }) + result = verify_after( + "/tmp/dashboard.png", + [{"role": "page", "text": "Dashboard"}], + ocr_client=ocr, + vlm_client=vlm, + ) + # 0.75 < 0.8 threshold → role mismatch + assert result.match == False + + def test_verify_after_passes_at_0_8(self): + ocr = mock_ocr_client_factory(["Dashboard"]) + vlm = mock_vlm_client_factory({ + "confirmed": [ + {"index": 1, "role_confirmed": True, "actual_role": "page", "confidence": 0.85}, + ], + "overall_confidence": 0.85, + }) + result = verify_after( + "/tmp/dashboard.png", + [{"role": "page", "text": "Dashboard"}], + ocr_client=ocr, + vlm_client=vlm, + ) + assert result.match == True + + def test_verify_before_ocr_missing(self): + """Pre-action: expected text not on screen → mismatch (can't proceed).""" + ocr = mock_ocr_client_factory(["Accueil"]) + vlm = mock_vlm_client_factory({}) + result = verify_before( + "/tmp/page.png", + [{"role": "bouton", "text": "Connexion"}], + ocr_client=ocr, + vlm_client=vlm, + context="pre-login", + ) + assert result.match == False + assert "OCR presence" in result.reason + + +# ── OcrPresenceResult dataclass tests ───────────────────────────────── + + +class TestOcrPresenceResult: + def test_presence_ratio_all_found(self): + result = OcrPresenceResult( + found_texts={"Login": "Login", "Password": "Password"}, + missing=[], + all_found=True, + ) + assert result.presence_ratio == 1.0 + + def test_presence_ratio_half_found(self): + result = OcrPresenceResult( + found_texts={"Login": "Login", "Password": ""}, + missing=["champ: Password"], + all_found=False, + ) + assert result.presence_ratio == 0.5 + + def test_presence_ratio_empty(self): + result = OcrPresenceResult( + found_texts={}, + missing=[], + all_found=True, + ) + assert result.presence_ratio == 1.0