"""Visual verifier — verify_before / verify_after avec ancrage OCR. Architecture OCR-ancrée (challenge Claude 01/07, gate-vert 30/06) : - PRESENCE = tokens OCR (déterministe, pas d'hallucination possible) - RÔLE = VLM confirmation (semantic, ancré sur tokens OCR trouvés) - VLM ne décide JAMAIS de la présence d'un élément - Faux positif impossible par construction ; faux négatif = retry acceptable Pattern d'injection : OcrClient + VlmClient injectables (tests sans réseau). """ from __future__ import annotations import json import logging import re import unicodedata from dataclasses import dataclass, field from difflib import SequenceMatcher from typing import Any, Callable, Dict, List, Optional logger = logging.getLogger(__name__) # Type aliases — injectable callables for offline testing VlmClient = Callable[[str, str], str] # (image_path, prompt) -> text OcrClient = Callable[[str], List[str]] # (image_path) -> list of OCR text strings @dataclass class ScreenMatchResult: """Result of a screen verification check.""" match: bool confidence: float = 0.0 reason: str = "" observed_elements: List[Dict[str, Any]] = field(default_factory=list) expected_elements: List[Dict[str, Any]] = field(default_factory=list) mismatches: List[str] = field(default_factory=list) def describe(self) -> str: if self.match: return f"Screen match OK (conf={self.confidence:.2f})" parts = [f"Screen mismatch (conf={self.confidence:.2f})"] if self.mismatches: parts.append("missing: " + ", ".join(self.mismatches)) if self.reason: parts.append(self.reason) return " | ".join(parts) # ── Text normalization (pure functions) ──────────────────────────────── def normalize_text(text: str) -> str: """Normalize text for fuzzy matching: lowercase, strip accents, collapse whitespace.""" text = text.lower().strip() # Strip accents: é→e, è→e, ê→e, à→a, etc. text = unicodedata.normalize("NFKD", text) text = "".join(c for c in text if not unicodedata.combining(c)) # Collapse whitespace text = re.sub(r"\s+", " ", text) return text def fuzzy_match(expected: str, observed: str, threshold: float = 0.8) -> bool: """Check if observed text fuzzy-matches expected text. Three strategies (any wins): 1. Exact match after normalization 2. Substring containment (either direction) 3. SequenceMatcher ratio >= threshold """ norm_expected = normalize_text(expected) norm_observed = normalize_text(observed) if norm_expected == norm_observed: return True if norm_expected in norm_observed or norm_observed in norm_expected: return True ratio = SequenceMatcher(None, norm_expected, norm_observed).ratio() return ratio >= threshold # ── OCR presence check (deterministic, no VLM) ────────────────────── @dataclass class OcrPresenceResult: """Result of OCR-based presence check.""" found_texts: Dict[str, str] = field(default_factory=dict) missing: List[str] = field(default_factory=list) all_found: bool = False @property def presence_ratio(self) -> float: if not self.found_texts: return 1.0 found_count = sum(1 for v in self.found_texts.values() if v != "") return found_count / len(self.found_texts) def ocr_presence_check( ocr_tokens: List[str], expected_elements: List[Dict[str, Any]], fuzzy_threshold: float = 0.8, ) -> OcrPresenceResult: """Check presence of expected texts against OCR tokens (deterministic). Pure function — no VLM call, zero hallucination risk. """ found_texts: Dict[str, str] = {} missing: List[str] = [] for el in expected_elements: expected_text = el.get("text", "") if not expected_text: found_texts[""] = "" continue matched_ocr = "" for token in ocr_tokens: if fuzzy_match(expected_text, token, threshold=fuzzy_threshold): matched_ocr = token break if matched_ocr: found_texts[expected_text] = matched_ocr else: found_texts[expected_text] = "" missing.append(f"{el.get('role', '?')}: {expected_text}") all_found = len(missing) == 0 return OcrPresenceResult( found_texts=found_texts, missing=missing, all_found=all_found, ) # ── VLM role confirmation (semantic, anchored on found OCR texts) ──── def build_role_confirm_prompt( found_elements: List[Dict[str, Any]], expected_elements: List[Dict[str, Any]], context: str = "", ) -> str: """Build VLM prompt for role confirmation of OCR-found elements. VLM receives found texts and confirms their ROLE only — never presence. """ found_lines = [] for i, el in enumerate(found_elements): matched_ocr = el.get("matched_ocr", "") expected_role = el.get("expected_role", "?") line = f"{i+1}. Text \"{matched_ocr}\" — expected role: {expected_role}" found_lines.append(line) found_block = "\n".join(found_lines) prompt = ( "You are a screen role validator. OCR has confirmed these texts are " "present on the screen. Your job is ONLY to confirm their ROLE — " "do NOT re-declare whether they are present.\n" ) if context: prompt += f"Context: {context}\n" prompt += ( f"Found texts with expected roles:\n{found_block}\n\n" "Respond in JSON format:\n" "{\"confirmed\": [{\"index\": 1, \"role_confirmed\": true/false, " "\"actual_role\": \"...\", \"confidence\": 0.0-1.0}], " "\"overall_confidence\": 0.0-1.0}\n" "Only confirm role_confirmed=true if the text clearly plays the " "expected role (e.g., a button, not just a label with the same text)." ) return prompt def parse_role_confirm_response(vlm_text: str) -> Dict[str, Any]: """Parse VLM role confirmation JSON response.""" try: data = json.loads(vlm_text) except json.JSONDecodeError: json_match = re.search(r"\{[\s\S]*\}", vlm_text) if json_match: try: data = json.loads(json_match.group()) except json.JSONDecodeError: logger.warning("role_confirm: VLM response not parseable as JSON") return {"confirmed": [], "overall_confidence": 0.0} else: return {"confirmed": [], "overall_confidence": 0.0} confirmed = data.get("confirmed", []) overall_conf = data.get("overall_confidence", 0.0) if isinstance(overall_conf, str): try: overall_conf = float(overall_conf) except ValueError: overall_conf = 0.0 return { "confirmed": confirmed, "overall_confidence": float(overall_conf), } # ── Core verification (OCR-anchored composition) ──────────────────── def verify_screen_match( screenshot_path: str, expected_elements: List[Dict[str, Any]], ocr_client: OcrClient, vlm_client: VlmClient, context: str = "", min_confidence: float = 0.7, ) -> ScreenMatchResult: """Verify screen state with OCR-anchored presence + VLM role confirmation. Step 1: OCR screenshot → tokens → deterministic presence check Step 2: VLM confirms role of found elements (not presence!) Eliminates VLM self-report hallucination for presence checks. """ if not expected_elements: return ScreenMatchResult( match=True, confidence=1.0, reason="no expected elements to verify", ) # Step 1: OCR presence check (deterministic) try: ocr_tokens = ocr_client(screenshot_path) except Exception as e: logger.warning("verify_screen_match: OCR call failed (%s)", e) return ScreenMatchResult( match=False, confidence=0.0, reason=f"OCR error: {e}", expected_elements=expected_elements, ) presence = ocr_presence_check(ocr_tokens, expected_elements) if not presence.all_found: observed = [] for el in expected_elements: text = el.get("text", "") matched = presence.found_texts.get(text, "") observed.append({ "role": el.get("role", "?"), "expected_text": text, "matched_ocr": matched, "found": matched != "", }) return ScreenMatchResult( match=False, confidence=presence.presence_ratio, reason="OCR presence check: some texts not found", observed_elements=observed, expected_elements=expected_elements, mismatches=presence.missing, ) # Step 2: VLM role confirmation (only for found elements) found_elements = [] for el in expected_elements: text = el.get("text", "") matched_ocr = presence.found_texts.get(text, "") if text and matched_ocr: found_elements.append({ "text": text, "expected_role": el.get("role", "?"), "matched_ocr": matched_ocr, }) if not found_elements: # All elements had no text → presence trivially OK return ScreenMatchResult( match=True, confidence=1.0, reason="no text-based elements to verify", expected_elements=expected_elements, ) prompt = build_role_confirm_prompt(found_elements, expected_elements, context) try: vlm_text = vlm_client(screenshot_path, prompt) except Exception as e: logger.warning("verify_screen_match: VLM role confirm failed (%s)", e) observed = [] for el in expected_elements: text = el.get("text", "") observed.append({ "role": el.get("role", "?"), "expected_text": text, "matched_ocr": presence.found_texts.get(text, ""), "found": True, "role_confirmed": False, "role_confidence": 0.0, }) return ScreenMatchResult( match=True, confidence=0.5, reason=f"OCR presence OK, VLM role confirm failed: {e}", observed_elements=observed, expected_elements=expected_elements, ) parsed = parse_role_confirm_response(vlm_text) overall_conf = parsed.get("overall_confidence", 0.0) confirmed = parsed.get("confirmed", []) observed = [] role_mismatches = [] for i, el in enumerate(expected_elements): text = el.get("text", "") expected_role = el.get("role", "?") matched_ocr = presence.found_texts.get(text, "") role_entry = None for c in confirmed: if c.get("index") == i + 1: role_entry = c break role_confirmed = False actual_role = "" role_confidence = 0.0 if role_entry: role_confirmed = role_entry.get("role_confirmed", False) actual_role = role_entry.get("actual_role", "") role_confidence = role_entry.get("confidence", 0.0) if isinstance(role_confidence, str): try: role_confidence = float(role_confidence) except ValueError: role_confidence = 0.0 observed.append({ "role": expected_role, "expected_text": text, "matched_ocr": matched_ocr, "found": True, "role_confirmed": role_confirmed, "actual_role": actual_role, "role_confidence": role_confidence, }) if not role_confirmed or role_confidence < min_confidence: role_mismatches.append( f"{expected_role}: {text} (actual={actual_role}, conf={role_confidence:.2f})" ) is_match = len(role_mismatches) == 0 and overall_conf >= min_confidence return ScreenMatchResult( match=is_match, confidence=overall_conf, reason=f"OCR presence: {presence.presence_ratio:.0%}, VLM role: {overall_conf:.2f}", observed_elements=observed, expected_elements=expected_elements, mismatches=presence.missing + role_mismatches, ) def verify_before( screenshot_path: str, expected_elements: List[Dict[str, Any]], ocr_client: OcrClient, vlm_client: VlmClient, context: str = "", ) -> ScreenMatchResult: """Verify screen state BEFORE an action (OCR-anchored). Checks pre-conditions: expected texts present + roles correct. min_confidence=0.7 — some tolerance for pre-action verification. """ return verify_screen_match( screenshot_path, expected_elements, ocr_client, vlm_client, context=f"PRE-ACTION: {context}", min_confidence=0.7, ) def verify_after( screenshot_path: str, expected_elements: List[Dict[str, Any]], ocr_client: OcrClient, vlm_client: VlmClient, context: str = "", ) -> ScreenMatchResult: """Verify screen state AFTER an action (OCR-anchored). Checks post-conditions with higher threshold (0.8). False positive = Léa proceeds on wrong assumption → stricter gate. """ return verify_screen_match( screenshot_path, expected_elements, ocr_client, vlm_client, context=f"POST-ACTION: {context}", min_confidence=0.8, )