rpa_vision_v3/core/navigation/visual_verifier.py

"""Visual verifier — verify_before / verify_after avec ancrage OCR.

Architecture OCR-ancrée (challenge Claude 01/07, gate-vert 30/06) :
- PRESENCE = tokens OCR (déterministe, pas d'hallucination possible)
- RÔLE = VLM confirmation (semantic, ancré sur tokens OCR trouvés)
- VLM ne décide JAMAIS de la présence d'un élément
- Faux positif impossible par construction ; faux négatif = retry acceptable

Pattern d'injection : OcrClient + VlmClient injectables (tests sans réseau).
"""

from __future__ import annotations

import json
import logging
import re
import unicodedata
from dataclasses import dataclass, field
from difflib import SequenceMatcher
from typing import Any, Callable, Dict, List, Optional

logger = logging.getLogger(__name__)

# Type aliases — injectable callables for offline testing
VlmClient = Callable[[str, str], str]  # (image_path, prompt) -> text
OcrClient = Callable[[str], List[str]]  # (image_path) -> list of OCR text strings


@dataclass
class ScreenMatchResult:
    """Result of a screen verification check."""

    match: bool
    confidence: float = 0.0
    reason: str = ""
    observed_elements: List[Dict[str, Any]] = field(default_factory=list)
    expected_elements: List[Dict[str, Any]] = field(default_factory=list)
    mismatches: List[str] = field(default_factory=list)

    def describe(self) -> str:
        if self.match:
            return f"Screen match OK (conf={self.confidence:.2f})"
        parts = [f"Screen mismatch (conf={self.confidence:.2f})"]
        if self.mismatches:
            parts.append("missing: " + ", ".join(self.mismatches))
        if self.reason:
            parts.append(self.reason)
        return " | ".join(parts)


# ── Text normalization (pure functions) ────────────────────────────────


def normalize_text(text: str) -> str:
    """Normalize text for fuzzy matching: lowercase, strip accents, collapse whitespace."""
    text = text.lower().strip()
    # Strip accents: é→e, è→e, ê→e, à→a, etc.
    text = unicodedata.normalize("NFKD", text)
    text = "".join(c for c in text if not unicodedata.combining(c))
    # Collapse whitespace
    text = re.sub(r"\s+", " ", text)
    return text


def fuzzy_match(expected: str, observed: str, threshold: float = 0.8) -> bool:
    """Check if observed text fuzzy-matches expected text.

    Three strategies (any wins):
    1. Exact match after normalization
    2. Substring containment (either direction)
    3. SequenceMatcher ratio >= threshold
    """
    norm_expected = normalize_text(expected)
    norm_observed = normalize_text(observed)

    if norm_expected == norm_observed:
        return True

    if norm_expected in norm_observed or norm_observed in norm_expected:
        return True

    ratio = SequenceMatcher(None, norm_expected, norm_observed).ratio()
    return ratio >= threshold


# ── OCR presence check (deterministic, no VLM) ──────────────────────


@dataclass
class OcrPresenceResult:
    """Result of OCR-based presence check."""

    found_texts: Dict[str, str] = field(default_factory=dict)
    missing: List[str] = field(default_factory=list)
    all_found: bool = False

    @property
    def presence_ratio(self) -> float:
        if not self.found_texts:
            return 1.0
        found_count = sum(1 for v in self.found_texts.values() if v != "")
        return found_count / len(self.found_texts)


def ocr_presence_check(
    ocr_tokens: List[str],
    expected_elements: List[Dict[str, Any]],
    fuzzy_threshold: float = 0.8,
) -> OcrPresenceResult:
    """Check presence of expected texts against OCR tokens (deterministic).

    Pure function — no VLM call, zero hallucination risk.
    """
    found_texts: Dict[str, str] = {}
    missing: List[str] = []

    for el in expected_elements:
        expected_text = el.get("text", "")
        if not expected_text:
            found_texts[""] = ""
            continue

        matched_ocr = ""
        for token in ocr_tokens:
            if fuzzy_match(expected_text, token, threshold=fuzzy_threshold):
                matched_ocr = token
                break

        if matched_ocr:
            found_texts[expected_text] = matched_ocr
        else:
            found_texts[expected_text] = ""
            missing.append(f"{el.get('role', '?')}: {expected_text}")

    all_found = len(missing) == 0
    return OcrPresenceResult(
        found_texts=found_texts,
        missing=missing,
        all_found=all_found,
    )


# ── VLM role confirmation (semantic, anchored on found OCR texts) ────


def build_role_confirm_prompt(
    found_elements: List[Dict[str, Any]],
    expected_elements: List[Dict[str, Any]],
    context: str = "",
) -> str:
    """Build VLM prompt for role confirmation of OCR-found elements.

    VLM receives found texts and confirms their ROLE only — never presence.
    """
    found_lines = []
    for i, el in enumerate(found_elements):
        matched_ocr = el.get("matched_ocr", "")
        expected_role = el.get("expected_role", "?")
        line = f"{i+1}. Text \"{matched_ocr}\" — expected role: {expected_role}"
        found_lines.append(line)

    found_block = "\n".join(found_lines)

    prompt = (
        "You are a screen role validator. OCR has confirmed these texts are "
        "present on the screen. Your job is ONLY to confirm their ROLE — "
        "do NOT re-declare whether they are present.\n"
    )
    if context:
        prompt += f"Context: {context}\n"
    prompt += (
        f"Found texts with expected roles:\n{found_block}\n\n"
        "Respond in JSON format:\n"
        "{\"confirmed\": [{\"index\": 1, \"role_confirmed\": true/false, "
        "\"actual_role\": \"...\", \"confidence\": 0.0-1.0}], "
        "\"overall_confidence\": 0.0-1.0}\n"
        "Only confirm role_confirmed=true if the text clearly plays the "
        "expected role (e.g., a button, not just a label with the same text)."
    )
    return prompt


def parse_role_confirm_response(vlm_text: str) -> Dict[str, Any]:
    """Parse VLM role confirmation JSON response."""
    try:
        data = json.loads(vlm_text)
    except json.JSONDecodeError:
        json_match = re.search(r"\{[\s\S]*\}", vlm_text)
        if json_match:
            try:
                data = json.loads(json_match.group())
            except json.JSONDecodeError:
                logger.warning("role_confirm: VLM response not parseable as JSON")
                return {"confirmed": [], "overall_confidence": 0.0}
        else:
            return {"confirmed": [], "overall_confidence": 0.0}

    confirmed = data.get("confirmed", [])
    overall_conf = data.get("overall_confidence", 0.0)
    if isinstance(overall_conf, str):
        try:
            overall_conf = float(overall_conf)
        except ValueError:
            overall_conf = 0.0

    return {
        "confirmed": confirmed,
        "overall_confidence": float(overall_conf),
    }


# ── Core verification (OCR-anchored composition) ────────────────────


def verify_screen_match(
    screenshot_path: str,
    expected_elements: List[Dict[str, Any]],
    ocr_client: OcrClient,
    vlm_client: VlmClient,
    context: str = "",
    min_confidence: float = 0.7,
) -> ScreenMatchResult:
    """Verify screen state with OCR-anchored presence + VLM role confirmation.

    Step 1: OCR screenshot → tokens → deterministic presence check
    Step 2: VLM confirms role of found elements (not presence!)

    Eliminates VLM self-report hallucination for presence checks.
    """
    if not expected_elements:
        return ScreenMatchResult(
            match=True,
            confidence=1.0,
            reason="no expected elements to verify",
        )

    # Step 1: OCR presence check (deterministic)
    try:
        ocr_tokens = ocr_client(screenshot_path)
    except Exception as e:
        logger.warning("verify_screen_match: OCR call failed (%s)", e)
        return ScreenMatchResult(
            match=False,
            confidence=0.0,
            reason=f"OCR error: {e}",
            expected_elements=expected_elements,
        )

    presence = ocr_presence_check(ocr_tokens, expected_elements)

    if not presence.all_found:
        observed = []
        for el in expected_elements:
            text = el.get("text", "")
            matched = presence.found_texts.get(text, "")
            observed.append({
                "role": el.get("role", "?"),
                "expected_text": text,
                "matched_ocr": matched,
                "found": matched != "",
            })
        return ScreenMatchResult(
            match=False,
            confidence=presence.presence_ratio,
            reason="OCR presence check: some texts not found",
            observed_elements=observed,
            expected_elements=expected_elements,
            mismatches=presence.missing,
        )

    # Step 2: VLM role confirmation (only for found elements)
    found_elements = []
    for el in expected_elements:
        text = el.get("text", "")
        matched_ocr = presence.found_texts.get(text, "")
        if text and matched_ocr:
            found_elements.append({
                "text": text,
                "expected_role": el.get("role", "?"),
                "matched_ocr": matched_ocr,
            })

    if not found_elements:
        # All elements had no text → presence trivially OK
        return ScreenMatchResult(
            match=True,
            confidence=1.0,
            reason="no text-based elements to verify",
            expected_elements=expected_elements,
        )

    prompt = build_role_confirm_prompt(found_elements, expected_elements, context)

    try:
        vlm_text = vlm_client(screenshot_path, prompt)
    except Exception as e:
        logger.warning("verify_screen_match: VLM role confirm failed (%s)", e)
        observed = []
        for el in expected_elements:
            text = el.get("text", "")
            observed.append({
                "role": el.get("role", "?"),
                "expected_text": text,
                "matched_ocr": presence.found_texts.get(text, ""),
                "found": True,
                "role_confirmed": False,
                "role_confidence": 0.0,
            })
        return ScreenMatchResult(
            match=True,
            confidence=0.5,
            reason=f"OCR presence OK, VLM role confirm failed: {e}",
            observed_elements=observed,
            expected_elements=expected_elements,
        )

    parsed = parse_role_confirm_response(vlm_text)
    overall_conf = parsed.get("overall_confidence", 0.0)
    confirmed = parsed.get("confirmed", [])

    observed = []
    role_mismatches = []
    for i, el in enumerate(expected_elements):
        text = el.get("text", "")
        expected_role = el.get("role", "?")
        matched_ocr = presence.found_texts.get(text, "")

        role_entry = None
        for c in confirmed:
            if c.get("index") == i + 1:
                role_entry = c
                break

        role_confirmed = False
        actual_role = ""
        role_confidence = 0.0

        if role_entry:
            role_confirmed = role_entry.get("role_confirmed", False)
            actual_role = role_entry.get("actual_role", "")
            role_confidence = role_entry.get("confidence", 0.0)
            if isinstance(role_confidence, str):
                try:
                    role_confidence = float(role_confidence)
                except ValueError:
                    role_confidence = 0.0

        observed.append({
            "role": expected_role,
            "expected_text": text,
            "matched_ocr": matched_ocr,
            "found": True,
            "role_confirmed": role_confirmed,
            "actual_role": actual_role,
            "role_confidence": role_confidence,
        })

        if not role_confirmed or role_confidence < min_confidence:
            role_mismatches.append(
                f"{expected_role}: {text} (actual={actual_role}, conf={role_confidence:.2f})"
            )

    is_match = len(role_mismatches) == 0 and overall_conf >= min_confidence

    return ScreenMatchResult(
        match=is_match,
        confidence=overall_conf,
        reason=f"OCR presence: {presence.presence_ratio:.0%}, VLM role: {overall_conf:.2f}",
        observed_elements=observed,
        expected_elements=expected_elements,
        mismatches=presence.missing + role_mismatches,
    )


def verify_before(
    screenshot_path: str,
    expected_elements: List[Dict[str, Any]],
    ocr_client: OcrClient,
    vlm_client: VlmClient,
    context: str = "",
) -> ScreenMatchResult:
    """Verify screen state BEFORE an action (OCR-anchored).

    Checks pre-conditions: expected texts present + roles correct.
    min_confidence=0.7 — some tolerance for pre-action verification.
    """
    return verify_screen_match(
        screenshot_path, expected_elements, ocr_client, vlm_client,
        context=f"PRE-ACTION: {context}", min_confidence=0.7,
    )


def verify_after(
    screenshot_path: str,
    expected_elements: List[Dict[str, Any]],
    ocr_client: OcrClient,
    vlm_client: VlmClient,
    context: str = "",
) -> ScreenMatchResult:
    """Verify screen state AFTER an action (OCR-anchored).

    Checks post-conditions with higher threshold (0.8).
    False positive = Léa proceeds on wrong assumption → stricter gate.
    """
    return verify_screen_match(
        screenshot_path, expected_elements, ocr_client, vlm_client,
        context=f"POST-ACTION: {context}", min_confidence=0.8,
    )