- core/navigation/ : visual_verifier (presence=OCR, role=VLM ancre sur tokens), grounding (OCR-anchor first, VLM fallback, cache coords valide par la vue), visual_login (verify_before/after, DETTE-023), action_resolver (pont runtime) - api_stream/replay_engine : dispatch action navigate server-side, never-fail -> needs_review, import depuis core.navigation (boot 5005 garanti) - 131 tests verts (wiring boot, e2e handler, unit modules) Chantier Qwen 01-02/07/2026, revue croisee Claude (plan deploy v2). Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
409 lines
13 KiB
Python
409 lines
13 KiB
Python
"""Visual verifier — verify_before / verify_after avec ancrage OCR.
|
|
|
|
Architecture OCR-ancrée (challenge Claude 01/07, gate-vert 30/06) :
|
|
- PRESENCE = tokens OCR (déterministe, pas d'hallucination possible)
|
|
- RÔLE = VLM confirmation (semantic, ancré sur tokens OCR trouvés)
|
|
- VLM ne décide JAMAIS de la présence d'un élément
|
|
- Faux positif impossible par construction ; faux négatif = retry acceptable
|
|
|
|
Pattern d'injection : OcrClient + VlmClient injectables (tests sans réseau).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
import unicodedata
|
|
from dataclasses import dataclass, field
|
|
from difflib import SequenceMatcher
|
|
from typing import Any, Callable, Dict, List, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Type aliases — injectable callables for offline testing
|
|
VlmClient = Callable[[str, str], str] # (image_path, prompt) -> text
|
|
OcrClient = Callable[[str], List[str]] # (image_path) -> list of OCR text strings
|
|
|
|
|
|
@dataclass
|
|
class ScreenMatchResult:
|
|
"""Result of a screen verification check."""
|
|
|
|
match: bool
|
|
confidence: float = 0.0
|
|
reason: str = ""
|
|
observed_elements: List[Dict[str, Any]] = field(default_factory=list)
|
|
expected_elements: List[Dict[str, Any]] = field(default_factory=list)
|
|
mismatches: List[str] = field(default_factory=list)
|
|
|
|
def describe(self) -> str:
|
|
if self.match:
|
|
return f"Screen match OK (conf={self.confidence:.2f})"
|
|
parts = [f"Screen mismatch (conf={self.confidence:.2f})"]
|
|
if self.mismatches:
|
|
parts.append("missing: " + ", ".join(self.mismatches))
|
|
if self.reason:
|
|
parts.append(self.reason)
|
|
return " | ".join(parts)
|
|
|
|
|
|
# ── Text normalization (pure functions) ────────────────────────────────
|
|
|
|
|
|
def normalize_text(text: str) -> str:
|
|
"""Normalize text for fuzzy matching: lowercase, strip accents, collapse whitespace."""
|
|
text = text.lower().strip()
|
|
# Strip accents: é→e, è→e, ê→e, à→a, etc.
|
|
text = unicodedata.normalize("NFKD", text)
|
|
text = "".join(c for c in text if not unicodedata.combining(c))
|
|
# Collapse whitespace
|
|
text = re.sub(r"\s+", " ", text)
|
|
return text
|
|
|
|
|
|
def fuzzy_match(expected: str, observed: str, threshold: float = 0.8) -> bool:
|
|
"""Check if observed text fuzzy-matches expected text.
|
|
|
|
Three strategies (any wins):
|
|
1. Exact match after normalization
|
|
2. Substring containment (either direction)
|
|
3. SequenceMatcher ratio >= threshold
|
|
"""
|
|
norm_expected = normalize_text(expected)
|
|
norm_observed = normalize_text(observed)
|
|
|
|
if norm_expected == norm_observed:
|
|
return True
|
|
|
|
if norm_expected in norm_observed or norm_observed in norm_expected:
|
|
return True
|
|
|
|
ratio = SequenceMatcher(None, norm_expected, norm_observed).ratio()
|
|
return ratio >= threshold
|
|
|
|
|
|
# ── OCR presence check (deterministic, no VLM) ──────────────────────
|
|
|
|
|
|
@dataclass
|
|
class OcrPresenceResult:
|
|
"""Result of OCR-based presence check."""
|
|
|
|
found_texts: Dict[str, str] = field(default_factory=dict)
|
|
missing: List[str] = field(default_factory=list)
|
|
all_found: bool = False
|
|
|
|
@property
|
|
def presence_ratio(self) -> float:
|
|
if not self.found_texts:
|
|
return 1.0
|
|
found_count = sum(1 for v in self.found_texts.values() if v != "")
|
|
return found_count / len(self.found_texts)
|
|
|
|
|
|
def ocr_presence_check(
|
|
ocr_tokens: List[str],
|
|
expected_elements: List[Dict[str, Any]],
|
|
fuzzy_threshold: float = 0.8,
|
|
) -> OcrPresenceResult:
|
|
"""Check presence of expected texts against OCR tokens (deterministic).
|
|
|
|
Pure function — no VLM call, zero hallucination risk.
|
|
"""
|
|
found_texts: Dict[str, str] = {}
|
|
missing: List[str] = []
|
|
|
|
for el in expected_elements:
|
|
expected_text = el.get("text", "")
|
|
if not expected_text:
|
|
found_texts[""] = ""
|
|
continue
|
|
|
|
matched_ocr = ""
|
|
for token in ocr_tokens:
|
|
if fuzzy_match(expected_text, token, threshold=fuzzy_threshold):
|
|
matched_ocr = token
|
|
break
|
|
|
|
if matched_ocr:
|
|
found_texts[expected_text] = matched_ocr
|
|
else:
|
|
found_texts[expected_text] = ""
|
|
missing.append(f"{el.get('role', '?')}: {expected_text}")
|
|
|
|
all_found = len(missing) == 0
|
|
return OcrPresenceResult(
|
|
found_texts=found_texts,
|
|
missing=missing,
|
|
all_found=all_found,
|
|
)
|
|
|
|
|
|
# ── VLM role confirmation (semantic, anchored on found OCR texts) ────
|
|
|
|
|
|
def build_role_confirm_prompt(
|
|
found_elements: List[Dict[str, Any]],
|
|
expected_elements: List[Dict[str, Any]],
|
|
context: str = "",
|
|
) -> str:
|
|
"""Build VLM prompt for role confirmation of OCR-found elements.
|
|
|
|
VLM receives found texts and confirms their ROLE only — never presence.
|
|
"""
|
|
found_lines = []
|
|
for i, el in enumerate(found_elements):
|
|
matched_ocr = el.get("matched_ocr", "")
|
|
expected_role = el.get("expected_role", "?")
|
|
line = f"{i+1}. Text \"{matched_ocr}\" — expected role: {expected_role}"
|
|
found_lines.append(line)
|
|
|
|
found_block = "\n".join(found_lines)
|
|
|
|
prompt = (
|
|
"You are a screen role validator. OCR has confirmed these texts are "
|
|
"present on the screen. Your job is ONLY to confirm their ROLE — "
|
|
"do NOT re-declare whether they are present.\n"
|
|
)
|
|
if context:
|
|
prompt += f"Context: {context}\n"
|
|
prompt += (
|
|
f"Found texts with expected roles:\n{found_block}\n\n"
|
|
"Respond in JSON format:\n"
|
|
"{\"confirmed\": [{\"index\": 1, \"role_confirmed\": true/false, "
|
|
"\"actual_role\": \"...\", \"confidence\": 0.0-1.0}], "
|
|
"\"overall_confidence\": 0.0-1.0}\n"
|
|
"Only confirm role_confirmed=true if the text clearly plays the "
|
|
"expected role (e.g., a button, not just a label with the same text)."
|
|
)
|
|
return prompt
|
|
|
|
|
|
def parse_role_confirm_response(vlm_text: str) -> Dict[str, Any]:
|
|
"""Parse VLM role confirmation JSON response."""
|
|
try:
|
|
data = json.loads(vlm_text)
|
|
except json.JSONDecodeError:
|
|
json_match = re.search(r"\{[\s\S]*\}", vlm_text)
|
|
if json_match:
|
|
try:
|
|
data = json.loads(json_match.group())
|
|
except json.JSONDecodeError:
|
|
logger.warning("role_confirm: VLM response not parseable as JSON")
|
|
return {"confirmed": [], "overall_confidence": 0.0}
|
|
else:
|
|
return {"confirmed": [], "overall_confidence": 0.0}
|
|
|
|
confirmed = data.get("confirmed", [])
|
|
overall_conf = data.get("overall_confidence", 0.0)
|
|
if isinstance(overall_conf, str):
|
|
try:
|
|
overall_conf = float(overall_conf)
|
|
except ValueError:
|
|
overall_conf = 0.0
|
|
|
|
return {
|
|
"confirmed": confirmed,
|
|
"overall_confidence": float(overall_conf),
|
|
}
|
|
|
|
|
|
# ── Core verification (OCR-anchored composition) ────────────────────
|
|
|
|
|
|
def verify_screen_match(
|
|
screenshot_path: str,
|
|
expected_elements: List[Dict[str, Any]],
|
|
ocr_client: OcrClient,
|
|
vlm_client: VlmClient,
|
|
context: str = "",
|
|
min_confidence: float = 0.7,
|
|
) -> ScreenMatchResult:
|
|
"""Verify screen state with OCR-anchored presence + VLM role confirmation.
|
|
|
|
Step 1: OCR screenshot → tokens → deterministic presence check
|
|
Step 2: VLM confirms role of found elements (not presence!)
|
|
|
|
Eliminates VLM self-report hallucination for presence checks.
|
|
"""
|
|
if not expected_elements:
|
|
return ScreenMatchResult(
|
|
match=True,
|
|
confidence=1.0,
|
|
reason="no expected elements to verify",
|
|
)
|
|
|
|
# Step 1: OCR presence check (deterministic)
|
|
try:
|
|
ocr_tokens = ocr_client(screenshot_path)
|
|
except Exception as e:
|
|
logger.warning("verify_screen_match: OCR call failed (%s)", e)
|
|
return ScreenMatchResult(
|
|
match=False,
|
|
confidence=0.0,
|
|
reason=f"OCR error: {e}",
|
|
expected_elements=expected_elements,
|
|
)
|
|
|
|
presence = ocr_presence_check(ocr_tokens, expected_elements)
|
|
|
|
if not presence.all_found:
|
|
observed = []
|
|
for el in expected_elements:
|
|
text = el.get("text", "")
|
|
matched = presence.found_texts.get(text, "")
|
|
observed.append({
|
|
"role": el.get("role", "?"),
|
|
"expected_text": text,
|
|
"matched_ocr": matched,
|
|
"found": matched != "",
|
|
})
|
|
return ScreenMatchResult(
|
|
match=False,
|
|
confidence=presence.presence_ratio,
|
|
reason="OCR presence check: some texts not found",
|
|
observed_elements=observed,
|
|
expected_elements=expected_elements,
|
|
mismatches=presence.missing,
|
|
)
|
|
|
|
# Step 2: VLM role confirmation (only for found elements)
|
|
found_elements = []
|
|
for el in expected_elements:
|
|
text = el.get("text", "")
|
|
matched_ocr = presence.found_texts.get(text, "")
|
|
if text and matched_ocr:
|
|
found_elements.append({
|
|
"text": text,
|
|
"expected_role": el.get("role", "?"),
|
|
"matched_ocr": matched_ocr,
|
|
})
|
|
|
|
if not found_elements:
|
|
# All elements had no text → presence trivially OK
|
|
return ScreenMatchResult(
|
|
match=True,
|
|
confidence=1.0,
|
|
reason="no text-based elements to verify",
|
|
expected_elements=expected_elements,
|
|
)
|
|
|
|
prompt = build_role_confirm_prompt(found_elements, expected_elements, context)
|
|
|
|
try:
|
|
vlm_text = vlm_client(screenshot_path, prompt)
|
|
except Exception as e:
|
|
logger.warning("verify_screen_match: VLM role confirm failed (%s)", e)
|
|
observed = []
|
|
for el in expected_elements:
|
|
text = el.get("text", "")
|
|
observed.append({
|
|
"role": el.get("role", "?"),
|
|
"expected_text": text,
|
|
"matched_ocr": presence.found_texts.get(text, ""),
|
|
"found": True,
|
|
"role_confirmed": False,
|
|
"role_confidence": 0.0,
|
|
})
|
|
return ScreenMatchResult(
|
|
match=True,
|
|
confidence=0.5,
|
|
reason=f"OCR presence OK, VLM role confirm failed: {e}",
|
|
observed_elements=observed,
|
|
expected_elements=expected_elements,
|
|
)
|
|
|
|
parsed = parse_role_confirm_response(vlm_text)
|
|
overall_conf = parsed.get("overall_confidence", 0.0)
|
|
confirmed = parsed.get("confirmed", [])
|
|
|
|
observed = []
|
|
role_mismatches = []
|
|
for i, el in enumerate(expected_elements):
|
|
text = el.get("text", "")
|
|
expected_role = el.get("role", "?")
|
|
matched_ocr = presence.found_texts.get(text, "")
|
|
|
|
role_entry = None
|
|
for c in confirmed:
|
|
if c.get("index") == i + 1:
|
|
role_entry = c
|
|
break
|
|
|
|
role_confirmed = False
|
|
actual_role = ""
|
|
role_confidence = 0.0
|
|
|
|
if role_entry:
|
|
role_confirmed = role_entry.get("role_confirmed", False)
|
|
actual_role = role_entry.get("actual_role", "")
|
|
role_confidence = role_entry.get("confidence", 0.0)
|
|
if isinstance(role_confidence, str):
|
|
try:
|
|
role_confidence = float(role_confidence)
|
|
except ValueError:
|
|
role_confidence = 0.0
|
|
|
|
observed.append({
|
|
"role": expected_role,
|
|
"expected_text": text,
|
|
"matched_ocr": matched_ocr,
|
|
"found": True,
|
|
"role_confirmed": role_confirmed,
|
|
"actual_role": actual_role,
|
|
"role_confidence": role_confidence,
|
|
})
|
|
|
|
if not role_confirmed or role_confidence < min_confidence:
|
|
role_mismatches.append(
|
|
f"{expected_role}: {text} (actual={actual_role}, conf={role_confidence:.2f})"
|
|
)
|
|
|
|
is_match = len(role_mismatches) == 0 and overall_conf >= min_confidence
|
|
|
|
return ScreenMatchResult(
|
|
match=is_match,
|
|
confidence=overall_conf,
|
|
reason=f"OCR presence: {presence.presence_ratio:.0%}, VLM role: {overall_conf:.2f}",
|
|
observed_elements=observed,
|
|
expected_elements=expected_elements,
|
|
mismatches=presence.missing + role_mismatches,
|
|
)
|
|
|
|
|
|
def verify_before(
|
|
screenshot_path: str,
|
|
expected_elements: List[Dict[str, Any]],
|
|
ocr_client: OcrClient,
|
|
vlm_client: VlmClient,
|
|
context: str = "",
|
|
) -> ScreenMatchResult:
|
|
"""Verify screen state BEFORE an action (OCR-anchored).
|
|
|
|
Checks pre-conditions: expected texts present + roles correct.
|
|
min_confidence=0.7 — some tolerance for pre-action verification.
|
|
"""
|
|
return verify_screen_match(
|
|
screenshot_path, expected_elements, ocr_client, vlm_client,
|
|
context=f"PRE-ACTION: {context}", min_confidence=0.7,
|
|
)
|
|
|
|
|
|
def verify_after(
|
|
screenshot_path: str,
|
|
expected_elements: List[Dict[str, Any]],
|
|
ocr_client: OcrClient,
|
|
vlm_client: VlmClient,
|
|
context: str = "",
|
|
) -> ScreenMatchResult:
|
|
"""Verify screen state AFTER an action (OCR-anchored).
|
|
|
|
Checks post-conditions with higher threshold (0.8).
|
|
False positive = Léa proceeds on wrong assumption → stricter gate.
|
|
"""
|
|
return verify_screen_match(
|
|
screenshot_path, expected_elements, ocr_client, vlm_client,
|
|
context=f"POST-ACTION: {context}", min_confidence=0.8,
|
|
)
|