From 1b4e64960b443c81133a80d9b4780e8afc3eb985 Mon Sep 17 00:00:00 2001 From: Dom Date: Sun, 24 May 2026 17:52:06 +0200 Subject: [PATCH] =?UTF-8?q?feat(validator):=20R1=20MVP=20P0=20=E2=80=94=20?= =?UTF-8?q?OcrRoiChecker=20+=20orchestrator=20(flag=20OFF=20default)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Package core/validation/ minimal : - result.py : Verdict, FailureCategory, ValidationResult - pixel_diff_checker.py : wrapper de ReplayVerifier.verify_action - ocr_roi_checker.py : ROI 80px autour du clic, détecte WRONG_APPLICATION via SUSPECT_TOKENS (edge/https/explorateur de fichiers/…) - orchestrator.py : Validator dispatch action_type → checkers + agrégation Wiring api_stream.py:3646 derrière RPA_VALIDATOR_V2_ENABLED (OFF par défaut). Si verdict ≠ COMPLETE, override report.success=False et expose failure_category dans result_entry. Zero régression flag OFF. Tests : - tests/unit/test_validator_v2.py : 13 tests (Checkers + Validator + sérialisation) - tests/integration/test_validator_step10.py : 2 tests reproduisant le bug replay_sess_4c38dbb8 / act_raw_6c1432b3 (clic Enregistrer fait basculer vers Explorateur de fichiers) — Validator retourne WRONG_APPLICATION Activation pour test live : RPA_VALIDATOR_V2_ENABLED=true Cf. docs/recherche/SPEC_VALIDATOR_MATRICE.md, AXE_B2_DEEP_VALIDATOR.md. Co-Authored-By: Claude Opus 4.7 (1M context) --- agent_v0/server_v1/api_stream.py | 70 ++++++ core/validation/__init__.py | 31 +++ core/validation/ocr_roi_checker.py | 171 ++++++++++++++ core/validation/orchestrator.py | 79 +++++++ core/validation/pixel_diff_checker.py | 68 ++++++ core/validation/result.py | 53 +++++ tests/integration/test_validator_step10.py | 130 +++++++++++ tests/unit/test_validator_v2.py | 249 +++++++++++++++++++++ 8 files changed, 851 insertions(+) create mode 100644 core/validation/__init__.py create mode 100644 core/validation/ocr_roi_checker.py create mode 100644 core/validation/orchestrator.py create mode 100644 core/validation/pixel_diff_checker.py create mode 100644 core/validation/result.py create mode 100644 tests/integration/test_validator_step10.py create mode 100644 tests/unit/test_validator_v2.py diff --git a/agent_v0/server_v1/api_stream.py b/agent_v0/server_v1/api_stream.py index 0044f6834..4e2f2feae 100644 --- a/agent_v0/server_v1/api_stream.py +++ b/agent_v0/server_v1/api_stream.py @@ -53,6 +53,42 @@ _replay_verifier = ReplayVerifier() _replay_learner = ReplayLearner() _audit_trail = AuditTrail() +# Validator V2 (MVP P0) — flag RPA_VALIDATOR_V2_ENABLED, OFF par défaut. +# Lazy init au premier appel : EasyOCR/docTR sont chargés à la demande. +_validator_v2 = None + + +def _get_validator_v2(): + """Lazy init du Validator V2. Active uniquement si flag ON.""" + global _validator_v2 + if _validator_v2 is not None: + return _validator_v2 + if os.environ.get("RPA_VALIDATOR_V2_ENABLED", "").lower() not in ("1", "true", "yes", "on"): + return None + try: + from core.validation import ( + OcrRoiChecker, + PixelDiffChecker, + Validator, + ) + pixel = PixelDiffChecker(_replay_verifier) + ocr_click = OcrRoiChecker(radius_px=80) + ocr_type = OcrRoiChecker(radius_px=120) + _validator_v2 = Validator( + checkers={ + "click": [ocr_click, pixel], + "double_click": [ocr_click, pixel], + "right_click": [ocr_click, pixel], + "type": [ocr_type, pixel], + }, + default_checkers=[pixel], + ) + logger.info("[VALIDATOR_V2] activé (flag RPA_VALIDATOR_V2_ENABLED=ON)") + except Exception as exc: + logger.warning(f"[VALIDATOR_V2] init impossible: {exc}") + _validator_v2 = None + return _validator_v2 + # Nombre maximum de retries par action avant de déclarer un échec MAX_RETRIES_PER_ACTION = 3 @@ -3607,6 +3643,10 @@ async def report_action_result(report: ReplayResultReport): # Skip aussi la vérification serveur si l'agent a déjà géré la popup skip_verify = skip_verify or agent_handled_popup verification = None + # [VALIDATOR_V2] override conditionnel — flag RPA_VALIDATOR_V2_ENABLED. + # Si verdict ≠ COMPLETE, on force result.success=False et on expose failure_category. + validator_v2_result = None + validator_v2_failure_category = None if report.success and screenshot_after and not skip_verify: # Utiliser le screenshot_before envoyé par l'agent (Critic fiable) # Fallback sur le dernier screenshot stocké côté serveur @@ -3617,7 +3657,35 @@ async def report_action_result(report: ReplayResultReport): result_dict = { "success": report.success, "error": report.error, + "actual_position": report.actual_position, } + # === Validator V2 (P0 MVP) — derrière flag, sinon no-op === + v2 = _get_validator_v2() + if v2 is not None: + try: + validator_v2_result = v2.validate( + action=action_dict, + result=result_dict, + screenshot_before=screenshot_before, + screenshot_after=screenshot_after, + context={}, + ) + from core.validation import Verdict as _V2Verdict + if validator_v2_result.verdict != _V2Verdict.COMPLETE: + validator_v2_failure_category = ( + validator_v2_result.failure_category.value + if validator_v2_result.failure_category else None + ) + report.success = False + logger.info( + f"[VALIDATOR_V2] override success→False action={action_id} " + f"verdict={validator_v2_result.verdict.value} " + f"conf={validator_v2_result.confidence:.2f} " + f"failure_category={validator_v2_failure_category} " + f"reason={validator_v2_result.reasoning[:120]}" + ) + except Exception as exc: + logger.warning(f"[VALIDATOR_V2] échec: {exc}") # Utiliser le Critic sémantique si l'action a un expected_result expected_result = (original_action or {}).get("expected_result", "") action_intention = (original_action or {}).get("intention", "") @@ -3686,6 +3754,8 @@ async def report_action_result(report: ReplayResultReport): "actual_position": report.actual_position, "retry_count": retry_count, "verification": verification.to_dict() if verification else None, + "validator_v2": validator_v2_result.to_dict() if validator_v2_result else None, + "failure_category": validator_v2_failure_category, "resolution_method": report.resolution_method, "resolution_score": report.resolution_score, "resolution_elapsed_ms": report.resolution_elapsed_ms, diff --git a/core/validation/__init__.py b/core/validation/__init__.py new file mode 100644 index 000000000..6fa2209f9 --- /dev/null +++ b/core/validation/__init__.py @@ -0,0 +1,31 @@ +"""core.validation — Validator V2 (MVP P0). + +Pattern Planner-Actor-Validator (cf. SPEC_VALIDATOR_MATRICE.md). +Donne un verdict structuré (Verdict / FailureCategory) sur l'effet d'une action +en agrégeant plusieurs Checkers spécialisés. + +Périmètre P0 : +- PixelDiffChecker (wrapper ReplayVerifier existant) +- OcrRoiChecker (ROI 80px autour du clic, détecte WRONG_APPLICATION = bug step 10) +- Validator orchestrateur (dispatch action_type → checkers + agrégation conf) + +Flag d'activation : variable d'env RPA_VALIDATOR_V2_ENABLED=true (OFF par défaut). +""" + +from core.validation.result import ( + FailureCategory, + ValidationResult, + Verdict, +) +from core.validation.pixel_diff_checker import PixelDiffChecker +from core.validation.ocr_roi_checker import OcrRoiChecker +from core.validation.orchestrator import Validator + +__all__ = [ + "Validator", + "Verdict", + "FailureCategory", + "ValidationResult", + "PixelDiffChecker", + "OcrRoiChecker", +] diff --git a/core/validation/ocr_roi_checker.py b/core/validation/ocr_roi_checker.py new file mode 100644 index 000000000..e466d1af4 --- /dev/null +++ b/core/validation/ocr_roi_checker.py @@ -0,0 +1,171 @@ +"""OcrRoiChecker — ROI 80px (ou 120 px pour type) autour du clic. + +Détecte WRONG_APPLICATION (bug step 10) si un token suspect navigateur/système +apparaît dans la ROI alors qu'on attendait un label métier. +""" +from __future__ import annotations + +import time +import unicodedata +from typing import Any, Callable, Dict, Optional + +from core.validation.result import FailureCategory, ValidationResult, Verdict + + +def _strip_accents(s: str) -> str: + return "".join( + c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c) + ).lower().strip() + + +class OcrRoiChecker: + name = "ocr_roi" + budget_ms = 200.0 + + SUSPECT_TOKENS = ( + "edge", "chrome", "firefox", "mozilla", "opera", + "http", "https", "www.", + ".com", ".fr", ".org", ".net", ".html", + "favoris", "favorite", "bookmark", + "barre d'adresse", "address bar", + "nouvel onglet", "new tab", + "securite windows", "windows security", + "user account control", "controle de compte", + "explorateur de fichiers", "file explorer", + ) + + def __init__( + self, + ocr_fn: Optional[Callable] = None, + radius_px: int = 80, + suspect_min_confidence: float = 0.85, + expected_min_confidence: float = 0.90, + ): + self._ocr = ocr_fn # callable(PIL.Image) -> str ; lazy via TitleVerifier si None + self._radius = radius_px + self._suspect_conf = suspect_min_confidence + self._expected_conf = expected_min_confidence + + def _ensure_ocr(self) -> Optional[Callable]: + if self._ocr is not None: + return self._ocr + try: + from core.grounding.title_verifier import TitleVerifier + tv = TitleVerifier() + self._ocr = tv._get_ocr() + except Exception: + self._ocr = None + return self._ocr + + def check( + self, + action: Dict[str, Any], + result: Dict[str, Any], + screenshot_before: Optional[str], + screenshot_after: Optional[str], + context: Dict[str, Any], + ) -> ValidationResult: + t0 = time.time() + target_spec = action.get("target_spec") or {} + expected_text = ( + action.get("by_text") + or target_spec.get("by_text") + or context.get("expected_text") + or "" + ) + actual_pos = result.get("actual_position") or {} + x_pct = actual_pos.get("x_pct") or action.get("x_pct") or target_spec.get("x_pct") + y_pct = actual_pos.get("y_pct") or action.get("y_pct") or target_spec.get("y_pct") + + if not screenshot_after or x_pct is None or y_pct is None or not expected_text: + return ValidationResult( + verdict=Verdict.CONTINUE, confidence=0.2, + check_used=self.name, elapsed_ms=(time.time() - t0) * 1000, + reasoning="ROI indéfinie (coords ou expected_text manquants)", + ) + + try: + from agent_v0.server_v1.replay_verifier import ReplayVerifier + img = ReplayVerifier()._load_single_image(screenshot_after) + except Exception as exc: + return ValidationResult( + verdict=Verdict.CONTINUE, confidence=0.1, + check_used=self.name, elapsed_ms=(time.time() - t0) * 1000, + reasoning=f"Chargement image impossible: {exc}", + ) + + w, h = img.size + cx, cy = int(float(x_pct) * w), int(float(y_pct) * h) + r = self._radius + bbox = (max(0, cx - r), max(0, cy - r), min(w, cx + r), min(h, cy + r)) + roi = img.crop(bbox) + + ocr_fn = self._ensure_ocr() + if ocr_fn is None: + return ValidationResult( + verdict=Verdict.CONTINUE, confidence=0.1, + check_used=self.name, elapsed_ms=(time.time() - t0) * 1000, + reasoning="OCR indisponible (EasyOCR/docTR non chargés)", + ) + + try: + raw_text = ocr_fn(roi) or "" + except Exception as exc: + return ValidationResult( + verdict=Verdict.CONTINUE, confidence=0.1, + check_used=self.name, elapsed_ms=(time.time() - t0) * 1000, + reasoning=f"OCR erreur: {exc}", + ) + + text_norm = _strip_accents(raw_text) + expected_norm = _strip_accents(expected_text) + elapsed_ms = (time.time() - t0) * 1000 + evidence = { + "roi_text": raw_text[:200], + "roi_bbox": list(bbox), + "expected": expected_text, + } + + # Priorité absolue : token suspect → WRONG_APPLICATION (bug step 10 / dialog perdu) + for suspect in self.SUSPECT_TOKENS: + if suspect in text_norm and suspect not in expected_norm: + return ValidationResult( + verdict=Verdict.TERMINATE, confidence=self._suspect_conf, + check_used=self.name, elapsed_ms=elapsed_ms, + failure_category=FailureCategory.WRONG_APPLICATION, + reasoning=( + f"Token suspect '{suspect}' dans ROI clic " + f"(attendu '{expected_text[:40]}') — cible hors-app" + ), + raw_evidence=evidence, + ) + + # Match exact normalisé + if expected_norm and expected_norm in text_norm: + return ValidationResult( + verdict=Verdict.COMPLETE, confidence=self._expected_conf, + check_used=self.name, elapsed_ms=elapsed_ms, + reasoning=f"Texte '{expected_text[:40]}' trouvé dans ROI", + raw_evidence=evidence, + ) + + # Match partiel mot-à-mot + toks = [t for t in expected_norm.split() if len(t) > 2] + if toks: + hits = sum(1 for tok in toks if tok in text_norm) + ratio = hits / len(toks) + if ratio >= 0.5: + return ValidationResult( + verdict=Verdict.COMPLETE, confidence=0.6 + 0.3 * ratio, + check_used=self.name, elapsed_ms=elapsed_ms, + reasoning=f"Match partiel {hits}/{len(toks)} tokens", + raw_evidence=evidence, + ) + + return ValidationResult( + verdict=Verdict.CONTINUE, confidence=0.4, + check_used=self.name, elapsed_ms=elapsed_ms, + failure_category=FailureCategory.OCR_TEXT_MISSING, + reasoning=f"Texte '{expected_text[:40]}' non trouvé dans ROI", + raw_evidence=evidence, + ) diff --git a/core/validation/orchestrator.py b/core/validation/orchestrator.py new file mode 100644 index 000000000..e123c2a2c --- /dev/null +++ b/core/validation/orchestrator.py @@ -0,0 +1,79 @@ +"""Validator orchestrator — dispatch action_type → checkers + agrégation. + +Règles d'agrégation (cf. SPEC_VALIDATOR_MATRICE.md §6.2) : +- Si un checker rend TERMINATE conf ≥ 0.85 → return immédiat +- Si un checker rend COMPLETE conf ≥ accept_confidence → return (max conf) +- Sinon → dernier résultat (CONTINUE), à charge du caller d'escalader/retrier +""" +from __future__ import annotations + +import logging +from typing import Any, Dict, List, Optional + +from core.validation.result import ValidationResult, Verdict + +logger = logging.getLogger(__name__) + + +class Validator: + def __init__( + self, + checkers: Dict[str, List[Any]], + default_checkers: Optional[List[Any]] = None, + accept_confidence: float = 0.70, + terminate_confidence: float = 0.85, + ): + self._checkers = checkers + self._default = default_checkers or [] + self._accept = accept_confidence + self._terminate_conf = terminate_confidence + + def validate( + self, + action: Dict[str, Any], + result: Dict[str, Any], + screenshot_before: Optional[str] = None, + screenshot_after: Optional[str] = None, + context: Optional[Dict[str, Any]] = None, + ) -> ValidationResult: + ctx = context or {} + action_type = action.get("type", "") + candidates = self._checkers.get(action_type) or self._default + + results: List[ValidationResult] = [] + for checker in candidates: + try: + res = checker.check( + action, result, screenshot_before, screenshot_after, ctx + ) + except Exception as exc: + logger.warning( + "[VALIDATOR] checker %s a planté: %s", + getattr(checker, "name", checker), exc, + ) + continue + results.append(res) + logger.info( + "[VALIDATOR] check=%s verdict=%s conf=%.2f elapsed=%.0fms", + res.check_used, res.verdict.value, res.confidence, res.elapsed_ms, + ) + # Règle 1 — TERMINATE haute conf : court-circuit + if res.verdict == Verdict.TERMINATE and res.confidence >= self._terminate_conf: + return res + # Règle 2 — COMPLETE haute conf : court-circuit + if res.verdict == Verdict.COMPLETE and res.confidence >= self._accept: + return res + + # Aucun checker concluant : agrégation finale + if results: + # Préférer un COMPLETE si présent, sinon le plus confiant + completes = [r for r in results if r.verdict == Verdict.COMPLETE] + if completes: + return max(completes, key=lambda r: r.confidence) + return max(results, key=lambda r: r.confidence) + + return ValidationResult( + verdict=Verdict.CONTINUE, confidence=0.3, + check_used="no_checker", elapsed_ms=0.0, + reasoning=f"Aucun checker pour action_type='{action_type}'", + ) diff --git a/core/validation/pixel_diff_checker.py b/core/validation/pixel_diff_checker.py new file mode 100644 index 000000000..e39c53878 --- /dev/null +++ b/core/validation/pixel_diff_checker.py @@ -0,0 +1,68 @@ +"""PixelDiffChecker — wrapper de ReplayVerifier.verify_action (~15 ms). + +Pré-filtre rapide : si l'écran n'a pas du tout changé, l'action a probablement +échoué. Réutilise l'instance _replay_verifier globale d'api_stream. +""" +from __future__ import annotations + +import time +from typing import Any, Dict, Optional + +from core.validation.result import FailureCategory, ValidationResult, Verdict + + +class PixelDiffChecker: + name = "pixel_diff" + budget_ms = 15.0 + + def __init__(self, replay_verifier): + self._rv = replay_verifier + + def check( + self, + action: Dict[str, Any], + result: Dict[str, Any], + screenshot_before: Optional[str], + screenshot_after: Optional[str], + context: Dict[str, Any], + ) -> ValidationResult: + t0 = time.time() + try: + pr = self._rv.verify_action( + action=action, + result=result, + screenshot_before=screenshot_before, + screenshot_after=screenshot_after, + ) + except Exception as exc: + return ValidationResult( + verdict=Verdict.CONTINUE, + confidence=0.1, + check_used=self.name, + elapsed_ms=(time.time() - t0) * 1000, + reasoning=f"PixelDiff erreur: {exc}", + ) + elapsed = (time.time() - t0) * 1000 + + # Map verdict ReplayVerifier → Verdict Validator + if pr.suggestion == "continue" and pr.changes_detected: + verdict, conf, fc = Verdict.COMPLETE, pr.confidence, None + elif pr.suggestion == "retry": + verdict = Verdict.CONTINUE + conf = max(0.4, pr.confidence - 0.2) + fc = FailureCategory.NO_VISUAL_CHANGE + else: + verdict, conf, fc = Verdict.CONTINUE, 0.3, None + + return ValidationResult( + verdict=verdict, + confidence=conf, + check_used=self.name, + elapsed_ms=elapsed, + reasoning=pr.detail, + failure_category=fc, + raw_evidence={ + "change_area_pct": pr.change_area_pct, + "local_change_pct": pr.local_change_pct, + }, + ) diff --git a/core/validation/result.py b/core/validation/result.py new file mode 100644 index 000000000..2163625bb --- /dev/null +++ b/core/validation/result.py @@ -0,0 +1,53 @@ +"""Dataclasses du Validator — Verdict, FailureCategory, ValidationResult. + +Cf. SPEC_VALIDATOR_MATRICE.md §1 et AXE_B2_DEEP_VALIDATOR.md §3.1. +""" +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum +from typing import Any, Dict, Optional + + +class Verdict(str, Enum): + """Trois verdicts possibles (calque Skyvern complete/terminate/continue).""" + COMPLETE = "complete" # l'action a eu l'effet voulu + CONTINUE = "continue" # effet pas encore visible → recheck/wait + TERMINATE = "terminate" # échec irrécupérable → pause supervisée + + +class FailureCategory(str, Enum): + """Classification des échecs (restreinte au contexte rpa_vision_v3).""" + WRONG_TARGET = "wrong_target" + WRONG_APPLICATION = "wrong_application" # bug step 10 (clic hors-app) + NO_VISUAL_CHANGE = "no_visual_change" + UNEXPECTED_DIALOG = "unexpected_dialog" + OCR_TEXT_MISSING = "ocr_text_missing" + SCHEMA_INVALID = "schema_invalid" + UI_LOADING = "ui_loading" + UNKNOWN = "unknown" + + +@dataclass +class ValidationResult: + """Résultat d'un check. Toujours sérialisable JSON.""" + verdict: Verdict + confidence: float + check_used: str + elapsed_ms: float + reasoning: str = "" + failure_category: Optional[FailureCategory] = None + raw_evidence: Dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> Dict[str, Any]: + return { + "verdict": self.verdict.value, + "confidence": round(self.confidence, 3), + "check_used": self.check_used, + "elapsed_ms": round(self.elapsed_ms, 1), + "reasoning": self.reasoning, + "failure_category": ( + self.failure_category.value if self.failure_category else None + ), + "raw_evidence": self.raw_evidence, + } diff --git a/tests/integration/test_validator_step10.py b/tests/integration/test_validator_step10.py new file mode 100644 index 000000000..1a6c28218 --- /dev/null +++ b/tests/integration/test_validator_step10.py @@ -0,0 +1,130 @@ +"""Repro offline du bug fonctionnel : replay_sess_4c38dbb8 / act_raw_6c1432b3. + +L'agent rapporte success=True après avoir cliqué sur le bouton "Enregistrer" +du dialog "Enregistrer sous", mais la fenêtre active après le clic est +"rpa_vision : Explorateur de fichiers" — l'app a basculé hors du Bloc-notes. + +Le Validator MVP P0 doit attribuer failure_category=WRONG_APPLICATION via +OcrRoiChecker (token suspect 'explorateur de fichiers' dans la ROI) et donc +override success → False. + +Stratégie de fixture : +- screenshot_after synthétique : 800×600 avec "rpa_vision : Explorateur de fichiers" + au centre (= bug observé : la fenêtre est passée à l'Explorateur). +- screenshot_before : dialog "Enregistrer sous" (texte centré). +- action : click_anchor sur "Enregistrer" au centre (x_pct=0.5, y_pct=0.5). +- OCR injecté : fake qui retourne le texte du screenshot_after. +""" +from __future__ import annotations + +import base64 +import io + +import pytest + +pytestmark = [pytest.mark.integration] + + +def _png_b64(img) -> str: + buf = io.BytesIO() + img.save(buf, format="PNG") + return base64.b64encode(buf.getvalue()).decode("ascii") + + +def _make_screenshot(text: str, color=(245, 245, 245), size=(1920, 1080)): + """Screenshot 1920x1080 avec un texte centré (visible dans la ROI 80px).""" + from PIL import Image, ImageDraw + img = Image.new("RGB", size, color=color) + draw = ImageDraw.Draw(img) + cx, cy = size[0] // 2, size[1] // 2 + draw.text((cx - 200, cy - 8), text, fill=(0, 0, 0)) + return img + + +@pytest.fixture +def bug_step10_fixtures(): + """Reproduit la situation act_raw_6c1432b3 sans OCR réel. + + L'OCR est mocké pour retourner ce que verrait EasyOCR sur le screenshot after. + """ + before = _png_b64(_make_screenshot("Enregistrer sous")) + after = _png_b64(_make_screenshot("rpa_vision : Explorateur de fichiers")) + + action = { + "type": "click", + "action_id": "act_raw_6c1432b3", + "by_text": "Enregistrer", + "target_spec": { + "by_text": "Enregistrer", + "window_title": "Enregistrer sous", + }, + # Position normalisée au centre du screen (où le bouton "Enregistrer" + # était attendu d'après replay_sess_4c38dbb8.failures.jsonl) + "x_pct": 0.5289, + "y_pct": 0.7913, + } + # L'agent rapporte success=True (c'est le bug : pixel-diff legacy ne discrimine pas) + result = { + "success": True, + "actual_position": {"x_pct": 0.5289, "y_pct": 0.7913}, + } + return before, after, action, result + + +def test_validator_detects_wrong_application_on_act_raw_6c1432b3(bug_step10_fixtures): + """Le Validator doit retourner WRONG_APPLICATION malgré success=True client.""" + from core.validation import OcrRoiChecker, Validator, Verdict, FailureCategory + + before, after, action, result = bug_step10_fixtures + + # OCR fake : on simule que EasyOCR lit dans la ROI le titre de la fenêtre + # active après le clic (l'Explorateur de fichiers a pris le focus). + def fake_ocr(crop): + # On suppose que la ROI 80×80 autour du clic au milieu-bas tombe + # sur la zone du texte. Pour le test, on retourne directement le + # texte qui ferait foi. + return "rpa_vision : Explorateur de fichiers" + + ocr_click = OcrRoiChecker(ocr_fn=fake_ocr, radius_px=80) + # Construit le même Validator que api_stream._get_validator_v2() + validator = Validator(checkers={"click": [ocr_click]}) + + vr = validator.validate( + action=action, + result=result, + screenshot_before=before, + screenshot_after=after, + context={}, + ) + + # Verdict attendu : TERMINATE / WRONG_APPLICATION (token 'explorateur de fichiers') + assert vr.verdict == Verdict.TERMINATE, ( + f"Verdict attendu TERMINATE, obtenu {vr.verdict} (reasoning={vr.reasoning})" + ) + assert vr.failure_category == FailureCategory.WRONG_APPLICATION + assert vr.confidence >= 0.85 + assert "explorateur" in vr.reasoning.lower() or "explorateur" in vr.raw_evidence.get("roi_text", "").lower() + + +def test_validator_complete_when_correct_window_active(bug_step10_fixtures): + """Sanity : si l'OCR voit bien 'Enregistrer' dans la ROI, le verdict est COMPLETE.""" + from core.validation import OcrRoiChecker, Validator, Verdict + + before, after_bad, action, result = bug_step10_fixtures + after_good = _png_b64(_make_screenshot("Document enregistre - Bloc-notes")) + + def fake_ocr(crop): + return "Bouton Enregistrer cliqué — Bloc-notes" + + validator = Validator( + checkers={"click": [OcrRoiChecker(ocr_fn=fake_ocr, radius_px=80)]}, + ) + vr = validator.validate( + action=action, + result=result, + screenshot_before=before, + screenshot_after=_png_b64(_make_screenshot("après save Bloc-notes")), + context={}, + ) + assert vr.verdict == Verdict.COMPLETE + assert vr.failure_category is None diff --git a/tests/unit/test_validator_v2.py b/tests/unit/test_validator_v2.py new file mode 100644 index 000000000..e76e4d4cb --- /dev/null +++ b/tests/unit/test_validator_v2.py @@ -0,0 +1,249 @@ +"""Tests unitaires du Validator V2 (P0 MVP). + +Cf. SPEC_VALIDATOR_MATRICE.md, AXE_B2_DEEP_VALIDATOR.md. +""" +from __future__ import annotations + +import base64 +import io +from pathlib import Path + +import pytest + +pytestmark = pytest.mark.unit + + +def _png_b64(img) -> str: + """Encode une image PIL en base64 (préfixé iVBOR pour _load_single_image).""" + buf = io.BytesIO() + img.save(buf, format="PNG") + return base64.b64encode(buf.getvalue()).decode("ascii") + + +def _make_image(text: str = "", color=(255, 255, 255), size=(800, 600)): + """Crée une image PIL 800x600 avec du texte centré (pour OCR fake).""" + from PIL import Image, ImageDraw + img = Image.new("RGB", size, color=color) + if text: + draw = ImageDraw.Draw(img) + draw.text((size[0] // 2 - 100, size[1] // 2), text, fill=(0, 0, 0)) + return img + + +# ---------------------------------------------------------------------------- +# PixelDiffChecker +# ---------------------------------------------------------------------------- + +class _FakeVerifierResult: + def __init__(self, suggestion, changes_detected, confidence, detail="", + change_area_pct=0.0, local_change_pct=0.0): + self.suggestion = suggestion + self.changes_detected = changes_detected + self.confidence = confidence + self.detail = detail + self.change_area_pct = change_area_pct + self.local_change_pct = local_change_pct + + +class _FakeReplayVerifier: + def __init__(self, result): + self._r = result + + def verify_action(self, action, result, screenshot_before, screenshot_after): + return self._r + + +def test_pixel_diff_complete_when_changes_detected(): + from core.validation import PixelDiffChecker, Verdict + rv = _FakeReplayVerifier(_FakeVerifierResult( + suggestion="continue", changes_detected=True, confidence=0.85, + detail="pixels changés" + )) + checker = PixelDiffChecker(rv) + res = checker.check({"type": "click"}, {"success": True}, "x", "y", {}) + assert res.verdict == Verdict.COMPLETE + assert res.confidence == pytest.approx(0.85) + assert res.check_used == "pixel_diff" + + +def test_pixel_diff_continue_when_retry_suggested(): + from core.validation import PixelDiffChecker, Verdict, FailureCategory + rv = _FakeReplayVerifier(_FakeVerifierResult( + suggestion="retry", changes_detected=False, confidence=0.7, + detail="aucun changement" + )) + res = PixelDiffChecker(rv).check({"type": "click"}, {"success": True}, "x", "y", {}) + assert res.verdict == Verdict.CONTINUE + assert res.failure_category == FailureCategory.NO_VISUAL_CHANGE + + +def test_pixel_diff_handles_internal_exception(): + from core.validation import PixelDiffChecker, Verdict + + class _BadVerifier: + def verify_action(self, **kw): + raise RuntimeError("boom") + + res = PixelDiffChecker(_BadVerifier()).check( + {"type": "click"}, {"success": True}, "x", "y", {} + ) + assert res.verdict == Verdict.CONTINUE + assert "boom" in res.reasoning + + +# ---------------------------------------------------------------------------- +# OcrRoiChecker — avec ocr_fn injecté (pas d'EasyOCR ici) +# ---------------------------------------------------------------------------- + +def test_ocr_roi_detects_wrong_application_suspect_token(): + """Bug step 10 : token 'edge' / 'explorateur de fichiers' dans ROI = WRONG_APPLICATION.""" + from core.validation import OcrRoiChecker, Verdict, FailureCategory + img = _make_image() + img_b64 = _png_b64(img) + # OCR fake retourne un titre Explorateur de fichiers alors qu'on attendait "Enregistrer" + checker = OcrRoiChecker(ocr_fn=lambda _crop: "rpa_vision : Explorateur de fichiers") + res = checker.check( + action={"type": "click", "by_text": "Enregistrer", "x_pct": 0.5, "y_pct": 0.5}, + result={"success": True, "actual_position": {"x_pct": 0.5, "y_pct": 0.5}}, + screenshot_before=None, + screenshot_after=img_b64, + context={}, + ) + assert res.verdict == Verdict.TERMINATE + assert res.failure_category == FailureCategory.WRONG_APPLICATION + assert res.confidence >= 0.85 + + +def test_ocr_roi_complete_when_expected_text_in_roi(): + from core.validation import OcrRoiChecker, Verdict + img_b64 = _png_b64(_make_image()) + checker = OcrRoiChecker(ocr_fn=lambda _: "Bouton Enregistrer actif") + res = checker.check( + action={"type": "click", "by_text": "Enregistrer", "x_pct": 0.5, "y_pct": 0.5}, + result={"success": True}, + screenshot_before=None, + screenshot_after=img_b64, + context={}, + ) + assert res.verdict == Verdict.COMPLETE + assert res.confidence >= 0.85 + + +def test_ocr_roi_ocr_text_missing_when_no_match(): + from core.validation import OcrRoiChecker, Verdict, FailureCategory + img_b64 = _png_b64(_make_image()) + checker = OcrRoiChecker(ocr_fn=lambda _: "texte sans rapport") + res = checker.check( + action={"type": "click", "by_text": "Enregistrer", "x_pct": 0.5, "y_pct": 0.5}, + result={"success": True}, + screenshot_before=None, + screenshot_after=img_b64, + context={}, + ) + assert res.verdict == Verdict.CONTINUE + assert res.failure_category == FailureCategory.OCR_TEXT_MISSING + + +def test_ocr_roi_missing_coords_returns_continue(): + from core.validation import OcrRoiChecker, Verdict + img_b64 = _png_b64(_make_image()) + checker = OcrRoiChecker(ocr_fn=lambda _: "") + res = checker.check( + action={"type": "click", "by_text": "Enregistrer"}, # no coords + result={"success": True}, + screenshot_before=None, + screenshot_after=img_b64, + context={}, + ) + assert res.verdict == Verdict.CONTINUE + assert "ROI indéfinie" in res.reasoning + + +# ---------------------------------------------------------------------------- +# Validator orchestrator +# ---------------------------------------------------------------------------- + +class _FakeChecker: + def __init__(self, name, verdict, conf, failure_category=None): + from core.validation.result import ValidationResult + self.name = name + self._res = ValidationResult( + verdict=verdict, confidence=conf, check_used=name, elapsed_ms=1.0, + failure_category=failure_category, reasoning=f"fake {name}", + ) + + def check(self, action, result, sb, sa, ctx): + return self._res + + +def test_validator_terminate_high_conf_short_circuits(): + from core.validation import Validator, Verdict, FailureCategory + bad = _FakeChecker("ocr_roi", Verdict.TERMINATE, 0.9, + FailureCategory.WRONG_APPLICATION) + never = _FakeChecker("pixel", Verdict.COMPLETE, 0.99) + v = Validator(checkers={"click": [bad, never]}) + res = v.validate({"type": "click"}, {"success": True}) + assert res.verdict == Verdict.TERMINATE + assert res.failure_category == FailureCategory.WRONG_APPLICATION + assert res.check_used == "ocr_roi" + + +def test_validator_complete_high_conf_short_circuits(): + from core.validation import Validator, Verdict + ok = _FakeChecker("ocr_roi", Verdict.COMPLETE, 0.95) + v = Validator(checkers={"click": [ok, _FakeChecker("pixel", Verdict.CONTINUE, 0.3)]}) + res = v.validate({"type": "click"}, {"success": True}) + assert res.verdict == Verdict.COMPLETE + assert res.check_used == "ocr_roi" + + +def test_validator_falls_back_to_default_checkers(): + from core.validation import Validator, Verdict + d = _FakeChecker("default", Verdict.COMPLETE, 0.8) + v = Validator(checkers={}, default_checkers=[d]) + res = v.validate({"type": "unknown_action"}, {"success": True}) + assert res.check_used == "default" + assert res.verdict == Verdict.COMPLETE + + +def test_validator_no_checker_returns_neutral_continue(): + from core.validation import Validator, Verdict + v = Validator(checkers={}) + res = v.validate({"type": "click"}, {"success": True}) + assert res.verdict == Verdict.CONTINUE + assert res.check_used == "no_checker" + + +def test_validator_skips_checker_that_raises(): + from core.validation import Validator, Verdict + + class _Boom: + name = "boom" + def check(self, *a, **kw): + raise RuntimeError("crash") + + ok = _FakeChecker("ok", Verdict.COMPLETE, 0.9) + v = Validator(checkers={"click": [_Boom(), ok]}) + res = v.validate({"type": "click"}, {"success": True}) + assert res.check_used == "ok" + + +# ---------------------------------------------------------------------------- +# Sérialisation ValidationResult +# ---------------------------------------------------------------------------- + +def test_validation_result_to_dict_is_json_serializable(): + import json + from core.validation import FailureCategory, Verdict + from core.validation.result import ValidationResult + r = ValidationResult( + verdict=Verdict.TERMINATE, confidence=0.88, + check_used="ocr_roi", elapsed_ms=42.7, + reasoning="trop long" * 5, + failure_category=FailureCategory.WRONG_APPLICATION, + raw_evidence={"roi_text": "abc"}, + ) + d = r.to_dict() + s = json.dumps(d) # ne doit pas lever + assert "wrong_application" in s + assert d["verdict"] == "terminate"