Files
rpa_vision_v3/tests/unit/test_replay_critic.py
Dom 99041f0117 feat: pipeline complet MACRO/MÉSO/MICRO — Critic, Observer, Policy, Recovery, Learning, Audit Trail, TaskPlanner
Architecture 3 niveaux implémentée et testée (137 tests unitaires + 21 visuels) :

MÉSO (acteur intelligent) :
- P0 Critic : vérification sémantique post-action via gemma4 (replay_verifier.py)
- P1 Observer : pré-analyse écran avant chaque action (api_stream.py /pre_analyze)
- P2 Grounding/Policy : séparation localisation (grounding.py) et décision (policy.py)
- P3 Recovery : rollback automatique Ctrl+Z/Escape/Alt+F4 (recovery.py)
- P4 Learning : apprentissage runtime avec boucle de consolidation (replay_learner.py)

MACRO (planificateur) :
- TaskPlanner : comprend les ordres en langage naturel via gemma4 (task_planner.py)
- Contexte métier TIM/CIM-10 pour les hôpitaux (domain_context.py)
- Endpoint POST /api/v1/task pour l'exécution par instruction

Traçabilité :
- Audit trail complet avec 18 champs par action (audit_trail.py)
- Endpoints GET /audit/history, /audit/summary, /audit/export (CSV)

Grounding :
- Fix parsing bbox_2d qwen2.5vl (pixels relatifs, pas grille 1000x1000)
- Benchmarks visuels sur captures réelles (3 approches : baseline, zoom, Citrix)
- Reproductibilité validée : variance < 0.008 sur 10 itérations

Sécurité :
- Tokens de production retirés du code source → .env.local
- Secret key aléatoire si non configuré
- Suppression logs qui leakent les tokens

Résultats : 80% de replay (vs 12.5% avant), 100% détection visuelle Citrix JPEG Q20

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 21:03:25 +02:00

442 lines
17 KiB
Python

"""
Tests unitaires pour le Critic (ReplayVerifier.verify_with_critic)
et l'enrichissement des actions avec intentions.
Vérifie les FONCTIONNALITÉS, pas juste la non-régression :
1. Le Critic fusionne correctement pixel + sémantique
2. La matrice de décision (4 cas) est correcte
3. L'enrichissement intentions parse bien les réponses gemma4
4. Les fallbacks fonctionnent quand le VLM est indisponible
"""
import base64
import io
import json
import sys
from pathlib import Path
from unittest.mock import MagicMock, patch, Mock
import pytest
_ROOT = str(Path(__file__).resolve().parents[2])
if _ROOT not in sys.path:
sys.path.insert(0, _ROOT)
from agent_v0.server_v1.replay_verifier import ReplayVerifier, VerificationResult
# =========================================================================
# Fixtures
# =========================================================================
def _make_screenshot_b64(width=100, height=100, color=(128, 128, 128)):
"""Créer un screenshot base64 factice (JPEG)."""
from PIL import Image
img = Image.new("RGB", (width, height), color)
buf = io.BytesIO()
img.save(buf, format="JPEG", quality=50)
return base64.b64encode(buf.getvalue()).decode()
@pytest.fixture
def verifier():
return ReplayVerifier()
@pytest.fixture
def screenshot_gray():
return _make_screenshot_b64(100, 100, (128, 128, 128))
@pytest.fixture
def screenshot_white():
return _make_screenshot_b64(100, 100, (255, 255, 255))
# =========================================================================
# Tests VerificationResult — nouveaux champs sémantiques
# =========================================================================
class TestVerificationResult:
def test_to_dict_sans_semantique(self):
"""Sans vérification sémantique, les champs semantic_ sont absents du dict."""
r = VerificationResult(
verified=True, confidence=0.8, changes_detected=True,
change_area_pct=5.0, suggestion="continue", detail="test",
)
d = r.to_dict()
assert "semantic_verified" not in d
assert d["verified"] is True
assert d["confidence"] == 0.8
def test_to_dict_avec_semantique(self):
"""Avec vérification sémantique, les champs semantic_ sont présents."""
r = VerificationResult(
verified=True, confidence=0.9, changes_detected=True,
change_area_pct=5.0, suggestion="continue", detail="test",
semantic_verified=True, semantic_detail="Bouton visible",
semantic_elapsed_ms=1500.0,
)
d = r.to_dict()
assert d["semantic_verified"] is True
assert d["semantic_detail"] == "Bouton visible"
assert d["semantic_elapsed_ms"] == 1500.0
def test_to_dict_semantique_false(self):
"""semantic_verified=False doit apparaître dans le dict."""
r = VerificationResult(
verified=False, confidence=0.7, changes_detected=True,
change_area_pct=5.0, suggestion="retry",
semantic_verified=False, semantic_detail="Mauvais écran",
semantic_elapsed_ms=2000.0,
)
d = r.to_dict()
assert d["semantic_verified"] is False
# =========================================================================
# Tests verify_with_critic — matrice de décision
# =========================================================================
class TestVerifyWithCritic:
def test_sans_expected_result_retourne_pixel_seul(self, verifier, screenshot_gray):
"""Sans expected_result, verify_with_critic = verify_action (pixel seul)."""
result = verifier.verify_with_critic(
action={"type": "click", "action_id": "test"},
result={"success": True},
screenshot_before=screenshot_gray,
screenshot_after=screenshot_gray,
expected_result="", # Pas d'attendu
)
# Pixel seul — pas de champ semantic
assert result.semantic_verified is None
def test_sans_screenshots_pas_de_semantique(self, verifier):
"""Sans screenshots, pas de vérification sémantique possible."""
result = verifier.verify_with_critic(
action={"type": "click", "action_id": "test"},
result={"success": True},
screenshot_before=None,
screenshot_after=None,
expected_result="Le fichier est ouvert",
)
# Pas de screenshots → pixel seul (confidence basse)
assert result.verified is True
assert result.confidence < 0.5
def test_pixel_pas_change_et_expected_result_skip_vlm(
self, verifier, screenshot_gray,
):
"""Si pixel identiques + expected_result → skip VLM (pas de changement = retry)."""
result = verifier.verify_with_critic(
action={"type": "click", "action_id": "test", "x_pct": 0.5, "y_pct": 0.5},
result={"success": True},
screenshot_before=screenshot_gray,
screenshot_after=screenshot_gray, # Même image → aucun changement
expected_result="Le menu s'est ouvert",
)
# Pas de changement pixel → retry, VLM non appelé
assert result.verified is False
assert result.suggestion == "retry"
assert result.semantic_verified is None # VLM non appelé
@patch("agent_v0.server_v1.replay_verifier.ReplayVerifier._verify_semantic")
def test_pixel_ok_semantic_ok(
self, mock_semantic, verifier, screenshot_gray, screenshot_white,
):
"""Pixel OK + Semantic OK → vérifié avec haute confiance."""
mock_semantic.return_value = {
"verified": True,
"detail": "Le menu est bien ouvert",
"elapsed_ms": 2000.0,
}
result = verifier.verify_with_critic(
action={"type": "click", "action_id": "test"},
result={"success": True},
screenshot_before=screenshot_gray,
screenshot_after=screenshot_white, # Différent → changement détecté
expected_result="Le menu s'est ouvert",
)
assert result.verified is True
assert result.semantic_verified is True
assert result.confidence >= 0.7
assert "Critic OK" in result.detail
@patch("agent_v0.server_v1.replay_verifier.ReplayVerifier._verify_semantic")
def test_pixel_ok_semantic_non(
self, mock_semantic, verifier, screenshot_gray, screenshot_white,
):
"""Pixel OK + Semantic NON → INATTENDU (changement mais pas le bon)."""
mock_semantic.return_value = {
"verified": False,
"detail": "Une erreur est apparue au lieu du menu",
"elapsed_ms": 2500.0,
}
result = verifier.verify_with_critic(
action={"type": "click", "action_id": "test"},
result={"success": True},
screenshot_before=screenshot_gray,
screenshot_after=screenshot_white,
expected_result="Le menu s'est ouvert",
)
assert result.verified is False
assert result.semantic_verified is False
assert result.suggestion == "retry"
assert "Critic NON" in result.detail
@patch("agent_v0.server_v1.replay_verifier.ReplayVerifier._verify_semantic")
def test_vlm_indisponible_fallback_pixel(
self, mock_semantic, verifier, screenshot_gray, screenshot_white,
):
"""VLM indisponible → fallback sur pixel seul."""
mock_semantic.return_value = None # VLM down
result = verifier.verify_with_critic(
action={"type": "click", "action_id": "test"},
result={"success": True},
screenshot_before=screenshot_gray,
screenshot_after=screenshot_white,
expected_result="Le menu s'est ouvert",
)
# Fallback pixel seul — le changement est détecté
assert result.verified is True
assert result.semantic_verified is None # Pas de VLM
# =========================================================================
# Tests _verify_semantic — parsing de la réponse VLM
# =========================================================================
class TestVerifySemantic:
@patch("requests.post")
def test_parse_verdict_oui(self, mock_post, verifier, screenshot_white):
"""Parse correctement VERDICT: OUI."""
mock_resp = MagicMock()
mock_resp.ok = True
mock_resp.json.return_value = {
"message": {"content": "VERDICT: OUI\nRAISON: Le fichier est bien ouvert"}
}
mock_post.return_value = mock_resp
result = verifier._verify_semantic(
screenshot_before=screenshot_white,
screenshot_after=screenshot_white,
expected_result="Le fichier est ouvert",
)
assert result is not None
assert result["verified"] is True
assert "ouvert" in result["detail"]
@patch("requests.post")
def test_parse_verdict_non(self, mock_post, verifier, screenshot_white):
"""Parse correctement VERDICT: NON."""
mock_resp = MagicMock()
mock_resp.ok = True
mock_resp.json.return_value = {
"message": {"content": "VERDICT: NON\nRAISON: L'écran n'a pas changé"}
}
mock_post.return_value = mock_resp
result = verifier._verify_semantic(
screenshot_before=screenshot_white,
screenshot_after=screenshot_white,
expected_result="Le menu s'est ouvert",
)
assert result is not None
assert result["verified"] is False
@patch("requests.post")
def test_vlm_timeout_retourne_none(self, mock_post, verifier, screenshot_white):
"""Timeout VLM → retourne None (fallback gracieux)."""
import requests as _real_requests
mock_post.side_effect = _real_requests.Timeout("timeout")
result = verifier._verify_semantic(
screenshot_before=screenshot_white,
screenshot_after=screenshot_white,
expected_result="Le fichier est ouvert",
)
assert result is None
def test_sans_screenshot_after_retourne_none(self, verifier):
"""Sans screenshot_after, pas de vérification possible."""
result = verifier._verify_semantic(
screenshot_before=None,
screenshot_after=None,
expected_result="Le fichier est ouvert",
)
assert result is None
# =========================================================================
# Tests _merge_results — matrice pixel x sémantique
# =========================================================================
class TestMergeResults:
def test_pixel_ok_sem_ok(self, verifier):
pixel = VerificationResult(
verified=True, confidence=0.7, changes_detected=True,
change_area_pct=5.0, suggestion="continue",
)
semantic = {"verified": True, "detail": "OK", "elapsed_ms": 1000}
result = verifier._merge_results(pixel, semantic)
assert result.verified is True
assert result.semantic_verified is True
assert result.confidence >= 0.7
def test_pixel_ok_sem_non(self, verifier):
"""Pixel OK + Sémantique NON = inattendu → retry."""
pixel = VerificationResult(
verified=True, confidence=0.7, changes_detected=True,
change_area_pct=5.0, suggestion="continue",
)
semantic = {"verified": False, "detail": "Erreur popup", "elapsed_ms": 2000}
result = verifier._merge_results(pixel, semantic)
assert result.verified is False
assert result.semantic_verified is False
assert result.suggestion == "retry"
def test_pixel_non_sem_ok(self, verifier):
"""Pixel inchangé + Sémantique OK = état subtil → continue."""
pixel = VerificationResult(
verified=False, confidence=0.5, changes_detected=False,
change_area_pct=0.1, suggestion="retry",
)
semantic = {"verified": True, "detail": "Onglet déjà actif", "elapsed_ms": 1500}
result = verifier._merge_results(pixel, semantic)
assert result.verified is True
assert result.semantic_verified is True
assert result.suggestion == "continue"
def test_pixel_non_sem_non(self, verifier):
"""Pixel inchangé + Sémantique NON = échec complet → retry."""
pixel = VerificationResult(
verified=False, confidence=0.5, changes_detected=False,
change_area_pct=0.0, suggestion="retry",
)
semantic = {"verified": False, "detail": "Rien ne s'est passé", "elapsed_ms": 3000}
result = verifier._merge_results(pixel, semantic)
assert result.verified is False
assert result.semantic_verified is False
assert result.confidence >= 0.7 # Haute confiance dans l'échec
# =========================================================================
# Tests enrichissement intentions (stream_processor)
# =========================================================================
class TestEnrichActionsWithIntentions:
@patch("requests.post")
@patch("requests.get")
def test_enrichissement_parse_reponse_gemma4(self, mock_get, mock_post):
"""La réponse gemma4 est correctement parsée en intention/avant/après."""
from agent_v0.server_v1.stream_processor import _enrich_actions_with_intentions
import tempfile, shutil
# Mock gemma4 disponible
mock_tags_resp = MagicMock()
mock_tags_resp.ok = True
mock_get.return_value = mock_tags_resp
mock_chat_resp = MagicMock()
mock_chat_resp.ok = True
mock_chat_resp.json.return_value = {
"message": {
"content": (
"INTENTION: Ouvrir le fichier client dans le logiciel\n"
"AVANT: Le logiciel est ouvert sur la page d'accueil\n"
"APRÈS: Le fichier client est affiché dans la fenêtre"
)
}
}
mock_post.return_value = mock_chat_resp
actions = [
{
"type": "click",
"action_id": "act_001",
"target_spec": {"by_text": "Ouvrir", "window_title": "Logiciel"},
},
{
"type": "wait",
"action_id": "act_002",
"duration_ms": 1000,
},
]
tmpdir = Path(tempfile.mkdtemp())
try:
(tmpdir / "shots").mkdir()
_enrich_actions_with_intentions(actions, tmpdir)
# L'action click doit être enrichie
assert actions[0].get("intention") == "Ouvrir le fichier client dans le logiciel"
assert actions[0].get("expected_state") == "Le logiciel est ouvert sur la page d'accueil"
assert actions[0].get("expected_result") == "Le fichier client est affiché dans la fenêtre"
# expected_state doit aussi être dans target_spec (pour l'Observer)
assert actions[0]["target_spec"]["expected_state"] == "Le logiciel est ouvert sur la page d'accueil"
# L'action wait ne doit PAS être enrichie
assert "intention" not in actions[1]
finally:
shutil.rmtree(tmpdir)
@patch("requests.get")
def test_gemma4_indisponible_pas_de_crash(self, mock_get):
"""Si gemma4 est down, l'enrichissement est silencieusement désactivé."""
from agent_v0.server_v1.stream_processor import _enrich_actions_with_intentions
import tempfile, shutil
mock_get.side_effect = ConnectionError("gemma4 down")
actions = [
{"type": "click", "action_id": "act_001", "target_spec": {"by_text": "OK"}},
]
tmpdir = Path(tempfile.mkdtemp())
try:
(tmpdir / "shots").mkdir()
_enrich_actions_with_intentions(actions, tmpdir)
# Aucun crash, aucune intention ajoutée
assert "intention" not in actions[0]
finally:
shutil.rmtree(tmpdir)
@patch("requests.post")
@patch("requests.get")
def test_reponse_gemma4_malformee(self, mock_get, mock_post):
"""Si gemma4 retourne du texte non structuré, pas de crash."""
from agent_v0.server_v1.stream_processor import _enrich_actions_with_intentions
import tempfile, shutil
mock_tags = MagicMock()
mock_tags.ok = True
mock_get.return_value = mock_tags
mock_resp = MagicMock()
mock_resp.ok = True
mock_resp.json.return_value = {
"message": {"content": "Je ne comprends pas cette demande."}
}
mock_post.return_value = mock_resp
actions = [
{"type": "click", "action_id": "act_001", "target_spec": {"by_text": "OK"}},
]
tmpdir = Path(tempfile.mkdtemp())
try:
(tmpdir / "shots").mkdir()
_enrich_actions_with_intentions(actions, tmpdir)
# Pas de crash, mais pas d'intention non plus
assert "intention" not in actions[0]
finally:
shutil.rmtree(tmpdir)