Architecture 3 niveaux implémentée et testée (137 tests unitaires + 21 visuels) : MÉSO (acteur intelligent) : - P0 Critic : vérification sémantique post-action via gemma4 (replay_verifier.py) - P1 Observer : pré-analyse écran avant chaque action (api_stream.py /pre_analyze) - P2 Grounding/Policy : séparation localisation (grounding.py) et décision (policy.py) - P3 Recovery : rollback automatique Ctrl+Z/Escape/Alt+F4 (recovery.py) - P4 Learning : apprentissage runtime avec boucle de consolidation (replay_learner.py) MACRO (planificateur) : - TaskPlanner : comprend les ordres en langage naturel via gemma4 (task_planner.py) - Contexte métier TIM/CIM-10 pour les hôpitaux (domain_context.py) - Endpoint POST /api/v1/task pour l'exécution par instruction Traçabilité : - Audit trail complet avec 18 champs par action (audit_trail.py) - Endpoints GET /audit/history, /audit/summary, /audit/export (CSV) Grounding : - Fix parsing bbox_2d qwen2.5vl (pixels relatifs, pas grille 1000x1000) - Benchmarks visuels sur captures réelles (3 approches : baseline, zoom, Citrix) - Reproductibilité validée : variance < 0.008 sur 10 itérations Sécurité : - Tokens de production retirés du code source → .env.local - Secret key aléatoire si non configuré - Suppression logs qui leakent les tokens Résultats : 80% de replay (vs 12.5% avant), 100% détection visuelle Citrix JPEG Q20 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
442 lines
17 KiB
Python
442 lines
17 KiB
Python
"""
|
|
Tests unitaires pour le Critic (ReplayVerifier.verify_with_critic)
|
|
et l'enrichissement des actions avec intentions.
|
|
|
|
Vérifie les FONCTIONNALITÉS, pas juste la non-régression :
|
|
1. Le Critic fusionne correctement pixel + sémantique
|
|
2. La matrice de décision (4 cas) est correcte
|
|
3. L'enrichissement intentions parse bien les réponses gemma4
|
|
4. Les fallbacks fonctionnent quand le VLM est indisponible
|
|
"""
|
|
|
|
import base64
|
|
import io
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, patch, Mock
|
|
|
|
import pytest
|
|
|
|
_ROOT = str(Path(__file__).resolve().parents[2])
|
|
if _ROOT not in sys.path:
|
|
sys.path.insert(0, _ROOT)
|
|
|
|
from agent_v0.server_v1.replay_verifier import ReplayVerifier, VerificationResult
|
|
|
|
|
|
# =========================================================================
|
|
# Fixtures
|
|
# =========================================================================
|
|
|
|
|
|
def _make_screenshot_b64(width=100, height=100, color=(128, 128, 128)):
|
|
"""Créer un screenshot base64 factice (JPEG)."""
|
|
from PIL import Image
|
|
img = Image.new("RGB", (width, height), color)
|
|
buf = io.BytesIO()
|
|
img.save(buf, format="JPEG", quality=50)
|
|
return base64.b64encode(buf.getvalue()).decode()
|
|
|
|
|
|
@pytest.fixture
|
|
def verifier():
|
|
return ReplayVerifier()
|
|
|
|
|
|
@pytest.fixture
|
|
def screenshot_gray():
|
|
return _make_screenshot_b64(100, 100, (128, 128, 128))
|
|
|
|
|
|
@pytest.fixture
|
|
def screenshot_white():
|
|
return _make_screenshot_b64(100, 100, (255, 255, 255))
|
|
|
|
|
|
# =========================================================================
|
|
# Tests VerificationResult — nouveaux champs sémantiques
|
|
# =========================================================================
|
|
|
|
|
|
class TestVerificationResult:
|
|
|
|
def test_to_dict_sans_semantique(self):
|
|
"""Sans vérification sémantique, les champs semantic_ sont absents du dict."""
|
|
r = VerificationResult(
|
|
verified=True, confidence=0.8, changes_detected=True,
|
|
change_area_pct=5.0, suggestion="continue", detail="test",
|
|
)
|
|
d = r.to_dict()
|
|
assert "semantic_verified" not in d
|
|
assert d["verified"] is True
|
|
assert d["confidence"] == 0.8
|
|
|
|
def test_to_dict_avec_semantique(self):
|
|
"""Avec vérification sémantique, les champs semantic_ sont présents."""
|
|
r = VerificationResult(
|
|
verified=True, confidence=0.9, changes_detected=True,
|
|
change_area_pct=5.0, suggestion="continue", detail="test",
|
|
semantic_verified=True, semantic_detail="Bouton visible",
|
|
semantic_elapsed_ms=1500.0,
|
|
)
|
|
d = r.to_dict()
|
|
assert d["semantic_verified"] is True
|
|
assert d["semantic_detail"] == "Bouton visible"
|
|
assert d["semantic_elapsed_ms"] == 1500.0
|
|
|
|
def test_to_dict_semantique_false(self):
|
|
"""semantic_verified=False doit apparaître dans le dict."""
|
|
r = VerificationResult(
|
|
verified=False, confidence=0.7, changes_detected=True,
|
|
change_area_pct=5.0, suggestion="retry",
|
|
semantic_verified=False, semantic_detail="Mauvais écran",
|
|
semantic_elapsed_ms=2000.0,
|
|
)
|
|
d = r.to_dict()
|
|
assert d["semantic_verified"] is False
|
|
|
|
|
|
# =========================================================================
|
|
# Tests verify_with_critic — matrice de décision
|
|
# =========================================================================
|
|
|
|
|
|
class TestVerifyWithCritic:
|
|
|
|
def test_sans_expected_result_retourne_pixel_seul(self, verifier, screenshot_gray):
|
|
"""Sans expected_result, verify_with_critic = verify_action (pixel seul)."""
|
|
result = verifier.verify_with_critic(
|
|
action={"type": "click", "action_id": "test"},
|
|
result={"success": True},
|
|
screenshot_before=screenshot_gray,
|
|
screenshot_after=screenshot_gray,
|
|
expected_result="", # Pas d'attendu
|
|
)
|
|
# Pixel seul — pas de champ semantic
|
|
assert result.semantic_verified is None
|
|
|
|
def test_sans_screenshots_pas_de_semantique(self, verifier):
|
|
"""Sans screenshots, pas de vérification sémantique possible."""
|
|
result = verifier.verify_with_critic(
|
|
action={"type": "click", "action_id": "test"},
|
|
result={"success": True},
|
|
screenshot_before=None,
|
|
screenshot_after=None,
|
|
expected_result="Le fichier est ouvert",
|
|
)
|
|
# Pas de screenshots → pixel seul (confidence basse)
|
|
assert result.verified is True
|
|
assert result.confidence < 0.5
|
|
|
|
def test_pixel_pas_change_et_expected_result_skip_vlm(
|
|
self, verifier, screenshot_gray,
|
|
):
|
|
"""Si pixel identiques + expected_result → skip VLM (pas de changement = retry)."""
|
|
result = verifier.verify_with_critic(
|
|
action={"type": "click", "action_id": "test", "x_pct": 0.5, "y_pct": 0.5},
|
|
result={"success": True},
|
|
screenshot_before=screenshot_gray,
|
|
screenshot_after=screenshot_gray, # Même image → aucun changement
|
|
expected_result="Le menu s'est ouvert",
|
|
)
|
|
# Pas de changement pixel → retry, VLM non appelé
|
|
assert result.verified is False
|
|
assert result.suggestion == "retry"
|
|
assert result.semantic_verified is None # VLM non appelé
|
|
|
|
@patch("agent_v0.server_v1.replay_verifier.ReplayVerifier._verify_semantic")
|
|
def test_pixel_ok_semantic_ok(
|
|
self, mock_semantic, verifier, screenshot_gray, screenshot_white,
|
|
):
|
|
"""Pixel OK + Semantic OK → vérifié avec haute confiance."""
|
|
mock_semantic.return_value = {
|
|
"verified": True,
|
|
"detail": "Le menu est bien ouvert",
|
|
"elapsed_ms": 2000.0,
|
|
}
|
|
result = verifier.verify_with_critic(
|
|
action={"type": "click", "action_id": "test"},
|
|
result={"success": True},
|
|
screenshot_before=screenshot_gray,
|
|
screenshot_after=screenshot_white, # Différent → changement détecté
|
|
expected_result="Le menu s'est ouvert",
|
|
)
|
|
assert result.verified is True
|
|
assert result.semantic_verified is True
|
|
assert result.confidence >= 0.7
|
|
assert "Critic OK" in result.detail
|
|
|
|
@patch("agent_v0.server_v1.replay_verifier.ReplayVerifier._verify_semantic")
|
|
def test_pixel_ok_semantic_non(
|
|
self, mock_semantic, verifier, screenshot_gray, screenshot_white,
|
|
):
|
|
"""Pixel OK + Semantic NON → INATTENDU (changement mais pas le bon)."""
|
|
mock_semantic.return_value = {
|
|
"verified": False,
|
|
"detail": "Une erreur est apparue au lieu du menu",
|
|
"elapsed_ms": 2500.0,
|
|
}
|
|
result = verifier.verify_with_critic(
|
|
action={"type": "click", "action_id": "test"},
|
|
result={"success": True},
|
|
screenshot_before=screenshot_gray,
|
|
screenshot_after=screenshot_white,
|
|
expected_result="Le menu s'est ouvert",
|
|
)
|
|
assert result.verified is False
|
|
assert result.semantic_verified is False
|
|
assert result.suggestion == "retry"
|
|
assert "Critic NON" in result.detail
|
|
|
|
@patch("agent_v0.server_v1.replay_verifier.ReplayVerifier._verify_semantic")
|
|
def test_vlm_indisponible_fallback_pixel(
|
|
self, mock_semantic, verifier, screenshot_gray, screenshot_white,
|
|
):
|
|
"""VLM indisponible → fallback sur pixel seul."""
|
|
mock_semantic.return_value = None # VLM down
|
|
result = verifier.verify_with_critic(
|
|
action={"type": "click", "action_id": "test"},
|
|
result={"success": True},
|
|
screenshot_before=screenshot_gray,
|
|
screenshot_after=screenshot_white,
|
|
expected_result="Le menu s'est ouvert",
|
|
)
|
|
# Fallback pixel seul — le changement est détecté
|
|
assert result.verified is True
|
|
assert result.semantic_verified is None # Pas de VLM
|
|
|
|
|
|
# =========================================================================
|
|
# Tests _verify_semantic — parsing de la réponse VLM
|
|
# =========================================================================
|
|
|
|
|
|
class TestVerifySemantic:
|
|
|
|
@patch("requests.post")
|
|
def test_parse_verdict_oui(self, mock_post, verifier, screenshot_white):
|
|
"""Parse correctement VERDICT: OUI."""
|
|
mock_resp = MagicMock()
|
|
mock_resp.ok = True
|
|
mock_resp.json.return_value = {
|
|
"message": {"content": "VERDICT: OUI\nRAISON: Le fichier est bien ouvert"}
|
|
}
|
|
mock_post.return_value = mock_resp
|
|
result = verifier._verify_semantic(
|
|
screenshot_before=screenshot_white,
|
|
screenshot_after=screenshot_white,
|
|
expected_result="Le fichier est ouvert",
|
|
)
|
|
assert result is not None
|
|
assert result["verified"] is True
|
|
assert "ouvert" in result["detail"]
|
|
|
|
@patch("requests.post")
|
|
def test_parse_verdict_non(self, mock_post, verifier, screenshot_white):
|
|
"""Parse correctement VERDICT: NON."""
|
|
mock_resp = MagicMock()
|
|
mock_resp.ok = True
|
|
mock_resp.json.return_value = {
|
|
"message": {"content": "VERDICT: NON\nRAISON: L'écran n'a pas changé"}
|
|
}
|
|
mock_post.return_value = mock_resp
|
|
result = verifier._verify_semantic(
|
|
screenshot_before=screenshot_white,
|
|
screenshot_after=screenshot_white,
|
|
expected_result="Le menu s'est ouvert",
|
|
)
|
|
assert result is not None
|
|
assert result["verified"] is False
|
|
|
|
@patch("requests.post")
|
|
def test_vlm_timeout_retourne_none(self, mock_post, verifier, screenshot_white):
|
|
"""Timeout VLM → retourne None (fallback gracieux)."""
|
|
import requests as _real_requests
|
|
mock_post.side_effect = _real_requests.Timeout("timeout")
|
|
result = verifier._verify_semantic(
|
|
screenshot_before=screenshot_white,
|
|
screenshot_after=screenshot_white,
|
|
expected_result="Le fichier est ouvert",
|
|
)
|
|
assert result is None
|
|
|
|
def test_sans_screenshot_after_retourne_none(self, verifier):
|
|
"""Sans screenshot_after, pas de vérification possible."""
|
|
result = verifier._verify_semantic(
|
|
screenshot_before=None,
|
|
screenshot_after=None,
|
|
expected_result="Le fichier est ouvert",
|
|
)
|
|
assert result is None
|
|
|
|
|
|
# =========================================================================
|
|
# Tests _merge_results — matrice pixel x sémantique
|
|
# =========================================================================
|
|
|
|
|
|
class TestMergeResults:
|
|
|
|
def test_pixel_ok_sem_ok(self, verifier):
|
|
pixel = VerificationResult(
|
|
verified=True, confidence=0.7, changes_detected=True,
|
|
change_area_pct=5.0, suggestion="continue",
|
|
)
|
|
semantic = {"verified": True, "detail": "OK", "elapsed_ms": 1000}
|
|
result = verifier._merge_results(pixel, semantic)
|
|
assert result.verified is True
|
|
assert result.semantic_verified is True
|
|
assert result.confidence >= 0.7
|
|
|
|
def test_pixel_ok_sem_non(self, verifier):
|
|
"""Pixel OK + Sémantique NON = inattendu → retry."""
|
|
pixel = VerificationResult(
|
|
verified=True, confidence=0.7, changes_detected=True,
|
|
change_area_pct=5.0, suggestion="continue",
|
|
)
|
|
semantic = {"verified": False, "detail": "Erreur popup", "elapsed_ms": 2000}
|
|
result = verifier._merge_results(pixel, semantic)
|
|
assert result.verified is False
|
|
assert result.semantic_verified is False
|
|
assert result.suggestion == "retry"
|
|
|
|
def test_pixel_non_sem_ok(self, verifier):
|
|
"""Pixel inchangé + Sémantique OK = état subtil → continue."""
|
|
pixel = VerificationResult(
|
|
verified=False, confidence=0.5, changes_detected=False,
|
|
change_area_pct=0.1, suggestion="retry",
|
|
)
|
|
semantic = {"verified": True, "detail": "Onglet déjà actif", "elapsed_ms": 1500}
|
|
result = verifier._merge_results(pixel, semantic)
|
|
assert result.verified is True
|
|
assert result.semantic_verified is True
|
|
assert result.suggestion == "continue"
|
|
|
|
def test_pixel_non_sem_non(self, verifier):
|
|
"""Pixel inchangé + Sémantique NON = échec complet → retry."""
|
|
pixel = VerificationResult(
|
|
verified=False, confidence=0.5, changes_detected=False,
|
|
change_area_pct=0.0, suggestion="retry",
|
|
)
|
|
semantic = {"verified": False, "detail": "Rien ne s'est passé", "elapsed_ms": 3000}
|
|
result = verifier._merge_results(pixel, semantic)
|
|
assert result.verified is False
|
|
assert result.semantic_verified is False
|
|
assert result.confidence >= 0.7 # Haute confiance dans l'échec
|
|
|
|
|
|
# =========================================================================
|
|
# Tests enrichissement intentions (stream_processor)
|
|
# =========================================================================
|
|
|
|
|
|
class TestEnrichActionsWithIntentions:
|
|
|
|
@patch("requests.post")
|
|
@patch("requests.get")
|
|
def test_enrichissement_parse_reponse_gemma4(self, mock_get, mock_post):
|
|
"""La réponse gemma4 est correctement parsée en intention/avant/après."""
|
|
from agent_v0.server_v1.stream_processor import _enrich_actions_with_intentions
|
|
import tempfile, shutil
|
|
|
|
# Mock gemma4 disponible
|
|
mock_tags_resp = MagicMock()
|
|
mock_tags_resp.ok = True
|
|
mock_get.return_value = mock_tags_resp
|
|
|
|
mock_chat_resp = MagicMock()
|
|
mock_chat_resp.ok = True
|
|
mock_chat_resp.json.return_value = {
|
|
"message": {
|
|
"content": (
|
|
"INTENTION: Ouvrir le fichier client dans le logiciel\n"
|
|
"AVANT: Le logiciel est ouvert sur la page d'accueil\n"
|
|
"APRÈS: Le fichier client est affiché dans la fenêtre"
|
|
)
|
|
}
|
|
}
|
|
mock_post.return_value = mock_chat_resp
|
|
|
|
actions = [
|
|
{
|
|
"type": "click",
|
|
"action_id": "act_001",
|
|
"target_spec": {"by_text": "Ouvrir", "window_title": "Logiciel"},
|
|
},
|
|
{
|
|
"type": "wait",
|
|
"action_id": "act_002",
|
|
"duration_ms": 1000,
|
|
},
|
|
]
|
|
|
|
tmpdir = Path(tempfile.mkdtemp())
|
|
try:
|
|
(tmpdir / "shots").mkdir()
|
|
_enrich_actions_with_intentions(actions, tmpdir)
|
|
|
|
# L'action click doit être enrichie
|
|
assert actions[0].get("intention") == "Ouvrir le fichier client dans le logiciel"
|
|
assert actions[0].get("expected_state") == "Le logiciel est ouvert sur la page d'accueil"
|
|
assert actions[0].get("expected_result") == "Le fichier client est affiché dans la fenêtre"
|
|
# expected_state doit aussi être dans target_spec (pour l'Observer)
|
|
assert actions[0]["target_spec"]["expected_state"] == "Le logiciel est ouvert sur la page d'accueil"
|
|
|
|
# L'action wait ne doit PAS être enrichie
|
|
assert "intention" not in actions[1]
|
|
finally:
|
|
shutil.rmtree(tmpdir)
|
|
|
|
@patch("requests.get")
|
|
def test_gemma4_indisponible_pas_de_crash(self, mock_get):
|
|
"""Si gemma4 est down, l'enrichissement est silencieusement désactivé."""
|
|
from agent_v0.server_v1.stream_processor import _enrich_actions_with_intentions
|
|
import tempfile, shutil
|
|
|
|
mock_get.side_effect = ConnectionError("gemma4 down")
|
|
|
|
actions = [
|
|
{"type": "click", "action_id": "act_001", "target_spec": {"by_text": "OK"}},
|
|
]
|
|
|
|
tmpdir = Path(tempfile.mkdtemp())
|
|
try:
|
|
(tmpdir / "shots").mkdir()
|
|
_enrich_actions_with_intentions(actions, tmpdir)
|
|
# Aucun crash, aucune intention ajoutée
|
|
assert "intention" not in actions[0]
|
|
finally:
|
|
shutil.rmtree(tmpdir)
|
|
|
|
@patch("requests.post")
|
|
@patch("requests.get")
|
|
def test_reponse_gemma4_malformee(self, mock_get, mock_post):
|
|
"""Si gemma4 retourne du texte non structuré, pas de crash."""
|
|
from agent_v0.server_v1.stream_processor import _enrich_actions_with_intentions
|
|
import tempfile, shutil
|
|
|
|
mock_tags = MagicMock()
|
|
mock_tags.ok = True
|
|
mock_get.return_value = mock_tags
|
|
|
|
mock_resp = MagicMock()
|
|
mock_resp.ok = True
|
|
mock_resp.json.return_value = {
|
|
"message": {"content": "Je ne comprends pas cette demande."}
|
|
}
|
|
mock_post.return_value = mock_resp
|
|
|
|
actions = [
|
|
{"type": "click", "action_id": "act_001", "target_spec": {"by_text": "OK"}},
|
|
]
|
|
|
|
tmpdir = Path(tempfile.mkdtemp())
|
|
try:
|
|
(tmpdir / "shots").mkdir()
|
|
_enrich_actions_with_intentions(actions, tmpdir)
|
|
# Pas de crash, mais pas d'intention non plus
|
|
assert "intention" not in actions[0]
|
|
finally:
|
|
shutil.rmtree(tmpdir)
|