feat: pipeline complet MACRO/MÉSO/MICRO — Critic, Observer, Policy, Recovery, Learning, Audit Trail, TaskPlanner
Architecture 3 niveaux implémentée et testée (137 tests unitaires + 21 visuels) : MÉSO (acteur intelligent) : - P0 Critic : vérification sémantique post-action via gemma4 (replay_verifier.py) - P1 Observer : pré-analyse écran avant chaque action (api_stream.py /pre_analyze) - P2 Grounding/Policy : séparation localisation (grounding.py) et décision (policy.py) - P3 Recovery : rollback automatique Ctrl+Z/Escape/Alt+F4 (recovery.py) - P4 Learning : apprentissage runtime avec boucle de consolidation (replay_learner.py) MACRO (planificateur) : - TaskPlanner : comprend les ordres en langage naturel via gemma4 (task_planner.py) - Contexte métier TIM/CIM-10 pour les hôpitaux (domain_context.py) - Endpoint POST /api/v1/task pour l'exécution par instruction Traçabilité : - Audit trail complet avec 18 champs par action (audit_trail.py) - Endpoints GET /audit/history, /audit/summary, /audit/export (CSV) Grounding : - Fix parsing bbox_2d qwen2.5vl (pixels relatifs, pas grille 1000x1000) - Benchmarks visuels sur captures réelles (3 approches : baseline, zoom, Citrix) - Reproductibilité validée : variance < 0.008 sur 10 itérations Sécurité : - Tokens de production retirés du code source → .env.local - Secret key aléatoire si non configuré - Suppression logs qui leakent les tokens Résultats : 80% de replay (vs 12.5% avant), 100% détection visuelle Citrix JPEG Q20 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
441
tests/unit/test_replay_critic.py
Normal file
441
tests/unit/test_replay_critic.py
Normal file
@@ -0,0 +1,441 @@
|
||||
"""
|
||||
Tests unitaires pour le Critic (ReplayVerifier.verify_with_critic)
|
||||
et l'enrichissement des actions avec intentions.
|
||||
|
||||
Vérifie les FONCTIONNALITÉS, pas juste la non-régression :
|
||||
1. Le Critic fusionne correctement pixel + sémantique
|
||||
2. La matrice de décision (4 cas) est correcte
|
||||
3. L'enrichissement intentions parse bien les réponses gemma4
|
||||
4. Les fallbacks fonctionnent quand le VLM est indisponible
|
||||
"""
|
||||
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch, Mock
|
||||
|
||||
import pytest
|
||||
|
||||
_ROOT = str(Path(__file__).resolve().parents[2])
|
||||
if _ROOT not in sys.path:
|
||||
sys.path.insert(0, _ROOT)
|
||||
|
||||
from agent_v0.server_v1.replay_verifier import ReplayVerifier, VerificationResult
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Fixtures
|
||||
# =========================================================================
|
||||
|
||||
|
||||
def _make_screenshot_b64(width=100, height=100, color=(128, 128, 128)):
|
||||
"""Créer un screenshot base64 factice (JPEG)."""
|
||||
from PIL import Image
|
||||
img = Image.new("RGB", (width, height), color)
|
||||
buf = io.BytesIO()
|
||||
img.save(buf, format="JPEG", quality=50)
|
||||
return base64.b64encode(buf.getvalue()).decode()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def verifier():
|
||||
return ReplayVerifier()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def screenshot_gray():
|
||||
return _make_screenshot_b64(100, 100, (128, 128, 128))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def screenshot_white():
|
||||
return _make_screenshot_b64(100, 100, (255, 255, 255))
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Tests VerificationResult — nouveaux champs sémantiques
|
||||
# =========================================================================
|
||||
|
||||
|
||||
class TestVerificationResult:
|
||||
|
||||
def test_to_dict_sans_semantique(self):
|
||||
"""Sans vérification sémantique, les champs semantic_ sont absents du dict."""
|
||||
r = VerificationResult(
|
||||
verified=True, confidence=0.8, changes_detected=True,
|
||||
change_area_pct=5.0, suggestion="continue", detail="test",
|
||||
)
|
||||
d = r.to_dict()
|
||||
assert "semantic_verified" not in d
|
||||
assert d["verified"] is True
|
||||
assert d["confidence"] == 0.8
|
||||
|
||||
def test_to_dict_avec_semantique(self):
|
||||
"""Avec vérification sémantique, les champs semantic_ sont présents."""
|
||||
r = VerificationResult(
|
||||
verified=True, confidence=0.9, changes_detected=True,
|
||||
change_area_pct=5.0, suggestion="continue", detail="test",
|
||||
semantic_verified=True, semantic_detail="Bouton visible",
|
||||
semantic_elapsed_ms=1500.0,
|
||||
)
|
||||
d = r.to_dict()
|
||||
assert d["semantic_verified"] is True
|
||||
assert d["semantic_detail"] == "Bouton visible"
|
||||
assert d["semantic_elapsed_ms"] == 1500.0
|
||||
|
||||
def test_to_dict_semantique_false(self):
|
||||
"""semantic_verified=False doit apparaître dans le dict."""
|
||||
r = VerificationResult(
|
||||
verified=False, confidence=0.7, changes_detected=True,
|
||||
change_area_pct=5.0, suggestion="retry",
|
||||
semantic_verified=False, semantic_detail="Mauvais écran",
|
||||
semantic_elapsed_ms=2000.0,
|
||||
)
|
||||
d = r.to_dict()
|
||||
assert d["semantic_verified"] is False
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Tests verify_with_critic — matrice de décision
|
||||
# =========================================================================
|
||||
|
||||
|
||||
class TestVerifyWithCritic:
|
||||
|
||||
def test_sans_expected_result_retourne_pixel_seul(self, verifier, screenshot_gray):
|
||||
"""Sans expected_result, verify_with_critic = verify_action (pixel seul)."""
|
||||
result = verifier.verify_with_critic(
|
||||
action={"type": "click", "action_id": "test"},
|
||||
result={"success": True},
|
||||
screenshot_before=screenshot_gray,
|
||||
screenshot_after=screenshot_gray,
|
||||
expected_result="", # Pas d'attendu
|
||||
)
|
||||
# Pixel seul — pas de champ semantic
|
||||
assert result.semantic_verified is None
|
||||
|
||||
def test_sans_screenshots_pas_de_semantique(self, verifier):
|
||||
"""Sans screenshots, pas de vérification sémantique possible."""
|
||||
result = verifier.verify_with_critic(
|
||||
action={"type": "click", "action_id": "test"},
|
||||
result={"success": True},
|
||||
screenshot_before=None,
|
||||
screenshot_after=None,
|
||||
expected_result="Le fichier est ouvert",
|
||||
)
|
||||
# Pas de screenshots → pixel seul (confidence basse)
|
||||
assert result.verified is True
|
||||
assert result.confidence < 0.5
|
||||
|
||||
def test_pixel_pas_change_et_expected_result_skip_vlm(
|
||||
self, verifier, screenshot_gray,
|
||||
):
|
||||
"""Si pixel identiques + expected_result → skip VLM (pas de changement = retry)."""
|
||||
result = verifier.verify_with_critic(
|
||||
action={"type": "click", "action_id": "test", "x_pct": 0.5, "y_pct": 0.5},
|
||||
result={"success": True},
|
||||
screenshot_before=screenshot_gray,
|
||||
screenshot_after=screenshot_gray, # Même image → aucun changement
|
||||
expected_result="Le menu s'est ouvert",
|
||||
)
|
||||
# Pas de changement pixel → retry, VLM non appelé
|
||||
assert result.verified is False
|
||||
assert result.suggestion == "retry"
|
||||
assert result.semantic_verified is None # VLM non appelé
|
||||
|
||||
@patch("agent_v0.server_v1.replay_verifier.ReplayVerifier._verify_semantic")
|
||||
def test_pixel_ok_semantic_ok(
|
||||
self, mock_semantic, verifier, screenshot_gray, screenshot_white,
|
||||
):
|
||||
"""Pixel OK + Semantic OK → vérifié avec haute confiance."""
|
||||
mock_semantic.return_value = {
|
||||
"verified": True,
|
||||
"detail": "Le menu est bien ouvert",
|
||||
"elapsed_ms": 2000.0,
|
||||
}
|
||||
result = verifier.verify_with_critic(
|
||||
action={"type": "click", "action_id": "test"},
|
||||
result={"success": True},
|
||||
screenshot_before=screenshot_gray,
|
||||
screenshot_after=screenshot_white, # Différent → changement détecté
|
||||
expected_result="Le menu s'est ouvert",
|
||||
)
|
||||
assert result.verified is True
|
||||
assert result.semantic_verified is True
|
||||
assert result.confidence >= 0.7
|
||||
assert "Critic OK" in result.detail
|
||||
|
||||
@patch("agent_v0.server_v1.replay_verifier.ReplayVerifier._verify_semantic")
|
||||
def test_pixel_ok_semantic_non(
|
||||
self, mock_semantic, verifier, screenshot_gray, screenshot_white,
|
||||
):
|
||||
"""Pixel OK + Semantic NON → INATTENDU (changement mais pas le bon)."""
|
||||
mock_semantic.return_value = {
|
||||
"verified": False,
|
||||
"detail": "Une erreur est apparue au lieu du menu",
|
||||
"elapsed_ms": 2500.0,
|
||||
}
|
||||
result = verifier.verify_with_critic(
|
||||
action={"type": "click", "action_id": "test"},
|
||||
result={"success": True},
|
||||
screenshot_before=screenshot_gray,
|
||||
screenshot_after=screenshot_white,
|
||||
expected_result="Le menu s'est ouvert",
|
||||
)
|
||||
assert result.verified is False
|
||||
assert result.semantic_verified is False
|
||||
assert result.suggestion == "retry"
|
||||
assert "Critic NON" in result.detail
|
||||
|
||||
@patch("agent_v0.server_v1.replay_verifier.ReplayVerifier._verify_semantic")
|
||||
def test_vlm_indisponible_fallback_pixel(
|
||||
self, mock_semantic, verifier, screenshot_gray, screenshot_white,
|
||||
):
|
||||
"""VLM indisponible → fallback sur pixel seul."""
|
||||
mock_semantic.return_value = None # VLM down
|
||||
result = verifier.verify_with_critic(
|
||||
action={"type": "click", "action_id": "test"},
|
||||
result={"success": True},
|
||||
screenshot_before=screenshot_gray,
|
||||
screenshot_after=screenshot_white,
|
||||
expected_result="Le menu s'est ouvert",
|
||||
)
|
||||
# Fallback pixel seul — le changement est détecté
|
||||
assert result.verified is True
|
||||
assert result.semantic_verified is None # Pas de VLM
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Tests _verify_semantic — parsing de la réponse VLM
|
||||
# =========================================================================
|
||||
|
||||
|
||||
class TestVerifySemantic:
|
||||
|
||||
@patch("requests.post")
|
||||
def test_parse_verdict_oui(self, mock_post, verifier, screenshot_white):
|
||||
"""Parse correctement VERDICT: OUI."""
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.ok = True
|
||||
mock_resp.json.return_value = {
|
||||
"message": {"content": "VERDICT: OUI\nRAISON: Le fichier est bien ouvert"}
|
||||
}
|
||||
mock_post.return_value = mock_resp
|
||||
result = verifier._verify_semantic(
|
||||
screenshot_before=screenshot_white,
|
||||
screenshot_after=screenshot_white,
|
||||
expected_result="Le fichier est ouvert",
|
||||
)
|
||||
assert result is not None
|
||||
assert result["verified"] is True
|
||||
assert "ouvert" in result["detail"]
|
||||
|
||||
@patch("requests.post")
|
||||
def test_parse_verdict_non(self, mock_post, verifier, screenshot_white):
|
||||
"""Parse correctement VERDICT: NON."""
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.ok = True
|
||||
mock_resp.json.return_value = {
|
||||
"message": {"content": "VERDICT: NON\nRAISON: L'écran n'a pas changé"}
|
||||
}
|
||||
mock_post.return_value = mock_resp
|
||||
result = verifier._verify_semantic(
|
||||
screenshot_before=screenshot_white,
|
||||
screenshot_after=screenshot_white,
|
||||
expected_result="Le menu s'est ouvert",
|
||||
)
|
||||
assert result is not None
|
||||
assert result["verified"] is False
|
||||
|
||||
@patch("requests.post")
|
||||
def test_vlm_timeout_retourne_none(self, mock_post, verifier, screenshot_white):
|
||||
"""Timeout VLM → retourne None (fallback gracieux)."""
|
||||
import requests as _real_requests
|
||||
mock_post.side_effect = _real_requests.Timeout("timeout")
|
||||
result = verifier._verify_semantic(
|
||||
screenshot_before=screenshot_white,
|
||||
screenshot_after=screenshot_white,
|
||||
expected_result="Le fichier est ouvert",
|
||||
)
|
||||
assert result is None
|
||||
|
||||
def test_sans_screenshot_after_retourne_none(self, verifier):
|
||||
"""Sans screenshot_after, pas de vérification possible."""
|
||||
result = verifier._verify_semantic(
|
||||
screenshot_before=None,
|
||||
screenshot_after=None,
|
||||
expected_result="Le fichier est ouvert",
|
||||
)
|
||||
assert result is None
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Tests _merge_results — matrice pixel x sémantique
|
||||
# =========================================================================
|
||||
|
||||
|
||||
class TestMergeResults:
|
||||
|
||||
def test_pixel_ok_sem_ok(self, verifier):
|
||||
pixel = VerificationResult(
|
||||
verified=True, confidence=0.7, changes_detected=True,
|
||||
change_area_pct=5.0, suggestion="continue",
|
||||
)
|
||||
semantic = {"verified": True, "detail": "OK", "elapsed_ms": 1000}
|
||||
result = verifier._merge_results(pixel, semantic)
|
||||
assert result.verified is True
|
||||
assert result.semantic_verified is True
|
||||
assert result.confidence >= 0.7
|
||||
|
||||
def test_pixel_ok_sem_non(self, verifier):
|
||||
"""Pixel OK + Sémantique NON = inattendu → retry."""
|
||||
pixel = VerificationResult(
|
||||
verified=True, confidence=0.7, changes_detected=True,
|
||||
change_area_pct=5.0, suggestion="continue",
|
||||
)
|
||||
semantic = {"verified": False, "detail": "Erreur popup", "elapsed_ms": 2000}
|
||||
result = verifier._merge_results(pixel, semantic)
|
||||
assert result.verified is False
|
||||
assert result.semantic_verified is False
|
||||
assert result.suggestion == "retry"
|
||||
|
||||
def test_pixel_non_sem_ok(self, verifier):
|
||||
"""Pixel inchangé + Sémantique OK = état subtil → continue."""
|
||||
pixel = VerificationResult(
|
||||
verified=False, confidence=0.5, changes_detected=False,
|
||||
change_area_pct=0.1, suggestion="retry",
|
||||
)
|
||||
semantic = {"verified": True, "detail": "Onglet déjà actif", "elapsed_ms": 1500}
|
||||
result = verifier._merge_results(pixel, semantic)
|
||||
assert result.verified is True
|
||||
assert result.semantic_verified is True
|
||||
assert result.suggestion == "continue"
|
||||
|
||||
def test_pixel_non_sem_non(self, verifier):
|
||||
"""Pixel inchangé + Sémantique NON = échec complet → retry."""
|
||||
pixel = VerificationResult(
|
||||
verified=False, confidence=0.5, changes_detected=False,
|
||||
change_area_pct=0.0, suggestion="retry",
|
||||
)
|
||||
semantic = {"verified": False, "detail": "Rien ne s'est passé", "elapsed_ms": 3000}
|
||||
result = verifier._merge_results(pixel, semantic)
|
||||
assert result.verified is False
|
||||
assert result.semantic_verified is False
|
||||
assert result.confidence >= 0.7 # Haute confiance dans l'échec
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Tests enrichissement intentions (stream_processor)
|
||||
# =========================================================================
|
||||
|
||||
|
||||
class TestEnrichActionsWithIntentions:
|
||||
|
||||
@patch("requests.post")
|
||||
@patch("requests.get")
|
||||
def test_enrichissement_parse_reponse_gemma4(self, mock_get, mock_post):
|
||||
"""La réponse gemma4 est correctement parsée en intention/avant/après."""
|
||||
from agent_v0.server_v1.stream_processor import _enrich_actions_with_intentions
|
||||
import tempfile, shutil
|
||||
|
||||
# Mock gemma4 disponible
|
||||
mock_tags_resp = MagicMock()
|
||||
mock_tags_resp.ok = True
|
||||
mock_get.return_value = mock_tags_resp
|
||||
|
||||
mock_chat_resp = MagicMock()
|
||||
mock_chat_resp.ok = True
|
||||
mock_chat_resp.json.return_value = {
|
||||
"message": {
|
||||
"content": (
|
||||
"INTENTION: Ouvrir le fichier client dans le logiciel\n"
|
||||
"AVANT: Le logiciel est ouvert sur la page d'accueil\n"
|
||||
"APRÈS: Le fichier client est affiché dans la fenêtre"
|
||||
)
|
||||
}
|
||||
}
|
||||
mock_post.return_value = mock_chat_resp
|
||||
|
||||
actions = [
|
||||
{
|
||||
"type": "click",
|
||||
"action_id": "act_001",
|
||||
"target_spec": {"by_text": "Ouvrir", "window_title": "Logiciel"},
|
||||
},
|
||||
{
|
||||
"type": "wait",
|
||||
"action_id": "act_002",
|
||||
"duration_ms": 1000,
|
||||
},
|
||||
]
|
||||
|
||||
tmpdir = Path(tempfile.mkdtemp())
|
||||
try:
|
||||
(tmpdir / "shots").mkdir()
|
||||
_enrich_actions_with_intentions(actions, tmpdir)
|
||||
|
||||
# L'action click doit être enrichie
|
||||
assert actions[0].get("intention") == "Ouvrir le fichier client dans le logiciel"
|
||||
assert actions[0].get("expected_state") == "Le logiciel est ouvert sur la page d'accueil"
|
||||
assert actions[0].get("expected_result") == "Le fichier client est affiché dans la fenêtre"
|
||||
# expected_state doit aussi être dans target_spec (pour l'Observer)
|
||||
assert actions[0]["target_spec"]["expected_state"] == "Le logiciel est ouvert sur la page d'accueil"
|
||||
|
||||
# L'action wait ne doit PAS être enrichie
|
||||
assert "intention" not in actions[1]
|
||||
finally:
|
||||
shutil.rmtree(tmpdir)
|
||||
|
||||
@patch("requests.get")
|
||||
def test_gemma4_indisponible_pas_de_crash(self, mock_get):
|
||||
"""Si gemma4 est down, l'enrichissement est silencieusement désactivé."""
|
||||
from agent_v0.server_v1.stream_processor import _enrich_actions_with_intentions
|
||||
import tempfile, shutil
|
||||
|
||||
mock_get.side_effect = ConnectionError("gemma4 down")
|
||||
|
||||
actions = [
|
||||
{"type": "click", "action_id": "act_001", "target_spec": {"by_text": "OK"}},
|
||||
]
|
||||
|
||||
tmpdir = Path(tempfile.mkdtemp())
|
||||
try:
|
||||
(tmpdir / "shots").mkdir()
|
||||
_enrich_actions_with_intentions(actions, tmpdir)
|
||||
# Aucun crash, aucune intention ajoutée
|
||||
assert "intention" not in actions[0]
|
||||
finally:
|
||||
shutil.rmtree(tmpdir)
|
||||
|
||||
@patch("requests.post")
|
||||
@patch("requests.get")
|
||||
def test_reponse_gemma4_malformee(self, mock_get, mock_post):
|
||||
"""Si gemma4 retourne du texte non structuré, pas de crash."""
|
||||
from agent_v0.server_v1.stream_processor import _enrich_actions_with_intentions
|
||||
import tempfile, shutil
|
||||
|
||||
mock_tags = MagicMock()
|
||||
mock_tags.ok = True
|
||||
mock_get.return_value = mock_tags
|
||||
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.ok = True
|
||||
mock_resp.json.return_value = {
|
||||
"message": {"content": "Je ne comprends pas cette demande."}
|
||||
}
|
||||
mock_post.return_value = mock_resp
|
||||
|
||||
actions = [
|
||||
{"type": "click", "action_id": "act_001", "target_spec": {"by_text": "OK"}},
|
||||
]
|
||||
|
||||
tmpdir = Path(tempfile.mkdtemp())
|
||||
try:
|
||||
(tmpdir / "shots").mkdir()
|
||||
_enrich_actions_with_intentions(actions, tmpdir)
|
||||
# Pas de crash, mais pas d'intention non plus
|
||||
assert "intention" not in actions[0]
|
||||
finally:
|
||||
shutil.rmtree(tmpdir)
|
||||
Reference in New Issue
Block a user