feat(scoring): EdgeScorer utilise la vraie source_similarity (Lot B)
Avant : source_similarity=1.0 hardcodé dans _check_preconditions
-> la contrainte EdgeConstraints.min_source_similarity était
silencieusement désactivée. Un edge passait toujours.
Après : propagation ExecutionLoop -> workflow_pipeline -> EdgeScorer
- select_best/rank/score_edge/_check_preconditions acceptent
source_similarity: float (kwargs-only)
- get_next_action() le propage
- execution_loop passe la confidence issue de match_current_state
La contrainte min_source_similarity est opérationnelle pour la
première fois. Preuve concrète par test_min_source_similarity_fail
et test_low_similarity_blocks_edge (edge rejeté si sim < seuil).
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
337
tests/unit/test_edge_scorer.py
Normal file
337
tests/unit/test_edge_scorer.py
Normal file
@@ -0,0 +1,337 @@
|
||||
"""
|
||||
Tests unitaires de l'EdgeScorer (C3).
|
||||
|
||||
Couvre :
|
||||
- Filtre dur : pre_conditions échouent → edge rejeté
|
||||
- Ranking : edge avec success_rate le plus élevé gagne
|
||||
- Tiebreak sur success_rate
|
||||
- Retour None si aucun edge valide
|
||||
- Target match via ui_elements
|
||||
- Mode legacy strategy="first"
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import pytest
|
||||
|
||||
from core.models.screen_state import (
|
||||
ContextLevel,
|
||||
EmbeddingRef,
|
||||
PerceptionLevel,
|
||||
RawLevel,
|
||||
ScreenState,
|
||||
WindowContext,
|
||||
)
|
||||
from core.models.ui_element import UIElement, UIElementEmbeddings, VisualFeatures
|
||||
from core.models.base_models import BBox
|
||||
from core.models.workflow_graph import (
|
||||
Action,
|
||||
EdgeConstraints,
|
||||
EdgeStats,
|
||||
PostConditions,
|
||||
TargetSpec,
|
||||
WorkflowEdge,
|
||||
)
|
||||
from core.pipeline.edge_scorer import EdgeScorer
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _make_edge(
|
||||
edge_id: str,
|
||||
by_text: str | None = None,
|
||||
by_role: str | None = None,
|
||||
success_rate: float | None = None,
|
||||
execution_count: int = 0,
|
||||
last_executed: datetime | None = None,
|
||||
required_window_title: str | None = None,
|
||||
required_app_name: str | None = None,
|
||||
min_source_similarity: float = 0.80,
|
||||
) -> WorkflowEdge:
|
||||
stats = EdgeStats()
|
||||
if success_rate is not None and execution_count > 0:
|
||||
stats.execution_count = execution_count
|
||||
stats.success_count = int(round(success_rate * execution_count))
|
||||
stats.failure_count = execution_count - stats.success_count
|
||||
stats.last_executed = last_executed
|
||||
|
||||
target = TargetSpec(by_text=by_text, by_role=by_role)
|
||||
action = Action(type="mouse_click", target=target)
|
||||
constraints = EdgeConstraints(
|
||||
required_window_title=required_window_title or "",
|
||||
required_app_name=required_app_name or "",
|
||||
min_source_similarity=min_source_similarity,
|
||||
)
|
||||
|
||||
return WorkflowEdge(
|
||||
edge_id=edge_id,
|
||||
from_node="n1",
|
||||
to_node="n2",
|
||||
action=action,
|
||||
constraints=constraints,
|
||||
post_conditions=PostConditions(),
|
||||
stats=stats,
|
||||
)
|
||||
|
||||
|
||||
def _make_ui_element(
|
||||
element_id: str, label: str, role: str = "button", type_: str = "button"
|
||||
) -> UIElement:
|
||||
return UIElement(
|
||||
element_id=element_id,
|
||||
type=type_,
|
||||
role=role,
|
||||
bbox=BBox(x=0, y=0, width=100, height=30),
|
||||
center=(50, 15),
|
||||
label=label,
|
||||
label_confidence=0.9,
|
||||
embeddings=UIElementEmbeddings(),
|
||||
visual_features=VisualFeatures(
|
||||
dominant_color="#000",
|
||||
has_icon=False,
|
||||
shape="rectangle",
|
||||
size_category="medium",
|
||||
),
|
||||
confidence=0.9,
|
||||
)
|
||||
|
||||
|
||||
def _make_state(
|
||||
window_title: str = "Firefox",
|
||||
app_name: str = "firefox",
|
||||
detected_text: list[str] | None = None,
|
||||
ui_elements: list[UIElement] | None = None,
|
||||
) -> ScreenState:
|
||||
return ScreenState(
|
||||
screen_state_id="s1",
|
||||
timestamp=datetime.now(),
|
||||
session_id="sess",
|
||||
window=WindowContext(
|
||||
app_name=app_name,
|
||||
window_title=window_title,
|
||||
screen_resolution=[1920, 1080],
|
||||
),
|
||||
raw=RawLevel(screenshot_path="", capture_method="t", file_size_bytes=0),
|
||||
perception=PerceptionLevel(
|
||||
embedding=EmbeddingRef(provider="t", vector_id="v", dimensions=512),
|
||||
detected_text=detected_text or [],
|
||||
text_detection_method="none",
|
||||
confidence_avg=0.0,
|
||||
),
|
||||
context=ContextLevel(),
|
||||
ui_elements=ui_elements or [],
|
||||
)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Tests
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestEdgeScorerBasic:
|
||||
|
||||
def test_returns_none_on_empty(self):
|
||||
assert EdgeScorer().select_best([]) is None
|
||||
|
||||
def test_single_edge_returned_when_no_constraints(self):
|
||||
edge = _make_edge("e1")
|
||||
state = _make_state()
|
||||
assert EdgeScorer().select_best([edge], screen_state=state) == edge
|
||||
|
||||
def test_strategy_first_returns_first_edge(self):
|
||||
e1 = _make_edge("e1", success_rate=0.1, execution_count=10)
|
||||
e2 = _make_edge("e2", success_rate=0.9, execution_count=10)
|
||||
state = _make_state()
|
||||
result = EdgeScorer().select_best(
|
||||
[e1, e2], screen_state=state, strategy="first"
|
||||
)
|
||||
assert result.edge_id == "e1"
|
||||
|
||||
|
||||
class TestEdgeScorerFilter:
|
||||
|
||||
def test_rejects_edge_with_wrong_window(self):
|
||||
"""Un edge exigeant un titre de fenêtre différent doit être rejeté."""
|
||||
e1 = _make_edge("e1", required_window_title="Chrome")
|
||||
state = _make_state(window_title="Firefox")
|
||||
result = EdgeScorer().select_best([e1], screen_state=state)
|
||||
assert result is None
|
||||
|
||||
def test_rejects_edge_with_wrong_app(self):
|
||||
e1 = _make_edge("e1", required_app_name="chrome")
|
||||
state = _make_state(app_name="firefox")
|
||||
result = EdgeScorer().select_best([e1], screen_state=state)
|
||||
assert result is None
|
||||
|
||||
def test_keeps_valid_edge_when_one_rejected(self):
|
||||
"""Cas simple : 2 edges, un seul valide."""
|
||||
e_bad = _make_edge("e_bad", required_window_title="NopeApp")
|
||||
e_ok = _make_edge("e_ok", required_window_title="Firefox")
|
||||
state = _make_state(window_title="Firefox Browser")
|
||||
result = EdgeScorer().select_best([e_bad, e_ok], screen_state=state)
|
||||
assert result is not None
|
||||
assert result.edge_id == "e_ok"
|
||||
|
||||
|
||||
class TestEdgeScorerRanking:
|
||||
|
||||
def test_higher_success_rate_wins(self):
|
||||
"""Cas : 2 edges valides, celui avec meilleur success_rate gagne."""
|
||||
e_low = _make_edge("e_low", success_rate=0.20, execution_count=20)
|
||||
e_high = _make_edge("e_high", success_rate=0.95, execution_count=20)
|
||||
state = _make_state()
|
||||
result = EdgeScorer().select_best([e_low, e_high], screen_state=state)
|
||||
assert result.edge_id == "e_high"
|
||||
|
||||
def test_rank_returns_sorted_by_score(self):
|
||||
e1 = _make_edge("e1", success_rate=0.3, execution_count=10)
|
||||
e2 = _make_edge("e2", success_rate=0.9, execution_count=10)
|
||||
e3 = _make_edge("e3", success_rate=0.6, execution_count=10)
|
||||
state = _make_state()
|
||||
ranked = EdgeScorer().rank([e1, e2, e3], screen_state=state)
|
||||
ids = [s.edge.edge_id for s in ranked]
|
||||
assert ids == ["e2", "e3", "e1"]
|
||||
|
||||
def test_target_match_boost(self):
|
||||
"""Un edge qui match un UI element gagne face à un sans match."""
|
||||
e_match = _make_edge("e_match", by_text="Submit")
|
||||
e_no_match = _make_edge("e_no_match", by_text="DoesNotExist")
|
||||
ui = _make_ui_element("btn1", label="Submit")
|
||||
state = _make_state(ui_elements=[ui])
|
||||
|
||||
ranked = EdgeScorer().rank([e_no_match, e_match], screen_state=state)
|
||||
assert ranked[0].edge.edge_id == "e_match"
|
||||
assert ranked[0].target_match > ranked[1].target_match
|
||||
|
||||
def test_recency_bonus_for_recent_execution(self):
|
||||
recent = _make_edge(
|
||||
"recent",
|
||||
success_rate=0.5,
|
||||
execution_count=10,
|
||||
last_executed=datetime.now() - timedelta(hours=1),
|
||||
)
|
||||
old = _make_edge(
|
||||
"old",
|
||||
success_rate=0.5,
|
||||
execution_count=10,
|
||||
last_executed=datetime.now() - timedelta(days=30),
|
||||
)
|
||||
scorer = EdgeScorer()
|
||||
state = _make_state()
|
||||
ranked = scorer.rank([old, recent], screen_state=state)
|
||||
# Même success_rate, récence tranche → recent gagne
|
||||
assert ranked[0].edge.edge_id == "recent"
|
||||
|
||||
|
||||
class TestEdgeScorerNoValidEdge:
|
||||
|
||||
def test_all_edges_rejected_returns_none(self):
|
||||
e1 = _make_edge("e1", required_window_title="AppA")
|
||||
e2 = _make_edge("e2", required_window_title="AppB")
|
||||
state = _make_state(window_title="AppC")
|
||||
assert EdgeScorer().select_best([e1, e2], screen_state=state) is None
|
||||
|
||||
def test_no_screen_state_does_not_filter(self):
|
||||
"""Sans ScreenState, on ne peut pas évaluer les pre_conditions → laisser passer."""
|
||||
e1 = _make_edge("e1", required_window_title="StrictApp")
|
||||
result = EdgeScorer().select_best([e1], screen_state=None)
|
||||
assert result is not None
|
||||
|
||||
|
||||
class TestEdgeScorerSourceSimilarity:
|
||||
"""Lot B — la contrainte `min_source_similarity` redevient effective."""
|
||||
|
||||
def test_min_source_similarity_pass(self):
|
||||
"""Edge accepté lorsque source_similarity >= min_source_similarity."""
|
||||
edge = _make_edge("e1", min_source_similarity=0.80)
|
||||
state = _make_state()
|
||||
result = EdgeScorer().select_best(
|
||||
[edge], screen_state=state, source_similarity=0.90
|
||||
)
|
||||
assert result is not None
|
||||
assert result.edge_id == "e1"
|
||||
|
||||
def test_min_source_similarity_fail(self):
|
||||
"""Edge rejeté lorsque source_similarity < min_source_similarity.
|
||||
|
||||
Ce test démontre concrètement que le filtre n'est plus désactivé
|
||||
silencieusement (avant Lot B il recevait toujours 1.0 hardcodé).
|
||||
"""
|
||||
edge = _make_edge("e1", min_source_similarity=0.80)
|
||||
state = _make_state()
|
||||
result = EdgeScorer().select_best(
|
||||
[edge], screen_state=state, source_similarity=0.50
|
||||
)
|
||||
assert result is None
|
||||
|
||||
def test_min_source_similarity_default_is_pass_through(self):
|
||||
"""Défaut source_similarity=1.0 → aucun edge n'est rejeté pour ce motif."""
|
||||
edge = _make_edge("e1", min_source_similarity=0.99)
|
||||
state = _make_state()
|
||||
# Pas de source_similarity fournie → défaut 1.0 → edge accepté
|
||||
result = EdgeScorer().select_best([edge], screen_state=state)
|
||||
assert result is not None
|
||||
|
||||
def test_tiebreak_unchanged_with_similarity(self):
|
||||
"""Avec similarité OK des deux côtés, le tiebreak sur success_rate
|
||||
reste identique (pas de régression du comportement existant)."""
|
||||
e_low = _make_edge(
|
||||
"e_low",
|
||||
success_rate=0.20,
|
||||
execution_count=20,
|
||||
min_source_similarity=0.70,
|
||||
)
|
||||
e_high = _make_edge(
|
||||
"e_high",
|
||||
success_rate=0.95,
|
||||
execution_count=20,
|
||||
min_source_similarity=0.70,
|
||||
)
|
||||
state = _make_state()
|
||||
ranked = EdgeScorer().rank(
|
||||
[e_low, e_high], screen_state=state, source_similarity=0.85
|
||||
)
|
||||
# Les deux passent le filtre, e_high gagne au success_rate
|
||||
assert ranked[0].edge.edge_id == "e_high"
|
||||
assert ranked[0].passed_preconditions is True
|
||||
assert ranked[1].passed_preconditions is True
|
||||
|
||||
def test_similarity_filters_before_ranking(self):
|
||||
"""Entre 2 edges, celui dont min_source_similarity est violée est rejeté
|
||||
même s'il a un meilleur success_rate."""
|
||||
e_strict_high = _make_edge(
|
||||
"e_strict_high",
|
||||
success_rate=0.95,
|
||||
execution_count=20,
|
||||
min_source_similarity=0.90,
|
||||
)
|
||||
e_loose_low = _make_edge(
|
||||
"e_loose_low",
|
||||
success_rate=0.30,
|
||||
execution_count=20,
|
||||
min_source_similarity=0.50,
|
||||
)
|
||||
state = _make_state()
|
||||
# Source similarity 0.70 → e_strict_high rejeté, e_loose_low accepté
|
||||
result = EdgeScorer().select_best(
|
||||
[e_strict_high, e_loose_low],
|
||||
screen_state=state,
|
||||
source_similarity=0.70,
|
||||
)
|
||||
assert result is not None
|
||||
assert result.edge_id == "e_loose_low"
|
||||
|
||||
def test_score_edge_exposes_precondition_reason(self):
|
||||
"""Pour la télémétrie : la raison d'échec mentionne la similarité."""
|
||||
edge = _make_edge("e1", min_source_similarity=0.80)
|
||||
state = _make_state()
|
||||
score = EdgeScorer().score_edge(
|
||||
edge, screen_state=state, source_similarity=0.40
|
||||
)
|
||||
assert score.passed_preconditions is False
|
||||
assert "imilarité" in score.precondition_reason or "imilarite" in score.precondition_reason
|
||||
Reference in New Issue
Block a user