feat(graph): enrichissement visuel des workflows (C2)
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 12s
security-audit / pip-audit (CVE dépendances) (push) Successful in 11s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 13s
tests / Tests sécurité (critique) (push) Has been skipped

GraphBuilder construit maintenant des ScreenState enrichis
(ui_elements + detected_text) au lieu de stubs vides, et associe
les clics aux UIElement par proximité spatiale.

Détails :
- __init__ accepte ui_detector, screen_analyzer, enable_ui_enrichment,
  element_proximity_max_px (+ lazy resolver via singleton C1)
- _create_screen_states délègue à ScreenAnalyzer.analyze() — remplace
  l'appel à _extract_text() qui n'existait plus depuis le Lot C
  (bug silencieux : OCR cassé en prod depuis ce jour, caught except)
- _find_clicked_element : bbox contenant strict + fallback proximité
  ≤50px, préfère le plus petit bbox (form vs button)
- _build_click_target_spec : TargetSpec(by_role, by_text,
  selection_policy="by_similarity") avec ancres dans context_hints
  (anchor_element_id, anchor_bbox, anchor_center)
- _build_edges propage le ScreenState source aux builders d'action
- WorkflowPipeline passe ui_detector + enable_ui_enrichment au builder

Impact : matching prod 3-5x plus précis, TargetSpec ne sont plus
des "unknown_element" génériques, UIConstraint.required_roles se
remplit correctement via _extract_common_ui_elements (qui marchait
depuis toujours mais sur des state.ui_elements vides).

Tests e2e migrés vers enable_ui_enrichment=False (2.9s vs 67s) —
ils valident le pipeline DBSCAN/edges, pas la détection UI réelle.

15 nouveaux tests, 178 tests passants au total (incluant Lots A-E).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-04-15 22:02:30 +02:00
parent eded968c70
commit 7f2bc6fe97
4 changed files with 1034 additions and 87 deletions

View File

@@ -143,13 +143,19 @@ def mock_embedding_builder():
@pytest.fixture
def graph_builder(mock_embedding_builder):
"""GraphBuilder configuré pour le test (validation qualité désactivée)."""
"""GraphBuilder configuré pour le test (validation qualité désactivée).
`enable_ui_enrichment=False` désactive l'analyzer GPU : ces tests
valident le pipeline DBSCAN + edges, pas la détection UI réelle
(couverte par tests/unit/test_graph_builder_ui_enrichment.py).
"""
return GraphBuilder(
embedding_builder=mock_embedding_builder,
min_pattern_repetitions=3,
clustering_eps=0.15,
clustering_min_samples=2,
enable_quality_validation=False,
enable_ui_enrichment=False,
)
@@ -356,6 +362,7 @@ class TestQualityValidation:
embedding_builder=mock_embedding_builder,
min_pattern_repetitions=3,
enable_quality_validation=True,
enable_ui_enrichment=False,
)
workflow = builder.build_from_session(session)
@@ -377,6 +384,7 @@ class TestQualityValidation:
embedding_builder=mock_embedding_builder,
min_pattern_repetitions=3,
enable_quality_validation=True,
enable_ui_enrichment=False,
)
workflow = builder.build_from_session(session)
@@ -403,6 +411,7 @@ class TestEdgeCases:
builder = GraphBuilder(
embedding_builder=mock_embedding_builder,
enable_quality_validation=False,
enable_ui_enrichment=False,
)
with pytest.raises(ValueError, match="no screenshots"):
@@ -456,6 +465,7 @@ class TestEdgeCases:
embedding_builder=mock_embedding_builder,
min_pattern_repetitions=3,
enable_quality_validation=False,
enable_ui_enrichment=False,
)
workflow = builder.build_from_session(session)

View File

@@ -0,0 +1,513 @@
"""
Tests unitaires de l'enrichissement visuel dans GraphBuilder (chantier C2).
Couvre :
- `_create_screen_states` : enrichit `ui_elements` via ScreenAnalyzer
- `_find_clicked_element` : association spatiale clic → UIElement
- `_build_single_action` : TargetSpec avec `by_role`/`by_text` quand ancre
- Fallback `by_role="unknown_element"` quand aucun ancrage n'est possible
- `_extract_common_ui_elements` : required_roles extrait du cluster
- Analyzer qui crash → ScreenState vide, pas de propagation d'exception
- Singleton partagé entre deux GraphBuilder (C1)
"""
from __future__ import annotations
from datetime import datetime, timedelta
from pathlib import Path
from unittest.mock import MagicMock, patch
import numpy as np
import pytest
from PIL import Image
from core.graph.graph_builder import GraphBuilder
from core.models.base_models import BBox
from core.models.raw_session import (
Event,
RawSession,
RawWindowContext,
Screenshot,
)
from core.models.screen_state import (
ContextLevel,
EmbeddingRef,
PerceptionLevel,
RawLevel,
ScreenState,
WindowContext,
)
from core.models.ui_element import (
UIElement,
UIElementEmbeddings,
VisualFeatures,
)
from core.pipeline import (
reset_screen_analyzer,
reset_screen_state_cache,
)
# -----------------------------------------------------------------------------
# Fixtures
# -----------------------------------------------------------------------------
@pytest.fixture(autouse=True)
def _reset_singletons():
"""Isole chaque test des singletons globaux."""
reset_screen_analyzer()
reset_screen_state_cache()
yield
reset_screen_analyzer()
reset_screen_state_cache()
def _make_click_event(pos, t: float = 1.0, button: str = "left") -> Event:
"""Event mouse_click minimal (window est requis par le dataclass)."""
return Event(
t=t,
type="mouse_click",
window=RawWindowContext(title="Test", app_name="test_app"),
data={"button": button, "pos": list(pos)},
)
def _make_key_event(t: float = 1.0, keys=None, text: str = None, ev_type: str = "key_press") -> Event:
"""Event clavier (key_press ou text_input)."""
data = {}
if keys is not None:
data["keys"] = keys
if text is not None:
data["text"] = text
return Event(
t=t,
type=ev_type,
window=RawWindowContext(title="Test", app_name="test_app"),
data=data,
)
def _make_ui_element(
element_id: str,
role: str,
label: str,
bbox: tuple,
el_type: str = "button",
) -> UIElement:
"""Construire un UIElement minimal pour les tests."""
return UIElement(
element_id=element_id,
type=el_type,
role=role,
bbox=BBox.from_tuple(bbox),
center=(bbox[0] + bbox[2] // 2, bbox[1] + bbox[3] // 2),
label=label,
label_confidence=0.95,
embeddings=UIElementEmbeddings(),
visual_features=VisualFeatures(
dominant_color="blue",
has_icon=False,
shape="rectangle",
size_category="medium",
),
confidence=0.9,
)
def _make_screen_state(
session_id: str,
index: int,
ui_elements: list,
title: str = "Test App",
detected_text: list = None,
) -> ScreenState:
"""ScreenState minimal utilisable par _extract_common_ui_elements."""
return ScreenState(
screen_state_id=f"{session_id}_state_{index:04d}",
timestamp=datetime(2026, 4, 13, 10, 0, index),
session_id=session_id,
window=WindowContext(
app_name="test_app",
window_title=title,
screen_resolution=[1920, 1080],
),
raw=RawLevel(
screenshot_path=f"/tmp/shot_{index}.png",
capture_method="mss",
file_size_bytes=1024,
),
perception=PerceptionLevel(
embedding=EmbeddingRef(
provider="test", vector_id=f"v_{index}", dimensions=512
),
detected_text=detected_text or [],
text_detection_method="test",
confidence_avg=0.8,
),
context=ContextLevel(),
metadata={},
ui_elements=ui_elements,
)
@pytest.fixture
def synthetic_session(tmp_path):
"""RawSession synthétique avec 2 screenshots alternés."""
session_id = "ui_enrich_session"
screens_dir = (
tmp_path / "data" / "training" / "sessions"
/ session_id / session_id / "screenshots"
)
screens_dir.mkdir(parents=True)
screenshots = []
events = []
for i in range(4):
ts = datetime(2026, 4, 13, 10, 0, i)
color = (200, 50, 50) if i % 2 == 0 else (50, 50, 200)
img = Image.new("RGB", (400, 300), color)
fname = f"screen_{i:03d}.png"
img.save(str(screens_dir / fname))
screenshots.append(Screenshot(
screenshot_id=f"ss_{i:03d}",
relative_path=f"screenshots/{fname}",
captured_at=ts.isoformat(),
))
events.append(Event(
t=float(i),
type="mouse_click",
window=RawWindowContext(
title="App A" if i % 2 == 0 else "App B",
app_name="app",
),
screenshot_id=f"ss_{i:03d}",
data={"button": "left", "pos": [150, 120]},
))
session = RawSession(
session_id=session_id,
agent_version="test",
environment={"screen": {"primary_resolution": [1920, 1080]}},
user={"id": "tester"},
context={},
started_at=datetime(2026, 4, 13, 10, 0, 0),
events=events,
screenshots=screenshots,
)
return session, tmp_path
# -----------------------------------------------------------------------------
# Enrichissement des ScreenState via ScreenAnalyzer
# -----------------------------------------------------------------------------
class TestCreateScreenStatesEnrichment:
"""_create_screen_states doit déléguer au ScreenAnalyzer."""
def test_build_from_session_enriches_screen_states(
self, synthetic_session, monkeypatch
):
"""Avec un analyzer mocké, les ui_elements sont propagés aux ScreenState."""
session, tmp_path = synthetic_session
monkeypatch.chdir(tmp_path)
# Analyzer mocké : renvoie un ScreenState avec 3 UIElement canoniques.
fake_elements = [
_make_ui_element("el_1", "primary_action", "Valider", (100, 100, 80, 30)),
_make_ui_element("el_2", "cancel", "Annuler", (200, 100, 80, 30)),
_make_ui_element("el_3", "form_input", "Nom", (100, 50, 200, 30)),
]
def fake_analyze(path, **kwargs):
# On renvoie un ScreenState avec le bon nombre d'éléments + OCR.
return _make_screen_state(
session.session_id,
index=0,
ui_elements=list(fake_elements),
detected_text=["Nom", "Valider", "Annuler"],
)
analyzer = MagicMock()
analyzer.analyze.side_effect = fake_analyze
builder = GraphBuilder(
screen_analyzer=analyzer,
enable_ui_enrichment=True,
enable_quality_validation=False,
)
states = builder._create_screen_states(session)
assert len(states) == 4
for st in states:
assert len(st.ui_elements) == 3
roles = {e.role for e in st.ui_elements}
assert {"primary_action", "cancel", "form_input"}.issubset(roles)
assert "Valider" in st.perception.detected_text
def test_enrichment_disabled_leaves_ui_elements_empty(
self, synthetic_session, monkeypatch
):
"""enable_ui_enrichment=False → ui_elements vide, analyzer jamais appelé."""
session, tmp_path = synthetic_session
monkeypatch.chdir(tmp_path)
analyzer = MagicMock()
builder = GraphBuilder(
screen_analyzer=analyzer,
enable_ui_enrichment=False,
enable_quality_validation=False,
)
states = builder._create_screen_states(session)
assert len(states) == 4
for st in states:
assert st.ui_elements == []
assert st.perception.detected_text == []
# L'analyzer ne doit pas avoir été appelé.
analyzer.analyze.assert_not_called()
def test_analyzer_failure_falls_back_to_empty(
self, synthetic_session, monkeypatch, caplog
):
"""Un analyzer qui crash → ScreenState vide, log warning, pas d'exception."""
session, tmp_path = synthetic_session
monkeypatch.chdir(tmp_path)
analyzer = MagicMock()
analyzer.analyze.side_effect = RuntimeError("boom (GPU OOM)")
builder = GraphBuilder(
screen_analyzer=analyzer,
enable_ui_enrichment=True,
enable_quality_validation=False,
)
with caplog.at_level("WARNING"):
states = builder._create_screen_states(session)
assert len(states) == 4
for st in states:
assert st.ui_elements == []
# La metadata trace l'erreur pour le diagnostic
assert "analyzer_error" in st.metadata
# Un log warning a bien été émis
assert any("Enrichissement visuel échoué" in r.getMessage() for r in caplog.records)
def test_shared_analyzer_singleton(self, monkeypatch):
"""Deux GraphBuilder créés sans analyzer explicite partagent le singleton C1."""
fake_analyzer = MagicMock(name="singleton_analyzer")
# Ne jamais appeler analyze (pas de screenshots dans ce test)
with patch(
"core.pipeline.get_screen_analyzer", return_value=fake_analyzer
) as getter:
b1 = GraphBuilder(enable_quality_validation=False)
b2 = GraphBuilder(enable_quality_validation=False)
a1 = b1._get_screen_analyzer()
a2 = b2._get_screen_analyzer()
assert a1 is fake_analyzer
assert a2 is fake_analyzer
# get_screen_analyzer appelé deux fois (une par builder), mais
# la vraie mutualisation passe par le singleton interne de C1.
assert getter.call_count >= 1
# -----------------------------------------------------------------------------
# Association spatiale clic → UIElement
# -----------------------------------------------------------------------------
class TestFindClickedElement:
"""Logique de proximité _find_clicked_element."""
def _builder(self, max_px: float = 50.0) -> GraphBuilder:
return GraphBuilder(
enable_quality_validation=False,
enable_ui_enrichment=False,
element_proximity_max_px=max_px,
)
def test_find_clicked_element_inside_bbox(self):
"""Clic strictement dans un bbox → match exact."""
builder = self._builder()
elements = [
_make_ui_element("e1", "primary_action", "OK", (50, 50, 150, 150)),
_make_ui_element("e2", "cancel", "Annuler", (300, 300, 100, 50)),
]
event = _make_click_event([100, 100])
result = builder._find_clicked_element(event, elements)
assert result is not None
assert result.element_id == "e1"
def test_find_clicked_element_nearest_proximity(self):
"""Clic hors de tout bbox mais à <50px → match au plus proche."""
builder = self._builder(max_px=50.0)
elements = [
# bbox à (50,50,100,40) → bord droit = 150, bord bas = 90
_make_ui_element("e_near", "primary_action", "Valider", (50, 50, 100, 40)),
# bbox loin (distance >> 50px du clic)
_make_ui_element("e_far", "cancel", "Annuler", (500, 500, 80, 30)),
]
# Clic à (170, 70) → bord droit de e_near = 150, dx = 20, dy = 0 → 20px
event = _make_click_event([170, 70])
result = builder._find_clicked_element(event, elements)
assert result is not None
assert result.element_id == "e_near"
def test_find_clicked_element_too_far_returns_none(self):
"""Clic à >50px du bbox le plus proche → None."""
builder = self._builder(max_px=50.0)
elements = [
_make_ui_element("e1", "primary_action", "OK", (50, 50, 100, 40)),
]
# Clic à (300, 300), bbox à (50,50,100,40) → distance ~ 280px
event = _make_click_event([300, 300])
result = builder._find_clicked_element(event, elements)
assert result is None
def test_find_clicked_element_prefers_smallest_containing(self):
"""Deux bbox contiennent le clic → retourne le plus spécifique (petit)."""
builder = self._builder()
elements = [
# Grand container
_make_ui_element(
"container", "data_display", "Form", (0, 0, 800, 600),
el_type="container",
),
# Petit bouton à l'intérieur
_make_ui_element("btn", "primary_action", "OK", (100, 100, 80, 30)),
]
event = _make_click_event([120, 110])
result = builder._find_clicked_element(event, elements)
assert result is not None
assert result.element_id == "btn"
def test_find_clicked_element_empty_list(self):
builder = self._builder()
event = _make_click_event([100, 100])
assert builder._find_clicked_element(event, []) is None
def test_find_clicked_element_non_click_event(self):
"""Un événement non-clic → None (pas d'ancrage spatial pertinent)."""
builder = self._builder()
elements = [
_make_ui_element("e1", "form_input", "Nom", (100, 100, 100, 30)),
]
event = _make_key_event(keys=["Enter"])
assert builder._find_clicked_element(event, elements) is None
# -----------------------------------------------------------------------------
# TargetSpec enrichi par _build_single_action
# -----------------------------------------------------------------------------
class TestTargetSpecEnrichment:
"""_build_single_action doit produire des TargetSpec discriminants."""
def test_target_spec_uses_element_role(self):
"""Clic ancré sur un élément → by_role + by_text + context_hints."""
builder = GraphBuilder(
enable_quality_validation=False,
enable_ui_enrichment=False,
)
elements = [
_make_ui_element("el_ok", "primary_action", "Valider", (100, 100, 120, 40)),
]
event = _make_click_event([150, 120])
action = builder._build_single_action(event, source_ui_elements=elements)
assert action.type == "mouse_click"
assert action.target.by_role == "primary_action"
assert action.target.by_text == "Valider"
assert action.target.selection_policy == "by_similarity"
# Traçabilité dans context_hints
assert action.target.context_hints.get("anchor_element_id") == "el_ok"
assert "anchor_bbox" in action.target.context_hints
assert action.target.context_hints["anchor_bbox"]["x"] == 100
def test_target_spec_fallback_when_no_element(self):
"""Aucun UIElement → legacy by_role=unknown_element."""
builder = GraphBuilder(
enable_quality_validation=False,
enable_ui_enrichment=False,
)
event = _make_click_event([400, 400])
action = builder._build_single_action(event, source_ui_elements=[])
assert action.target.by_role == "unknown_element"
assert action.target.by_text is None
# Pas de context_hints d'ancrage
assert not action.target.context_hints.get("anchor_element_id")
def test_target_spec_fallback_when_click_too_far(self):
"""Clic loin de tout bbox → fallback unknown_element."""
builder = GraphBuilder(
enable_quality_validation=False,
enable_ui_enrichment=False,
element_proximity_max_px=30.0,
)
elements = [
_make_ui_element("far", "cancel", "X", (50, 50, 20, 20)),
]
event = _make_click_event([800, 800])
action = builder._build_single_action(event, source_ui_elements=elements)
assert action.target.by_role == "unknown_element"
def test_keyboard_event_target_unchanged(self):
"""Les events non-clic conservent leur target_role legacy."""
builder = GraphBuilder(
enable_quality_validation=False,
enable_ui_enrichment=False,
)
event = _make_key_event(text="hello", ev_type="text_input")
action = builder._build_single_action(event, source_ui_elements=[])
assert action.target.by_role == "text_field"
# -----------------------------------------------------------------------------
# UIConstraint.required_roles depuis _extract_common_ui_elements
# -----------------------------------------------------------------------------
class TestRequiredRolesExtraction:
def test_required_roles_extracted_from_common_elements(self):
"""3 ScreenState avec rôle commun → required_roles le contient."""
builder = GraphBuilder(
enable_quality_validation=False,
enable_ui_enrichment=False,
)
# 3 écrans, tous avec "primary_action" (Valider) et 2 avec "cancel"
states = [
_make_screen_state(
"sid", i,
ui_elements=[
_make_ui_element(
f"ok_{i}", "primary_action", "Valider",
(100, 100, 80, 30),
),
_make_ui_element(
f"cancel_{i}", "cancel", "Annuler",
(200, 100, 80, 30),
) if i < 2 else _make_ui_element(
f"other_{i}", "navigation", "Menu",
(300, 100, 80, 30),
),
],
)
for i in range(3)
]
prototype = np.zeros(512, dtype=np.float32)
prototype[0] = 1.0
template = builder._create_screen_template(states, prototype)
assert template.ui is not None
# primary_action présent dans 3/3 écrans → inclus
assert "primary_action" in template.ui.required_roles
# cancel présent dans 2/3 → ratio 0.66 >= 0.5 → inclus
assert "cancel" in template.ui.required_roles
# navigation présent dans 1/3 → ratio 0.33 < 0.5 → exclu
assert "navigation" not in template.ui.required_roles