feat: câblage complet V4 — stratégie UIA + surface profile

Pipeline V4 câblé de bout en bout :
  RawTrace (avec uia_snapshot) → IRBuilder → Action._enrichment
  WorkflowIR → ExecutionCompiler (avec SurfaceProfile) → ExecutionPlan
  ExecutionPlan → runner → target_spec (avec uia_target + resolve_order)

ResolutionStrategy étendu :
- Champs UIA : uia_name, uia_control_type, uia_automation_id, uia_parent_path
- Champs DOM : dom_selector, dom_xpath, dom_url_pattern (préparation web)

ExecutionCompiler.compile(surface_profile=...) :
- Timeouts/retries tirés du profil (citrix=15s/3x, web=5s/1x, natif=8s/2x)
- UIA primaire seulement si surface=WINDOWS_NATIVE et uia_available
- Citrix ignore UIA même si snapshot présent (UIA ne marche pas dans Citrix)

IRBuilder lit evt['uia_snapshot'] et le stocke dans action._enrichment
(à remplir par l'agent Windows pendant l'enregistrement via lea_uia.exe)

execution_plan_runner propage uia_target et dom_target dans target_spec
pour que l'agent Windows puisse les consommer au runtime.

11 tests de câblage E2E :
- Profils (Citrix/web/natif) imposent bien les timeouts
- Stratégie UIA créée quand snapshot+surface OK
- Stratégie UIA bloquée sur Citrix
- IRBuilder propage uia_snapshot
- Runner produit target_spec avec uia_target + resolve_order=['uia', 'ocr', 'vlm']

496 tests au total, 0 régression.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-04-10 11:02:51 +02:00
parent ac9c207474
commit 332366b58c
5 changed files with 480 additions and 14 deletions

View File

@@ -114,6 +114,8 @@ def _strategy_to_target_spec(
by_text_candidate = ""
anchor_candidate = ""
vlm_candidate = ""
uia_data: Dict[str, Any] = {}
dom_data: Dict[str, Any] = {}
resolve_order: List[str] = []
seen_methods: set = set()
@@ -129,6 +131,19 @@ def _strategy_to_target_spec(
by_text_candidate = strat.target_text
elif strat.method == "vlm" and strat.vlm_description and not vlm_candidate:
vlm_candidate = strat.vlm_description
elif strat.method == "uia" and strat.uia_name and not uia_data:
uia_data = {
"name": strat.uia_name,
"control_type": strat.uia_control_type,
"automation_id": strat.uia_automation_id,
"parent_path": strat.uia_parent_path,
}
elif strat.method == "dom" and strat.dom_selector and not dom_data:
dom_data = {
"selector": strat.dom_selector,
"xpath": strat.dom_xpath,
"url_pattern": strat.dom_url_pattern,
}
# Construire l'ordre des méthodes (dans l'ordre primaire → fallbacks)
if strat.method and strat.method not in seen_methods:
@@ -145,6 +160,14 @@ def _strategy_to_target_spec(
# L'intention métier devient le prompt VLM de dernier recours
spec["vlm_description"] = intent
# Données UIA — consommées par l'agent Windows via lea_uia.exe
if uia_data:
spec["uia_target"] = uia_data
# Données DOM — consommées par l'agent Windows via CDP (futur)
if dom_data:
spec["dom_target"] = dom_data
# Ordre de résolution pré-compilé — c'est LA pièce centrale du V4
if resolve_order:
spec["resolve_order"] = resolve_order

View File

@@ -61,6 +61,7 @@ class ExecutionCompiler:
target_machine: str = "",
target_resolution: str = "1280x800",
params: Optional[Dict[str, str]] = None,
surface_profile=None,
) -> ExecutionPlan:
"""Compiler un WorkflowIR en ExecutionPlan.
@@ -69,6 +70,8 @@ class ExecutionCompiler:
target_machine: Machine cible (pour adapter les stratégies)
target_resolution: Résolution de la machine cible
params: Variables à substituer
surface_profile: SurfaceProfile optionnel pour adapter les paramètres.
Si fourni, timeouts/seuils/retries sont tirés du profil.
"""
t_start = time.time()
@@ -88,7 +91,7 @@ class ExecutionCompiler:
# Compiler chaque étape
for step in ir.steps:
nodes = self._compile_step(step, ir, learned_strategies)
nodes = self._compile_step(step, ir, learned_strategies, surface_profile)
plan.nodes.extend(nodes)
# Statistiques de compilation
@@ -124,6 +127,7 @@ class ExecutionCompiler:
step: Step,
ir: WorkflowIR,
learned: Dict[str, str],
surface_profile=None,
) -> List[ExecutionNode]:
"""Compiler une étape en nœuds d'exécution."""
nodes = []
@@ -135,6 +139,7 @@ class ExecutionCompiler:
action_index=i,
ir=ir,
learned=learned,
surface_profile=surface_profile,
)
nodes.append(node)
@@ -147,6 +152,7 @@ class ExecutionCompiler:
action_index: int,
ir: WorkflowIR,
learned: Dict[str, str],
surface_profile=None,
) -> ExecutionNode:
"""Compiler une action en nœud d'exécution avec stratégie de résolution."""
@@ -158,13 +164,20 @@ class ExecutionCompiler:
is_optional=step.is_optional,
)
# Paramètres par défaut, surchargés par le surface_profile si fourni
default_click_timeout = 10000
default_click_retries = 2
if surface_profile is not None:
default_click_timeout = getattr(surface_profile, "timeout_click_ms", 10000)
default_click_retries = getattr(surface_profile, "max_retries", 2)
if action.type == "click":
# Compiler les stratégies de résolution pour ce clic
node.strategy_primary, node.strategy_fallbacks = self._compile_click_resolution(
action, step, learned,
action, step, learned, surface_profile,
)
node.timeout_ms = 10000
node.max_retries = 2
node.timeout_ms = default_click_timeout
node.max_retries = default_click_retries
node.recovery_action = "escape"
# Condition de succès basée sur la postcondition
@@ -205,16 +218,22 @@ class ExecutionCompiler:
action: Action,
step: Step,
learned: Dict[str, str],
surface_profile=None,
) -> tuple:
"""Compiler les stratégies de résolution pour un clic.
Utilise les données d'enrichissement visuel (action._enrichment) si
disponibles (crop anchor, description VLM, window_capture).
disponibles :
- by_text (OCR)
- anchor_image_base64 (template)
- vlm_description (VLM)
- uia_snapshot (UIA sur Windows natif)
Ordre de priorité :
1. OCR exact (si by_text disponible) — 100ms, pixel-perfect
2. Template matching (si anchor_image_base64) — 10ms
3. VLM (vlm_description) — 2-5s, exception handler
Ordre de priorité (variable selon la surface) :
1. UIA (si snapshot dispo ET surface native ET helper dispo) — 10-20ms
2. OCR exact (si texte visible) — 100-200ms
3. Template matching (si crop) — 10ms
4. VLM — exception handler
Le learning peut réordonner si une stratégie a mieux marché avant.
"""
@@ -227,6 +246,7 @@ class ExecutionCompiler:
anchor_b64 = enrichment.get("anchor_image_base64", "")
vlm_desc_from_enrich = enrichment.get("vlm_description", "")
window_title = enrichment.get("window_title", "")
uia_snapshot = enrichment.get("uia_snapshot") or {}
# Source de texte : enrichissement > anchor_hint > target
target_text = by_text_from_enrich or action.anchor_hint or action.target
@@ -236,6 +256,33 @@ class ExecutionCompiler:
learned_method = learned.get(target_text, "")
# Est-ce qu'on est sur une surface où UIA est activable ?
uia_eligible = False
if surface_profile is not None:
from .surface_classifier import SurfaceType
surface_type = getattr(surface_profile, "surface_type", None)
uia_available = getattr(surface_profile, "uia_available", False)
uia_eligible = (
uia_available
and surface_type == SurfaceType.WINDOWS_NATIVE
)
else:
# Sans profil explicite, on active UIA si le snapshot est présent
# (l'agent décidera au runtime s'il peut l'utiliser)
uia_eligible = bool(uia_snapshot)
# Stratégie UIA — la plus rapide et la plus précise sur Windows natif
if uia_snapshot and uia_snapshot.get("name") and uia_eligible:
uia_strategy = ResolutionStrategy(
method="uia",
uia_name=uia_snapshot.get("name", ""),
uia_control_type=uia_snapshot.get("control_type", ""),
uia_automation_id=uia_snapshot.get("automation_id", ""),
uia_parent_path=uia_snapshot.get("parent_path", []),
threshold=0.95,
)
primary = uia_strategy
# Stratégie OCR — le texte visible est la meilleure ancre
if target_text:
ocr_strategy = ResolutionStrategy(
@@ -243,7 +290,10 @@ class ExecutionCompiler:
target_text=target_text,
threshold=0.7,
)
if not learned_method or learned_method in ("ocr", "som_text_match", "hybrid_text_direct", "v4_ocr"):
if primary is None and (
not learned_method
or learned_method in ("ocr", "som_text_match", "hybrid_text_direct", "v4_ocr")
):
primary = ocr_strategy
else:
fallbacks.append(ocr_strategy)
@@ -256,9 +306,9 @@ class ExecutionCompiler:
anchor_b64=anchor_b64,
threshold=0.85,
)
if learned_method in ("anchor_template", "template_matching", "v4_template"):
if primary:
fallbacks.insert(0, primary)
if primary is None and learned_method in (
"anchor_template", "template_matching", "v4_template"
):
primary = template_strategy
else:
fallbacks.append(template_strategy)

View File

@@ -29,7 +29,7 @@ class ResolutionStrategy:
Pré-compilée — le runtime n'a pas besoin du VLM pour résoudre.
"""
method: str # "ocr", "template", "position", "vlm"
method: str # "uia", "ocr", "template", "position", "vlm", "dom"
target_text: str = "" # Texte à chercher (pour OCR)
anchor_b64: str = "" # Crop de référence (pour template matching)
zone: Dict[str, float] = field(default_factory=dict) # Zone de recherche {x_min, y_min, x_max, y_max}
@@ -37,6 +37,20 @@ class ResolutionStrategy:
vlm_description: str = "" # Description VLM (dernier recours)
threshold: float = 0.8 # Seuil de confiance
# Stratégie UIA (Windows UI Automation)
# Utilisée quand l'enregistrement a capturé un snapshot UIA au moment du clic.
# Au replay, l'agent Windows appelle lea_uia.exe find --name ... pour retrouver
# l'élément par son chemin logique (100% fiable sur Windows natif).
uia_name: str = "" # Name property de l'élément
uia_control_type: str = "" # ControlType (Button, Edit, MenuItem, ...)
uia_automation_id: str = "" # AutomationId (optionnel)
uia_parent_path: List[Dict[str, str]] = field(default_factory=list)
# Stratégie DOM (web avec CDP activé) — préparation pour plus tard
dom_selector: str = "" # CSS selector
dom_xpath: str = "" # XPath
dom_url_pattern: str = "" # Pattern URL à matcher
def to_dict(self) -> Dict[str, Any]:
d = {"method": self.method}
if self.target_text:
@@ -49,6 +63,20 @@ class ResolutionStrategy:
d["position_hint"] = self.position_hint
if self.vlm_description:
d["vlm_description"] = self.vlm_description
if self.uia_name:
d["uia_name"] = self.uia_name
if self.uia_control_type:
d["uia_control_type"] = self.uia_control_type
if self.uia_automation_id:
d["uia_automation_id"] = self.uia_automation_id
if self.uia_parent_path:
d["uia_parent_path"] = self.uia_parent_path
if self.dom_selector:
d["dom_selector"] = self.dom_selector
if self.dom_xpath:
d["dom_xpath"] = self.dom_xpath
if self.dom_url_pattern:
d["dom_url_pattern"] = self.dom_url_pattern
d["threshold"] = self.threshold
return d

View File

@@ -269,6 +269,22 @@ class IRBuilder:
# (utilisé par l'ExecutionCompiler pour construire les stratégies)
action._enrichment = enrichment
# Lire le snapshot UIA si l'agent Windows l'a capturé.
# Format attendu dans l'événement :
# evt["uia_snapshot"] = {
# "name": "Enregistrer",
# "control_type": "bouton",
# "automation_id": "btnSave",
# "parent_path": [{"name": "...", "control_type": "..."}],
# }
# Si présent, il est fusionné dans _enrichment pour que
# l'ExecutionCompiler puisse créer une stratégie UIA prioritaire.
uia_snapshot = evt.get("uia_snapshot")
if uia_snapshot and isinstance(uia_snapshot, dict):
if not hasattr(action, "_enrichment") or action._enrichment is None:
action._enrichment = {}
action._enrichment["uia_snapshot"] = uia_snapshot
return action
elif evt_type == "text_input":

View File

@@ -0,0 +1,349 @@
"""
Tests de câblage complet V4 :
- SurfaceClassifier + ExecutionCompiler : paramètres adaptés par surface
- IRBuilder lit uia_snapshot depuis les événements
- ExecutionCompiler crée une stratégie UIA quand dispo
- execution_plan_runner propage uia_target dans target_spec
- Pipeline E2E : RawTrace (avec UIA) → WorkflowIR → Plan → action runtime
"""
import sys
from pathlib import Path
from unittest.mock import patch
import pytest
_ROOT = str(Path(__file__).resolve().parents[2])
if _ROOT not in sys.path:
sys.path.insert(0, _ROOT)
from core.workflow.workflow_ir import WorkflowIR, Step, Action
from core.workflow.execution_plan import ExecutionPlan, ExecutionNode, ResolutionStrategy
from core.workflow.execution_compiler import ExecutionCompiler
from core.workflow.surface_classifier import SurfaceClassifier, SurfaceProfile, SurfaceType
from core.workflow.ir_builder import IRBuilder
from agent_v0.server_v1.execution_plan_runner import (
execution_node_to_action,
execution_plan_to_actions,
_strategy_to_target_spec,
)
# =========================================================================
# ExecutionCompiler avec SurfaceProfile
# =========================================================================
class TestCompilerWithSurfaceProfile:
def test_profil_citrix_impose_timeouts_longs(self):
"""Profil Citrix → timeouts longs, retries 3x."""
ir = WorkflowIR.new("Test")
ir.add_step("Clic", actions=[{"type": "click", "target": "Bouton", "anchor_hint": "OK"}])
profile = SurfaceProfile(
surface_type=SurfaceType.CITRIX,
timeout_click_ms=15000,
max_retries=3,
ocr_threshold=0.65,
)
compiler = ExecutionCompiler()
plan = compiler.compile(ir, surface_profile=profile)
click_node = [n for n in plan.nodes if n.action_type == "click"][0]
assert click_node.timeout_ms == 15000
assert click_node.max_retries == 3
def test_profil_web_impose_timeouts_courts(self):
"""Profil web → timeouts courts, 1 retry."""
ir = WorkflowIR.new("Test")
ir.add_step("Clic", actions=[{"type": "click", "target": "X", "anchor_hint": "Login"}])
profile = SurfaceProfile(
surface_type=SurfaceType.WEB_LOCAL,
timeout_click_ms=5000,
max_retries=1,
)
compiler = ExecutionCompiler()
plan = compiler.compile(ir, surface_profile=profile)
click_node = [n for n in plan.nodes if n.action_type == "click"][0]
assert click_node.timeout_ms == 5000
assert click_node.max_retries == 1
def test_sans_profil_utilise_defauts(self):
"""Sans surface_profile, comportement par défaut."""
ir = WorkflowIR.new("Test")
ir.add_step("Clic", actions=[{"type": "click", "target": "X", "anchor_hint": "Y"}])
compiler = ExecutionCompiler()
plan = compiler.compile(ir)
click_node = [n for n in plan.nodes if n.action_type == "click"][0]
assert click_node.timeout_ms == 10000 # Défaut
assert click_node.max_retries == 2 # Défaut
# =========================================================================
# Stratégie UIA dans la compilation
# =========================================================================
class TestUiaStrategyCompilation:
def _make_ir_with_uia(self):
"""Créer un WorkflowIR avec une action portant un uia_snapshot."""
ir = WorkflowIR.new("Test UIA")
action = Action(
type="click",
target="Bloc-notes",
anchor_hint="Enregistrer",
)
# Simuler l'enrichissement avec UIA
action._enrichment = {
"by_text": "Enregistrer",
"anchor_image_base64": "fake_crop_data",
"vlm_description": "Le bouton Enregistrer du menu Fichier",
"uia_snapshot": {
"name": "Enregistrer",
"control_type": "bouton",
"automation_id": "btnSave",
"parent_path": [
{"name": "Bloc-notes", "control_type": "fenêtre"},
{"name": "Fichier", "control_type": "menu"},
],
},
}
step = Step(step_id="s1", intent="Sauvegarder", actions=[action])
ir.steps.append(step)
return ir
def test_uia_strategie_creee_si_surface_windows(self):
"""Sur Windows natif avec UIA dispo, la stratégie UIA est primaire."""
ir = self._make_ir_with_uia()
profile = SurfaceProfile(
surface_type=SurfaceType.WINDOWS_NATIVE,
uia_available=True,
)
compiler = ExecutionCompiler()
plan = compiler.compile(ir, surface_profile=profile)
click = [n for n in plan.nodes if n.action_type == "click"][0]
assert click.strategy_primary is not None
assert click.strategy_primary.method == "uia"
assert click.strategy_primary.uia_name == "Enregistrer"
assert click.strategy_primary.uia_control_type == "bouton"
def test_uia_desactive_sur_citrix(self):
"""Sur Citrix, UIA est ignoré même si snapshot présent."""
ir = self._make_ir_with_uia()
profile = SurfaceProfile(
surface_type=SurfaceType.CITRIX,
uia_available=False,
)
compiler = ExecutionCompiler()
plan = compiler.compile(ir, surface_profile=profile)
click = [n for n in plan.nodes if n.action_type == "click"][0]
assert click.strategy_primary.method != "uia"
# OCR est la primaire (texte dispo)
assert click.strategy_primary.method == "ocr"
def test_uia_fallback_sur_ocr_si_uia_manquant(self):
"""Sans uia_snapshot, OCR primaire."""
ir = WorkflowIR.new("Test")
action = Action(
type="click",
target="Fichier",
anchor_hint="Fichier",
)
action._enrichment = {
"by_text": "Fichier",
"vlm_description": "Menu Fichier",
}
step = Step(step_id="s1", intent="Ouvrir menu", actions=[action])
ir.steps.append(step)
profile = SurfaceProfile(
surface_type=SurfaceType.WINDOWS_NATIVE,
uia_available=True,
)
compiler = ExecutionCompiler()
plan = compiler.compile(ir, surface_profile=profile)
click = [n for n in plan.nodes if n.action_type == "click"][0]
assert click.strategy_primary.method == "ocr"
# =========================================================================
# IRBuilder lit uia_snapshot depuis les événements
# =========================================================================
class TestIRBuilderLitUiaSnapshot:
def test_ir_builder_propage_uia_snapshot(self):
"""Un event avec uia_snapshot → Action._enrichment contient uia_snapshot."""
events = [
{
"event": {
"type": "mouse_click",
"pos": [500, 300],
"window": {"title": "Bloc-notes"},
"timestamp": 100.0,
"uia_snapshot": {
"name": "Enregistrer",
"control_type": "bouton",
"automation_id": "btnSave",
"parent_path": [{"name": "Fichier", "control_type": "menu"}],
},
}
}
]
builder = IRBuilder(gemma4_port="99999")
ir = builder.build(events, name="Test")
# Parcourir les steps pour trouver le clic
found_action = None
for step in ir.steps:
for action in step.actions:
if action.type == "click":
found_action = action
break
assert found_action is not None
enrichment = getattr(found_action, "_enrichment", None) or {}
assert "uia_snapshot" in enrichment
assert enrichment["uia_snapshot"]["name"] == "Enregistrer"
assert enrichment["uia_snapshot"]["control_type"] == "bouton"
# =========================================================================
# execution_plan_runner propage uia_target dans target_spec
# =========================================================================
class TestUiaTargetPropagation:
def test_strategy_uia_produit_uia_target(self):
"""Une stratégie UIA primaire → target_spec contient uia_target."""
primary = ResolutionStrategy(
method="uia",
uia_name="Enregistrer",
uia_control_type="bouton",
uia_automation_id="btnSave",
uia_parent_path=[{"name": "Fichier", "control_type": "menu"}],
)
fallbacks = [
ResolutionStrategy(method="ocr", target_text="Enregistrer"),
ResolutionStrategy(method="vlm", vlm_description="bouton Enregistrer"),
]
spec = _strategy_to_target_spec(primary, fallbacks)
assert "uia_target" in spec
assert spec["uia_target"]["name"] == "Enregistrer"
assert spec["uia_target"]["control_type"] == "bouton"
assert spec["uia_target"]["automation_id"] == "btnSave"
assert spec["resolve_order"][0] == "uia"
assert "ocr" in spec["resolve_order"]
assert "vlm" in spec["resolve_order"]
def test_pas_de_uia_target_si_pas_de_stratégie(self):
"""Sans stratégie UIA → pas de uia_target."""
primary = ResolutionStrategy(method="ocr", target_text="test")
spec = _strategy_to_target_spec(primary, [])
assert "uia_target" not in spec
assert "uia" not in spec.get("resolve_order", [])
def test_execution_node_to_action_avec_uia(self):
"""Un ExecutionNode avec stratégie UIA produit une action complète."""
node = ExecutionNode(
node_id="n1",
action_type="click",
intent="Cliquer Enregistrer",
strategy_primary=ResolutionStrategy(
method="uia",
uia_name="Enregistrer",
uia_control_type="bouton",
),
strategy_fallbacks=[
ResolutionStrategy(method="ocr", target_text="Enregistrer"),
],
)
action = execution_node_to_action(node)
assert action is not None
assert action["type"] == "click"
assert "uia_target" in action["target_spec"]
assert action["target_spec"]["uia_target"]["name"] == "Enregistrer"
assert action["target_spec"]["resolve_order"] == ["uia", "ocr"]
# =========================================================================
# Pipeline E2E : événement avec UIA → action runtime avec uia_target
# =========================================================================
class TestPipelineE2EUia:
def test_pipeline_complet_uia(self):
"""RawTrace (avec uia_snapshot) → WorkflowIR → Plan → action runtime."""
# Événements simulés d'un enregistrement sur Windows natif
events = [
{
"event": {
"type": "mouse_click",
"pos": [500, 300],
"window": {"title": "Bloc-notes"},
"timestamp": 100.0,
"uia_snapshot": {
"name": "Enregistrer",
"control_type": "bouton",
"automation_id": "btnSave",
"parent_path": [
{"name": "Bloc-notes", "control_type": "fenêtre"},
],
},
}
}
]
# Pipeline complet
builder = IRBuilder(gemma4_port="99999")
ir = builder.build(events, name="Test E2E UIA")
profile = SurfaceProfile(
surface_type=SurfaceType.WINDOWS_NATIVE,
uia_available=True,
timeout_click_ms=8000,
max_retries=2,
)
compiler = ExecutionCompiler()
plan = compiler.compile(ir, surface_profile=profile)
actions = execution_plan_to_actions(plan)
# Vérifier que l'action finale a toutes les données UIA
click_actions = [a for a in actions if a["type"] == "click"]
assert len(click_actions) == 1
action = click_actions[0]
assert "target_spec" in action
spec = action["target_spec"]
assert "resolve_order" in spec
assert spec["resolve_order"][0] == "uia"
assert "uia_target" in spec
assert spec["uia_target"]["name"] == "Enregistrer"
assert spec["uia_target"]["control_type"] == "bouton"
assert action.get("timeout_ms") == 8000
assert action.get("max_retries") == 2