diff --git a/agent_v0/server_v1/execution_plan_runner.py b/agent_v0/server_v1/execution_plan_runner.py index a0be8eac7..bb02e3a26 100644 --- a/agent_v0/server_v1/execution_plan_runner.py +++ b/agent_v0/server_v1/execution_plan_runner.py @@ -114,6 +114,8 @@ def _strategy_to_target_spec( by_text_candidate = "" anchor_candidate = "" vlm_candidate = "" + uia_data: Dict[str, Any] = {} + dom_data: Dict[str, Any] = {} resolve_order: List[str] = [] seen_methods: set = set() @@ -129,6 +131,19 @@ def _strategy_to_target_spec( by_text_candidate = strat.target_text elif strat.method == "vlm" and strat.vlm_description and not vlm_candidate: vlm_candidate = strat.vlm_description + elif strat.method == "uia" and strat.uia_name and not uia_data: + uia_data = { + "name": strat.uia_name, + "control_type": strat.uia_control_type, + "automation_id": strat.uia_automation_id, + "parent_path": strat.uia_parent_path, + } + elif strat.method == "dom" and strat.dom_selector and not dom_data: + dom_data = { + "selector": strat.dom_selector, + "xpath": strat.dom_xpath, + "url_pattern": strat.dom_url_pattern, + } # Construire l'ordre des méthodes (dans l'ordre primaire → fallbacks) if strat.method and strat.method not in seen_methods: @@ -145,6 +160,14 @@ def _strategy_to_target_spec( # L'intention métier devient le prompt VLM de dernier recours spec["vlm_description"] = intent + # Données UIA — consommées par l'agent Windows via lea_uia.exe + if uia_data: + spec["uia_target"] = uia_data + + # Données DOM — consommées par l'agent Windows via CDP (futur) + if dom_data: + spec["dom_target"] = dom_data + # Ordre de résolution pré-compilé — c'est LA pièce centrale du V4 if resolve_order: spec["resolve_order"] = resolve_order diff --git a/core/workflow/execution_compiler.py b/core/workflow/execution_compiler.py index 11640a8fa..685c3834e 100644 --- a/core/workflow/execution_compiler.py +++ b/core/workflow/execution_compiler.py @@ -61,6 +61,7 @@ class ExecutionCompiler: target_machine: str = "", target_resolution: str = "1280x800", params: Optional[Dict[str, str]] = None, + surface_profile=None, ) -> ExecutionPlan: """Compiler un WorkflowIR en ExecutionPlan. @@ -69,6 +70,8 @@ class ExecutionCompiler: target_machine: Machine cible (pour adapter les stratégies) target_resolution: Résolution de la machine cible params: Variables à substituer + surface_profile: SurfaceProfile optionnel pour adapter les paramètres. + Si fourni, timeouts/seuils/retries sont tirés du profil. """ t_start = time.time() @@ -88,7 +91,7 @@ class ExecutionCompiler: # Compiler chaque étape for step in ir.steps: - nodes = self._compile_step(step, ir, learned_strategies) + nodes = self._compile_step(step, ir, learned_strategies, surface_profile) plan.nodes.extend(nodes) # Statistiques de compilation @@ -124,6 +127,7 @@ class ExecutionCompiler: step: Step, ir: WorkflowIR, learned: Dict[str, str], + surface_profile=None, ) -> List[ExecutionNode]: """Compiler une étape en nœuds d'exécution.""" nodes = [] @@ -135,6 +139,7 @@ class ExecutionCompiler: action_index=i, ir=ir, learned=learned, + surface_profile=surface_profile, ) nodes.append(node) @@ -147,6 +152,7 @@ class ExecutionCompiler: action_index: int, ir: WorkflowIR, learned: Dict[str, str], + surface_profile=None, ) -> ExecutionNode: """Compiler une action en nœud d'exécution avec stratégie de résolution.""" @@ -158,13 +164,20 @@ class ExecutionCompiler: is_optional=step.is_optional, ) + # Paramètres par défaut, surchargés par le surface_profile si fourni + default_click_timeout = 10000 + default_click_retries = 2 + if surface_profile is not None: + default_click_timeout = getattr(surface_profile, "timeout_click_ms", 10000) + default_click_retries = getattr(surface_profile, "max_retries", 2) + if action.type == "click": # Compiler les stratégies de résolution pour ce clic node.strategy_primary, node.strategy_fallbacks = self._compile_click_resolution( - action, step, learned, + action, step, learned, surface_profile, ) - node.timeout_ms = 10000 - node.max_retries = 2 + node.timeout_ms = default_click_timeout + node.max_retries = default_click_retries node.recovery_action = "escape" # Condition de succès basée sur la postcondition @@ -205,16 +218,22 @@ class ExecutionCompiler: action: Action, step: Step, learned: Dict[str, str], + surface_profile=None, ) -> tuple: """Compiler les stratégies de résolution pour un clic. Utilise les données d'enrichissement visuel (action._enrichment) si - disponibles (crop anchor, description VLM, window_capture). + disponibles : + - by_text (OCR) + - anchor_image_base64 (template) + - vlm_description (VLM) + - uia_snapshot (UIA sur Windows natif) - Ordre de priorité : - 1. OCR exact (si by_text disponible) — 100ms, pixel-perfect - 2. Template matching (si anchor_image_base64) — 10ms - 3. VLM (vlm_description) — 2-5s, exception handler + Ordre de priorité (variable selon la surface) : + 1. UIA (si snapshot dispo ET surface native ET helper dispo) — 10-20ms + 2. OCR exact (si texte visible) — 100-200ms + 3. Template matching (si crop) — 10ms + 4. VLM — exception handler Le learning peut réordonner si une stratégie a mieux marché avant. """ @@ -227,6 +246,7 @@ class ExecutionCompiler: anchor_b64 = enrichment.get("anchor_image_base64", "") vlm_desc_from_enrich = enrichment.get("vlm_description", "") window_title = enrichment.get("window_title", "") + uia_snapshot = enrichment.get("uia_snapshot") or {} # Source de texte : enrichissement > anchor_hint > target target_text = by_text_from_enrich or action.anchor_hint or action.target @@ -236,6 +256,33 @@ class ExecutionCompiler: learned_method = learned.get(target_text, "") + # Est-ce qu'on est sur une surface où UIA est activable ? + uia_eligible = False + if surface_profile is not None: + from .surface_classifier import SurfaceType + surface_type = getattr(surface_profile, "surface_type", None) + uia_available = getattr(surface_profile, "uia_available", False) + uia_eligible = ( + uia_available + and surface_type == SurfaceType.WINDOWS_NATIVE + ) + else: + # Sans profil explicite, on active UIA si le snapshot est présent + # (l'agent décidera au runtime s'il peut l'utiliser) + uia_eligible = bool(uia_snapshot) + + # Stratégie UIA — la plus rapide et la plus précise sur Windows natif + if uia_snapshot and uia_snapshot.get("name") and uia_eligible: + uia_strategy = ResolutionStrategy( + method="uia", + uia_name=uia_snapshot.get("name", ""), + uia_control_type=uia_snapshot.get("control_type", ""), + uia_automation_id=uia_snapshot.get("automation_id", ""), + uia_parent_path=uia_snapshot.get("parent_path", []), + threshold=0.95, + ) + primary = uia_strategy + # Stratégie OCR — le texte visible est la meilleure ancre if target_text: ocr_strategy = ResolutionStrategy( @@ -243,7 +290,10 @@ class ExecutionCompiler: target_text=target_text, threshold=0.7, ) - if not learned_method or learned_method in ("ocr", "som_text_match", "hybrid_text_direct", "v4_ocr"): + if primary is None and ( + not learned_method + or learned_method in ("ocr", "som_text_match", "hybrid_text_direct", "v4_ocr") + ): primary = ocr_strategy else: fallbacks.append(ocr_strategy) @@ -256,9 +306,9 @@ class ExecutionCompiler: anchor_b64=anchor_b64, threshold=0.85, ) - if learned_method in ("anchor_template", "template_matching", "v4_template"): - if primary: - fallbacks.insert(0, primary) + if primary is None and learned_method in ( + "anchor_template", "template_matching", "v4_template" + ): primary = template_strategy else: fallbacks.append(template_strategy) diff --git a/core/workflow/execution_plan.py b/core/workflow/execution_plan.py index 038a8ff46..83b4c5b29 100644 --- a/core/workflow/execution_plan.py +++ b/core/workflow/execution_plan.py @@ -29,7 +29,7 @@ class ResolutionStrategy: Pré-compilée — le runtime n'a pas besoin du VLM pour résoudre. """ - method: str # "ocr", "template", "position", "vlm" + method: str # "uia", "ocr", "template", "position", "vlm", "dom" target_text: str = "" # Texte à chercher (pour OCR) anchor_b64: str = "" # Crop de référence (pour template matching) zone: Dict[str, float] = field(default_factory=dict) # Zone de recherche {x_min, y_min, x_max, y_max} @@ -37,6 +37,20 @@ class ResolutionStrategy: vlm_description: str = "" # Description VLM (dernier recours) threshold: float = 0.8 # Seuil de confiance + # Stratégie UIA (Windows UI Automation) + # Utilisée quand l'enregistrement a capturé un snapshot UIA au moment du clic. + # Au replay, l'agent Windows appelle lea_uia.exe find --name ... pour retrouver + # l'élément par son chemin logique (100% fiable sur Windows natif). + uia_name: str = "" # Name property de l'élément + uia_control_type: str = "" # ControlType (Button, Edit, MenuItem, ...) + uia_automation_id: str = "" # AutomationId (optionnel) + uia_parent_path: List[Dict[str, str]] = field(default_factory=list) + + # Stratégie DOM (web avec CDP activé) — préparation pour plus tard + dom_selector: str = "" # CSS selector + dom_xpath: str = "" # XPath + dom_url_pattern: str = "" # Pattern URL à matcher + def to_dict(self) -> Dict[str, Any]: d = {"method": self.method} if self.target_text: @@ -49,6 +63,20 @@ class ResolutionStrategy: d["position_hint"] = self.position_hint if self.vlm_description: d["vlm_description"] = self.vlm_description + if self.uia_name: + d["uia_name"] = self.uia_name + if self.uia_control_type: + d["uia_control_type"] = self.uia_control_type + if self.uia_automation_id: + d["uia_automation_id"] = self.uia_automation_id + if self.uia_parent_path: + d["uia_parent_path"] = self.uia_parent_path + if self.dom_selector: + d["dom_selector"] = self.dom_selector + if self.dom_xpath: + d["dom_xpath"] = self.dom_xpath + if self.dom_url_pattern: + d["dom_url_pattern"] = self.dom_url_pattern d["threshold"] = self.threshold return d diff --git a/core/workflow/ir_builder.py b/core/workflow/ir_builder.py index 637378d75..7fc15d5b3 100644 --- a/core/workflow/ir_builder.py +++ b/core/workflow/ir_builder.py @@ -269,6 +269,22 @@ class IRBuilder: # (utilisé par l'ExecutionCompiler pour construire les stratégies) action._enrichment = enrichment + # Lire le snapshot UIA si l'agent Windows l'a capturé. + # Format attendu dans l'événement : + # evt["uia_snapshot"] = { + # "name": "Enregistrer", + # "control_type": "bouton", + # "automation_id": "btnSave", + # "parent_path": [{"name": "...", "control_type": "..."}], + # } + # Si présent, il est fusionné dans _enrichment pour que + # l'ExecutionCompiler puisse créer une stratégie UIA prioritaire. + uia_snapshot = evt.get("uia_snapshot") + if uia_snapshot and isinstance(uia_snapshot, dict): + if not hasattr(action, "_enrichment") or action._enrichment is None: + action._enrichment = {} + action._enrichment["uia_snapshot"] = uia_snapshot + return action elif evt_type == "text_input": diff --git a/tests/unit/test_v4_wiring.py b/tests/unit/test_v4_wiring.py new file mode 100644 index 000000000..95fdc4102 --- /dev/null +++ b/tests/unit/test_v4_wiring.py @@ -0,0 +1,349 @@ +""" +Tests de câblage complet V4 : +- SurfaceClassifier + ExecutionCompiler : paramètres adaptés par surface +- IRBuilder lit uia_snapshot depuis les événements +- ExecutionCompiler crée une stratégie UIA quand dispo +- execution_plan_runner propage uia_target dans target_spec +- Pipeline E2E : RawTrace (avec UIA) → WorkflowIR → Plan → action runtime +""" + +import sys +from pathlib import Path +from unittest.mock import patch + +import pytest + +_ROOT = str(Path(__file__).resolve().parents[2]) +if _ROOT not in sys.path: + sys.path.insert(0, _ROOT) + +from core.workflow.workflow_ir import WorkflowIR, Step, Action +from core.workflow.execution_plan import ExecutionPlan, ExecutionNode, ResolutionStrategy +from core.workflow.execution_compiler import ExecutionCompiler +from core.workflow.surface_classifier import SurfaceClassifier, SurfaceProfile, SurfaceType +from core.workflow.ir_builder import IRBuilder +from agent_v0.server_v1.execution_plan_runner import ( + execution_node_to_action, + execution_plan_to_actions, + _strategy_to_target_spec, +) + + +# ========================================================================= +# ExecutionCompiler avec SurfaceProfile +# ========================================================================= + + +class TestCompilerWithSurfaceProfile: + + def test_profil_citrix_impose_timeouts_longs(self): + """Profil Citrix → timeouts longs, retries 3x.""" + ir = WorkflowIR.new("Test") + ir.add_step("Clic", actions=[{"type": "click", "target": "Bouton", "anchor_hint": "OK"}]) + + profile = SurfaceProfile( + surface_type=SurfaceType.CITRIX, + timeout_click_ms=15000, + max_retries=3, + ocr_threshold=0.65, + ) + + compiler = ExecutionCompiler() + plan = compiler.compile(ir, surface_profile=profile) + + click_node = [n for n in plan.nodes if n.action_type == "click"][0] + assert click_node.timeout_ms == 15000 + assert click_node.max_retries == 3 + + def test_profil_web_impose_timeouts_courts(self): + """Profil web → timeouts courts, 1 retry.""" + ir = WorkflowIR.new("Test") + ir.add_step("Clic", actions=[{"type": "click", "target": "X", "anchor_hint": "Login"}]) + + profile = SurfaceProfile( + surface_type=SurfaceType.WEB_LOCAL, + timeout_click_ms=5000, + max_retries=1, + ) + + compiler = ExecutionCompiler() + plan = compiler.compile(ir, surface_profile=profile) + + click_node = [n for n in plan.nodes if n.action_type == "click"][0] + assert click_node.timeout_ms == 5000 + assert click_node.max_retries == 1 + + def test_sans_profil_utilise_defauts(self): + """Sans surface_profile, comportement par défaut.""" + ir = WorkflowIR.new("Test") + ir.add_step("Clic", actions=[{"type": "click", "target": "X", "anchor_hint": "Y"}]) + + compiler = ExecutionCompiler() + plan = compiler.compile(ir) + + click_node = [n for n in plan.nodes if n.action_type == "click"][0] + assert click_node.timeout_ms == 10000 # Défaut + assert click_node.max_retries == 2 # Défaut + + +# ========================================================================= +# Stratégie UIA dans la compilation +# ========================================================================= + + +class TestUiaStrategyCompilation: + + def _make_ir_with_uia(self): + """Créer un WorkflowIR avec une action portant un uia_snapshot.""" + ir = WorkflowIR.new("Test UIA") + action = Action( + type="click", + target="Bloc-notes", + anchor_hint="Enregistrer", + ) + # Simuler l'enrichissement avec UIA + action._enrichment = { + "by_text": "Enregistrer", + "anchor_image_base64": "fake_crop_data", + "vlm_description": "Le bouton Enregistrer du menu Fichier", + "uia_snapshot": { + "name": "Enregistrer", + "control_type": "bouton", + "automation_id": "btnSave", + "parent_path": [ + {"name": "Bloc-notes", "control_type": "fenêtre"}, + {"name": "Fichier", "control_type": "menu"}, + ], + }, + } + step = Step(step_id="s1", intent="Sauvegarder", actions=[action]) + ir.steps.append(step) + return ir + + def test_uia_strategie_creee_si_surface_windows(self): + """Sur Windows natif avec UIA dispo, la stratégie UIA est primaire.""" + ir = self._make_ir_with_uia() + profile = SurfaceProfile( + surface_type=SurfaceType.WINDOWS_NATIVE, + uia_available=True, + ) + + compiler = ExecutionCompiler() + plan = compiler.compile(ir, surface_profile=profile) + + click = [n for n in plan.nodes if n.action_type == "click"][0] + assert click.strategy_primary is not None + assert click.strategy_primary.method == "uia" + assert click.strategy_primary.uia_name == "Enregistrer" + assert click.strategy_primary.uia_control_type == "bouton" + + def test_uia_desactive_sur_citrix(self): + """Sur Citrix, UIA est ignoré même si snapshot présent.""" + ir = self._make_ir_with_uia() + profile = SurfaceProfile( + surface_type=SurfaceType.CITRIX, + uia_available=False, + ) + + compiler = ExecutionCompiler() + plan = compiler.compile(ir, surface_profile=profile) + + click = [n for n in plan.nodes if n.action_type == "click"][0] + assert click.strategy_primary.method != "uia" + # OCR est la primaire (texte dispo) + assert click.strategy_primary.method == "ocr" + + def test_uia_fallback_sur_ocr_si_uia_manquant(self): + """Sans uia_snapshot, OCR primaire.""" + ir = WorkflowIR.new("Test") + action = Action( + type="click", + target="Fichier", + anchor_hint="Fichier", + ) + action._enrichment = { + "by_text": "Fichier", + "vlm_description": "Menu Fichier", + } + step = Step(step_id="s1", intent="Ouvrir menu", actions=[action]) + ir.steps.append(step) + + profile = SurfaceProfile( + surface_type=SurfaceType.WINDOWS_NATIVE, + uia_available=True, + ) + + compiler = ExecutionCompiler() + plan = compiler.compile(ir, surface_profile=profile) + + click = [n for n in plan.nodes if n.action_type == "click"][0] + assert click.strategy_primary.method == "ocr" + + +# ========================================================================= +# IRBuilder lit uia_snapshot depuis les événements +# ========================================================================= + + +class TestIRBuilderLitUiaSnapshot: + + def test_ir_builder_propage_uia_snapshot(self): + """Un event avec uia_snapshot → Action._enrichment contient uia_snapshot.""" + events = [ + { + "event": { + "type": "mouse_click", + "pos": [500, 300], + "window": {"title": "Bloc-notes"}, + "timestamp": 100.0, + "uia_snapshot": { + "name": "Enregistrer", + "control_type": "bouton", + "automation_id": "btnSave", + "parent_path": [{"name": "Fichier", "control_type": "menu"}], + }, + } + } + ] + + builder = IRBuilder(gemma4_port="99999") + ir = builder.build(events, name="Test") + + # Parcourir les steps pour trouver le clic + found_action = None + for step in ir.steps: + for action in step.actions: + if action.type == "click": + found_action = action + break + + assert found_action is not None + enrichment = getattr(found_action, "_enrichment", None) or {} + assert "uia_snapshot" in enrichment + assert enrichment["uia_snapshot"]["name"] == "Enregistrer" + assert enrichment["uia_snapshot"]["control_type"] == "bouton" + + +# ========================================================================= +# execution_plan_runner propage uia_target dans target_spec +# ========================================================================= + + +class TestUiaTargetPropagation: + + def test_strategy_uia_produit_uia_target(self): + """Une stratégie UIA primaire → target_spec contient uia_target.""" + primary = ResolutionStrategy( + method="uia", + uia_name="Enregistrer", + uia_control_type="bouton", + uia_automation_id="btnSave", + uia_parent_path=[{"name": "Fichier", "control_type": "menu"}], + ) + fallbacks = [ + ResolutionStrategy(method="ocr", target_text="Enregistrer"), + ResolutionStrategy(method="vlm", vlm_description="bouton Enregistrer"), + ] + + spec = _strategy_to_target_spec(primary, fallbacks) + + assert "uia_target" in spec + assert spec["uia_target"]["name"] == "Enregistrer" + assert spec["uia_target"]["control_type"] == "bouton" + assert spec["uia_target"]["automation_id"] == "btnSave" + assert spec["resolve_order"][0] == "uia" + assert "ocr" in spec["resolve_order"] + assert "vlm" in spec["resolve_order"] + + def test_pas_de_uia_target_si_pas_de_stratégie(self): + """Sans stratégie UIA → pas de uia_target.""" + primary = ResolutionStrategy(method="ocr", target_text="test") + spec = _strategy_to_target_spec(primary, []) + + assert "uia_target" not in spec + assert "uia" not in spec.get("resolve_order", []) + + def test_execution_node_to_action_avec_uia(self): + """Un ExecutionNode avec stratégie UIA produit une action complète.""" + node = ExecutionNode( + node_id="n1", + action_type="click", + intent="Cliquer Enregistrer", + strategy_primary=ResolutionStrategy( + method="uia", + uia_name="Enregistrer", + uia_control_type="bouton", + ), + strategy_fallbacks=[ + ResolutionStrategy(method="ocr", target_text="Enregistrer"), + ], + ) + + action = execution_node_to_action(node) + assert action is not None + assert action["type"] == "click" + assert "uia_target" in action["target_spec"] + assert action["target_spec"]["uia_target"]["name"] == "Enregistrer" + assert action["target_spec"]["resolve_order"] == ["uia", "ocr"] + + +# ========================================================================= +# Pipeline E2E : événement avec UIA → action runtime avec uia_target +# ========================================================================= + + +class TestPipelineE2EUia: + + def test_pipeline_complet_uia(self): + """RawTrace (avec uia_snapshot) → WorkflowIR → Plan → action runtime.""" + # Événements simulés d'un enregistrement sur Windows natif + events = [ + { + "event": { + "type": "mouse_click", + "pos": [500, 300], + "window": {"title": "Bloc-notes"}, + "timestamp": 100.0, + "uia_snapshot": { + "name": "Enregistrer", + "control_type": "bouton", + "automation_id": "btnSave", + "parent_path": [ + {"name": "Bloc-notes", "control_type": "fenêtre"}, + ], + }, + } + } + ] + + # Pipeline complet + builder = IRBuilder(gemma4_port="99999") + ir = builder.build(events, name="Test E2E UIA") + + profile = SurfaceProfile( + surface_type=SurfaceType.WINDOWS_NATIVE, + uia_available=True, + timeout_click_ms=8000, + max_retries=2, + ) + + compiler = ExecutionCompiler() + plan = compiler.compile(ir, surface_profile=profile) + + actions = execution_plan_to_actions(plan) + + # Vérifier que l'action finale a toutes les données UIA + click_actions = [a for a in actions if a["type"] == "click"] + assert len(click_actions) == 1 + + action = click_actions[0] + assert "target_spec" in action + spec = action["target_spec"] + + assert "resolve_order" in spec + assert spec["resolve_order"][0] == "uia" + assert "uia_target" in spec + assert spec["uia_target"]["name"] == "Enregistrer" + assert spec["uia_target"]["control_type"] == "bouton" + assert action.get("timeout_ms") == 8000 + assert action.get("max_retries") == 2