diff --git a/agent_v0/agent_v1/core/captor.py b/agent_v0/agent_v1/core/captor.py index 4dcd9392f..77d3b99dc 100644 --- a/agent_v0/agent_v1/core/captor.py +++ b/agent_v0/agent_v1/core/captor.py @@ -178,8 +178,41 @@ class EventCaptorV1: "timestamp": now, } self._inject_screen_metadata(event) + # Capturer le snapshot UIA à la position du clic (si helper dispo) + # Non-bloquant : si UIA échoue, l'event est enrichi uniquement + # des données vision comme aujourd'hui. + self._inject_uia_snapshot(event, x, y) self.on_event(event) + def _inject_uia_snapshot(self, event: dict, x: int, y: int) -> None: + """Ajouter un uia_snapshot à l'événement si le helper UIA est dispo. + + Appelle lea_uia.exe query --x N --y N en ~10-20ms. + Fallback silencieux si le helper n'est pas dispo ou échoue. + """ + try: + from .uia_helper import get_shared_helper + helper = get_shared_helper() + if not helper.available: + return + element = helper.query_at(int(x), int(y), with_parents=True) + if element is None: + return + event["uia_snapshot"] = { + "name": element.name, + "control_type": element.control_type, + "class_name": element.class_name, + "automation_id": element.automation_id, + "bounding_rect": list(element.bounding_rect), + "is_enabled": element.is_enabled, + "is_offscreen": element.is_offscreen, + "parent_path": element.parent_path, + } + except Exception as e: + # Non bloquant — on continue sans UIA + import logging + logging.getLogger(__name__).debug(f"UIA snapshot skip: {e}") + def _on_scroll(self, x, y, dx, dy): event = { "type": "mouse_scroll", diff --git a/agent_v0/agent_v1/core/executor.py b/agent_v0/agent_v1/core/executor.py index c50256b58..f6d14dc46 100644 --- a/agent_v0/agent_v1/core/executor.py +++ b/agent_v0/agent_v1/core/executor.py @@ -264,6 +264,59 @@ class ActionExecutorV1: logger.warning(f"Acteur gemma4 indisponible : {e}") return "EXECUTER" + # ========================================================================= + # UIA local — résolution via lea_uia.exe (helper Rust) + # ========================================================================= + + def _resolve_via_uia_local( + self, uia_target: dict, screen_width: int, screen_height: int, + ): + """Résoudre une cible via UIA local (lea_uia.exe). + + Le plan V4 contient un uia_target (nom, control_type, parent_path). + On appelle le helper Rust qui interroge UIAutomationCore.dll et + retourne les coordonnées pixel-perfect de l'élément. + + Retourne (x_pct, y_pct) si trouvé, None sinon. + Le fallback vers le grounding serveur est géré par l'appelant. + """ + try: + from .uia_helper import get_shared_helper + helper = get_shared_helper() + if not helper.available: + return None + + name = uia_target.get("name", "") + control_type = uia_target.get("control_type", "") or None + automation_id = uia_target.get("automation_id", "") or None + + if not name: + return None + + element = helper.find_by_name( + name=name, + control_type=control_type, + automation_id=automation_id, + timeout_ms=1500, + ) + if element is None or not element.is_clickable(): + return None + + cx, cy = element.center() + if screen_width <= 0 or screen_height <= 0: + return None + + x_pct = cx / screen_width + y_pct = cy / screen_height + if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0): + return None + + return (x_pct, y_pct) + + except Exception as e: + logger.debug(f"UIA local resolve erreur : {e}") + return None + # ========================================================================= # Observer — pré-analyse écran avant chaque action # ========================================================================= @@ -506,8 +559,29 @@ class ActionExecutorV1: return result # EXECUTER → continuer normalement - if visual_mode and target_spec and server_url: - # ── GROUNDING : localisation pure via GroundingEngine ── + # ── V4 : UIA local (si resolve_order l'indique et helper dispo) ── + # Court-circuite le grounding serveur pour les clicks sur Windows natif. + # 10-20ms au lieu de 2-5s pour un clic — c'est le cœur du V4. + uia_resolved = False + if visual_mode and target_spec and action_type == "click": + resolve_order = target_spec.get("resolve_order", []) + uia_target = target_spec.get("uia_target") + if resolve_order and resolve_order[0] == "uia" and uia_target: + uia_coords = self._resolve_via_uia_local(uia_target, width, height) + if uia_coords: + x_pct, y_pct = uia_coords + result["visual_resolved"] = True + result["resolution_method"] = "v4_uia_local" + result["resolution_score"] = 0.95 + uia_resolved = True + print(f" [UIA] résolu en local: ({x_pct:.4f}, {y_pct:.4f})") + logger.info( + f"V4 UIA local OK : {uia_target.get('name', '?')} " + f"→ ({x_pct:.4f}, {y_pct:.4f})" + ) + + if not uia_resolved and visual_mode and target_spec and server_url: + # ── GROUNDING : localisation pure via GroundingEngine (fallback) ── from .grounding import GroundingEngine grounding = GroundingEngine(self) grounding_result = grounding.locate( diff --git a/agent_v0/agent_v1/core/uia_helper.py b/agent_v0/agent_v1/core/uia_helper.py new file mode 100644 index 000000000..80061f853 --- /dev/null +++ b/agent_v0/agent_v1/core/uia_helper.py @@ -0,0 +1,278 @@ +# core/workflow/uia_helper.py +""" +UIAHelper — Wrapper Python pour lea_uia.exe (helper Rust UI Automation). + +Expose une API Python simple pour interroger UIA via le binaire Rust. +Communique via subprocess + stdin/stdout JSON. + +Pourquoi un helper Rust ? +- 5-10x plus rapide que pywinauto (10-20ms vs 50-200ms) +- Binaire standalone ~500 Ko, aucune dépendance runtime +- Pas de problèmes de threading COM en Python +- Crash-safe (le crash du helper n'affecte pas l'agent Python) + +Architecture : + Python executor + ↓ subprocess.run + lea_uia.exe query --x 812 --y 436 + ↓ UIA API Windows + JSON response + ↓ stdout + Python executor parse JSON + +Si lea_uia.exe n'est pas disponible (Linux, binaire absent, crash) : +toutes les méthodes retournent None → fallback vision automatique. +""" + +import json +import logging +import os +import platform +import subprocess +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Tuple + +logger = logging.getLogger(__name__) + +# Timeout par défaut pour les appels UIA (en secondes) +_DEFAULT_TIMEOUT = 5.0 + + +@dataclass +class UiaElement: + """Représentation Python d'un élément UIA.""" + name: str = "" + control_type: str = "" + class_name: str = "" + automation_id: str = "" + bounding_rect: Tuple[int, int, int, int] = (0, 0, 0, 0) + is_enabled: bool = False + is_offscreen: bool = True + parent_path: List[Dict[str, str]] = field(default_factory=list) + process_name: str = "" + + def center(self) -> Tuple[int, int]: + """Retourner le centre du rectangle (pixels).""" + x1, y1, x2, y2 = self.bounding_rect + return ((x1 + x2) // 2, (y1 + y2) // 2) + + def width(self) -> int: + return self.bounding_rect[2] - self.bounding_rect[0] + + def height(self) -> int: + return self.bounding_rect[3] - self.bounding_rect[1] + + def is_clickable(self) -> bool: + """Peut-on cliquer dessus ?""" + return ( + self.is_enabled + and not self.is_offscreen + and self.width() > 0 + and self.height() > 0 + ) + + def path_signature(self) -> str: + """Signature du chemin parent (pour retrouver l'élément).""" + parts = [f"{p['control_type']}[{p['name']}]" for p in self.parent_path if p.get("name")] + parts.append(f"{self.control_type}[{self.name}]") + return " > ".join(parts) + + def to_dict(self) -> Dict[str, Any]: + return { + "name": self.name, + "control_type": self.control_type, + "class_name": self.class_name, + "automation_id": self.automation_id, + "bounding_rect": list(self.bounding_rect), + "is_enabled": self.is_enabled, + "is_offscreen": self.is_offscreen, + "parent_path": self.parent_path, + "process_name": self.process_name, + } + + @classmethod + def from_dict(cls, d: Dict[str, Any]) -> "UiaElement": + rect = d.get("bounding_rect", [0, 0, 0, 0]) + if isinstance(rect, list) and len(rect) >= 4: + rect = tuple(rect[:4]) + else: + rect = (0, 0, 0, 0) + return cls( + name=d.get("name", ""), + control_type=d.get("control_type", ""), + class_name=d.get("class_name", ""), + automation_id=d.get("automation_id", ""), + bounding_rect=rect, + is_enabled=d.get("is_enabled", False), + is_offscreen=d.get("is_offscreen", True), + parent_path=d.get("parent_path", []), + process_name=d.get("process_name", ""), + ) + + +class UIAHelper: + """Wrapper Python pour lea_uia.exe.""" + + def __init__(self, helper_path: str = "", timeout: float = _DEFAULT_TIMEOUT): + self._helper_path = helper_path or self._find_helper() + self._timeout = timeout + self._available = self._check_available() + + def _find_helper(self) -> str: + """Trouver lea_uia.exe dans les emplacements standards.""" + candidates = [ + r"C:\Lea\helpers\lea_uia.exe", + os.path.join(os.path.dirname(__file__), "..", "..", + "agent_rust", "lea_uia", "target", + "x86_64-pc-windows-gnu", "release", "lea_uia.exe"), + "./helpers/lea_uia.exe", + "lea_uia.exe", + ] + for path in candidates: + if os.path.isfile(path): + return os.path.abspath(path) + return "" + + def _check_available(self) -> bool: + """Vérifier que le helper est utilisable (Windows + binaire + health OK).""" + if platform.system() != "Windows": + logger.debug("UIAHelper: Linux/Mac — helper désactivé") + return False + if not self._helper_path: + logger.debug("UIAHelper: lea_uia.exe introuvable") + return False + if not os.path.isfile(self._helper_path): + logger.debug(f"UIAHelper: chemin invalide {self._helper_path}") + return False + return True + + @property + def available(self) -> bool: + return self._available + + @property + def helper_path(self) -> str: + return self._helper_path + + def _run(self, args: List[str]) -> Optional[Dict[str, Any]]: + """Exécuter lea_uia.exe avec les arguments et parser le JSON.""" + if not self._available: + return None + try: + result = subprocess.run( + [self._helper_path] + args, + capture_output=True, + text=True, + timeout=self._timeout, + encoding="utf-8", + errors="replace", + ) + if result.returncode != 0: + logger.debug( + f"UIAHelper: exit code {result.returncode}, " + f"stderr: {result.stderr[:200]}" + ) + return None + output = result.stdout.strip() + if not output: + return None + return json.loads(output) + except subprocess.TimeoutExpired: + logger.debug(f"UIAHelper: timeout ({self._timeout}s) sur {args}") + return None + except json.JSONDecodeError as e: + logger.debug(f"UIAHelper: JSON invalide — {e}") + return None + except Exception as e: + logger.debug(f"UIAHelper: erreur {e}") + return None + + def health(self) -> bool: + """Vérifier que UIA répond.""" + data = self._run(["health"]) + return data is not None and data.get("status") == "ok" + + def query_at( + self, + x: int, + y: int, + with_parents: bool = True, + ) -> Optional[UiaElement]: + """Récupérer l'élément UIA à une position écran. + + Args: + x, y: Coordonnées pixel absolues + with_parents: Inclure la hiérarchie des parents + + Returns: + UiaElement si trouvé, None sinon (pas d'élément ou UIA indispo) + """ + args = ["query", "--x", str(x), "--y", str(y)] + if not with_parents: + args.append("--with-parents=false") + + data = self._run(args) + if not data or data.get("status") != "ok": + return None + + elem_data = data.get("element") + if not elem_data: + return None + return UiaElement.from_dict(elem_data) + + def find_by_name( + self, + name: str, + control_type: Optional[str] = None, + automation_id: Optional[str] = None, + window: Optional[str] = None, + timeout_ms: int = 2000, + ) -> Optional[UiaElement]: + """Rechercher un élément par son nom (+ filtres optionnels). + + Args: + name: Nom exact de l'élément + control_type: Type de contrôle (Button, Edit, MenuItem...) + automation_id: ID d'automation + window: Restreindre à une fenêtre spécifique + timeout_ms: Timeout de recherche en millisecondes + """ + args = ["find", "--name", name, "--timeout-ms", str(timeout_ms)] + if control_type: + args.extend(["--control-type", control_type]) + if automation_id: + args.extend(["--automation-id", automation_id]) + if window: + args.extend(["--window", window]) + + data = self._run(args) + if not data or data.get("status") != "ok": + return None + + elem_data = data.get("element") + if not elem_data: + return None + return UiaElement.from_dict(elem_data) + + def capture_focused(self, max_depth: int = 3) -> Optional[UiaElement]: + """Capturer l'élément ayant le focus + son contexte.""" + data = self._run(["capture", "--max-depth", str(max_depth)]) + if not data or data.get("status") != "ok": + return None + + elem_data = data.get("element") + if not elem_data: + return None + return UiaElement.from_dict(elem_data) + + +# Instance globale partagée (singleton léger) +_SHARED_HELPER: Optional[UIAHelper] = None + + +def get_shared_helper() -> UIAHelper: + """Retourner une instance partagée de UIAHelper.""" + global _SHARED_HELPER + if _SHARED_HELPER is None: + _SHARED_HELPER = UIAHelper() + return _SHARED_HELPER