feat: premier replay E2E + mode apprentissage supervisé

Premier replay fonctionnel de bout en bout (Bloc-notes, Chrome). Corrections critiques : - Fix double-lancement agent (Lea.bat start /b + verrou PID) - Sérialisation replay (threading.Lock dans poll_and_execute) - Garde UIA bbox >50% écran (rejet conteneurs "Bureau") - Filtre fenêtres bruit système (systray overflow) - Auto-nettoyage replays bloqués (paused_need_help) Cascade visuelle complète dans session_cleaner : - UIA local (10ms) → template matching (100ms) → serveur docTR/VLM - Nettoyage bureau pré-replay (clic "Afficher le bureau") - Crops 80x80 + vlm_description pour chaque clic Grounding contraint à la fenêtre active : - Capture croppée à la fenêtre au lieu de l'écran entier - Conversion coordonnées fenêtre → écran - Élimine les faux positifs taskbar/systray Mode apprentissage supervisé (SUPERVISE → capture humaine) : - Léa passe en mode capture quand elle est perdue - Capture mini-workflow humain (clics + frappes + combos) - Fin par Ctrl+Shift+L ou timeout inactivité 10s - Correction stockée dans target_memory.db via serveur Deploy Windows complet (grounding.py, policy.py, uia_helper.py). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 07:42:50 +02:00
parent 816b37af98
commit 33c198b827
12 changed files with 1561 additions and 60 deletions
--- a/agent_v0/agent_v1/core/executor.py
+++ b/agent_v0/agent_v1/core/executor.py
@@ -17,6 +17,7 @@ import base64
 import hashlib
 import io
 import os
 import threading
 import time
 import logging
@@ -72,6 +73,12 @@ class ActionExecutorV1:
        # different de celui qui utilise l'instance).
        self._sct = None
        self.running = True
        # ── Verrou de sérialisation replay ──
        # Garantit qu'UNE SEULE action de replay s'exécute à la fois.
        # Sans ce lock, deux threads (polling main.py + lea_ui) peuvent
        # consommer deux actions simultanément → race condition + mss
        # thread-unsafe retourne des résolutions fantômes (1024x768).
        self._replay_lock = threading.Lock()
        # Backoff exponentiel pour le polling replay (evite de marteler le serveur)
        self._poll_backoff = 1.0       # Delai actuel (secondes)
        self._poll_backoff_min = 1.0   # Delai minimal (reset apres succes)
@@ -340,6 +347,25 @@ class ActionExecutorV1:
                            )
                            return None
            # ── GARDE : rejeter les éléments géants (conteneurs) ──
            # Un élément qui couvre >50% de l'écran est un conteneur
            # (Bureau, Rechercher, liste), pas un bouton cliquable.
            # Cliquer au centre d'un conteneur = clic aveugle.
            elem_w = element.width()
            elem_h = element.height()
            screen_area = screen_width * screen_height
            elem_area = elem_w * elem_h
            if screen_area > 0 and elem_area / screen_area > 0.5:
                logger.warning(
                    f"UIA REJET : '{name}' couvre {elem_area / screen_area * 100:.0f}% "
                    f"de l'écran ({elem_w}x{elem_h}) — conteneur, pas un élément cliquable"
                )
                print(
                    f"    [UIA] REJET — '{name}' trop grand "
                    f"({elem_w}x{elem_h}, {elem_area / screen_area * 100:.0f}% écran)"
                )
                return None
            cx, cy = element.center()
            if screen_width <= 0 or screen_height <= 0:
                return None
@@ -499,10 +525,25 @@ class ActionExecutorV1:
            "visual_resolved": False,
        }
        # ── Délai inter-actions (anti race condition mss) ──
        wait_before = action.get("wait_before", 0.5)
        if wait_before > 0:
            time.sleep(wait_before)
        try:
            monitor = self.sct.monitors[1]
            width, height = monitor["width"], monitor["height"]
            # ── Diagnostic résolution ──
            logger.info(
                f"[REPLAY] Action {action_id} ({action_type}) — "
                f"écran replay: {width}x{height}, "
                f"x_pct={action.get('x_pct', 0):.4f}, "
                f"y_pct={action.get('y_pct', 0):.4f} "
                f"→ pixel ({int(action.get('x_pct', 0) * width)}, "
                f"{int(action.get('y_pct', 0) * height)})"
            )
            # Resolution visuelle des coordonnees si demande
            x_pct = action.get("x_pct", 0.0)
            y_pct = action.get("y_pct", 0.0)
@@ -526,7 +567,7 @@ class ActionExecutorV1:
                )
                if expected_title and expected_title != "unknown_window":
                    from ..window_info_crossplatform import get_active_window_info
-                    from ..ui.messages import est_fenetre_lea
+                    from ..ui.messages import est_fenetre_lea, est_fenetre_bruit
                    # Polling court pour laisser le temps à la fenêtre de
                    # se stabiliser (évite les faux négatifs sur transitions
@@ -544,8 +585,9 @@ class ActionExecutorV1:
                            time.sleep(0.3)
                            continue
-                        # Si on tombe sur unknown_window → on attend aussi
+                        # Bruit système (systray overflow, taskbar, etc.)
-                        if not current_title or current_title == "unknown_window":
+                        # → on attend que la vraie fenêtre reprenne le focus
                        if est_fenetre_bruit(current_title):
                            time.sleep(0.3)
                            continue
@@ -686,8 +728,8 @@ class ActionExecutorV1:
            if action_type == "click":
                # Si visual_mode est activé, le resolve DOIT réussir.
-                # Pas de fallback blind — on arrête le replay si la cible
+                # Pas de fallback blind — Léa VOIT l'écran et CHERCHE
-                # n'est pas trouvée visuellement. C'est un RPA VISUEL.
+                # l'élément. Si toute la cascade échoue → pause supervisée.
                if visual_mode and not result.get("visual_resolved"):
                    # ── Policy : décider quoi faire quand grounding échoue ──
                    from .policy import PolicyEngine, Decision
@@ -709,7 +751,6 @@ class ActionExecutorV1:
                    )
                    if policy_decision.decision == Decision.RETRY:
                        # Re-tenter le grounding après correction (popup fermée, etc.)
                        resolved2 = self._resolve_target_visual(
                            server_url, target_spec, x_pct, y_pct, width, height
                        )
@@ -719,7 +760,6 @@ class ActionExecutorV1:
                            result["visual_resolved"] = True
                            print(f"    [POLICY] Re-resolve OK après {policy_decision.action_taken}")
                        else:
                            # Re-resolve échoué — SUPERVISE (rendre la main)
                            result["success"] = False
                            result["error"] = "target_not_found"
                            result["target_description"] = target_desc
@@ -746,18 +786,55 @@ class ActionExecutorV1:
                        )
                        return result
-                    else:  # SUPERVISE ou CONTINUE
+                    else:  # SUPERVISE → mode apprentissage
-                        result["success"] = False
+                        # Léa est perdue. Au lieu de s'arrêter, elle
-                        result["error"] = "target_not_found"
+                        # passe en mode capture et enregistre ce que
-                        result["target_description"] = target_desc
+                        # l'humain fait (mini-workflow de correction).
-                        result["target_spec"] = target_spec
+                        try:
-                        result["screenshot"] = self._capture_screenshot_b64()
+                            self.notifier.replay_target_not_found(
-                        result["warning"] = "visual_resolve_failed"
+                                target_desc,
-                        self.notifier.replay_target_not_found(
+                                target_spec.get("window_title", ""),
-                            target_desc,
+                            )
-                            target_spec.get("window_title", ""),
+                        except Exception:
                            pass
                        human_actions = self._capture_human_correction(
                            timeout_s=120,
                        )
-                        return result
+                        if human_actions:
                            # L'humain a montré un mini-workflow
                            result["success"] = True
                            result["resolution_method"] = "human_supervised"
                            result["warning"] = "human_supervised"
                            # Stocker le dernier clic comme position résolue
                            last_click = None
                            for ha in reversed(human_actions):
                                if ha.get("type") == "click":
                                    last_click = ha
                                    break
                            if last_click:
                                result["actual_position"] = {
                                    "x_pct": last_click["x_pct"],
                                    "y_pct": last_click["y_pct"],
                                }
                            # Envoyer toute la correction au serveur
                            result["correction"] = {
                                "actions": human_actions,
                                "action_count": len(human_actions),
                                "last_click": last_click,
                            }
                            logger.info(
                                f"[APPRENTISSAGE] Correction reçue : "
                                f"{len(human_actions)} actions — je m'en souviendrai."
                            )
                        else:
                            # Timeout — l'humain n'a pas répondu
                            result["success"] = False
                            result["error"] = "target_not_found"
                            result["target_description"] = target_desc
                            result["target_spec"] = target_spec
                            result["screenshot"] = self._capture_screenshot_b64()
                            result["warning"] = "visual_resolve_failed"
                real_x = int(x_pct * width)
                real_y = int(y_pct * height)
@@ -1417,15 +1494,24 @@ Example: x_pct=0.50, y_pct=0.30"""
        2. Execute l'action (clic, texte, etc.)
        3. POST /replay/result avec le resultat + screenshot
-        Args:
+        Sérialisé par _replay_lock — une seule action à la fois.
-            session_id: Identifiant de la session courante
+        Sans ce lock, deux threads concurrents consomment deux actions
-            server_url: URL de base du serveur streaming
+        et mss retourne des résolutions fantômes (thread-unsafe).
            machine_id: Identifiant de la machine (pour le replay multi-machine)
        Retourne True si une action a ete executee, False sinon.
        IMPORTANT: Si une action est recue, le resultat est TOUJOURS rapporte
        au serveur (meme en cas d'erreur d'execution).
        """
        # Sérialisation stricte : si un autre thread exécute déjà une
        # action, on abandonne ce poll immédiatement (pas de file d'attente).
        if not self._replay_lock.acquire(blocking=False):
            return False
        try:
            return self._poll_and_execute_inner(session_id, server_url, machine_id)
        finally:
            self._replay_lock.release()
    def _poll_and_execute_inner(self, session_id: str, server_url: str, machine_id: str) -> bool:
        """Implémentation interne de poll_and_execute (protégée par _replay_lock)."""
        import requests
        replay_next_url = f"{server_url}/traces/stream/replay/next"
@@ -1499,11 +1585,14 @@ Example: x_pct=0.50, y_pct=0.30"""
            print(f">>> ERREUR EXECUTION : {e}")
            logger.error(f"Erreur execute_replay_action: {e}")
            import traceback
            tb_str = traceback.format_exc()
            traceback.print_exc()
            result = {
                "action_id": action_id,
                "success": False,
-                "error": f"Exception executor: {e}",
+                # Inclure le traceback complet pour diagnostiquer
                # les crashes côté agent depuis les logs serveur
                "error": f"{e}\n---TRACEBACK---\n{tb_str[-500:]}",
                "screenshot": None,
            }
@@ -1525,6 +1614,8 @@ Example: x_pct=0.50, y_pct=0.30"""
            # Champs enrichis pour target_not_found (pause supervisée)
            "target_description": result.get("target_description"),
            "target_spec": result.get("target_spec"),
            # Correction humaine (mode apprentissage supervisé)
            "correction": result.get("correction"),
        }
        try:
            resp2 = requests.post(
@@ -2007,6 +2098,159 @@ Example: x_pct=0.50, y_pct=0.30"""
        logger.debug(f"Texte saisi char-by-char ({len(text)} chars)")
    # =========================================================================
    # Mode apprentissage — l'humain montre, Léa apprend
    # =========================================================================
    # Hotkey pour signaler la fin de la correction humaine
    _LEARNING_DONE_HOTKEY = {Key.ctrl_l, Key.shift, KeyCode.from_char("l")}
    def _capture_human_correction(self, timeout_s: float = 120.0) -> list[dict]:
        """Capturer un mini-workflow de correction humaine.
        Léa est perdue — elle passe en mode capture et enregistre
        TOUTES les actions de l'humain (clics, frappes, combos)
        jusqu'à ce que l'humain signale qu'il a fini :
        - Ctrl+Shift+L (hotkey)
        - Ou timeout d'inactivité (10s sans action)
        - Ou timeout global (120s)
        Retourne la liste des actions capturées (peut être vide si timeout).
        C'est un mini-workflow, pas juste un clic.
        """
        done_event = threading.Event()
        actions: list[dict] = []
        last_action_time = [time.time()]
        keys_pressed: set = set()
        INACTIVITY_TIMEOUT = 10.0  # secondes
        monitor = self.sct.monitors[1]
        screen_w, screen_h = monitor["width"], monitor["height"]
        def _on_click(x, y, button, pressed):
            if done_event.is_set():
                return False
            if pressed and button.name in ("left", "right"):
                action = {
                    "type": "click",
                    "x_pct": round(x / screen_w, 6),
                    "y_pct": round(y / screen_h, 6),
                    "button": button.name,
                    "timestamp": time.time(),
                }
                # UIA snapshot
                try:
                    from .uia_helper import get_shared_helper
                    helper = get_shared_helper()
                    if helper.available:
                        elem = helper.query_at(int(x), int(y), with_parents=True)
                        if elem:
                            action["uia_snapshot"] = elem.to_dict()
                except Exception:
                    pass
                actions.append(action)
                last_action_time[0] = time.time()
                logger.info(f"[APPRENTISSAGE] Clic ({x}, {y}) bouton={button.name}")
        def _on_key_press(key):
            if done_event.is_set():
                return False
            keys_pressed.add(key)
            # Vérifier hotkey Ctrl+Shift+L
            if self._LEARNING_DONE_HOTKEY.issubset(keys_pressed):
                logger.info("[APPRENTISSAGE] Hotkey Ctrl+Shift+L — fin de correction")
                print("    [APPRENTISSAGE] Ctrl+Shift+L reçu — merci !")
                done_event.set()
                return False
        def _on_key_release(key):
            keys_pressed.discard(key)
            if done_event.is_set():
                return False
            # Capturer les frappes texte (pas les modifiers seuls)
            if hasattr(key, "char") and key.char:
                actions.append({
                    "type": "type",
                    "text": key.char,
                    "timestamp": time.time(),
                })
                last_action_time[0] = time.time()
            elif key == Key.enter:
                actions.append({
                    "type": "key_combo",
                    "keys": ["enter"],
                    "timestamp": time.time(),
                })
                last_action_time[0] = time.time()
        from pynput.mouse import Listener as MouseListener
        from pynput.keyboard import Listener as KeyboardListener
        mouse_listener = MouseListener(on_click=_on_click)
        kbd_listener = KeyboardListener(
            on_press=_on_key_press, on_release=_on_key_release,
        )
        mouse_listener.start()
        kbd_listener.start()
        logger.info(
            f"[APPRENTISSAGE] Mode capture activé (timeout={timeout_s}s, "
            f"inactivité={INACTIVITY_TIMEOUT}s, hotkey=Ctrl+Shift+L)"
        )
        print(
            f"    [APPRENTISSAGE] Montre-moi comment faire.\n"
            f"    Quand tu as fini → Ctrl+Shift+L\n"
            f"    (ou j'attends {INACTIVITY_TIMEOUT}s sans action)"
        )
        # Attendre : hotkey OU inactivité OU timeout global
        start = time.time()
        while not done_event.is_set():
            elapsed = time.time() - start
            if elapsed > timeout_s:
                logger.info("[APPRENTISSAGE] Timeout global")
                break
            # Timeout inactivité : si l'humain a fait au moins 1 action
            # et n'a rien fait depuis INACTIVITY_TIMEOUT secondes
            if actions and (time.time() - last_action_time[0]) > INACTIVITY_TIMEOUT:
                logger.info(
                    f"[APPRENTISSAGE] Inactivité {INACTIVITY_TIMEOUT}s — "
                    f"fin automatique ({len(actions)} actions)"
                )
                print(f"    [APPRENTISSAGE] Pas d'action depuis {INACTIVITY_TIMEOUT}s — je reprends.")
                break
            time.sleep(0.2)
        mouse_listener.stop()
        kbd_listener.stop()
        logger.info(f"[APPRENTISSAGE] {len(actions)} actions capturées")
        print(f"    [APPRENTISSAGE] {len(actions)} actions capturées — merci !")
        return actions
    def _capture_crop_at(self, x: int, y: int, size: int = 80) -> str:
        """Capturer un crop carré autour d'une position."""
        try:
            from PIL import Image
            with mss.mss() as local_sct:
                monitor = local_sct.monitors[1]
                raw = local_sct.grab(monitor)
            img = Image.frombytes("RGB", raw.size, raw.bgra, "raw", "BGRX")
            half = size // 2
            left = max(0, x - half)
            top = max(0, y - half)
            right = min(img.width, x + half)
            bottom = min(img.height, y + half)
            crop = img.crop((left, top, right, bottom))
            buffer = io.BytesIO()
            crop.save(buffer, format="JPEG", quality=85)
            return base64.b64encode(buffer.getvalue()).decode("utf-8")
        except Exception:
            return ""
    def _click(self, pos, button_name):
        """Deplacer la souris via courbe de Bézier puis cliquer.
--- a/agent_v0/agent_v1/core/grounding.py
+++ b/agent_v0/agent_v1/core/grounding.py
@@ -15,6 +15,7 @@ Ref: docs/PLAN_ACTEUR_V1.md — Architecture MICRO (grounding + exécution)
 """
 import base64
 import io
 import logging
 import os
 import time
@@ -126,19 +127,62 @@ class GroundingEngine:
                )
        t_start = time.time()
-        screenshot_b64 = self._executor._capture_screenshot_b64(max_width=0, quality=75)
+
        # ── Capture contrainte à la fenêtre active ──
        # Le grounding ne voit QUE la fenêtre attendue — pas la taskbar,
        # pas le systray, pas les autres apps. Comme un humain qui regarde
        # l'application sur laquelle il travaille.
        window_rect = None
        try:
            from ..window_info_crossplatform import get_active_window_rect
            win_info = get_active_window_rect()
            if win_info and win_info.get("rect"):
                r = win_info["rect"]  # [left, top, right, bottom]
                # Validation : fenêtre visible et pas minuscule
                w = r[2] - r[0]
                h = r[3] - r[1]
                if w > 50 and h > 50:
                    window_rect = {
                        "left": max(0, r[0]),
                        "top": max(0, r[1]),
                        "width": min(w, screen_width),
                        "height": min(h, screen_height),
                    }
                    logger.info(
                        f"Grounding contraint à la fenêtre : "
                        f"{window_rect['width']}x{window_rect['height']} "
                        f"à ({window_rect['left']}, {window_rect['top']})"
                    )
        except Exception as e:
            logger.debug(f"Pas de window rect disponible : {e}")
        screenshot_b64 = self._capture_window_or_screen(window_rect)
        if not screenshot_b64:
            return GroundingResult(
                found=False, detail="Capture screenshot échouée",
                elapsed_ms=(time.time() - t_start) * 1000,
            )
        # Dimensions de la zone capturée (fenêtre ou écran entier)
        cap_w = window_rect["width"] if window_rect else screen_width
        cap_h = window_rect["height"] if window_rect else screen_height
        for strategy in strategies:
            result = self._try_strategy(
                strategy, server_url, screenshot_b64, target_spec,
-                fallback_x, fallback_y, screen_width, screen_height,
+                fallback_x, fallback_y, cap_w, cap_h,
            )
            if result.found:
                # ── Conversion coords fenêtre → coords écran ──
                if window_rect:
                    # Le grounding a retourné des coords relatives à la fenêtre
                    # On les convertit en coords relatives à l'écran entier
                    abs_x = window_rect["left"] + result.x_pct * cap_w
                    abs_y = window_rect["top"] + result.y_pct * cap_h
                    result.x_pct = abs_x / screen_width
                    result.y_pct = abs_y / screen_height
                    result.detail = f"{result.detail} [fenêtre {cap_w}x{cap_h}]"
                result.elapsed_ms = (time.time() - t_start) * 1000
                return result
@@ -148,6 +192,39 @@ class GroundingEngine:
            elapsed_ms=(time.time() - t_start) * 1000,
        )
    def _capture_window_or_screen(self, window_rect: Optional[Dict]) -> str:
        """Capturer soit la fenêtre active (croppée), soit l'écran entier.
        Si window_rect est fourni, capture uniquement cette zone.
        Sinon, capture l'écran entier (fallback).
        """
        try:
            from PIL import Image
            import mss as mss_lib
            with mss_lib.mss() as local_sct:
                if window_rect:
                    # Capture de la zone fenêtre uniquement
                    region = {
                        "left": window_rect["left"],
                        "top": window_rect["top"],
                        "width": window_rect["width"],
                        "height": window_rect["height"],
                    }
                    raw = local_sct.grab(region)
                else:
                    # Fallback écran entier
                    raw = local_sct.grab(local_sct.monitors[1])
            img = Image.frombytes("RGB", raw.size, raw.bgra, "raw", "BGRX")
            buffer = io.BytesIO()
            img.save(buffer, format="JPEG", quality=75)
            return base64.b64encode(buffer.getvalue()).decode("utf-8")
        except Exception as e:
            logger.warning(f"Capture échouée : {e}")
            # Fallback sur la méthode existante de l'executor
            return self._executor._capture_screenshot_b64(max_width=0, quality=75)
    def _try_strategy(
        self,
        strategy: str,
--- a/agent_v0/agent_v1/ui/messages.py
+++ b/agent_v0/agent_v1/ui/messages.py
@@ -568,6 +568,35 @@ def est_fenetre_lea(titre_fenetre: str) -> bool:
    return any(re.search(motif, titre_lower) for motif in _MOTIFS_FENETRE_LEA_REGEX)
 # Fenêtres parasites Windows à ignorer dans les pré-vérifications.
 # Ce ne sont pas des fenêtres applicatives — c'est du bruit système
 # qui prend le focus de manière imprévisible.
 _FENETRES_BRUIT_SYSTEME = (
    "fenêtre de dépassement de capacité",
    "overflow",               # version anglaise systray
    "program manager",
    "barre des tâches",
    "task bar",
    "cortana",
    "action center",
    "centre de notifications",
 )
 def est_fenetre_bruit(titre_fenetre: str) -> bool:
    """Détecter si un titre de fenêtre est du bruit système Windows.
    Ces fenêtres prennent le focus de manière imprévisible (systray overflow,
    taskbar, Program Manager) et ne sont jamais la cible d'une action utilisateur.
    """
    if not titre_fenetre:
        return True  # pas de titre = bruit
    titre_lower = titre_fenetre.lower().strip()
    if titre_lower == "unknown_window":
        return True
    return any(p in titre_lower for p in _FENETRES_BRUIT_SYSTEME)
 # Conservé pour rétro-compatibilité avec le code qui listait MOTIFS_FENETRE_LEA
 MOTIFS_FENETRE_LEA = (
    "léa",
--- a/agent_v0/deploy/windows_client/agent_v1/core/executor.py
+++ b/agent_v0/deploy/windows_client/agent_v1/core/executor.py
@@ -17,6 +17,7 @@ import base64
 import hashlib
 import io
 import os
 import threading
 import time
 import logging
@@ -72,6 +73,8 @@ class ActionExecutorV1:
        # different de celui qui utilise l'instance).
        self._sct = None
        self.running = True
        # Verrou de sérialisation — une seule action replay à la fois
        self._replay_lock = threading.Lock()
        # Backoff exponentiel pour le polling replay (evite de marteler le serveur)
        self._poll_backoff = 1.0       # Delai actuel (secondes)
        self._poll_backoff_min = 1.0   # Delai minimal (reset apres succes)
@@ -241,6 +244,107 @@ class ActionExecutorV1:
            logger.warning(f"Acteur gemma4 indisponible : {e}")
            return "EXECUTER"
    # =========================================================================
    # UIA local — résolution via lea_uia.exe (helper Rust)
    # =========================================================================
    def _resolve_via_uia_local(
        self, uia_target: dict, screen_width: int, screen_height: int,
    ):
        """Résoudre une cible via UIA local (lea_uia.exe).
        Le plan contient un uia_target (nom, control_type, parent_path).
        On appelle le helper Rust qui interroge UIAutomationCore.dll et
        retourne les coordonnées pixel-perfect de l'élément.
        STRICT : si l'élément trouvé n'appartient pas à la bonne fenêtre
        parente (comparaison du parent_path), on REFUSE.
        Retourne (x_pct, y_pct) si trouvé ET validé, None sinon.
        """
        try:
            from .uia_helper import get_shared_helper
            helper = get_shared_helper()
            if not helper.available:
                return None
            name = uia_target.get("name", "")
            control_type = uia_target.get("control_type", "") or None
            automation_id = uia_target.get("automation_id", "") or None
            expected_parent_path = uia_target.get("parent_path", []) or []
            if not name:
                return None
            element = helper.find_by_name(
                name=name,
                control_type=control_type,
                automation_id=automation_id,
                timeout_ms=1500,
            )
            if element is None or not element.is_clickable():
                logger.debug(f"UIA: '{name}' non trouvé ou non cliquable")
                return None
            # ── VÉRIFICATION STRICTE du parent_path ──
            if expected_parent_path:
                expected_root = None
                for p in expected_parent_path:
                    if p.get("control_type", "").lower() in ("fenêtre", "window"):
                        expected_root = p.get("name", "").strip()
                        break
                if expected_root:
                    found_root = None
                    for p in element.parent_path:
                        if p.get("control_type", "").lower() in ("fenêtre", "window"):
                            found_root = p.get("name", "").strip()
                            break
                    if found_root and expected_root != found_root:
                        if (expected_root.lower() not in found_root.lower()
                                and found_root.lower() not in expected_root.lower()):
                            logger.warning(
                                f"UIA REJET : '{name}' trouvé dans '{found_root}' "
                                f"mais attendu dans '{expected_root}'"
                            )
                            print(
                                f"    [UIA] REJET — '{name}' trouvé dans mauvaise fenêtre "
                                f"({found_root} ≠ {expected_root})"
                            )
                            return None
            # ── GARDE : rejeter les éléments géants (conteneurs) ──
            elem_w = element.width()
            elem_h = element.height()
            screen_area = screen_width * screen_height
            elem_area = elem_w * elem_h
            if screen_area > 0 and elem_area / screen_area > 0.5:
                logger.warning(
                    f"UIA REJET : '{name}' couvre {elem_area / screen_area * 100:.0f}% "
                    f"de l'écran ({elem_w}x{elem_h}) — conteneur, pas un élément cliquable"
                )
                print(
                    f"    [UIA] REJET — '{name}' trop grand "
                    f"({elem_w}x{elem_h}, {elem_area / screen_area * 100:.0f}% écran)"
                )
                return None
            cx, cy = element.center()
            if screen_width <= 0 or screen_height <= 0:
                return None
            x_pct = cx / screen_width
            y_pct = cy / screen_height
            if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0):
                return None
            return (x_pct, y_pct)
        except Exception as e:
            logger.debug(f"UIA local resolve erreur : {e}")
            return None
    # =========================================================================
    # Observer — pré-analyse écran avant chaque action
    # =========================================================================
@@ -385,6 +489,11 @@ class ActionExecutorV1:
            "visual_resolved": False,
        }
        # ── Délai inter-actions (anti race condition mss) ──
        wait_before = action.get("wait_before", 0.5)
        if wait_before > 0:
            time.sleep(wait_before)
        try:
            monitor = self.sct.monitors[1]
            width, height = monitor["width"], monitor["height"]
@@ -393,6 +502,14 @@ class ActionExecutorV1:
            x_pct = action.get("x_pct", 0.0)
            y_pct = action.get("y_pct", 0.0)
            # ── Diagnostic résolution ──
            logger.info(
                f"[REPLAY] Action {action_id} ({action_type}) — "
                f"écran replay: {width}x{height}, "
                f"x_pct={x_pct:.4f}, y_pct={y_pct:.4f} "
                f"→ pixel ({int(x_pct * width)}, {int(y_pct * height)})"
            )
            # Extraire le nom de l'application depuis un titre de fenêtre
            def _app_name(title):
                for sep in [" – ", " - ", " — "]:
@@ -477,8 +594,27 @@ class ActionExecutorV1:
                            return result
                        # EXECUTER → continuer normalement
-            if visual_mode and target_spec and server_url:
+            # ── UIA local : résolution rapide via lea_uia.exe ──
-                # ── GROUNDING : localisation pure via GroundingEngine ──
+            uia_resolved = False
            if visual_mode and target_spec and action_type == "click":
                resolve_order = target_spec.get("resolve_order", [])
                uia_target = target_spec.get("uia_target")
                if resolve_order and resolve_order[0] == "uia" and uia_target:
                    uia_coords = self._resolve_via_uia_local(uia_target, width, height)
                    if uia_coords:
                        x_pct, y_pct = uia_coords
                        result["visual_resolved"] = True
                        result["resolution_method"] = "uia_local"
                        result["resolution_score"] = 0.95
                        uia_resolved = True
                        print(f"    [UIA] résolu en local: ({x_pct:.4f}, {y_pct:.4f})")
                        logger.info(
                            f"UIA local OK : {uia_target.get('name', '?')} "
                            f"→ ({x_pct:.4f}, {y_pct:.4f})"
                        )
            if not uia_resolved and visual_mode and target_spec and server_url:
                # ── GROUNDING : localisation pure via GroundingEngine (fallback) ──
                from .grounding import GroundingEngine
                grounding = GroundingEngine(self)
                grounding_result = grounding.locate(
@@ -510,8 +646,8 @@ class ActionExecutorV1:
            if action_type == "click":
                # Si visual_mode est activé, le resolve DOIT réussir.
-                # Pas de fallback blind — on arrête le replay si la cible
+                # Pas de fallback blind — Léa VOIT l'écran et CHERCHE
-                # n'est pas trouvée visuellement. C'est un RPA VISUEL.
+                # l'élément. Si toute la cascade échoue → pause supervisée.
                if visual_mode and not result.get("visual_resolved"):
                    # ── Policy : décider quoi faire quand grounding échoue ──
                    from .policy import PolicyEngine, Decision
@@ -533,7 +669,6 @@ class ActionExecutorV1:
                    )
                    if policy_decision.decision == Decision.RETRY:
                        # Re-tenter le grounding après correction (popup fermée, etc.)
                        resolved2 = self._resolve_target_visual(
                            server_url, target_spec, x_pct, y_pct, width, height
                        )
@@ -543,7 +678,6 @@ class ActionExecutorV1:
                            result["visual_resolved"] = True
                            print(f"    [POLICY] Re-resolve OK après {policy_decision.action_taken}")
                        else:
                            # Re-resolve échoué — SUPERVISE (rendre la main)
                            result["success"] = False
                            result["error"] = "target_not_found"
                            result["target_description"] = target_desc
@@ -1200,20 +1334,17 @@ Example: x_pct=0.50, y_pct=0.30"""
    def poll_and_execute(self, session_id: str, server_url: str, machine_id: str = "default") -> bool:
        """
        Poll le serveur pour recuperer et executer la prochaine action.
-
+        Sérialisé par _replay_lock — une seule action à la fois.
        1. GET /replay/next pour recuperer l'action
        2. Execute l'action (clic, texte, etc.)
        3. POST /replay/result avec le resultat + screenshot
        Args:
            session_id: Identifiant de la session courante
            server_url: URL de base du serveur streaming
            machine_id: Identifiant de la machine (pour le replay multi-machine)
        Retourne True si une action a ete executee, False sinon.
        IMPORTANT: Si une action est recue, le resultat est TOUJOURS rapporte
        au serveur (meme en cas d'erreur d'execution).
        """
        if not self._replay_lock.acquire(blocking=False):
            return False
        try:
            return self._poll_and_execute_inner(session_id, server_url, machine_id)
        finally:
            self._replay_lock.release()
    def _poll_and_execute_inner(self, session_id: str, server_url: str, machine_id: str) -> bool:
        """Implémentation interne de poll_and_execute (protégée par _replay_lock)."""
        import requests
        replay_next_url = f"{server_url}/traces/stream/replay/next"
--- a/agent_v0/deploy/windows_client/agent_v1/core/grounding.py
+++ b/agent_v0/deploy/windows_client/agent_v1/core/grounding.py
@@ -0,0 +1,214 @@
 # agent_v1/core/grounding.py
 """
 Module Grounding — localisation pure d'éléments UI sur l'écran.
 Responsabilité unique : "Trouve l'élément X sur l'écran et retourne ses coordonnées."
 Ne prend AUCUNE décision. Si l'élément n'est pas trouvé → retourne NOT_FOUND.
 Stratégies disponibles (cascade configurable) :
 1. Serveur SomEngine + VLM (GPU distant)
 2. Template matching local (CPU, ~10ms)
 3. VLM local direct (CPU/GPU local)
 Séparé de Policy (qui décide quoi faire quand grounding échoue).
 Ref: docs/PLAN_ACTEUR_V1.md — Architecture MICRO (grounding + exécution)
 """
 import base64
 import logging
 import os
 import time
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 logger = logging.getLogger(__name__)
@dataclass
 class GroundingResult:
    """Résultat d'une tentative de localisation visuelle."""
    found: bool                     # L'élément a été trouvé
    x_pct: float = 0.0             # Position X en % (0.0-1.0)
    y_pct: float = 0.0             # Position Y en % (0.0-1.0)
    method: str = ""               # Méthode utilisée (server_som, anchor_template, vlm_direct...)
    score: float = 0.0             # Confiance (0.0-1.0)
    elapsed_ms: float = 0.0        # Temps de résolution
    detail: str = ""               # Info supplémentaire (label trouvé, raison échec)
    raw: Optional[Dict] = None     # Données brutes du resolver (pour debug)
    def to_dict(self) -> Dict[str, Any]:
        return {
            "found": self.found,
            "x_pct": self.x_pct,
            "y_pct": self.y_pct,
            "method": self.method,
            "score": round(self.score, 3),
            "elapsed_ms": round(self.elapsed_ms, 1),
            "detail": self.detail,
        }
 # Résultat singleton pour "pas trouvé"
 NOT_FOUND = GroundingResult(found=False, detail="Aucune méthode n'a trouvé l'élément")
 class GroundingEngine:
    """Moteur de localisation visuelle d'éléments UI.
    Encapsule la cascade de résolution (serveur → template → VLM local)
    avec une interface unifiée. Ne prend aucune décision — c'est le rôle
    de PolicyEngine.
    Usage :
        engine = GroundingEngine(executor)
        result = engine.locate(screenshot_b64, target_spec, screen_w, screen_h)
        if result.found:
            click(result.x_pct, result.y_pct)
    """
    def __init__(self, executor):
        """
        Args:
            executor: ActionExecutorV1 — fournit les méthodes de résolution existantes.
        """
        self._executor = executor
    def locate(
        self,
        server_url: str,
        target_spec: Dict[str, Any],
        fallback_x: float,
        fallback_y: float,
        screen_width: int,
        screen_height: int,
        strategies: Optional[List[str]] = None,
    ) -> GroundingResult:
        """Localiser un élément UI sur l'écran.
        Exécute la cascade de stratégies dans l'ordre et retourne
        dès qu'une stratégie trouve l'élément.
        Args:
            server_url: URL du serveur (SomEngine + VLM GPU)
            target_spec: Spécification de la cible (by_text, anchor, vlm_description...)
            fallback_x, fallback_y: Coordonnées de fallback (enregistrement)
            screen_width, screen_height: Résolution écran
            strategies: Liste ordonnée de stratégies à essayer.
                        Par défaut : ["server", "template", "vlm_local"]
        Returns:
            GroundingResult avec found=True et coordonnées, ou NOT_FOUND
        """
        if strategies is None:
            strategies = ["server", "template", "vlm_local"]
        # ── Apprentissage : réordonner les stratégies selon l'historique ──
        # Si le Learning sait quelle méthode marche pour cette cible,
        # la mettre en premier. C'est la boucle d'apprentissage.
        learned = target_spec.get("_learned_strategy", "")
        if learned:
            strategy_map = {
                "som_text_match": "server",
                "grounding_vlm": "server",
                "server_som": "server",
                "anchor_template": "template",
                "template_matching": "template",
                "hybrid_text_direct": "vlm_local",
                "hybrid_vlm_text": "vlm_local",
                "vlm_direct": "vlm_local",
            }
            preferred = strategy_map.get(learned, "")
            if preferred and preferred in strategies:
                strategies = [preferred] + [s for s in strategies if s != preferred]
                logger.info(
                    f"Grounding: stratégie réordonnée par l'apprentissage → "
                    f"{strategies} (learned={learned})"
                )
        t_start = time.time()
        screenshot_b64 = self._executor._capture_screenshot_b64(max_width=0, quality=75)
        if not screenshot_b64:
            return GroundingResult(
                found=False, detail="Capture screenshot échouée",
                elapsed_ms=(time.time() - t_start) * 1000,
            )
        for strategy in strategies:
            result = self._try_strategy(
                strategy, server_url, screenshot_b64, target_spec,
                fallback_x, fallback_y, screen_width, screen_height,
            )
            if result.found:
                result.elapsed_ms = (time.time() - t_start) * 1000
                return result
        return GroundingResult(
            found=False,
            detail=f"Toutes les stratégies ont échoué ({', '.join(strategies)})",
            elapsed_ms=(time.time() - t_start) * 1000,
        )
    def _try_strategy(
        self,
        strategy: str,
        server_url: str,
        screenshot_b64: str,
        target_spec: Dict[str, Any],
        fallback_x: float,
        fallback_y: float,
        screen_width: int,
        screen_height: int,
    ) -> GroundingResult:
        """Essayer une stratégie de grounding unique."""
        if strategy == "server" and server_url:
            raw = self._executor._server_resolve_target(
                server_url, screenshot_b64, target_spec,
                fallback_x, fallback_y, screen_width, screen_height,
            )
            if raw and raw.get("resolved"):
                return GroundingResult(
                    found=True,
                    x_pct=raw["x_pct"],
                    y_pct=raw["y_pct"],
                    method=raw.get("method", "server"),
                    score=raw.get("score", 0.0),
                    detail=raw.get("matched_element", {}).get("label", ""),
                    raw=raw,
                )
        elif strategy == "template":
            anchor_b64 = target_spec.get("anchor_image_base64", "")
            if anchor_b64:
                raw = self._executor._template_match_anchor(
                    screenshot_b64, anchor_b64, screen_width, screen_height,
                )
                if raw and raw.get("resolved"):
                    return GroundingResult(
                        found=True,
                        x_pct=raw["x_pct"],
                        y_pct=raw["y_pct"],
                        method="anchor_template",
                        score=raw.get("score", 0.0),
                        raw=raw,
                    )
        elif strategy == "vlm_local":
            by_text = target_spec.get("by_text", "")
            vlm_desc = target_spec.get("vlm_description", "")
            if vlm_desc or by_text:
                raw = self._executor._hybrid_vlm_resolve(
                    screenshot_b64, target_spec, screen_width, screen_height,
                )
                if raw and raw.get("resolved"):
                    return GroundingResult(
                        found=True,
                        x_pct=raw["x_pct"],
                        y_pct=raw["y_pct"],
                        method=raw.get("method", "vlm_local"),
                        score=raw.get("score", 0.0),
                        detail=raw.get("matched_element", {}).get("label", ""),
                        raw=raw,
                    )
        return GroundingResult(found=False, method=strategy, detail=f"{strategy}: pas trouvé")
--- a/agent_v0/deploy/windows_client/agent_v1/core/policy.py
+++ b/agent_v0/deploy/windows_client/agent_v1/core/policy.py
@@ -0,0 +1,152 @@
 # agent_v1/core/policy.py
 """
 Module Policy — décisions intelligentes quand le grounding échoue.
 Responsabilité unique : "Le Grounding dit NOT_FOUND. Que fait-on ?"
 Ne localise AUCUN élément — c'est le rôle du Grounding.
 Décisions possibles :
 - RETRY : re-tenter le grounding (après popup fermée, par exemple)
 - SKIP : l'action n'est plus nécessaire (état déjà atteint)
 - ABORT : arrêter le workflow (état incohérent)
 - SUPERVISE : rendre la main à l'utilisateur
 Séparé de Grounding (qui localise les éléments).
 Ref: docs/PLAN_ACTEUR_V1.md — Architecture MÉSO (acteur intelligent)
 """
 import logging
 import os
 import time
 from dataclasses import dataclass
 from enum import Enum
 from typing import Any, Dict, Optional
 logger = logging.getLogger(__name__)
 class Decision(Enum):
    """Décisions possibles quand le grounding échoue."""
    RETRY = "retry"             # Re-tenter (après correction : popup fermée, navigation...)
    SKIP = "skip"               # Action inutile (état déjà atteint)
    ABORT = "abort"             # Arrêter le workflow (état incohérent)
    SUPERVISE = "supervise"     # Rendre la main à l'utilisateur (Léa dit "je bloque")
    CONTINUE = "continue"       # Continuer malgré l'échec (action non critique)
@dataclass
 class PolicyDecision:
    """Résultat d'une décision Policy."""
    decision: Decision
    reason: str                     # Explication de la décision
    action_taken: str = ""          # Action corrective effectuée (ex: "popup fermée")
    elapsed_ms: float = 0.0
    def to_dict(self) -> Dict[str, Any]:
        return {
            "decision": self.decision.value,
            "reason": self.reason,
            "action_taken": self.action_taken,
            "elapsed_ms": round(self.elapsed_ms, 1),
        }
 class PolicyEngine:
    """Moteur de décision quand le grounding échoue.
    Cascade de décision :
    1. Popup détectée ? → fermer et RETRY
    2. Acteur gemma4 → SKIP / ABORT / SUPERVISE
    3. Fallback → SUPERVISE (rendre la main)
    Usage :
        policy = PolicyEngine(executor)
        decision = policy.decide(action, target_spec, grounding_result)
        if decision.decision == Decision.RETRY:
            # re-tenter le grounding
        elif decision.decision == Decision.SKIP:
            # marquer comme réussi, passer à la suite
    """
    def __init__(self, executor):
        self._executor = executor
    def decide(
        self,
        action: Dict[str, Any],
        target_spec: Dict[str, Any],
        retry_count: int = 0,
        max_retries: int = 1,
    ) -> PolicyDecision:
        """Décider quoi faire quand le grounding a échoué.
        Cascade :
        1. Si c'est le premier essai → tenter de fermer une popup → RETRY
        2. Si retry déjà fait → demander à l'acteur gemma4
        3. Selon gemma4 : SKIP, ABORT, ou SUPERVISE
        Args:
            action: L'action qui a échoué
            target_spec: La cible non trouvée
            retry_count: Nombre de retries déjà faits
            max_retries: Maximum de retries autorisés
        """
        t_start = time.time()
        # ── Étape 1 : Tentative de fermeture popup (premier essai) ──
        if retry_count == 0:
            popup_handled = self._try_close_popup()
            if popup_handled:
                return PolicyDecision(
                    decision=Decision.RETRY,
                    reason="Popup détectée et fermée, re-tentative",
                    action_taken="popup_closed",
                    elapsed_ms=(time.time() - t_start) * 1000,
                )
        # ── Étape 2 : Max retries atteint → acteur gemma4 ──
        if retry_count >= max_retries:
            actor_decision = self._ask_actor(action, target_spec)
            if actor_decision == "PASSER":
                return PolicyDecision(
                    decision=Decision.SKIP,
                    reason="Acteur gemma4 : l'état est déjà atteint",
                    elapsed_ms=(time.time() - t_start) * 1000,
                )
            elif actor_decision == "STOPPER":
                return PolicyDecision(
                    decision=Decision.ABORT,
                    reason="Acteur gemma4 : état incohérent, arrêt",
                    elapsed_ms=(time.time() - t_start) * 1000,
                )
            else:
                # EXECUTER ou inconnu → pause supervisée
                return PolicyDecision(
                    decision=Decision.SUPERVISE,
                    reason=f"Acteur gemma4 : {actor_decision}, pause supervisée",
                    elapsed_ms=(time.time() - t_start) * 1000,
                )
        # ── Étape 3 : Encore des retries disponibles → RETRY ──
        return PolicyDecision(
            decision=Decision.RETRY,
            reason=f"Retry {retry_count + 1}/{max_retries}",
            elapsed_ms=(time.time() - t_start) * 1000,
        )
    def _try_close_popup(self) -> bool:
        """Tenter de fermer une popup via le handler VLM existant."""
        try:
            return self._executor._handle_popup_vlm()
        except Exception as e:
            logger.debug(f"Policy: popup handler échoué : {e}")
            return False
    def _ask_actor(self, action: Dict, target_spec: Dict) -> str:
        """Demander à gemma4 de décider (PASSER/EXECUTER/STOPPER)."""
        try:
            return self._executor._actor_decide(action, target_spec)
        except Exception as e:
            logger.debug(f"Policy: acteur gemma4 échoué : {e}")
            return "EXECUTER"  # Fallback → supervisé
--- a/agent_v0/deploy/windows_client/agent_v1/core/uia_helper.py
+++ b/agent_v0/deploy/windows_client/agent_v1/core/uia_helper.py
@@ -0,0 +1,294 @@
 # core/workflow/uia_helper.py
 """
 UIAHelper — Wrapper Python pour lea_uia.exe (helper Rust UI Automation).
 Expose une API Python simple pour interroger UIA via le binaire Rust.
 Communique via subprocess + stdin/stdout JSON.
 Pourquoi un helper Rust ?
 - 5-10x plus rapide que pywinauto (10-20ms vs 50-200ms)
 - Binaire standalone ~500 Ko, aucune dépendance runtime
 - Pas de problèmes de threading COM en Python
 - Crash-safe (le crash du helper n'affecte pas l'agent Python)
 Architecture :
    Python executor
        ↓ subprocess.run
    lea_uia.exe query --x 812 --y 436
        ↓ UIA API Windows
    JSON response
        ↓ stdout
    Python executor parse JSON
 Si lea_uia.exe n'est pas disponible (Linux, binaire absent, crash) :
 toutes les méthodes retournent None → fallback vision automatique.
 """
 import json
 import logging
 import os
 import platform
 import subprocess
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Tuple
 logger = logging.getLogger(__name__)
 # Timeout par défaut pour les appels UIA (en secondes)
 _DEFAULT_TIMEOUT = 5.0
 # Masquer la fenêtre console lors du spawn de lea_uia.exe sur Windows.
 # Sans ce flag, chaque appel (à chaque clic utilisateur pendant
 # l'enregistrement) fait apparaître une fenêtre cmd noire brièvement
 # visible à l'écran → ralentit la souris et pollue les screenshots
 # capturés (le VLM peut "voir" le chemin lea_uia.exe comme texte cliqué).
 #
 # La valeur 0x08000000 correspond à CREATE_NO_WINDOW défini dans
 # l'API Windows. Sur Linux/Mac, la valeur est 0 et `creationflags`
 # est ignoré. getattr() gère le cas où Python expose déjà la constante
 # sur Windows.
 if platform.system() == "Windows":
    _SUBPROCESS_CREATION_FLAGS = getattr(subprocess, "CREATE_NO_WINDOW", 0x08000000)
 else:
    _SUBPROCESS_CREATION_FLAGS = 0
@dataclass
 class UiaElement:
    """Représentation Python d'un élément UIA."""
    name: str = ""
    control_type: str = ""
    class_name: str = ""
    automation_id: str = ""
    bounding_rect: Tuple[int, int, int, int] = (0, 0, 0, 0)
    is_enabled: bool = False
    is_offscreen: bool = True
    parent_path: List[Dict[str, str]] = field(default_factory=list)
    process_name: str = ""
    def center(self) -> Tuple[int, int]:
        """Retourner le centre du rectangle (pixels)."""
        x1, y1, x2, y2 = self.bounding_rect
        return ((x1 + x2) // 2, (y1 + y2) // 2)
    def width(self) -> int:
        return self.bounding_rect[2] - self.bounding_rect[0]
    def height(self) -> int:
        return self.bounding_rect[3] - self.bounding_rect[1]
    def is_clickable(self) -> bool:
        """Peut-on cliquer dessus ?"""
        return (
            self.is_enabled
            and not self.is_offscreen
            and self.width() > 0
            and self.height() > 0
        )
    def path_signature(self) -> str:
        """Signature du chemin parent (pour retrouver l'élément)."""
        parts = [f"{p['control_type']}[{p['name']}]" for p in self.parent_path if p.get("name")]
        parts.append(f"{self.control_type}[{self.name}]")
        return " > ".join(parts)
    def to_dict(self) -> Dict[str, Any]:
        return {
            "name": self.name,
            "control_type": self.control_type,
            "class_name": self.class_name,
            "automation_id": self.automation_id,
            "bounding_rect": list(self.bounding_rect),
            "is_enabled": self.is_enabled,
            "is_offscreen": self.is_offscreen,
            "parent_path": self.parent_path,
            "process_name": self.process_name,
        }
    @classmethod
    def from_dict(cls, d: Dict[str, Any]) -> "UiaElement":
        rect = d.get("bounding_rect", [0, 0, 0, 0])
        if isinstance(rect, list) and len(rect) >= 4:
            rect = tuple(rect[:4])
        else:
            rect = (0, 0, 0, 0)
        return cls(
            name=d.get("name", ""),
            control_type=d.get("control_type", ""),
            class_name=d.get("class_name", ""),
            automation_id=d.get("automation_id", ""),
            bounding_rect=rect,
            is_enabled=d.get("is_enabled", False),
            is_offscreen=d.get("is_offscreen", True),
            parent_path=d.get("parent_path", []),
            process_name=d.get("process_name", ""),
        )
 class UIAHelper:
    """Wrapper Python pour lea_uia.exe."""
    def __init__(self, helper_path: str = "", timeout: float = _DEFAULT_TIMEOUT):
        self._helper_path = helper_path or self._find_helper()
        self._timeout = timeout
        self._available = self._check_available()
    def _find_helper(self) -> str:
        """Trouver lea_uia.exe dans les emplacements standards."""
        candidates = [
            r"C:\Lea\helpers\lea_uia.exe",
            os.path.join(os.path.dirname(__file__), "..", "..",
                         "agent_rust", "lea_uia", "target",
                         "x86_64-pc-windows-gnu", "release", "lea_uia.exe"),
            "./helpers/lea_uia.exe",
            "lea_uia.exe",
        ]
        for path in candidates:
            if os.path.isfile(path):
                return os.path.abspath(path)
        return ""
    def _check_available(self) -> bool:
        """Vérifier que le helper est utilisable (Windows + binaire + health OK)."""
        if platform.system() != "Windows":
            logger.debug("UIAHelper: Linux/Mac — helper désactivé")
            return False
        if not self._helper_path:
            logger.debug("UIAHelper: lea_uia.exe introuvable")
            return False
        if not os.path.isfile(self._helper_path):
            logger.debug(f"UIAHelper: chemin invalide {self._helper_path}")
            return False
        return True
    @property
    def available(self) -> bool:
        return self._available
    @property
    def helper_path(self) -> str:
        return self._helper_path
    def _run(self, args: List[str]) -> Optional[Dict[str, Any]]:
        """Exécuter lea_uia.exe avec les arguments et parser le JSON."""
        if not self._available:
            return None
        try:
            result = subprocess.run(
                [self._helper_path] + args,
                capture_output=True,
                text=True,
                timeout=self._timeout,
                encoding="utf-8",
                errors="replace",
                creationflags=_SUBPROCESS_CREATION_FLAGS,
            )
            if result.returncode != 0:
                logger.debug(
                    f"UIAHelper: exit code {result.returncode}, "
                    f"stderr: {result.stderr[:200]}"
                )
                return None
            output = result.stdout.strip()
            if not output:
                return None
            return json.loads(output)
        except subprocess.TimeoutExpired:
            logger.debug(f"UIAHelper: timeout ({self._timeout}s) sur {args}")
            return None
        except json.JSONDecodeError as e:
            logger.debug(f"UIAHelper: JSON invalide — {e}")
            return None
        except Exception as e:
            logger.debug(f"UIAHelper: erreur {e}")
            return None
    def health(self) -> bool:
        """Vérifier que UIA répond."""
        data = self._run(["health"])
        return data is not None and data.get("status") == "ok"
    def query_at(
        self,
        x: int,
        y: int,
        with_parents: bool = True,
    ) -> Optional[UiaElement]:
        """Récupérer l'élément UIA à une position écran.
        Args:
            x, y: Coordonnées pixel absolues
            with_parents: Inclure la hiérarchie des parents
        Returns:
            UiaElement si trouvé, None sinon (pas d'élément ou UIA indispo)
        """
        args = ["query", "--x", str(x), "--y", str(y)]
        if not with_parents:
            args.append("--with-parents=false")
        data = self._run(args)
        if not data or data.get("status") != "ok":
            return None
        elem_data = data.get("element")
        if not elem_data:
            return None
        return UiaElement.from_dict(elem_data)
    def find_by_name(
        self,
        name: str,
        control_type: Optional[str] = None,
        automation_id: Optional[str] = None,
        window: Optional[str] = None,
        timeout_ms: int = 2000,
    ) -> Optional[UiaElement]:
        """Rechercher un élément par son nom (+ filtres optionnels).
        Args:
            name: Nom exact de l'élément
            control_type: Type de contrôle (Button, Edit, MenuItem...)
            automation_id: ID d'automation
            window: Restreindre à une fenêtre spécifique
            timeout_ms: Timeout de recherche en millisecondes
        """
        args = ["find", "--name", name, "--timeout-ms", str(timeout_ms)]
        if control_type:
            args.extend(["--control-type", control_type])
        if automation_id:
            args.extend(["--automation-id", automation_id])
        if window:
            args.extend(["--window", window])
        data = self._run(args)
        if not data or data.get("status") != "ok":
            return None
        elem_data = data.get("element")
        if not elem_data:
            return None
        return UiaElement.from_dict(elem_data)
    def capture_focused(self, max_depth: int = 3) -> Optional[UiaElement]:
        """Capturer l'élément ayant le focus + son contexte."""
        data = self._run(["capture", "--max-depth", str(max_depth)])
        if not data or data.get("status") != "ok":
            return None
        elem_data = data.get("element")
        if not elem_data:
            return None
        return UiaElement.from_dict(elem_data)
 # Instance globale partagée (singleton léger)
 _SHARED_HELPER: Optional[UIAHelper] = None
 def get_shared_helper() -> UIAHelper:
    """Retourner une instance partagée de UIAHelper."""
    global _SHARED_HELPER
    if _SHARED_HELPER is None:
        _SHARED_HELPER = UIAHelper()
    return _SHARED_HELPER
--- a/agent_v0/run_agent_v1.py
+++ b/agent_v0/run_agent_v1.py
@@ -1,12 +1,97 @@
 # run_agent_v1.py
 import sys
 import os
 import atexit
 # Ajout du répertoire courant au PYTHONPATH pour permettre les imports de modules
 current_dir = os.path.dirname(os.path.abspath(__file__))
 if current_dir not in sys.path:
    sys.path.append(current_dir)
 # ---------------------------------------------------------------
 # Verrou PID — empêche le lancement de plusieurs instances
 # Même si Lea.bat est double-cliqué ou lancé deux fois,
 # un seul agent tourne à la fois (defense-in-depth).
 # ---------------------------------------------------------------
 LOCK_FILE = os.path.join(current_dir, "lea_agent.lock")
 def _pid_is_alive(pid: int) -> bool:
    """Vérifie si un processus avec ce PID existe encore (Windows + Unix)."""
    if sys.platform == "win32":
        try:
            import ctypes
            kernel32 = ctypes.windll.kernel32  # type: ignore[attr-defined]
            PROCESS_QUERY_LIMITED_INFORMATION = 0x1000
            handle = kernel32.OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION, False, pid)
            if handle:
                kernel32.CloseHandle(handle)
                return True
            return False
        except Exception:
            # Fallback : tasklist
            try:
                import subprocess
                result = subprocess.run(
                    ["tasklist", "/FI", f"PID eq {pid}", "/NH"],
                    capture_output=True, text=True, timeout=5,
                )
                return str(pid) in result.stdout
            except Exception:
                return False
    else:
        # Unix/Linux — os.kill(pid, 0) ne tue pas le process
        try:
            os.kill(pid, 0)
            return True
        except (OSError, ProcessLookupError):
            return False
 def _acquire_lock() -> bool:
    """Tente d'acquérir le verrou PID. Retourne False si une autre instance tourne."""
    my_pid = os.getpid()
    # Lire le PID existant
    if os.path.isfile(LOCK_FILE):
        try:
            with open(LOCK_FILE, "r", encoding="utf-8") as f:
                old_pid = int(f.read().strip())
            # Le PID dans le lock est-il encore vivant ?
            if old_pid != my_pid and _pid_is_alive(old_pid):
                return False  # Une autre instance tourne déjà
        except (ValueError, OSError):
            pass  # Fichier corrompu — on l'écrase
    # Écrire notre PID
    try:
        with open(LOCK_FILE, "w", encoding="utf-8") as f:
            f.write(str(my_pid))
    except OSError:
        pass  # Pas bloquant — on continue sans lock
    return True
 def _release_lock():
    """Supprime le fichier lock au shutdown."""
    try:
        if os.path.isfile(LOCK_FILE):
            with open(LOCK_FILE, "r", encoding="utf-8") as f:
                stored_pid = int(f.read().strip())
            # Ne supprimer que si c'est bien NOTRE lock
            if stored_pid == os.getpid():
                os.remove(LOCK_FILE)
    except (ValueError, OSError):
        pass
 # Vérification du lock AVANT toute initialisation lourde
 if not _acquire_lock():
    # Une autre instance de Léa tourne déjà — on quitte silencieusement
    sys.exit(0)
 atexit.register(_release_lock)
 # Charger config.txt et .env comme variables d'environnement
 # (équivalent du `set` dans Lea.bat, mais fonctionne aussi sans le .bat)
 for config_file in ("config.txt", ".env"):
@@ -32,7 +117,7 @@ logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
 )
-logging.info("=== Agent V1 démarrage — config chargée ===")
+logging.info("=== Agent V1 démarrage — config chargée (PID %d) ===", os.getpid())
 logging.info("RPA_SERVER_URL=%s", os.environ.get("RPA_SERVER_URL", "(non défini)"))
 logging.info("RPA_SERVER_HOST=%s", os.environ.get("RPA_SERVER_HOST", "(non défini)"))
 logging.info("RPA_API_TOKEN=%s", os.environ.get("RPA_API_TOKEN", "(non défini)")[:8] + "...")
--- a/agent_v0/server_v1/api_stream.py
+++ b/agent_v0/server_v1/api_stream.py
@@ -488,6 +488,8 @@ class ReplayResultReport(BaseModel):
    # Champs enrichis pour target_not_found (pause supervisée)
    target_description: Optional[str] = None  # Description humaine de la cible
    target_spec: Optional[Dict[str, Any]] = None  # Spec complete de la cible
    # Correction humaine (mode apprentissage supervisé)
    correction: Optional[Dict[str, Any]] = None  # {x_pct, y_pct, uia_snapshot, crop_b64}
 class ErrorCallbackConfig(BaseModel):
@@ -1883,6 +1885,26 @@ async def start_raw_replay(request: RawReplayRequest):
    resolved_machine_id = target_machine_id or (session_obj.machine_id if session_obj else "default")
    with _replay_lock:
        # ── Nettoyage : annuler les replays bloqués pour cette machine ──
        # Un replay en paused_need_help bloque tous les suivants.
        # Quand on lance un nouveau replay, les anciens sont obsolètes.
        stale_ids = [
            rid for rid, state in _replay_states.items()
            if state.get("machine_id") == resolved_machine_id
            and state["status"] in ("paused_need_help", "running")
        ]
        for rid in stale_ids:
            old_state = _replay_states[rid]
            old_sid = old_state.get("session_id", "")
            old_state["status"] = "cancelled"
            # Vider la queue associée
            if old_sid in _replay_queues:
                _replay_queues.pop(old_sid, None)
            logger.info(
                f"Replay {rid} annulé (remplacé par {replay_id}) — "
                f"était {old_state.get('completed_actions', 0)}/{old_state.get('total_actions', 0)}"
            )
        _replay_queues[session_id] = list(actions)
        _replay_states[replay_id] = _create_replay_state(
            replay_id=replay_id,
@@ -3032,6 +3054,26 @@ async def report_action_result(report: ReplayResultReport):
    except Exception as e:
        logger.debug(f"Learning: échec enregistrement: {e}")
    # === Correction humaine (mode apprentissage supervisé) ===
    # L'humain a montré à Léa où cliquer. On stocke cette correction
    # dans target_memory pour que la prochaine fois, Léa sache toute seule.
    if report.correction and original_action:
        try:
            corr = report.correction
            target_spec = original_action.get("target_spec", {})
            logger.info(
                f"[APPRENTISSAGE] Correction humaine reçue : "
                f"({corr.get('x_pct', 0):.4f}, {corr.get('y_pct', 0):.4f}) "
                f"pour '{target_spec.get('by_text', '?')}'"
            )
            _replay_learner.record_human_correction(
                session_id=session_id,
                action=original_action,
                correction=corr,
            )
        except Exception as e:
            logger.warning(f"Learning: échec stockage correction humaine: {e}")
    # === Audit Trail : traçabilité complète pour conformité hospitalière ===
    try:
        _action = original_action or {"action_id": action_id, "type": "unknown"}
--- a/agent_v0/server_v1/replay_learner.py
+++ b/agent_v0/server_v1/replay_learner.py
@@ -175,6 +175,55 @@ class ReplayLearner:
        self.record(outcome)
    def record_human_correction(
        self,
        session_id: str,
        action: Dict[str, Any],
        correction: Dict[str, Any],
    ) -> None:
        """Enregistrer une correction humaine (mode apprentissage supervisé).
        L'humain a montré à Léa où cliquer. On stocke cette correction
        dans target_memory.db pour que la prochaine fois, Léa sache.
        """
        target_spec = action.get("target_spec", {})
        by_text = target_spec.get("by_text", "")
        window_title = target_spec.get("window_title", "")
        x_pct = correction.get("x_pct", 0.0)
        y_pct = correction.get("y_pct", 0.0)
        # Enregistrer dans le JSONL d'apprentissage
        outcome = ActionOutcome(
            session_id=session_id,
            action_id=action.get("action_id", ""),
            action_type="click",
            target_description=by_text,
            window_title=window_title,
            resolution_method="human_supervised",
            resolution_score=1.0,  # Confiance maximale — l'humain a montré
            success=True,
        )
        self.record(outcome)
        # Stocker dans target_memory.db pour le lookup futur
        try:
            from .replay_memory import get_target_memory_store
            store = get_target_memory_store()
            if store:
                store.record_success(
                    screen_signature="human_correction",
                    target_spec=target_spec,
                    resolved_position={"x_pct": x_pct, "y_pct": y_pct},
                    method="human_supervised",
                    score=1.0,
                )
                logger.info(
                    f"[APPRENTISSAGE] Correction stockée dans target_memory : "
                    f"'{by_text}' → ({x_pct:.4f}, {y_pct:.4f})"
                )
        except Exception as e:
            logger.warning(f"Learning: échec stockage target_memory: {e}")
    def query_similar(
        self,
        target_description: str = "",
--- a/deploy/lea_package/Lea.bat
+++ b/deploy/lea_package/Lea.bat
@@ -51,10 +51,14 @@ echo  Pour arreter Lea : clic droit sur l'icone ^> "Quitter Lea"
 echo  Vous pouvez fermer cette fenetre.
 echo.
-.venv\Scripts\pythonw.exe run_agent_v1.py
+start "" /b .venv\Scripts\pythonw.exe run_agent_v1.py
 :: Attendre 3s puis verifier que Lea tourne
 timeout /t 3 >nul
 tasklist /FI "IMAGENAME eq pythonw.exe" /NH 2>nul | findstr /I "pythonw" >nul
 if errorlevel 1 (
    echo.
-    echo  Lea a rencontre un probleme au demarrage.
+    echo  Lea n'a pas demarre correctement.
    echo  Tentative avec affichage des erreurs...
    echo.
    .venv\Scripts\python.exe run_agent_v1.py
--- a/tools/session_cleaner.py
+++ b/tools/session_cleaner.py
@@ -875,17 +875,174 @@ def _find_session_dir(machine_id: str, session_id: str) -> Optional[Path]:
    return None
 def _load_crop_as_base64(session_dir: Path, screenshot_id: str) -> str:
    """Charger un crop screenshot et le retourner en base64.
    Le crop (80x80 autour du clic) sert d'ancre pour le template matching —
    le GroundingEngine compare cette vignette a l'ecran actuel via OpenCV.
    """
    if not screenshot_id:
        return ""
    crop_path = session_dir / "shots" / f"{screenshot_id}_crop.png"
    if not crop_path.is_file():
        return ""
    try:
        import base64
        data = crop_path.read_bytes()
        return base64.b64encode(data).decode("ascii")
    except Exception:
        return ""
 def _build_vlm_description(
    uia_snapshot: Dict[str, Any], window_info: Dict[str, Any],
 ) -> str:
    """Construire une description naturelle pour le VLM.
    Le VLM recoit cette phrase + le screenshot actuel et doit localiser
    l'element decrit. Plus la description est precise, meilleur le grounding.
    """
    name = uia_snapshot.get("name", "")
    control_type = uia_snapshot.get("control_type", "")
    window_title = window_info.get("title", "") if window_info else ""
    parts = []
    if control_type:
        parts.append(f"le {control_type}")
    if name:
        parts.append(f"'{name}'")
    if window_title and window_title != "unknown_window":
        parts.append(f"dans la fenetre '{window_title}'")
    if parts:
        return " ".join(parts)
    return ""
 def _build_full_target_spec(
    event: Dict[str, Any], session_dir: Path,
 ) -> Dict[str, Any]:
    """Construire un target_spec complet pour la cascade de resolution visuelle.
    Exploite TOUTES les donnees capturees pendant l'enregistrement :
    - uia_snapshot → resolution UIA locale (lea_uia.exe, 10-20ms)
    - crop screenshot → template matching OpenCV (~100ms)
    - nom UIA + window_title → OCR docTR + VLM grounding (1-5s)
    La cascade : UIA → template → serveur (docTR+VLM) → VLM local.
    Si tout echoue → pause supervisee (pas de clic aveugle).
    """
    uia_snapshot = event.get("uia_snapshot", {})
    window_info = event.get("window", {})
    vision_info = event.get("vision_info", {})
    screenshot_id = event.get("screenshot_id", "")
    name = uia_snapshot.get("name", "") if uia_snapshot else ""
    control_type = uia_snapshot.get("control_type", "") if uia_snapshot else ""
    automation_id = uia_snapshot.get("automation_id", "") if uia_snapshot else ""
    parent_path = uia_snapshot.get("parent_path", []) if uia_snapshot else []
    window_title = window_info.get("title", "") if window_info else ""
    # Cascade de resolution — UIA d'abord (rapide), puis vision
    resolve_order = []
    # UIA : disponible si on a un nom ou automation_id
    has_uia = bool(name or automation_id)
    if has_uia:
        resolve_order.append("uia")
    # Template matching : disponible si on a un crop
    anchor_b64 = _load_crop_as_base64(session_dir, screenshot_id)
    if anchor_b64:
        resolve_order.append("template")
    # Serveur (docTR OCR + SomEngine + VLM) : toujours en fallback
    resolve_order.append("server")
    # VLM local : dernier recours
    resolve_order.append("vlm_local")
    if not resolve_order:
        return {}
    target_spec: Dict[str, Any] = {
        "resolve_order": resolve_order,
        "window_title": window_title,
    }
    # UIA target
    if has_uia:
        target_spec["uia_target"] = {
            "name": name,
            "control_type": control_type,
            "automation_id": automation_id,
            "parent_path": parent_path,
        }
    # Anchor pour template matching
    if anchor_b64:
        target_spec["anchor_image_base64"] = anchor_b64
    # Texte pour OCR (docTR sur le serveur)
    if name:
        target_spec["by_text"] = name
    # Description VLM
    vlm_desc = _build_vlm_description(uia_snapshot or {}, window_info or {})
    if vlm_desc:
        target_spec["vlm_description"] = vlm_desc
    return target_spec
 def _build_desktop_cleanup_actions(screen_w: int, screen_h: int) -> List[Dict[str, Any]]:
    """Construire les actions de nettoyage bureau AVANT le replay.
    Sur Windows 11, un clic sur l'extreme droite de la barre des taches
    (le pixel invisible 'Afficher le bureau') minimise toutes les fenetres.
    C'est exactement ce qu'un humain ferait avant de commencer un travail :
    repartir d'un bureau propre.
    100% visuel — pas de raccourci clavier injecte (cf feedback_100pct_visual).
    """
    # Le bouton 'Afficher le bureau' est au pixel tout en bas a droite
    # de la taskbar. Sur Win11, c'est une fine bande cliquable.
    x_pct = round((screen_w - 2) / screen_w, 6)   # avant-dernier pixel
    y_pct = round((screen_h - 2) / screen_h, 6)    # idem vertical
    return [
        {
            "action_id": f"act_setup_desktop_{uuid.uuid4().hex[:6]}",
            "type": "click",
            "x_pct": x_pct,
            "y_pct": y_pct,
            "button": "left",
            "visual_mode": False,  # position fixe, pas besoin de grounding
            "wait_before": 0.3,
            "_setup_action": True,  # marqueur pour le distinguer des vrais clics
        },
        {
            "action_id": f"act_setup_wait_{uuid.uuid4().hex[:6]}",
            "type": "wait",
            "duration_ms": 1000,
            "wait_before": 0,
            "_setup_action": True,
        },
    ]
 def _simple_build_replay(events: List[Dict[str, Any]], session_dir: Path) -> List[Dict[str, Any]]:
-    """Construire un replay simplifie sans dependre de stream_processor.
+    """Construire un replay visuel depuis les evenements bruts.
-    Convertit les evenements bruts en actions normalisees simples :
+    Chaque clic est enrichi avec un target_spec complet qui alimente
-    - mouse_click -> action click (coordonnees en pixels)
+    la cascade de resolution du GroundingEngine :
-    - text_input / type -> action type
+      UIA local (10ms) → template matching (100ms) → serveur docTR/VLM (2-5s)
    - key_combo / key_press -> action key_combo
-    C'est un fallback pour quand build_replay_from_raw_events n'est pas disponible.
+    Les coordonnees x_pct/y_pct sont incluses comme hint de derniere chance.
-    Les coordonnees ne sont PAS converties en pourcentages (le serveur les accepte
+    Lea ne clique pas en aveugle — elle VOIT l'ecran et CHERCHE l'element.
-    aussi en pixels).
+
    Le replay commence par un nettoyage du bureau (clic 'Afficher le bureau')
    pour partir d'un etat propre — exactement comme un humain.
    """
    actions: List[Dict[str, Any]] = []
    click_count = 0
@@ -900,6 +1057,9 @@ def _simple_build_replay(events: List[Dict[str, Any]], session_dir: Path) -> Lis
            screen_w, screen_h = int(res[0]), int(res[1])
            break
    # ── Étape 0 : nettoyer le bureau ──
    actions.extend(_build_desktop_cleanup_actions(screen_w, screen_h))
    for ev in events:
        inner = ev.get("event", {})
        etype = inner.get("type", "")
@@ -913,15 +1073,35 @@ def _simple_build_replay(events: List[Dict[str, Any]], session_dir: Path) -> Lis
            pos = inner.get("pos", [0, 0])
            click_count += 1
            x_pct = round(pos[0] / screen_w, 6) if screen_w else 0.0
            y_pct = round(pos[1] / screen_h, 6) if screen_h else 0.0
            action = {
                "action_id": action_id,
                "type": "click",
-                "x_pct": round(pos[0] / screen_w, 6) if screen_w else 0.0,
+                "x_pct": x_pct,
-                "y_pct": round(pos[1] / screen_h, 6) if screen_h else 0.0,
+                "y_pct": y_pct,
                "button": inner.get("button", "left"),
                "visual_mode": False,  # pas d'enrichissement → coords brutes
                "wait_before": 0.5,
            }
            # Enrichir avec la cascade visuelle complete
            target_spec = _build_full_target_spec(inner, session_dir)
            if target_spec:
                action["visual_mode"] = True
                action["target_spec"] = target_spec
                uia_name = inner.get("uia_snapshot", {}).get("name", "?")
                methods = target_spec.get("resolve_order", [])
                logger.info(
                    "Action %s enrichie [%s] : '%s' (%s)",
                    action_id, "+".join(methods), uia_name,
                    inner.get("uia_snapshot", {}).get("control_type", "?"),
                )
            else:
                # Pas de donnee visuelle du tout → coords brutes en dernier recours
                action["visual_mode"] = False
                logger.warning("Action %s : aucune donnee visuelle, coords brutes", action_id)
            actions.append(action)
        elif etype in ("text_input", "type"):