feat: premier replay E2E + mode apprentissage supervisé
Premier replay fonctionnel de bout en bout (Bloc-notes, Chrome). Corrections critiques : - Fix double-lancement agent (Lea.bat start /b + verrou PID) - Sérialisation replay (threading.Lock dans poll_and_execute) - Garde UIA bbox >50% écran (rejet conteneurs "Bureau") - Filtre fenêtres bruit système (systray overflow) - Auto-nettoyage replays bloqués (paused_need_help) Cascade visuelle complète dans session_cleaner : - UIA local (10ms) → template matching (100ms) → serveur docTR/VLM - Nettoyage bureau pré-replay (clic "Afficher le bureau") - Crops 80x80 + vlm_description pour chaque clic Grounding contraint à la fenêtre active : - Capture croppée à la fenêtre au lieu de l'écran entier - Conversion coordonnées fenêtre → écran - Élimine les faux positifs taskbar/systray Mode apprentissage supervisé (SUPERVISE → capture humaine) : - Léa passe en mode capture quand elle est perdue - Capture mini-workflow humain (clics + frappes + combos) - Fin par Ctrl+Shift+L ou timeout inactivité 10s - Correction stockée dans target_memory.db via serveur Deploy Windows complet (grounding.py, policy.py, uia_helper.py). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -17,6 +17,7 @@ import base64
|
|||||||
import hashlib
|
import hashlib
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
|
import threading
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
@@ -72,6 +73,12 @@ class ActionExecutorV1:
|
|||||||
# different de celui qui utilise l'instance).
|
# different de celui qui utilise l'instance).
|
||||||
self._sct = None
|
self._sct = None
|
||||||
self.running = True
|
self.running = True
|
||||||
|
# ── Verrou de sérialisation replay ──
|
||||||
|
# Garantit qu'UNE SEULE action de replay s'exécute à la fois.
|
||||||
|
# Sans ce lock, deux threads (polling main.py + lea_ui) peuvent
|
||||||
|
# consommer deux actions simultanément → race condition + mss
|
||||||
|
# thread-unsafe retourne des résolutions fantômes (1024x768).
|
||||||
|
self._replay_lock = threading.Lock()
|
||||||
# Backoff exponentiel pour le polling replay (evite de marteler le serveur)
|
# Backoff exponentiel pour le polling replay (evite de marteler le serveur)
|
||||||
self._poll_backoff = 1.0 # Delai actuel (secondes)
|
self._poll_backoff = 1.0 # Delai actuel (secondes)
|
||||||
self._poll_backoff_min = 1.0 # Delai minimal (reset apres succes)
|
self._poll_backoff_min = 1.0 # Delai minimal (reset apres succes)
|
||||||
@@ -340,6 +347,25 @@ class ActionExecutorV1:
|
|||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# ── GARDE : rejeter les éléments géants (conteneurs) ──
|
||||||
|
# Un élément qui couvre >50% de l'écran est un conteneur
|
||||||
|
# (Bureau, Rechercher, liste), pas un bouton cliquable.
|
||||||
|
# Cliquer au centre d'un conteneur = clic aveugle.
|
||||||
|
elem_w = element.width()
|
||||||
|
elem_h = element.height()
|
||||||
|
screen_area = screen_width * screen_height
|
||||||
|
elem_area = elem_w * elem_h
|
||||||
|
if screen_area > 0 and elem_area / screen_area > 0.5:
|
||||||
|
logger.warning(
|
||||||
|
f"UIA REJET : '{name}' couvre {elem_area / screen_area * 100:.0f}% "
|
||||||
|
f"de l'écran ({elem_w}x{elem_h}) — conteneur, pas un élément cliquable"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f" [UIA] REJET — '{name}' trop grand "
|
||||||
|
f"({elem_w}x{elem_h}, {elem_area / screen_area * 100:.0f}% écran)"
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
cx, cy = element.center()
|
cx, cy = element.center()
|
||||||
if screen_width <= 0 or screen_height <= 0:
|
if screen_width <= 0 or screen_height <= 0:
|
||||||
return None
|
return None
|
||||||
@@ -499,10 +525,25 @@ class ActionExecutorV1:
|
|||||||
"visual_resolved": False,
|
"visual_resolved": False,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ── Délai inter-actions (anti race condition mss) ──
|
||||||
|
wait_before = action.get("wait_before", 0.5)
|
||||||
|
if wait_before > 0:
|
||||||
|
time.sleep(wait_before)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
monitor = self.sct.monitors[1]
|
monitor = self.sct.monitors[1]
|
||||||
width, height = monitor["width"], monitor["height"]
|
width, height = monitor["width"], monitor["height"]
|
||||||
|
|
||||||
|
# ── Diagnostic résolution ──
|
||||||
|
logger.info(
|
||||||
|
f"[REPLAY] Action {action_id} ({action_type}) — "
|
||||||
|
f"écran replay: {width}x{height}, "
|
||||||
|
f"x_pct={action.get('x_pct', 0):.4f}, "
|
||||||
|
f"y_pct={action.get('y_pct', 0):.4f} "
|
||||||
|
f"→ pixel ({int(action.get('x_pct', 0) * width)}, "
|
||||||
|
f"{int(action.get('y_pct', 0) * height)})"
|
||||||
|
)
|
||||||
|
|
||||||
# Resolution visuelle des coordonnees si demande
|
# Resolution visuelle des coordonnees si demande
|
||||||
x_pct = action.get("x_pct", 0.0)
|
x_pct = action.get("x_pct", 0.0)
|
||||||
y_pct = action.get("y_pct", 0.0)
|
y_pct = action.get("y_pct", 0.0)
|
||||||
@@ -526,7 +567,7 @@ class ActionExecutorV1:
|
|||||||
)
|
)
|
||||||
if expected_title and expected_title != "unknown_window":
|
if expected_title and expected_title != "unknown_window":
|
||||||
from ..window_info_crossplatform import get_active_window_info
|
from ..window_info_crossplatform import get_active_window_info
|
||||||
from ..ui.messages import est_fenetre_lea
|
from ..ui.messages import est_fenetre_lea, est_fenetre_bruit
|
||||||
|
|
||||||
# Polling court pour laisser le temps à la fenêtre de
|
# Polling court pour laisser le temps à la fenêtre de
|
||||||
# se stabiliser (évite les faux négatifs sur transitions
|
# se stabiliser (évite les faux négatifs sur transitions
|
||||||
@@ -544,8 +585,9 @@ class ActionExecutorV1:
|
|||||||
time.sleep(0.3)
|
time.sleep(0.3)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Si on tombe sur unknown_window → on attend aussi
|
# Bruit système (systray overflow, taskbar, etc.)
|
||||||
if not current_title or current_title == "unknown_window":
|
# → on attend que la vraie fenêtre reprenne le focus
|
||||||
|
if est_fenetre_bruit(current_title):
|
||||||
time.sleep(0.3)
|
time.sleep(0.3)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -686,8 +728,8 @@ class ActionExecutorV1:
|
|||||||
|
|
||||||
if action_type == "click":
|
if action_type == "click":
|
||||||
# Si visual_mode est activé, le resolve DOIT réussir.
|
# Si visual_mode est activé, le resolve DOIT réussir.
|
||||||
# Pas de fallback blind — on arrête le replay si la cible
|
# Pas de fallback blind — Léa VOIT l'écran et CHERCHE
|
||||||
# n'est pas trouvée visuellement. C'est un RPA VISUEL.
|
# l'élément. Si toute la cascade échoue → pause supervisée.
|
||||||
if visual_mode and not result.get("visual_resolved"):
|
if visual_mode and not result.get("visual_resolved"):
|
||||||
# ── Policy : décider quoi faire quand grounding échoue ──
|
# ── Policy : décider quoi faire quand grounding échoue ──
|
||||||
from .policy import PolicyEngine, Decision
|
from .policy import PolicyEngine, Decision
|
||||||
@@ -709,7 +751,6 @@ class ActionExecutorV1:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if policy_decision.decision == Decision.RETRY:
|
if policy_decision.decision == Decision.RETRY:
|
||||||
# Re-tenter le grounding après correction (popup fermée, etc.)
|
|
||||||
resolved2 = self._resolve_target_visual(
|
resolved2 = self._resolve_target_visual(
|
||||||
server_url, target_spec, x_pct, y_pct, width, height
|
server_url, target_spec, x_pct, y_pct, width, height
|
||||||
)
|
)
|
||||||
@@ -719,7 +760,6 @@ class ActionExecutorV1:
|
|||||||
result["visual_resolved"] = True
|
result["visual_resolved"] = True
|
||||||
print(f" [POLICY] Re-resolve OK après {policy_decision.action_taken}")
|
print(f" [POLICY] Re-resolve OK après {policy_decision.action_taken}")
|
||||||
else:
|
else:
|
||||||
# Re-resolve échoué — SUPERVISE (rendre la main)
|
|
||||||
result["success"] = False
|
result["success"] = False
|
||||||
result["error"] = "target_not_found"
|
result["error"] = "target_not_found"
|
||||||
result["target_description"] = target_desc
|
result["target_description"] = target_desc
|
||||||
@@ -746,18 +786,55 @@ class ActionExecutorV1:
|
|||||||
)
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
else: # SUPERVISE ou CONTINUE
|
else: # SUPERVISE → mode apprentissage
|
||||||
result["success"] = False
|
# Léa est perdue. Au lieu de s'arrêter, elle
|
||||||
result["error"] = "target_not_found"
|
# passe en mode capture et enregistre ce que
|
||||||
result["target_description"] = target_desc
|
# l'humain fait (mini-workflow de correction).
|
||||||
result["target_spec"] = target_spec
|
try:
|
||||||
result["screenshot"] = self._capture_screenshot_b64()
|
self.notifier.replay_target_not_found(
|
||||||
result["warning"] = "visual_resolve_failed"
|
target_desc,
|
||||||
self.notifier.replay_target_not_found(
|
target_spec.get("window_title", ""),
|
||||||
target_desc,
|
)
|
||||||
target_spec.get("window_title", ""),
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
human_actions = self._capture_human_correction(
|
||||||
|
timeout_s=120,
|
||||||
)
|
)
|
||||||
return result
|
if human_actions:
|
||||||
|
# L'humain a montré un mini-workflow
|
||||||
|
result["success"] = True
|
||||||
|
result["resolution_method"] = "human_supervised"
|
||||||
|
result["warning"] = "human_supervised"
|
||||||
|
# Stocker le dernier clic comme position résolue
|
||||||
|
last_click = None
|
||||||
|
for ha in reversed(human_actions):
|
||||||
|
if ha.get("type") == "click":
|
||||||
|
last_click = ha
|
||||||
|
break
|
||||||
|
if last_click:
|
||||||
|
result["actual_position"] = {
|
||||||
|
"x_pct": last_click["x_pct"],
|
||||||
|
"y_pct": last_click["y_pct"],
|
||||||
|
}
|
||||||
|
# Envoyer toute la correction au serveur
|
||||||
|
result["correction"] = {
|
||||||
|
"actions": human_actions,
|
||||||
|
"action_count": len(human_actions),
|
||||||
|
"last_click": last_click,
|
||||||
|
}
|
||||||
|
logger.info(
|
||||||
|
f"[APPRENTISSAGE] Correction reçue : "
|
||||||
|
f"{len(human_actions)} actions — je m'en souviendrai."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Timeout — l'humain n'a pas répondu
|
||||||
|
result["success"] = False
|
||||||
|
result["error"] = "target_not_found"
|
||||||
|
result["target_description"] = target_desc
|
||||||
|
result["target_spec"] = target_spec
|
||||||
|
result["screenshot"] = self._capture_screenshot_b64()
|
||||||
|
result["warning"] = "visual_resolve_failed"
|
||||||
|
|
||||||
real_x = int(x_pct * width)
|
real_x = int(x_pct * width)
|
||||||
real_y = int(y_pct * height)
|
real_y = int(y_pct * height)
|
||||||
@@ -1417,15 +1494,24 @@ Example: x_pct=0.50, y_pct=0.30"""
|
|||||||
2. Execute l'action (clic, texte, etc.)
|
2. Execute l'action (clic, texte, etc.)
|
||||||
3. POST /replay/result avec le resultat + screenshot
|
3. POST /replay/result avec le resultat + screenshot
|
||||||
|
|
||||||
Args:
|
Sérialisé par _replay_lock — une seule action à la fois.
|
||||||
session_id: Identifiant de la session courante
|
Sans ce lock, deux threads concurrents consomment deux actions
|
||||||
server_url: URL de base du serveur streaming
|
et mss retourne des résolutions fantômes (thread-unsafe).
|
||||||
machine_id: Identifiant de la machine (pour le replay multi-machine)
|
|
||||||
|
|
||||||
Retourne True si une action a ete executee, False sinon.
|
Retourne True si une action a ete executee, False sinon.
|
||||||
IMPORTANT: Si une action est recue, le resultat est TOUJOURS rapporte
|
|
||||||
au serveur (meme en cas d'erreur d'execution).
|
|
||||||
"""
|
"""
|
||||||
|
# Sérialisation stricte : si un autre thread exécute déjà une
|
||||||
|
# action, on abandonne ce poll immédiatement (pas de file d'attente).
|
||||||
|
if not self._replay_lock.acquire(blocking=False):
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
return self._poll_and_execute_inner(session_id, server_url, machine_id)
|
||||||
|
finally:
|
||||||
|
self._replay_lock.release()
|
||||||
|
|
||||||
|
def _poll_and_execute_inner(self, session_id: str, server_url: str, machine_id: str) -> bool:
|
||||||
|
"""Implémentation interne de poll_and_execute (protégée par _replay_lock)."""
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
replay_next_url = f"{server_url}/traces/stream/replay/next"
|
replay_next_url = f"{server_url}/traces/stream/replay/next"
|
||||||
@@ -1499,11 +1585,14 @@ Example: x_pct=0.50, y_pct=0.30"""
|
|||||||
print(f">>> ERREUR EXECUTION : {e}")
|
print(f">>> ERREUR EXECUTION : {e}")
|
||||||
logger.error(f"Erreur execute_replay_action: {e}")
|
logger.error(f"Erreur execute_replay_action: {e}")
|
||||||
import traceback
|
import traceback
|
||||||
|
tb_str = traceback.format_exc()
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
result = {
|
result = {
|
||||||
"action_id": action_id,
|
"action_id": action_id,
|
||||||
"success": False,
|
"success": False,
|
||||||
"error": f"Exception executor: {e}",
|
# Inclure le traceback complet pour diagnostiquer
|
||||||
|
# les crashes côté agent depuis les logs serveur
|
||||||
|
"error": f"{e}\n---TRACEBACK---\n{tb_str[-500:]}",
|
||||||
"screenshot": None,
|
"screenshot": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1525,6 +1614,8 @@ Example: x_pct=0.50, y_pct=0.30"""
|
|||||||
# Champs enrichis pour target_not_found (pause supervisée)
|
# Champs enrichis pour target_not_found (pause supervisée)
|
||||||
"target_description": result.get("target_description"),
|
"target_description": result.get("target_description"),
|
||||||
"target_spec": result.get("target_spec"),
|
"target_spec": result.get("target_spec"),
|
||||||
|
# Correction humaine (mode apprentissage supervisé)
|
||||||
|
"correction": result.get("correction"),
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
resp2 = requests.post(
|
resp2 = requests.post(
|
||||||
@@ -2007,6 +2098,159 @@ Example: x_pct=0.50, y_pct=0.30"""
|
|||||||
|
|
||||||
logger.debug(f"Texte saisi char-by-char ({len(text)} chars)")
|
logger.debug(f"Texte saisi char-by-char ({len(text)} chars)")
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Mode apprentissage — l'humain montre, Léa apprend
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
# Hotkey pour signaler la fin de la correction humaine
|
||||||
|
_LEARNING_DONE_HOTKEY = {Key.ctrl_l, Key.shift, KeyCode.from_char("l")}
|
||||||
|
|
||||||
|
def _capture_human_correction(self, timeout_s: float = 120.0) -> list[dict]:
|
||||||
|
"""Capturer un mini-workflow de correction humaine.
|
||||||
|
|
||||||
|
Léa est perdue — elle passe en mode capture et enregistre
|
||||||
|
TOUTES les actions de l'humain (clics, frappes, combos)
|
||||||
|
jusqu'à ce que l'humain signale qu'il a fini :
|
||||||
|
- Ctrl+Shift+L (hotkey)
|
||||||
|
- Ou timeout d'inactivité (10s sans action)
|
||||||
|
- Ou timeout global (120s)
|
||||||
|
|
||||||
|
Retourne la liste des actions capturées (peut être vide si timeout).
|
||||||
|
C'est un mini-workflow, pas juste un clic.
|
||||||
|
"""
|
||||||
|
done_event = threading.Event()
|
||||||
|
actions: list[dict] = []
|
||||||
|
last_action_time = [time.time()]
|
||||||
|
keys_pressed: set = set()
|
||||||
|
INACTIVITY_TIMEOUT = 10.0 # secondes
|
||||||
|
|
||||||
|
monitor = self.sct.monitors[1]
|
||||||
|
screen_w, screen_h = monitor["width"], monitor["height"]
|
||||||
|
|
||||||
|
def _on_click(x, y, button, pressed):
|
||||||
|
if done_event.is_set():
|
||||||
|
return False
|
||||||
|
if pressed and button.name in ("left", "right"):
|
||||||
|
action = {
|
||||||
|
"type": "click",
|
||||||
|
"x_pct": round(x / screen_w, 6),
|
||||||
|
"y_pct": round(y / screen_h, 6),
|
||||||
|
"button": button.name,
|
||||||
|
"timestamp": time.time(),
|
||||||
|
}
|
||||||
|
# UIA snapshot
|
||||||
|
try:
|
||||||
|
from .uia_helper import get_shared_helper
|
||||||
|
helper = get_shared_helper()
|
||||||
|
if helper.available:
|
||||||
|
elem = helper.query_at(int(x), int(y), with_parents=True)
|
||||||
|
if elem:
|
||||||
|
action["uia_snapshot"] = elem.to_dict()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
actions.append(action)
|
||||||
|
last_action_time[0] = time.time()
|
||||||
|
logger.info(f"[APPRENTISSAGE] Clic ({x}, {y}) bouton={button.name}")
|
||||||
|
|
||||||
|
def _on_key_press(key):
|
||||||
|
if done_event.is_set():
|
||||||
|
return False
|
||||||
|
keys_pressed.add(key)
|
||||||
|
# Vérifier hotkey Ctrl+Shift+L
|
||||||
|
if self._LEARNING_DONE_HOTKEY.issubset(keys_pressed):
|
||||||
|
logger.info("[APPRENTISSAGE] Hotkey Ctrl+Shift+L — fin de correction")
|
||||||
|
print(" [APPRENTISSAGE] Ctrl+Shift+L reçu — merci !")
|
||||||
|
done_event.set()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _on_key_release(key):
|
||||||
|
keys_pressed.discard(key)
|
||||||
|
if done_event.is_set():
|
||||||
|
return False
|
||||||
|
# Capturer les frappes texte (pas les modifiers seuls)
|
||||||
|
if hasattr(key, "char") and key.char:
|
||||||
|
actions.append({
|
||||||
|
"type": "type",
|
||||||
|
"text": key.char,
|
||||||
|
"timestamp": time.time(),
|
||||||
|
})
|
||||||
|
last_action_time[0] = time.time()
|
||||||
|
elif key == Key.enter:
|
||||||
|
actions.append({
|
||||||
|
"type": "key_combo",
|
||||||
|
"keys": ["enter"],
|
||||||
|
"timestamp": time.time(),
|
||||||
|
})
|
||||||
|
last_action_time[0] = time.time()
|
||||||
|
|
||||||
|
from pynput.mouse import Listener as MouseListener
|
||||||
|
from pynput.keyboard import Listener as KeyboardListener
|
||||||
|
|
||||||
|
mouse_listener = MouseListener(on_click=_on_click)
|
||||||
|
kbd_listener = KeyboardListener(
|
||||||
|
on_press=_on_key_press, on_release=_on_key_release,
|
||||||
|
)
|
||||||
|
mouse_listener.start()
|
||||||
|
kbd_listener.start()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"[APPRENTISSAGE] Mode capture activé (timeout={timeout_s}s, "
|
||||||
|
f"inactivité={INACTIVITY_TIMEOUT}s, hotkey=Ctrl+Shift+L)"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f" [APPRENTISSAGE] Montre-moi comment faire.\n"
|
||||||
|
f" Quand tu as fini → Ctrl+Shift+L\n"
|
||||||
|
f" (ou j'attends {INACTIVITY_TIMEOUT}s sans action)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Attendre : hotkey OU inactivité OU timeout global
|
||||||
|
start = time.time()
|
||||||
|
while not done_event.is_set():
|
||||||
|
elapsed = time.time() - start
|
||||||
|
if elapsed > timeout_s:
|
||||||
|
logger.info("[APPRENTISSAGE] Timeout global")
|
||||||
|
break
|
||||||
|
# Timeout inactivité : si l'humain a fait au moins 1 action
|
||||||
|
# et n'a rien fait depuis INACTIVITY_TIMEOUT secondes
|
||||||
|
if actions and (time.time() - last_action_time[0]) > INACTIVITY_TIMEOUT:
|
||||||
|
logger.info(
|
||||||
|
f"[APPRENTISSAGE] Inactivité {INACTIVITY_TIMEOUT}s — "
|
||||||
|
f"fin automatique ({len(actions)} actions)"
|
||||||
|
)
|
||||||
|
print(f" [APPRENTISSAGE] Pas d'action depuis {INACTIVITY_TIMEOUT}s — je reprends.")
|
||||||
|
break
|
||||||
|
time.sleep(0.2)
|
||||||
|
|
||||||
|
mouse_listener.stop()
|
||||||
|
kbd_listener.stop()
|
||||||
|
|
||||||
|
logger.info(f"[APPRENTISSAGE] {len(actions)} actions capturées")
|
||||||
|
print(f" [APPRENTISSAGE] {len(actions)} actions capturées — merci !")
|
||||||
|
return actions
|
||||||
|
|
||||||
|
def _capture_crop_at(self, x: int, y: int, size: int = 80) -> str:
|
||||||
|
"""Capturer un crop carré autour d'une position."""
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
with mss.mss() as local_sct:
|
||||||
|
monitor = local_sct.monitors[1]
|
||||||
|
raw = local_sct.grab(monitor)
|
||||||
|
img = Image.frombytes("RGB", raw.size, raw.bgra, "raw", "BGRX")
|
||||||
|
|
||||||
|
half = size // 2
|
||||||
|
left = max(0, x - half)
|
||||||
|
top = max(0, y - half)
|
||||||
|
right = min(img.width, x + half)
|
||||||
|
bottom = min(img.height, y + half)
|
||||||
|
crop = img.crop((left, top, right, bottom))
|
||||||
|
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
crop.save(buffer, format="JPEG", quality=85)
|
||||||
|
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
def _click(self, pos, button_name):
|
def _click(self, pos, button_name):
|
||||||
"""Deplacer la souris via courbe de Bézier puis cliquer.
|
"""Deplacer la souris via courbe de Bézier puis cliquer.
|
||||||
|
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ Ref: docs/PLAN_ACTEUR_V1.md — Architecture MICRO (grounding + exécution)
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
|
import io
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
@@ -126,19 +127,62 @@ class GroundingEngine:
|
|||||||
)
|
)
|
||||||
|
|
||||||
t_start = time.time()
|
t_start = time.time()
|
||||||
screenshot_b64 = self._executor._capture_screenshot_b64(max_width=0, quality=75)
|
|
||||||
|
# ── Capture contrainte à la fenêtre active ──
|
||||||
|
# Le grounding ne voit QUE la fenêtre attendue — pas la taskbar,
|
||||||
|
# pas le systray, pas les autres apps. Comme un humain qui regarde
|
||||||
|
# l'application sur laquelle il travaille.
|
||||||
|
window_rect = None
|
||||||
|
try:
|
||||||
|
from ..window_info_crossplatform import get_active_window_rect
|
||||||
|
win_info = get_active_window_rect()
|
||||||
|
if win_info and win_info.get("rect"):
|
||||||
|
r = win_info["rect"] # [left, top, right, bottom]
|
||||||
|
# Validation : fenêtre visible et pas minuscule
|
||||||
|
w = r[2] - r[0]
|
||||||
|
h = r[3] - r[1]
|
||||||
|
if w > 50 and h > 50:
|
||||||
|
window_rect = {
|
||||||
|
"left": max(0, r[0]),
|
||||||
|
"top": max(0, r[1]),
|
||||||
|
"width": min(w, screen_width),
|
||||||
|
"height": min(h, screen_height),
|
||||||
|
}
|
||||||
|
logger.info(
|
||||||
|
f"Grounding contraint à la fenêtre : "
|
||||||
|
f"{window_rect['width']}x{window_rect['height']} "
|
||||||
|
f"à ({window_rect['left']}, {window_rect['top']})"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Pas de window rect disponible : {e}")
|
||||||
|
|
||||||
|
screenshot_b64 = self._capture_window_or_screen(window_rect)
|
||||||
if not screenshot_b64:
|
if not screenshot_b64:
|
||||||
return GroundingResult(
|
return GroundingResult(
|
||||||
found=False, detail="Capture screenshot échouée",
|
found=False, detail="Capture screenshot échouée",
|
||||||
elapsed_ms=(time.time() - t_start) * 1000,
|
elapsed_ms=(time.time() - t_start) * 1000,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Dimensions de la zone capturée (fenêtre ou écran entier)
|
||||||
|
cap_w = window_rect["width"] if window_rect else screen_width
|
||||||
|
cap_h = window_rect["height"] if window_rect else screen_height
|
||||||
|
|
||||||
for strategy in strategies:
|
for strategy in strategies:
|
||||||
result = self._try_strategy(
|
result = self._try_strategy(
|
||||||
strategy, server_url, screenshot_b64, target_spec,
|
strategy, server_url, screenshot_b64, target_spec,
|
||||||
fallback_x, fallback_y, screen_width, screen_height,
|
fallback_x, fallback_y, cap_w, cap_h,
|
||||||
)
|
)
|
||||||
if result.found:
|
if result.found:
|
||||||
|
# ── Conversion coords fenêtre → coords écran ──
|
||||||
|
if window_rect:
|
||||||
|
# Le grounding a retourné des coords relatives à la fenêtre
|
||||||
|
# On les convertit en coords relatives à l'écran entier
|
||||||
|
abs_x = window_rect["left"] + result.x_pct * cap_w
|
||||||
|
abs_y = window_rect["top"] + result.y_pct * cap_h
|
||||||
|
result.x_pct = abs_x / screen_width
|
||||||
|
result.y_pct = abs_y / screen_height
|
||||||
|
result.detail = f"{result.detail} [fenêtre {cap_w}x{cap_h}]"
|
||||||
|
|
||||||
result.elapsed_ms = (time.time() - t_start) * 1000
|
result.elapsed_ms = (time.time() - t_start) * 1000
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@@ -148,6 +192,39 @@ class GroundingEngine:
|
|||||||
elapsed_ms=(time.time() - t_start) * 1000,
|
elapsed_ms=(time.time() - t_start) * 1000,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _capture_window_or_screen(self, window_rect: Optional[Dict]) -> str:
|
||||||
|
"""Capturer soit la fenêtre active (croppée), soit l'écran entier.
|
||||||
|
|
||||||
|
Si window_rect est fourni, capture uniquement cette zone.
|
||||||
|
Sinon, capture l'écran entier (fallback).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
import mss as mss_lib
|
||||||
|
|
||||||
|
with mss_lib.mss() as local_sct:
|
||||||
|
if window_rect:
|
||||||
|
# Capture de la zone fenêtre uniquement
|
||||||
|
region = {
|
||||||
|
"left": window_rect["left"],
|
||||||
|
"top": window_rect["top"],
|
||||||
|
"width": window_rect["width"],
|
||||||
|
"height": window_rect["height"],
|
||||||
|
}
|
||||||
|
raw = local_sct.grab(region)
|
||||||
|
else:
|
||||||
|
# Fallback écran entier
|
||||||
|
raw = local_sct.grab(local_sct.monitors[1])
|
||||||
|
|
||||||
|
img = Image.frombytes("RGB", raw.size, raw.bgra, "raw", "BGRX")
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
img.save(buffer, format="JPEG", quality=75)
|
||||||
|
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Capture échouée : {e}")
|
||||||
|
# Fallback sur la méthode existante de l'executor
|
||||||
|
return self._executor._capture_screenshot_b64(max_width=0, quality=75)
|
||||||
|
|
||||||
def _try_strategy(
|
def _try_strategy(
|
||||||
self,
|
self,
|
||||||
strategy: str,
|
strategy: str,
|
||||||
|
|||||||
@@ -568,6 +568,35 @@ def est_fenetre_lea(titre_fenetre: str) -> bool:
|
|||||||
return any(re.search(motif, titre_lower) for motif in _MOTIFS_FENETRE_LEA_REGEX)
|
return any(re.search(motif, titre_lower) for motif in _MOTIFS_FENETRE_LEA_REGEX)
|
||||||
|
|
||||||
|
|
||||||
|
# Fenêtres parasites Windows à ignorer dans les pré-vérifications.
|
||||||
|
# Ce ne sont pas des fenêtres applicatives — c'est du bruit système
|
||||||
|
# qui prend le focus de manière imprévisible.
|
||||||
|
_FENETRES_BRUIT_SYSTEME = (
|
||||||
|
"fenêtre de dépassement de capacité",
|
||||||
|
"overflow", # version anglaise systray
|
||||||
|
"program manager",
|
||||||
|
"barre des tâches",
|
||||||
|
"task bar",
|
||||||
|
"cortana",
|
||||||
|
"action center",
|
||||||
|
"centre de notifications",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def est_fenetre_bruit(titre_fenetre: str) -> bool:
|
||||||
|
"""Détecter si un titre de fenêtre est du bruit système Windows.
|
||||||
|
|
||||||
|
Ces fenêtres prennent le focus de manière imprévisible (systray overflow,
|
||||||
|
taskbar, Program Manager) et ne sont jamais la cible d'une action utilisateur.
|
||||||
|
"""
|
||||||
|
if not titre_fenetre:
|
||||||
|
return True # pas de titre = bruit
|
||||||
|
titre_lower = titre_fenetre.lower().strip()
|
||||||
|
if titre_lower == "unknown_window":
|
||||||
|
return True
|
||||||
|
return any(p in titre_lower for p in _FENETRES_BRUIT_SYSTEME)
|
||||||
|
|
||||||
|
|
||||||
# Conservé pour rétro-compatibilité avec le code qui listait MOTIFS_FENETRE_LEA
|
# Conservé pour rétro-compatibilité avec le code qui listait MOTIFS_FENETRE_LEA
|
||||||
MOTIFS_FENETRE_LEA = (
|
MOTIFS_FENETRE_LEA = (
|
||||||
"léa",
|
"léa",
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ import base64
|
|||||||
import hashlib
|
import hashlib
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
|
import threading
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
@@ -72,6 +73,8 @@ class ActionExecutorV1:
|
|||||||
# different de celui qui utilise l'instance).
|
# different de celui qui utilise l'instance).
|
||||||
self._sct = None
|
self._sct = None
|
||||||
self.running = True
|
self.running = True
|
||||||
|
# Verrou de sérialisation — une seule action replay à la fois
|
||||||
|
self._replay_lock = threading.Lock()
|
||||||
# Backoff exponentiel pour le polling replay (evite de marteler le serveur)
|
# Backoff exponentiel pour le polling replay (evite de marteler le serveur)
|
||||||
self._poll_backoff = 1.0 # Delai actuel (secondes)
|
self._poll_backoff = 1.0 # Delai actuel (secondes)
|
||||||
self._poll_backoff_min = 1.0 # Delai minimal (reset apres succes)
|
self._poll_backoff_min = 1.0 # Delai minimal (reset apres succes)
|
||||||
@@ -241,6 +244,107 @@ class ActionExecutorV1:
|
|||||||
logger.warning(f"Acteur gemma4 indisponible : {e}")
|
logger.warning(f"Acteur gemma4 indisponible : {e}")
|
||||||
return "EXECUTER"
|
return "EXECUTER"
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# UIA local — résolution via lea_uia.exe (helper Rust)
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
def _resolve_via_uia_local(
|
||||||
|
self, uia_target: dict, screen_width: int, screen_height: int,
|
||||||
|
):
|
||||||
|
"""Résoudre une cible via UIA local (lea_uia.exe).
|
||||||
|
|
||||||
|
Le plan contient un uia_target (nom, control_type, parent_path).
|
||||||
|
On appelle le helper Rust qui interroge UIAutomationCore.dll et
|
||||||
|
retourne les coordonnées pixel-perfect de l'élément.
|
||||||
|
|
||||||
|
STRICT : si l'élément trouvé n'appartient pas à la bonne fenêtre
|
||||||
|
parente (comparaison du parent_path), on REFUSE.
|
||||||
|
|
||||||
|
Retourne (x_pct, y_pct) si trouvé ET validé, None sinon.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from .uia_helper import get_shared_helper
|
||||||
|
helper = get_shared_helper()
|
||||||
|
if not helper.available:
|
||||||
|
return None
|
||||||
|
|
||||||
|
name = uia_target.get("name", "")
|
||||||
|
control_type = uia_target.get("control_type", "") or None
|
||||||
|
automation_id = uia_target.get("automation_id", "") or None
|
||||||
|
expected_parent_path = uia_target.get("parent_path", []) or []
|
||||||
|
|
||||||
|
if not name:
|
||||||
|
return None
|
||||||
|
|
||||||
|
element = helper.find_by_name(
|
||||||
|
name=name,
|
||||||
|
control_type=control_type,
|
||||||
|
automation_id=automation_id,
|
||||||
|
timeout_ms=1500,
|
||||||
|
)
|
||||||
|
if element is None or not element.is_clickable():
|
||||||
|
logger.debug(f"UIA: '{name}' non trouvé ou non cliquable")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ── VÉRIFICATION STRICTE du parent_path ──
|
||||||
|
if expected_parent_path:
|
||||||
|
expected_root = None
|
||||||
|
for p in expected_parent_path:
|
||||||
|
if p.get("control_type", "").lower() in ("fenêtre", "window"):
|
||||||
|
expected_root = p.get("name", "").strip()
|
||||||
|
break
|
||||||
|
|
||||||
|
if expected_root:
|
||||||
|
found_root = None
|
||||||
|
for p in element.parent_path:
|
||||||
|
if p.get("control_type", "").lower() in ("fenêtre", "window"):
|
||||||
|
found_root = p.get("name", "").strip()
|
||||||
|
break
|
||||||
|
|
||||||
|
if found_root and expected_root != found_root:
|
||||||
|
if (expected_root.lower() not in found_root.lower()
|
||||||
|
and found_root.lower() not in expected_root.lower()):
|
||||||
|
logger.warning(
|
||||||
|
f"UIA REJET : '{name}' trouvé dans '{found_root}' "
|
||||||
|
f"mais attendu dans '{expected_root}'"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f" [UIA] REJET — '{name}' trouvé dans mauvaise fenêtre "
|
||||||
|
f"({found_root} ≠ {expected_root})"
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ── GARDE : rejeter les éléments géants (conteneurs) ──
|
||||||
|
elem_w = element.width()
|
||||||
|
elem_h = element.height()
|
||||||
|
screen_area = screen_width * screen_height
|
||||||
|
elem_area = elem_w * elem_h
|
||||||
|
if screen_area > 0 and elem_area / screen_area > 0.5:
|
||||||
|
logger.warning(
|
||||||
|
f"UIA REJET : '{name}' couvre {elem_area / screen_area * 100:.0f}% "
|
||||||
|
f"de l'écran ({elem_w}x{elem_h}) — conteneur, pas un élément cliquable"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f" [UIA] REJET — '{name}' trop grand "
|
||||||
|
f"({elem_w}x{elem_h}, {elem_area / screen_area * 100:.0f}% écran)"
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
cx, cy = element.center()
|
||||||
|
if screen_width <= 0 or screen_height <= 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
x_pct = cx / screen_width
|
||||||
|
y_pct = cy / screen_height
|
||||||
|
if not (0.0 <= x_pct <= 1.0 and 0.0 <= y_pct <= 1.0):
|
||||||
|
return None
|
||||||
|
|
||||||
|
return (x_pct, y_pct)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"UIA local resolve erreur : {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# Observer — pré-analyse écran avant chaque action
|
# Observer — pré-analyse écran avant chaque action
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
@@ -385,6 +489,11 @@ class ActionExecutorV1:
|
|||||||
"visual_resolved": False,
|
"visual_resolved": False,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ── Délai inter-actions (anti race condition mss) ──
|
||||||
|
wait_before = action.get("wait_before", 0.5)
|
||||||
|
if wait_before > 0:
|
||||||
|
time.sleep(wait_before)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
monitor = self.sct.monitors[1]
|
monitor = self.sct.monitors[1]
|
||||||
width, height = monitor["width"], monitor["height"]
|
width, height = monitor["width"], monitor["height"]
|
||||||
@@ -393,6 +502,14 @@ class ActionExecutorV1:
|
|||||||
x_pct = action.get("x_pct", 0.0)
|
x_pct = action.get("x_pct", 0.0)
|
||||||
y_pct = action.get("y_pct", 0.0)
|
y_pct = action.get("y_pct", 0.0)
|
||||||
|
|
||||||
|
# ── Diagnostic résolution ──
|
||||||
|
logger.info(
|
||||||
|
f"[REPLAY] Action {action_id} ({action_type}) — "
|
||||||
|
f"écran replay: {width}x{height}, "
|
||||||
|
f"x_pct={x_pct:.4f}, y_pct={y_pct:.4f} "
|
||||||
|
f"→ pixel ({int(x_pct * width)}, {int(y_pct * height)})"
|
||||||
|
)
|
||||||
|
|
||||||
# Extraire le nom de l'application depuis un titre de fenêtre
|
# Extraire le nom de l'application depuis un titre de fenêtre
|
||||||
def _app_name(title):
|
def _app_name(title):
|
||||||
for sep in [" – ", " - ", " — "]:
|
for sep in [" – ", " - ", " — "]:
|
||||||
@@ -477,8 +594,27 @@ class ActionExecutorV1:
|
|||||||
return result
|
return result
|
||||||
# EXECUTER → continuer normalement
|
# EXECUTER → continuer normalement
|
||||||
|
|
||||||
if visual_mode and target_spec and server_url:
|
# ── UIA local : résolution rapide via lea_uia.exe ──
|
||||||
# ── GROUNDING : localisation pure via GroundingEngine ──
|
uia_resolved = False
|
||||||
|
if visual_mode and target_spec and action_type == "click":
|
||||||
|
resolve_order = target_spec.get("resolve_order", [])
|
||||||
|
uia_target = target_spec.get("uia_target")
|
||||||
|
if resolve_order and resolve_order[0] == "uia" and uia_target:
|
||||||
|
uia_coords = self._resolve_via_uia_local(uia_target, width, height)
|
||||||
|
if uia_coords:
|
||||||
|
x_pct, y_pct = uia_coords
|
||||||
|
result["visual_resolved"] = True
|
||||||
|
result["resolution_method"] = "uia_local"
|
||||||
|
result["resolution_score"] = 0.95
|
||||||
|
uia_resolved = True
|
||||||
|
print(f" [UIA] résolu en local: ({x_pct:.4f}, {y_pct:.4f})")
|
||||||
|
logger.info(
|
||||||
|
f"UIA local OK : {uia_target.get('name', '?')} "
|
||||||
|
f"→ ({x_pct:.4f}, {y_pct:.4f})"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not uia_resolved and visual_mode and target_spec and server_url:
|
||||||
|
# ── GROUNDING : localisation pure via GroundingEngine (fallback) ──
|
||||||
from .grounding import GroundingEngine
|
from .grounding import GroundingEngine
|
||||||
grounding = GroundingEngine(self)
|
grounding = GroundingEngine(self)
|
||||||
grounding_result = grounding.locate(
|
grounding_result = grounding.locate(
|
||||||
@@ -510,8 +646,8 @@ class ActionExecutorV1:
|
|||||||
|
|
||||||
if action_type == "click":
|
if action_type == "click":
|
||||||
# Si visual_mode est activé, le resolve DOIT réussir.
|
# Si visual_mode est activé, le resolve DOIT réussir.
|
||||||
# Pas de fallback blind — on arrête le replay si la cible
|
# Pas de fallback blind — Léa VOIT l'écran et CHERCHE
|
||||||
# n'est pas trouvée visuellement. C'est un RPA VISUEL.
|
# l'élément. Si toute la cascade échoue → pause supervisée.
|
||||||
if visual_mode and not result.get("visual_resolved"):
|
if visual_mode and not result.get("visual_resolved"):
|
||||||
# ── Policy : décider quoi faire quand grounding échoue ──
|
# ── Policy : décider quoi faire quand grounding échoue ──
|
||||||
from .policy import PolicyEngine, Decision
|
from .policy import PolicyEngine, Decision
|
||||||
@@ -533,7 +669,6 @@ class ActionExecutorV1:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if policy_decision.decision == Decision.RETRY:
|
if policy_decision.decision == Decision.RETRY:
|
||||||
# Re-tenter le grounding après correction (popup fermée, etc.)
|
|
||||||
resolved2 = self._resolve_target_visual(
|
resolved2 = self._resolve_target_visual(
|
||||||
server_url, target_spec, x_pct, y_pct, width, height
|
server_url, target_spec, x_pct, y_pct, width, height
|
||||||
)
|
)
|
||||||
@@ -543,7 +678,6 @@ class ActionExecutorV1:
|
|||||||
result["visual_resolved"] = True
|
result["visual_resolved"] = True
|
||||||
print(f" [POLICY] Re-resolve OK après {policy_decision.action_taken}")
|
print(f" [POLICY] Re-resolve OK après {policy_decision.action_taken}")
|
||||||
else:
|
else:
|
||||||
# Re-resolve échoué — SUPERVISE (rendre la main)
|
|
||||||
result["success"] = False
|
result["success"] = False
|
||||||
result["error"] = "target_not_found"
|
result["error"] = "target_not_found"
|
||||||
result["target_description"] = target_desc
|
result["target_description"] = target_desc
|
||||||
@@ -1200,20 +1334,17 @@ Example: x_pct=0.50, y_pct=0.30"""
|
|||||||
def poll_and_execute(self, session_id: str, server_url: str, machine_id: str = "default") -> bool:
|
def poll_and_execute(self, session_id: str, server_url: str, machine_id: str = "default") -> bool:
|
||||||
"""
|
"""
|
||||||
Poll le serveur pour recuperer et executer la prochaine action.
|
Poll le serveur pour recuperer et executer la prochaine action.
|
||||||
|
Sérialisé par _replay_lock — une seule action à la fois.
|
||||||
1. GET /replay/next pour recuperer l'action
|
|
||||||
2. Execute l'action (clic, texte, etc.)
|
|
||||||
3. POST /replay/result avec le resultat + screenshot
|
|
||||||
|
|
||||||
Args:
|
|
||||||
session_id: Identifiant de la session courante
|
|
||||||
server_url: URL de base du serveur streaming
|
|
||||||
machine_id: Identifiant de la machine (pour le replay multi-machine)
|
|
||||||
|
|
||||||
Retourne True si une action a ete executee, False sinon.
|
|
||||||
IMPORTANT: Si une action est recue, le resultat est TOUJOURS rapporte
|
|
||||||
au serveur (meme en cas d'erreur d'execution).
|
|
||||||
"""
|
"""
|
||||||
|
if not self._replay_lock.acquire(blocking=False):
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
return self._poll_and_execute_inner(session_id, server_url, machine_id)
|
||||||
|
finally:
|
||||||
|
self._replay_lock.release()
|
||||||
|
|
||||||
|
def _poll_and_execute_inner(self, session_id: str, server_url: str, machine_id: str) -> bool:
|
||||||
|
"""Implémentation interne de poll_and_execute (protégée par _replay_lock)."""
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
replay_next_url = f"{server_url}/traces/stream/replay/next"
|
replay_next_url = f"{server_url}/traces/stream/replay/next"
|
||||||
|
|||||||
214
agent_v0/deploy/windows_client/agent_v1/core/grounding.py
Normal file
214
agent_v0/deploy/windows_client/agent_v1/core/grounding.py
Normal file
@@ -0,0 +1,214 @@
|
|||||||
|
# agent_v1/core/grounding.py
|
||||||
|
"""
|
||||||
|
Module Grounding — localisation pure d'éléments UI sur l'écran.
|
||||||
|
|
||||||
|
Responsabilité unique : "Trouve l'élément X sur l'écran et retourne ses coordonnées."
|
||||||
|
Ne prend AUCUNE décision. Si l'élément n'est pas trouvé → retourne NOT_FOUND.
|
||||||
|
|
||||||
|
Stratégies disponibles (cascade configurable) :
|
||||||
|
1. Serveur SomEngine + VLM (GPU distant)
|
||||||
|
2. Template matching local (CPU, ~10ms)
|
||||||
|
3. VLM local direct (CPU/GPU local)
|
||||||
|
|
||||||
|
Séparé de Policy (qui décide quoi faire quand grounding échoue).
|
||||||
|
Ref: docs/PLAN_ACTEUR_V1.md — Architecture MICRO (grounding + exécution)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class GroundingResult:
|
||||||
|
"""Résultat d'une tentative de localisation visuelle."""
|
||||||
|
found: bool # L'élément a été trouvé
|
||||||
|
x_pct: float = 0.0 # Position X en % (0.0-1.0)
|
||||||
|
y_pct: float = 0.0 # Position Y en % (0.0-1.0)
|
||||||
|
method: str = "" # Méthode utilisée (server_som, anchor_template, vlm_direct...)
|
||||||
|
score: float = 0.0 # Confiance (0.0-1.0)
|
||||||
|
elapsed_ms: float = 0.0 # Temps de résolution
|
||||||
|
detail: str = "" # Info supplémentaire (label trouvé, raison échec)
|
||||||
|
raw: Optional[Dict] = None # Données brutes du resolver (pour debug)
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"found": self.found,
|
||||||
|
"x_pct": self.x_pct,
|
||||||
|
"y_pct": self.y_pct,
|
||||||
|
"method": self.method,
|
||||||
|
"score": round(self.score, 3),
|
||||||
|
"elapsed_ms": round(self.elapsed_ms, 1),
|
||||||
|
"detail": self.detail,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Résultat singleton pour "pas trouvé"
|
||||||
|
NOT_FOUND = GroundingResult(found=False, detail="Aucune méthode n'a trouvé l'élément")
|
||||||
|
|
||||||
|
|
||||||
|
class GroundingEngine:
|
||||||
|
"""Moteur de localisation visuelle d'éléments UI.
|
||||||
|
|
||||||
|
Encapsule la cascade de résolution (serveur → template → VLM local)
|
||||||
|
avec une interface unifiée. Ne prend aucune décision — c'est le rôle
|
||||||
|
de PolicyEngine.
|
||||||
|
|
||||||
|
Usage :
|
||||||
|
engine = GroundingEngine(executor)
|
||||||
|
result = engine.locate(screenshot_b64, target_spec, screen_w, screen_h)
|
||||||
|
if result.found:
|
||||||
|
click(result.x_pct, result.y_pct)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, executor):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
executor: ActionExecutorV1 — fournit les méthodes de résolution existantes.
|
||||||
|
"""
|
||||||
|
self._executor = executor
|
||||||
|
|
||||||
|
def locate(
|
||||||
|
self,
|
||||||
|
server_url: str,
|
||||||
|
target_spec: Dict[str, Any],
|
||||||
|
fallback_x: float,
|
||||||
|
fallback_y: float,
|
||||||
|
screen_width: int,
|
||||||
|
screen_height: int,
|
||||||
|
strategies: Optional[List[str]] = None,
|
||||||
|
) -> GroundingResult:
|
||||||
|
"""Localiser un élément UI sur l'écran.
|
||||||
|
|
||||||
|
Exécute la cascade de stratégies dans l'ordre et retourne
|
||||||
|
dès qu'une stratégie trouve l'élément.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
server_url: URL du serveur (SomEngine + VLM GPU)
|
||||||
|
target_spec: Spécification de la cible (by_text, anchor, vlm_description...)
|
||||||
|
fallback_x, fallback_y: Coordonnées de fallback (enregistrement)
|
||||||
|
screen_width, screen_height: Résolution écran
|
||||||
|
strategies: Liste ordonnée de stratégies à essayer.
|
||||||
|
Par défaut : ["server", "template", "vlm_local"]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
GroundingResult avec found=True et coordonnées, ou NOT_FOUND
|
||||||
|
"""
|
||||||
|
if strategies is None:
|
||||||
|
strategies = ["server", "template", "vlm_local"]
|
||||||
|
|
||||||
|
# ── Apprentissage : réordonner les stratégies selon l'historique ──
|
||||||
|
# Si le Learning sait quelle méthode marche pour cette cible,
|
||||||
|
# la mettre en premier. C'est la boucle d'apprentissage.
|
||||||
|
learned = target_spec.get("_learned_strategy", "")
|
||||||
|
if learned:
|
||||||
|
strategy_map = {
|
||||||
|
"som_text_match": "server",
|
||||||
|
"grounding_vlm": "server",
|
||||||
|
"server_som": "server",
|
||||||
|
"anchor_template": "template",
|
||||||
|
"template_matching": "template",
|
||||||
|
"hybrid_text_direct": "vlm_local",
|
||||||
|
"hybrid_vlm_text": "vlm_local",
|
||||||
|
"vlm_direct": "vlm_local",
|
||||||
|
}
|
||||||
|
preferred = strategy_map.get(learned, "")
|
||||||
|
if preferred and preferred in strategies:
|
||||||
|
strategies = [preferred] + [s for s in strategies if s != preferred]
|
||||||
|
logger.info(
|
||||||
|
f"Grounding: stratégie réordonnée par l'apprentissage → "
|
||||||
|
f"{strategies} (learned={learned})"
|
||||||
|
)
|
||||||
|
|
||||||
|
t_start = time.time()
|
||||||
|
screenshot_b64 = self._executor._capture_screenshot_b64(max_width=0, quality=75)
|
||||||
|
if not screenshot_b64:
|
||||||
|
return GroundingResult(
|
||||||
|
found=False, detail="Capture screenshot échouée",
|
||||||
|
elapsed_ms=(time.time() - t_start) * 1000,
|
||||||
|
)
|
||||||
|
|
||||||
|
for strategy in strategies:
|
||||||
|
result = self._try_strategy(
|
||||||
|
strategy, server_url, screenshot_b64, target_spec,
|
||||||
|
fallback_x, fallback_y, screen_width, screen_height,
|
||||||
|
)
|
||||||
|
if result.found:
|
||||||
|
result.elapsed_ms = (time.time() - t_start) * 1000
|
||||||
|
return result
|
||||||
|
|
||||||
|
return GroundingResult(
|
||||||
|
found=False,
|
||||||
|
detail=f"Toutes les stratégies ont échoué ({', '.join(strategies)})",
|
||||||
|
elapsed_ms=(time.time() - t_start) * 1000,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _try_strategy(
|
||||||
|
self,
|
||||||
|
strategy: str,
|
||||||
|
server_url: str,
|
||||||
|
screenshot_b64: str,
|
||||||
|
target_spec: Dict[str, Any],
|
||||||
|
fallback_x: float,
|
||||||
|
fallback_y: float,
|
||||||
|
screen_width: int,
|
||||||
|
screen_height: int,
|
||||||
|
) -> GroundingResult:
|
||||||
|
"""Essayer une stratégie de grounding unique."""
|
||||||
|
|
||||||
|
if strategy == "server" and server_url:
|
||||||
|
raw = self._executor._server_resolve_target(
|
||||||
|
server_url, screenshot_b64, target_spec,
|
||||||
|
fallback_x, fallback_y, screen_width, screen_height,
|
||||||
|
)
|
||||||
|
if raw and raw.get("resolved"):
|
||||||
|
return GroundingResult(
|
||||||
|
found=True,
|
||||||
|
x_pct=raw["x_pct"],
|
||||||
|
y_pct=raw["y_pct"],
|
||||||
|
method=raw.get("method", "server"),
|
||||||
|
score=raw.get("score", 0.0),
|
||||||
|
detail=raw.get("matched_element", {}).get("label", ""),
|
||||||
|
raw=raw,
|
||||||
|
)
|
||||||
|
|
||||||
|
elif strategy == "template":
|
||||||
|
anchor_b64 = target_spec.get("anchor_image_base64", "")
|
||||||
|
if anchor_b64:
|
||||||
|
raw = self._executor._template_match_anchor(
|
||||||
|
screenshot_b64, anchor_b64, screen_width, screen_height,
|
||||||
|
)
|
||||||
|
if raw and raw.get("resolved"):
|
||||||
|
return GroundingResult(
|
||||||
|
found=True,
|
||||||
|
x_pct=raw["x_pct"],
|
||||||
|
y_pct=raw["y_pct"],
|
||||||
|
method="anchor_template",
|
||||||
|
score=raw.get("score", 0.0),
|
||||||
|
raw=raw,
|
||||||
|
)
|
||||||
|
|
||||||
|
elif strategy == "vlm_local":
|
||||||
|
by_text = target_spec.get("by_text", "")
|
||||||
|
vlm_desc = target_spec.get("vlm_description", "")
|
||||||
|
if vlm_desc or by_text:
|
||||||
|
raw = self._executor._hybrid_vlm_resolve(
|
||||||
|
screenshot_b64, target_spec, screen_width, screen_height,
|
||||||
|
)
|
||||||
|
if raw and raw.get("resolved"):
|
||||||
|
return GroundingResult(
|
||||||
|
found=True,
|
||||||
|
x_pct=raw["x_pct"],
|
||||||
|
y_pct=raw["y_pct"],
|
||||||
|
method=raw.get("method", "vlm_local"),
|
||||||
|
score=raw.get("score", 0.0),
|
||||||
|
detail=raw.get("matched_element", {}).get("label", ""),
|
||||||
|
raw=raw,
|
||||||
|
)
|
||||||
|
|
||||||
|
return GroundingResult(found=False, method=strategy, detail=f"{strategy}: pas trouvé")
|
||||||
152
agent_v0/deploy/windows_client/agent_v1/core/policy.py
Normal file
152
agent_v0/deploy/windows_client/agent_v1/core/policy.py
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
# agent_v1/core/policy.py
|
||||||
|
"""
|
||||||
|
Module Policy — décisions intelligentes quand le grounding échoue.
|
||||||
|
|
||||||
|
Responsabilité unique : "Le Grounding dit NOT_FOUND. Que fait-on ?"
|
||||||
|
Ne localise AUCUN élément — c'est le rôle du Grounding.
|
||||||
|
|
||||||
|
Décisions possibles :
|
||||||
|
- RETRY : re-tenter le grounding (après popup fermée, par exemple)
|
||||||
|
- SKIP : l'action n'est plus nécessaire (état déjà atteint)
|
||||||
|
- ABORT : arrêter le workflow (état incohérent)
|
||||||
|
- SUPERVISE : rendre la main à l'utilisateur
|
||||||
|
|
||||||
|
Séparé de Grounding (qui localise les éléments).
|
||||||
|
Ref: docs/PLAN_ACTEUR_V1.md — Architecture MÉSO (acteur intelligent)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Decision(Enum):
|
||||||
|
"""Décisions possibles quand le grounding échoue."""
|
||||||
|
RETRY = "retry" # Re-tenter (après correction : popup fermée, navigation...)
|
||||||
|
SKIP = "skip" # Action inutile (état déjà atteint)
|
||||||
|
ABORT = "abort" # Arrêter le workflow (état incohérent)
|
||||||
|
SUPERVISE = "supervise" # Rendre la main à l'utilisateur (Léa dit "je bloque")
|
||||||
|
CONTINUE = "continue" # Continuer malgré l'échec (action non critique)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PolicyDecision:
|
||||||
|
"""Résultat d'une décision Policy."""
|
||||||
|
decision: Decision
|
||||||
|
reason: str # Explication de la décision
|
||||||
|
action_taken: str = "" # Action corrective effectuée (ex: "popup fermée")
|
||||||
|
elapsed_ms: float = 0.0
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"decision": self.decision.value,
|
||||||
|
"reason": self.reason,
|
||||||
|
"action_taken": self.action_taken,
|
||||||
|
"elapsed_ms": round(self.elapsed_ms, 1),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class PolicyEngine:
|
||||||
|
"""Moteur de décision quand le grounding échoue.
|
||||||
|
|
||||||
|
Cascade de décision :
|
||||||
|
1. Popup détectée ? → fermer et RETRY
|
||||||
|
2. Acteur gemma4 → SKIP / ABORT / SUPERVISE
|
||||||
|
3. Fallback → SUPERVISE (rendre la main)
|
||||||
|
|
||||||
|
Usage :
|
||||||
|
policy = PolicyEngine(executor)
|
||||||
|
decision = policy.decide(action, target_spec, grounding_result)
|
||||||
|
if decision.decision == Decision.RETRY:
|
||||||
|
# re-tenter le grounding
|
||||||
|
elif decision.decision == Decision.SKIP:
|
||||||
|
# marquer comme réussi, passer à la suite
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, executor):
|
||||||
|
self._executor = executor
|
||||||
|
|
||||||
|
def decide(
|
||||||
|
self,
|
||||||
|
action: Dict[str, Any],
|
||||||
|
target_spec: Dict[str, Any],
|
||||||
|
retry_count: int = 0,
|
||||||
|
max_retries: int = 1,
|
||||||
|
) -> PolicyDecision:
|
||||||
|
"""Décider quoi faire quand le grounding a échoué.
|
||||||
|
|
||||||
|
Cascade :
|
||||||
|
1. Si c'est le premier essai → tenter de fermer une popup → RETRY
|
||||||
|
2. Si retry déjà fait → demander à l'acteur gemma4
|
||||||
|
3. Selon gemma4 : SKIP, ABORT, ou SUPERVISE
|
||||||
|
|
||||||
|
Args:
|
||||||
|
action: L'action qui a échoué
|
||||||
|
target_spec: La cible non trouvée
|
||||||
|
retry_count: Nombre de retries déjà faits
|
||||||
|
max_retries: Maximum de retries autorisés
|
||||||
|
"""
|
||||||
|
t_start = time.time()
|
||||||
|
|
||||||
|
# ── Étape 1 : Tentative de fermeture popup (premier essai) ──
|
||||||
|
if retry_count == 0:
|
||||||
|
popup_handled = self._try_close_popup()
|
||||||
|
if popup_handled:
|
||||||
|
return PolicyDecision(
|
||||||
|
decision=Decision.RETRY,
|
||||||
|
reason="Popup détectée et fermée, re-tentative",
|
||||||
|
action_taken="popup_closed",
|
||||||
|
elapsed_ms=(time.time() - t_start) * 1000,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Étape 2 : Max retries atteint → acteur gemma4 ──
|
||||||
|
if retry_count >= max_retries:
|
||||||
|
actor_decision = self._ask_actor(action, target_spec)
|
||||||
|
|
||||||
|
if actor_decision == "PASSER":
|
||||||
|
return PolicyDecision(
|
||||||
|
decision=Decision.SKIP,
|
||||||
|
reason="Acteur gemma4 : l'état est déjà atteint",
|
||||||
|
elapsed_ms=(time.time() - t_start) * 1000,
|
||||||
|
)
|
||||||
|
elif actor_decision == "STOPPER":
|
||||||
|
return PolicyDecision(
|
||||||
|
decision=Decision.ABORT,
|
||||||
|
reason="Acteur gemma4 : état incohérent, arrêt",
|
||||||
|
elapsed_ms=(time.time() - t_start) * 1000,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# EXECUTER ou inconnu → pause supervisée
|
||||||
|
return PolicyDecision(
|
||||||
|
decision=Decision.SUPERVISE,
|
||||||
|
reason=f"Acteur gemma4 : {actor_decision}, pause supervisée",
|
||||||
|
elapsed_ms=(time.time() - t_start) * 1000,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Étape 3 : Encore des retries disponibles → RETRY ──
|
||||||
|
return PolicyDecision(
|
||||||
|
decision=Decision.RETRY,
|
||||||
|
reason=f"Retry {retry_count + 1}/{max_retries}",
|
||||||
|
elapsed_ms=(time.time() - t_start) * 1000,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _try_close_popup(self) -> bool:
|
||||||
|
"""Tenter de fermer une popup via le handler VLM existant."""
|
||||||
|
try:
|
||||||
|
return self._executor._handle_popup_vlm()
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Policy: popup handler échoué : {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _ask_actor(self, action: Dict, target_spec: Dict) -> str:
|
||||||
|
"""Demander à gemma4 de décider (PASSER/EXECUTER/STOPPER)."""
|
||||||
|
try:
|
||||||
|
return self._executor._actor_decide(action, target_spec)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Policy: acteur gemma4 échoué : {e}")
|
||||||
|
return "EXECUTER" # Fallback → supervisé
|
||||||
294
agent_v0/deploy/windows_client/agent_v1/core/uia_helper.py
Normal file
294
agent_v0/deploy/windows_client/agent_v1/core/uia_helper.py
Normal file
@@ -0,0 +1,294 @@
|
|||||||
|
# core/workflow/uia_helper.py
|
||||||
|
"""
|
||||||
|
UIAHelper — Wrapper Python pour lea_uia.exe (helper Rust UI Automation).
|
||||||
|
|
||||||
|
Expose une API Python simple pour interroger UIA via le binaire Rust.
|
||||||
|
Communique via subprocess + stdin/stdout JSON.
|
||||||
|
|
||||||
|
Pourquoi un helper Rust ?
|
||||||
|
- 5-10x plus rapide que pywinauto (10-20ms vs 50-200ms)
|
||||||
|
- Binaire standalone ~500 Ko, aucune dépendance runtime
|
||||||
|
- Pas de problèmes de threading COM en Python
|
||||||
|
- Crash-safe (le crash du helper n'affecte pas l'agent Python)
|
||||||
|
|
||||||
|
Architecture :
|
||||||
|
Python executor
|
||||||
|
↓ subprocess.run
|
||||||
|
lea_uia.exe query --x 812 --y 436
|
||||||
|
↓ UIA API Windows
|
||||||
|
JSON response
|
||||||
|
↓ stdout
|
||||||
|
Python executor parse JSON
|
||||||
|
|
||||||
|
Si lea_uia.exe n'est pas disponible (Linux, binaire absent, crash) :
|
||||||
|
toutes les méthodes retournent None → fallback vision automatique.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import platform
|
||||||
|
import subprocess
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Timeout par défaut pour les appels UIA (en secondes)
|
||||||
|
_DEFAULT_TIMEOUT = 5.0
|
||||||
|
|
||||||
|
# Masquer la fenêtre console lors du spawn de lea_uia.exe sur Windows.
|
||||||
|
# Sans ce flag, chaque appel (à chaque clic utilisateur pendant
|
||||||
|
# l'enregistrement) fait apparaître une fenêtre cmd noire brièvement
|
||||||
|
# visible à l'écran → ralentit la souris et pollue les screenshots
|
||||||
|
# capturés (le VLM peut "voir" le chemin lea_uia.exe comme texte cliqué).
|
||||||
|
#
|
||||||
|
# La valeur 0x08000000 correspond à CREATE_NO_WINDOW défini dans
|
||||||
|
# l'API Windows. Sur Linux/Mac, la valeur est 0 et `creationflags`
|
||||||
|
# est ignoré. getattr() gère le cas où Python expose déjà la constante
|
||||||
|
# sur Windows.
|
||||||
|
if platform.system() == "Windows":
|
||||||
|
_SUBPROCESS_CREATION_FLAGS = getattr(subprocess, "CREATE_NO_WINDOW", 0x08000000)
|
||||||
|
else:
|
||||||
|
_SUBPROCESS_CREATION_FLAGS = 0
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class UiaElement:
|
||||||
|
"""Représentation Python d'un élément UIA."""
|
||||||
|
name: str = ""
|
||||||
|
control_type: str = ""
|
||||||
|
class_name: str = ""
|
||||||
|
automation_id: str = ""
|
||||||
|
bounding_rect: Tuple[int, int, int, int] = (0, 0, 0, 0)
|
||||||
|
is_enabled: bool = False
|
||||||
|
is_offscreen: bool = True
|
||||||
|
parent_path: List[Dict[str, str]] = field(default_factory=list)
|
||||||
|
process_name: str = ""
|
||||||
|
|
||||||
|
def center(self) -> Tuple[int, int]:
|
||||||
|
"""Retourner le centre du rectangle (pixels)."""
|
||||||
|
x1, y1, x2, y2 = self.bounding_rect
|
||||||
|
return ((x1 + x2) // 2, (y1 + y2) // 2)
|
||||||
|
|
||||||
|
def width(self) -> int:
|
||||||
|
return self.bounding_rect[2] - self.bounding_rect[0]
|
||||||
|
|
||||||
|
def height(self) -> int:
|
||||||
|
return self.bounding_rect[3] - self.bounding_rect[1]
|
||||||
|
|
||||||
|
def is_clickable(self) -> bool:
|
||||||
|
"""Peut-on cliquer dessus ?"""
|
||||||
|
return (
|
||||||
|
self.is_enabled
|
||||||
|
and not self.is_offscreen
|
||||||
|
and self.width() > 0
|
||||||
|
and self.height() > 0
|
||||||
|
)
|
||||||
|
|
||||||
|
def path_signature(self) -> str:
|
||||||
|
"""Signature du chemin parent (pour retrouver l'élément)."""
|
||||||
|
parts = [f"{p['control_type']}[{p['name']}]" for p in self.parent_path if p.get("name")]
|
||||||
|
parts.append(f"{self.control_type}[{self.name}]")
|
||||||
|
return " > ".join(parts)
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"name": self.name,
|
||||||
|
"control_type": self.control_type,
|
||||||
|
"class_name": self.class_name,
|
||||||
|
"automation_id": self.automation_id,
|
||||||
|
"bounding_rect": list(self.bounding_rect),
|
||||||
|
"is_enabled": self.is_enabled,
|
||||||
|
"is_offscreen": self.is_offscreen,
|
||||||
|
"parent_path": self.parent_path,
|
||||||
|
"process_name": self.process_name,
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, d: Dict[str, Any]) -> "UiaElement":
|
||||||
|
rect = d.get("bounding_rect", [0, 0, 0, 0])
|
||||||
|
if isinstance(rect, list) and len(rect) >= 4:
|
||||||
|
rect = tuple(rect[:4])
|
||||||
|
else:
|
||||||
|
rect = (0, 0, 0, 0)
|
||||||
|
return cls(
|
||||||
|
name=d.get("name", ""),
|
||||||
|
control_type=d.get("control_type", ""),
|
||||||
|
class_name=d.get("class_name", ""),
|
||||||
|
automation_id=d.get("automation_id", ""),
|
||||||
|
bounding_rect=rect,
|
||||||
|
is_enabled=d.get("is_enabled", False),
|
||||||
|
is_offscreen=d.get("is_offscreen", True),
|
||||||
|
parent_path=d.get("parent_path", []),
|
||||||
|
process_name=d.get("process_name", ""),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class UIAHelper:
|
||||||
|
"""Wrapper Python pour lea_uia.exe."""
|
||||||
|
|
||||||
|
def __init__(self, helper_path: str = "", timeout: float = _DEFAULT_TIMEOUT):
|
||||||
|
self._helper_path = helper_path or self._find_helper()
|
||||||
|
self._timeout = timeout
|
||||||
|
self._available = self._check_available()
|
||||||
|
|
||||||
|
def _find_helper(self) -> str:
|
||||||
|
"""Trouver lea_uia.exe dans les emplacements standards."""
|
||||||
|
candidates = [
|
||||||
|
r"C:\Lea\helpers\lea_uia.exe",
|
||||||
|
os.path.join(os.path.dirname(__file__), "..", "..",
|
||||||
|
"agent_rust", "lea_uia", "target",
|
||||||
|
"x86_64-pc-windows-gnu", "release", "lea_uia.exe"),
|
||||||
|
"./helpers/lea_uia.exe",
|
||||||
|
"lea_uia.exe",
|
||||||
|
]
|
||||||
|
for path in candidates:
|
||||||
|
if os.path.isfile(path):
|
||||||
|
return os.path.abspath(path)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _check_available(self) -> bool:
|
||||||
|
"""Vérifier que le helper est utilisable (Windows + binaire + health OK)."""
|
||||||
|
if platform.system() != "Windows":
|
||||||
|
logger.debug("UIAHelper: Linux/Mac — helper désactivé")
|
||||||
|
return False
|
||||||
|
if not self._helper_path:
|
||||||
|
logger.debug("UIAHelper: lea_uia.exe introuvable")
|
||||||
|
return False
|
||||||
|
if not os.path.isfile(self._helper_path):
|
||||||
|
logger.debug(f"UIAHelper: chemin invalide {self._helper_path}")
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def available(self) -> bool:
|
||||||
|
return self._available
|
||||||
|
|
||||||
|
@property
|
||||||
|
def helper_path(self) -> str:
|
||||||
|
return self._helper_path
|
||||||
|
|
||||||
|
def _run(self, args: List[str]) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Exécuter lea_uia.exe avec les arguments et parser le JSON."""
|
||||||
|
if not self._available:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
[self._helper_path] + args,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=self._timeout,
|
||||||
|
encoding="utf-8",
|
||||||
|
errors="replace",
|
||||||
|
creationflags=_SUBPROCESS_CREATION_FLAGS,
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
logger.debug(
|
||||||
|
f"UIAHelper: exit code {result.returncode}, "
|
||||||
|
f"stderr: {result.stderr[:200]}"
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
output = result.stdout.strip()
|
||||||
|
if not output:
|
||||||
|
return None
|
||||||
|
return json.loads(output)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
logger.debug(f"UIAHelper: timeout ({self._timeout}s) sur {args}")
|
||||||
|
return None
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.debug(f"UIAHelper: JSON invalide — {e}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"UIAHelper: erreur {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def health(self) -> bool:
|
||||||
|
"""Vérifier que UIA répond."""
|
||||||
|
data = self._run(["health"])
|
||||||
|
return data is not None and data.get("status") == "ok"
|
||||||
|
|
||||||
|
def query_at(
|
||||||
|
self,
|
||||||
|
x: int,
|
||||||
|
y: int,
|
||||||
|
with_parents: bool = True,
|
||||||
|
) -> Optional[UiaElement]:
|
||||||
|
"""Récupérer l'élément UIA à une position écran.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x, y: Coordonnées pixel absolues
|
||||||
|
with_parents: Inclure la hiérarchie des parents
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
UiaElement si trouvé, None sinon (pas d'élément ou UIA indispo)
|
||||||
|
"""
|
||||||
|
args = ["query", "--x", str(x), "--y", str(y)]
|
||||||
|
if not with_parents:
|
||||||
|
args.append("--with-parents=false")
|
||||||
|
|
||||||
|
data = self._run(args)
|
||||||
|
if not data or data.get("status") != "ok":
|
||||||
|
return None
|
||||||
|
|
||||||
|
elem_data = data.get("element")
|
||||||
|
if not elem_data:
|
||||||
|
return None
|
||||||
|
return UiaElement.from_dict(elem_data)
|
||||||
|
|
||||||
|
def find_by_name(
|
||||||
|
self,
|
||||||
|
name: str,
|
||||||
|
control_type: Optional[str] = None,
|
||||||
|
automation_id: Optional[str] = None,
|
||||||
|
window: Optional[str] = None,
|
||||||
|
timeout_ms: int = 2000,
|
||||||
|
) -> Optional[UiaElement]:
|
||||||
|
"""Rechercher un élément par son nom (+ filtres optionnels).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Nom exact de l'élément
|
||||||
|
control_type: Type de contrôle (Button, Edit, MenuItem...)
|
||||||
|
automation_id: ID d'automation
|
||||||
|
window: Restreindre à une fenêtre spécifique
|
||||||
|
timeout_ms: Timeout de recherche en millisecondes
|
||||||
|
"""
|
||||||
|
args = ["find", "--name", name, "--timeout-ms", str(timeout_ms)]
|
||||||
|
if control_type:
|
||||||
|
args.extend(["--control-type", control_type])
|
||||||
|
if automation_id:
|
||||||
|
args.extend(["--automation-id", automation_id])
|
||||||
|
if window:
|
||||||
|
args.extend(["--window", window])
|
||||||
|
|
||||||
|
data = self._run(args)
|
||||||
|
if not data or data.get("status") != "ok":
|
||||||
|
return None
|
||||||
|
|
||||||
|
elem_data = data.get("element")
|
||||||
|
if not elem_data:
|
||||||
|
return None
|
||||||
|
return UiaElement.from_dict(elem_data)
|
||||||
|
|
||||||
|
def capture_focused(self, max_depth: int = 3) -> Optional[UiaElement]:
|
||||||
|
"""Capturer l'élément ayant le focus + son contexte."""
|
||||||
|
data = self._run(["capture", "--max-depth", str(max_depth)])
|
||||||
|
if not data or data.get("status") != "ok":
|
||||||
|
return None
|
||||||
|
|
||||||
|
elem_data = data.get("element")
|
||||||
|
if not elem_data:
|
||||||
|
return None
|
||||||
|
return UiaElement.from_dict(elem_data)
|
||||||
|
|
||||||
|
|
||||||
|
# Instance globale partagée (singleton léger)
|
||||||
|
_SHARED_HELPER: Optional[UIAHelper] = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_shared_helper() -> UIAHelper:
|
||||||
|
"""Retourner une instance partagée de UIAHelper."""
|
||||||
|
global _SHARED_HELPER
|
||||||
|
if _SHARED_HELPER is None:
|
||||||
|
_SHARED_HELPER = UIAHelper()
|
||||||
|
return _SHARED_HELPER
|
||||||
@@ -1,12 +1,97 @@
|
|||||||
# run_agent_v1.py
|
# run_agent_v1.py
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import atexit
|
||||||
|
|
||||||
# Ajout du répertoire courant au PYTHONPATH pour permettre les imports de modules
|
# Ajout du répertoire courant au PYTHONPATH pour permettre les imports de modules
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
if current_dir not in sys.path:
|
if current_dir not in sys.path:
|
||||||
sys.path.append(current_dir)
|
sys.path.append(current_dir)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Verrou PID — empêche le lancement de plusieurs instances
|
||||||
|
# Même si Lea.bat est double-cliqué ou lancé deux fois,
|
||||||
|
# un seul agent tourne à la fois (defense-in-depth).
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
LOCK_FILE = os.path.join(current_dir, "lea_agent.lock")
|
||||||
|
|
||||||
|
|
||||||
|
def _pid_is_alive(pid: int) -> bool:
|
||||||
|
"""Vérifie si un processus avec ce PID existe encore (Windows + Unix)."""
|
||||||
|
if sys.platform == "win32":
|
||||||
|
try:
|
||||||
|
import ctypes
|
||||||
|
kernel32 = ctypes.windll.kernel32 # type: ignore[attr-defined]
|
||||||
|
PROCESS_QUERY_LIMITED_INFORMATION = 0x1000
|
||||||
|
handle = kernel32.OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION, False, pid)
|
||||||
|
if handle:
|
||||||
|
kernel32.CloseHandle(handle)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
# Fallback : tasklist
|
||||||
|
try:
|
||||||
|
import subprocess
|
||||||
|
result = subprocess.run(
|
||||||
|
["tasklist", "/FI", f"PID eq {pid}", "/NH"],
|
||||||
|
capture_output=True, text=True, timeout=5,
|
||||||
|
)
|
||||||
|
return str(pid) in result.stdout
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
# Unix/Linux — os.kill(pid, 0) ne tue pas le process
|
||||||
|
try:
|
||||||
|
os.kill(pid, 0)
|
||||||
|
return True
|
||||||
|
except (OSError, ProcessLookupError):
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _acquire_lock() -> bool:
|
||||||
|
"""Tente d'acquérir le verrou PID. Retourne False si une autre instance tourne."""
|
||||||
|
my_pid = os.getpid()
|
||||||
|
|
||||||
|
# Lire le PID existant
|
||||||
|
if os.path.isfile(LOCK_FILE):
|
||||||
|
try:
|
||||||
|
with open(LOCK_FILE, "r", encoding="utf-8") as f:
|
||||||
|
old_pid = int(f.read().strip())
|
||||||
|
# Le PID dans le lock est-il encore vivant ?
|
||||||
|
if old_pid != my_pid and _pid_is_alive(old_pid):
|
||||||
|
return False # Une autre instance tourne déjà
|
||||||
|
except (ValueError, OSError):
|
||||||
|
pass # Fichier corrompu — on l'écrase
|
||||||
|
|
||||||
|
# Écrire notre PID
|
||||||
|
try:
|
||||||
|
with open(LOCK_FILE, "w", encoding="utf-8") as f:
|
||||||
|
f.write(str(my_pid))
|
||||||
|
except OSError:
|
||||||
|
pass # Pas bloquant — on continue sans lock
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _release_lock():
|
||||||
|
"""Supprime le fichier lock au shutdown."""
|
||||||
|
try:
|
||||||
|
if os.path.isfile(LOCK_FILE):
|
||||||
|
with open(LOCK_FILE, "r", encoding="utf-8") as f:
|
||||||
|
stored_pid = int(f.read().strip())
|
||||||
|
# Ne supprimer que si c'est bien NOTRE lock
|
||||||
|
if stored_pid == os.getpid():
|
||||||
|
os.remove(LOCK_FILE)
|
||||||
|
except (ValueError, OSError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# Vérification du lock AVANT toute initialisation lourde
|
||||||
|
if not _acquire_lock():
|
||||||
|
# Une autre instance de Léa tourne déjà — on quitte silencieusement
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
atexit.register(_release_lock)
|
||||||
|
|
||||||
# Charger config.txt et .env comme variables d'environnement
|
# Charger config.txt et .env comme variables d'environnement
|
||||||
# (équivalent du `set` dans Lea.bat, mais fonctionne aussi sans le .bat)
|
# (équivalent du `set` dans Lea.bat, mais fonctionne aussi sans le .bat)
|
||||||
for config_file in ("config.txt", ".env"):
|
for config_file in ("config.txt", ".env"):
|
||||||
@@ -32,7 +117,7 @@ logging.basicConfig(
|
|||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
||||||
)
|
)
|
||||||
logging.info("=== Agent V1 démarrage — config chargée ===")
|
logging.info("=== Agent V1 démarrage — config chargée (PID %d) ===", os.getpid())
|
||||||
logging.info("RPA_SERVER_URL=%s", os.environ.get("RPA_SERVER_URL", "(non défini)"))
|
logging.info("RPA_SERVER_URL=%s", os.environ.get("RPA_SERVER_URL", "(non défini)"))
|
||||||
logging.info("RPA_SERVER_HOST=%s", os.environ.get("RPA_SERVER_HOST", "(non défini)"))
|
logging.info("RPA_SERVER_HOST=%s", os.environ.get("RPA_SERVER_HOST", "(non défini)"))
|
||||||
logging.info("RPA_API_TOKEN=%s", os.environ.get("RPA_API_TOKEN", "(non défini)")[:8] + "...")
|
logging.info("RPA_API_TOKEN=%s", os.environ.get("RPA_API_TOKEN", "(non défini)")[:8] + "...")
|
||||||
|
|||||||
@@ -488,6 +488,8 @@ class ReplayResultReport(BaseModel):
|
|||||||
# Champs enrichis pour target_not_found (pause supervisée)
|
# Champs enrichis pour target_not_found (pause supervisée)
|
||||||
target_description: Optional[str] = None # Description humaine de la cible
|
target_description: Optional[str] = None # Description humaine de la cible
|
||||||
target_spec: Optional[Dict[str, Any]] = None # Spec complete de la cible
|
target_spec: Optional[Dict[str, Any]] = None # Spec complete de la cible
|
||||||
|
# Correction humaine (mode apprentissage supervisé)
|
||||||
|
correction: Optional[Dict[str, Any]] = None # {x_pct, y_pct, uia_snapshot, crop_b64}
|
||||||
|
|
||||||
|
|
||||||
class ErrorCallbackConfig(BaseModel):
|
class ErrorCallbackConfig(BaseModel):
|
||||||
@@ -1883,6 +1885,26 @@ async def start_raw_replay(request: RawReplayRequest):
|
|||||||
resolved_machine_id = target_machine_id or (session_obj.machine_id if session_obj else "default")
|
resolved_machine_id = target_machine_id or (session_obj.machine_id if session_obj else "default")
|
||||||
|
|
||||||
with _replay_lock:
|
with _replay_lock:
|
||||||
|
# ── Nettoyage : annuler les replays bloqués pour cette machine ──
|
||||||
|
# Un replay en paused_need_help bloque tous les suivants.
|
||||||
|
# Quand on lance un nouveau replay, les anciens sont obsolètes.
|
||||||
|
stale_ids = [
|
||||||
|
rid for rid, state in _replay_states.items()
|
||||||
|
if state.get("machine_id") == resolved_machine_id
|
||||||
|
and state["status"] in ("paused_need_help", "running")
|
||||||
|
]
|
||||||
|
for rid in stale_ids:
|
||||||
|
old_state = _replay_states[rid]
|
||||||
|
old_sid = old_state.get("session_id", "")
|
||||||
|
old_state["status"] = "cancelled"
|
||||||
|
# Vider la queue associée
|
||||||
|
if old_sid in _replay_queues:
|
||||||
|
_replay_queues.pop(old_sid, None)
|
||||||
|
logger.info(
|
||||||
|
f"Replay {rid} annulé (remplacé par {replay_id}) — "
|
||||||
|
f"était {old_state.get('completed_actions', 0)}/{old_state.get('total_actions', 0)}"
|
||||||
|
)
|
||||||
|
|
||||||
_replay_queues[session_id] = list(actions)
|
_replay_queues[session_id] = list(actions)
|
||||||
_replay_states[replay_id] = _create_replay_state(
|
_replay_states[replay_id] = _create_replay_state(
|
||||||
replay_id=replay_id,
|
replay_id=replay_id,
|
||||||
@@ -3032,6 +3054,26 @@ async def report_action_result(report: ReplayResultReport):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Learning: échec enregistrement: {e}")
|
logger.debug(f"Learning: échec enregistrement: {e}")
|
||||||
|
|
||||||
|
# === Correction humaine (mode apprentissage supervisé) ===
|
||||||
|
# L'humain a montré à Léa où cliquer. On stocke cette correction
|
||||||
|
# dans target_memory pour que la prochaine fois, Léa sache toute seule.
|
||||||
|
if report.correction and original_action:
|
||||||
|
try:
|
||||||
|
corr = report.correction
|
||||||
|
target_spec = original_action.get("target_spec", {})
|
||||||
|
logger.info(
|
||||||
|
f"[APPRENTISSAGE] Correction humaine reçue : "
|
||||||
|
f"({corr.get('x_pct', 0):.4f}, {corr.get('y_pct', 0):.4f}) "
|
||||||
|
f"pour '{target_spec.get('by_text', '?')}'"
|
||||||
|
)
|
||||||
|
_replay_learner.record_human_correction(
|
||||||
|
session_id=session_id,
|
||||||
|
action=original_action,
|
||||||
|
correction=corr,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Learning: échec stockage correction humaine: {e}")
|
||||||
|
|
||||||
# === Audit Trail : traçabilité complète pour conformité hospitalière ===
|
# === Audit Trail : traçabilité complète pour conformité hospitalière ===
|
||||||
try:
|
try:
|
||||||
_action = original_action or {"action_id": action_id, "type": "unknown"}
|
_action = original_action or {"action_id": action_id, "type": "unknown"}
|
||||||
|
|||||||
@@ -175,6 +175,55 @@ class ReplayLearner:
|
|||||||
|
|
||||||
self.record(outcome)
|
self.record(outcome)
|
||||||
|
|
||||||
|
def record_human_correction(
|
||||||
|
self,
|
||||||
|
session_id: str,
|
||||||
|
action: Dict[str, Any],
|
||||||
|
correction: Dict[str, Any],
|
||||||
|
) -> None:
|
||||||
|
"""Enregistrer une correction humaine (mode apprentissage supervisé).
|
||||||
|
|
||||||
|
L'humain a montré à Léa où cliquer. On stocke cette correction
|
||||||
|
dans target_memory.db pour que la prochaine fois, Léa sache.
|
||||||
|
"""
|
||||||
|
target_spec = action.get("target_spec", {})
|
||||||
|
by_text = target_spec.get("by_text", "")
|
||||||
|
window_title = target_spec.get("window_title", "")
|
||||||
|
x_pct = correction.get("x_pct", 0.0)
|
||||||
|
y_pct = correction.get("y_pct", 0.0)
|
||||||
|
|
||||||
|
# Enregistrer dans le JSONL d'apprentissage
|
||||||
|
outcome = ActionOutcome(
|
||||||
|
session_id=session_id,
|
||||||
|
action_id=action.get("action_id", ""),
|
||||||
|
action_type="click",
|
||||||
|
target_description=by_text,
|
||||||
|
window_title=window_title,
|
||||||
|
resolution_method="human_supervised",
|
||||||
|
resolution_score=1.0, # Confiance maximale — l'humain a montré
|
||||||
|
success=True,
|
||||||
|
)
|
||||||
|
self.record(outcome)
|
||||||
|
|
||||||
|
# Stocker dans target_memory.db pour le lookup futur
|
||||||
|
try:
|
||||||
|
from .replay_memory import get_target_memory_store
|
||||||
|
store = get_target_memory_store()
|
||||||
|
if store:
|
||||||
|
store.record_success(
|
||||||
|
screen_signature="human_correction",
|
||||||
|
target_spec=target_spec,
|
||||||
|
resolved_position={"x_pct": x_pct, "y_pct": y_pct},
|
||||||
|
method="human_supervised",
|
||||||
|
score=1.0,
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"[APPRENTISSAGE] Correction stockée dans target_memory : "
|
||||||
|
f"'{by_text}' → ({x_pct:.4f}, {y_pct:.4f})"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Learning: échec stockage target_memory: {e}")
|
||||||
|
|
||||||
def query_similar(
|
def query_similar(
|
||||||
self,
|
self,
|
||||||
target_description: str = "",
|
target_description: str = "",
|
||||||
|
|||||||
@@ -51,10 +51,14 @@ echo Pour arreter Lea : clic droit sur l'icone ^> "Quitter Lea"
|
|||||||
echo Vous pouvez fermer cette fenetre.
|
echo Vous pouvez fermer cette fenetre.
|
||||||
echo.
|
echo.
|
||||||
|
|
||||||
.venv\Scripts\pythonw.exe run_agent_v1.py
|
start "" /b .venv\Scripts\pythonw.exe run_agent_v1.py
|
||||||
|
|
||||||
|
:: Attendre 3s puis verifier que Lea tourne
|
||||||
|
timeout /t 3 >nul
|
||||||
|
tasklist /FI "IMAGENAME eq pythonw.exe" /NH 2>nul | findstr /I "pythonw" >nul
|
||||||
if errorlevel 1 (
|
if errorlevel 1 (
|
||||||
echo.
|
echo.
|
||||||
echo Lea a rencontre un probleme au demarrage.
|
echo Lea n'a pas demarre correctement.
|
||||||
echo Tentative avec affichage des erreurs...
|
echo Tentative avec affichage des erreurs...
|
||||||
echo.
|
echo.
|
||||||
.venv\Scripts\python.exe run_agent_v1.py
|
.venv\Scripts\python.exe run_agent_v1.py
|
||||||
|
|||||||
@@ -875,17 +875,174 @@ def _find_session_dir(machine_id: str, session_id: str) -> Optional[Path]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _load_crop_as_base64(session_dir: Path, screenshot_id: str) -> str:
|
||||||
|
"""Charger un crop screenshot et le retourner en base64.
|
||||||
|
|
||||||
|
Le crop (80x80 autour du clic) sert d'ancre pour le template matching —
|
||||||
|
le GroundingEngine compare cette vignette a l'ecran actuel via OpenCV.
|
||||||
|
"""
|
||||||
|
if not screenshot_id:
|
||||||
|
return ""
|
||||||
|
crop_path = session_dir / "shots" / f"{screenshot_id}_crop.png"
|
||||||
|
if not crop_path.is_file():
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
import base64
|
||||||
|
data = crop_path.read_bytes()
|
||||||
|
return base64.b64encode(data).decode("ascii")
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _build_vlm_description(
|
||||||
|
uia_snapshot: Dict[str, Any], window_info: Dict[str, Any],
|
||||||
|
) -> str:
|
||||||
|
"""Construire une description naturelle pour le VLM.
|
||||||
|
|
||||||
|
Le VLM recoit cette phrase + le screenshot actuel et doit localiser
|
||||||
|
l'element decrit. Plus la description est precise, meilleur le grounding.
|
||||||
|
"""
|
||||||
|
name = uia_snapshot.get("name", "")
|
||||||
|
control_type = uia_snapshot.get("control_type", "")
|
||||||
|
window_title = window_info.get("title", "") if window_info else ""
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
if control_type:
|
||||||
|
parts.append(f"le {control_type}")
|
||||||
|
if name:
|
||||||
|
parts.append(f"'{name}'")
|
||||||
|
if window_title and window_title != "unknown_window":
|
||||||
|
parts.append(f"dans la fenetre '{window_title}'")
|
||||||
|
|
||||||
|
if parts:
|
||||||
|
return " ".join(parts)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _build_full_target_spec(
|
||||||
|
event: Dict[str, Any], session_dir: Path,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Construire un target_spec complet pour la cascade de resolution visuelle.
|
||||||
|
|
||||||
|
Exploite TOUTES les donnees capturees pendant l'enregistrement :
|
||||||
|
- uia_snapshot → resolution UIA locale (lea_uia.exe, 10-20ms)
|
||||||
|
- crop screenshot → template matching OpenCV (~100ms)
|
||||||
|
- nom UIA + window_title → OCR docTR + VLM grounding (1-5s)
|
||||||
|
|
||||||
|
La cascade : UIA → template → serveur (docTR+VLM) → VLM local.
|
||||||
|
Si tout echoue → pause supervisee (pas de clic aveugle).
|
||||||
|
"""
|
||||||
|
uia_snapshot = event.get("uia_snapshot", {})
|
||||||
|
window_info = event.get("window", {})
|
||||||
|
vision_info = event.get("vision_info", {})
|
||||||
|
screenshot_id = event.get("screenshot_id", "")
|
||||||
|
|
||||||
|
name = uia_snapshot.get("name", "") if uia_snapshot else ""
|
||||||
|
control_type = uia_snapshot.get("control_type", "") if uia_snapshot else ""
|
||||||
|
automation_id = uia_snapshot.get("automation_id", "") if uia_snapshot else ""
|
||||||
|
parent_path = uia_snapshot.get("parent_path", []) if uia_snapshot else []
|
||||||
|
window_title = window_info.get("title", "") if window_info else ""
|
||||||
|
|
||||||
|
# Cascade de resolution — UIA d'abord (rapide), puis vision
|
||||||
|
resolve_order = []
|
||||||
|
|
||||||
|
# UIA : disponible si on a un nom ou automation_id
|
||||||
|
has_uia = bool(name or automation_id)
|
||||||
|
if has_uia:
|
||||||
|
resolve_order.append("uia")
|
||||||
|
|
||||||
|
# Template matching : disponible si on a un crop
|
||||||
|
anchor_b64 = _load_crop_as_base64(session_dir, screenshot_id)
|
||||||
|
if anchor_b64:
|
||||||
|
resolve_order.append("template")
|
||||||
|
|
||||||
|
# Serveur (docTR OCR + SomEngine + VLM) : toujours en fallback
|
||||||
|
resolve_order.append("server")
|
||||||
|
|
||||||
|
# VLM local : dernier recours
|
||||||
|
resolve_order.append("vlm_local")
|
||||||
|
|
||||||
|
if not resolve_order:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
target_spec: Dict[str, Any] = {
|
||||||
|
"resolve_order": resolve_order,
|
||||||
|
"window_title": window_title,
|
||||||
|
}
|
||||||
|
|
||||||
|
# UIA target
|
||||||
|
if has_uia:
|
||||||
|
target_spec["uia_target"] = {
|
||||||
|
"name": name,
|
||||||
|
"control_type": control_type,
|
||||||
|
"automation_id": automation_id,
|
||||||
|
"parent_path": parent_path,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Anchor pour template matching
|
||||||
|
if anchor_b64:
|
||||||
|
target_spec["anchor_image_base64"] = anchor_b64
|
||||||
|
|
||||||
|
# Texte pour OCR (docTR sur le serveur)
|
||||||
|
if name:
|
||||||
|
target_spec["by_text"] = name
|
||||||
|
|
||||||
|
# Description VLM
|
||||||
|
vlm_desc = _build_vlm_description(uia_snapshot or {}, window_info or {})
|
||||||
|
if vlm_desc:
|
||||||
|
target_spec["vlm_description"] = vlm_desc
|
||||||
|
|
||||||
|
return target_spec
|
||||||
|
|
||||||
|
|
||||||
|
def _build_desktop_cleanup_actions(screen_w: int, screen_h: int) -> List[Dict[str, Any]]:
|
||||||
|
"""Construire les actions de nettoyage bureau AVANT le replay.
|
||||||
|
|
||||||
|
Sur Windows 11, un clic sur l'extreme droite de la barre des taches
|
||||||
|
(le pixel invisible 'Afficher le bureau') minimise toutes les fenetres.
|
||||||
|
C'est exactement ce qu'un humain ferait avant de commencer un travail :
|
||||||
|
repartir d'un bureau propre.
|
||||||
|
|
||||||
|
100% visuel — pas de raccourci clavier injecte (cf feedback_100pct_visual).
|
||||||
|
"""
|
||||||
|
# Le bouton 'Afficher le bureau' est au pixel tout en bas a droite
|
||||||
|
# de la taskbar. Sur Win11, c'est une fine bande cliquable.
|
||||||
|
x_pct = round((screen_w - 2) / screen_w, 6) # avant-dernier pixel
|
||||||
|
y_pct = round((screen_h - 2) / screen_h, 6) # idem vertical
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"action_id": f"act_setup_desktop_{uuid.uuid4().hex[:6]}",
|
||||||
|
"type": "click",
|
||||||
|
"x_pct": x_pct,
|
||||||
|
"y_pct": y_pct,
|
||||||
|
"button": "left",
|
||||||
|
"visual_mode": False, # position fixe, pas besoin de grounding
|
||||||
|
"wait_before": 0.3,
|
||||||
|
"_setup_action": True, # marqueur pour le distinguer des vrais clics
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action_id": f"act_setup_wait_{uuid.uuid4().hex[:6]}",
|
||||||
|
"type": "wait",
|
||||||
|
"duration_ms": 1000,
|
||||||
|
"wait_before": 0,
|
||||||
|
"_setup_action": True,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def _simple_build_replay(events: List[Dict[str, Any]], session_dir: Path) -> List[Dict[str, Any]]:
|
def _simple_build_replay(events: List[Dict[str, Any]], session_dir: Path) -> List[Dict[str, Any]]:
|
||||||
"""Construire un replay simplifie sans dependre de stream_processor.
|
"""Construire un replay visuel depuis les evenements bruts.
|
||||||
|
|
||||||
Convertit les evenements bruts en actions normalisees simples :
|
Chaque clic est enrichi avec un target_spec complet qui alimente
|
||||||
- mouse_click -> action click (coordonnees en pixels)
|
la cascade de resolution du GroundingEngine :
|
||||||
- text_input / type -> action type
|
UIA local (10ms) → template matching (100ms) → serveur docTR/VLM (2-5s)
|
||||||
- key_combo / key_press -> action key_combo
|
|
||||||
|
|
||||||
C'est un fallback pour quand build_replay_from_raw_events n'est pas disponible.
|
Les coordonnees x_pct/y_pct sont incluses comme hint de derniere chance.
|
||||||
Les coordonnees ne sont PAS converties en pourcentages (le serveur les accepte
|
Lea ne clique pas en aveugle — elle VOIT l'ecran et CHERCHE l'element.
|
||||||
aussi en pixels).
|
|
||||||
|
Le replay commence par un nettoyage du bureau (clic 'Afficher le bureau')
|
||||||
|
pour partir d'un etat propre — exactement comme un humain.
|
||||||
"""
|
"""
|
||||||
actions: List[Dict[str, Any]] = []
|
actions: List[Dict[str, Any]] = []
|
||||||
click_count = 0
|
click_count = 0
|
||||||
@@ -900,6 +1057,9 @@ def _simple_build_replay(events: List[Dict[str, Any]], session_dir: Path) -> Lis
|
|||||||
screen_w, screen_h = int(res[0]), int(res[1])
|
screen_w, screen_h = int(res[0]), int(res[1])
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# ── Étape 0 : nettoyer le bureau ──
|
||||||
|
actions.extend(_build_desktop_cleanup_actions(screen_w, screen_h))
|
||||||
|
|
||||||
for ev in events:
|
for ev in events:
|
||||||
inner = ev.get("event", {})
|
inner = ev.get("event", {})
|
||||||
etype = inner.get("type", "")
|
etype = inner.get("type", "")
|
||||||
@@ -913,15 +1073,35 @@ def _simple_build_replay(events: List[Dict[str, Any]], session_dir: Path) -> Lis
|
|||||||
pos = inner.get("pos", [0, 0])
|
pos = inner.get("pos", [0, 0])
|
||||||
click_count += 1
|
click_count += 1
|
||||||
|
|
||||||
|
x_pct = round(pos[0] / screen_w, 6) if screen_w else 0.0
|
||||||
|
y_pct = round(pos[1] / screen_h, 6) if screen_h else 0.0
|
||||||
|
|
||||||
action = {
|
action = {
|
||||||
"action_id": action_id,
|
"action_id": action_id,
|
||||||
"type": "click",
|
"type": "click",
|
||||||
"x_pct": round(pos[0] / screen_w, 6) if screen_w else 0.0,
|
"x_pct": x_pct,
|
||||||
"y_pct": round(pos[1] / screen_h, 6) if screen_h else 0.0,
|
"y_pct": y_pct,
|
||||||
"button": inner.get("button", "left"),
|
"button": inner.get("button", "left"),
|
||||||
"visual_mode": False, # pas d'enrichissement → coords brutes
|
|
||||||
"wait_before": 0.5,
|
"wait_before": 0.5,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Enrichir avec la cascade visuelle complete
|
||||||
|
target_spec = _build_full_target_spec(inner, session_dir)
|
||||||
|
if target_spec:
|
||||||
|
action["visual_mode"] = True
|
||||||
|
action["target_spec"] = target_spec
|
||||||
|
uia_name = inner.get("uia_snapshot", {}).get("name", "?")
|
||||||
|
methods = target_spec.get("resolve_order", [])
|
||||||
|
logger.info(
|
||||||
|
"Action %s enrichie [%s] : '%s' (%s)",
|
||||||
|
action_id, "+".join(methods), uia_name,
|
||||||
|
inner.get("uia_snapshot", {}).get("control_type", "?"),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Pas de donnee visuelle du tout → coords brutes en dernier recours
|
||||||
|
action["visual_mode"] = False
|
||||||
|
logger.warning("Action %s : aucune donnee visuelle, coords brutes", action_id)
|
||||||
|
|
||||||
actions.append(action)
|
actions.append(action)
|
||||||
|
|
||||||
elif etype in ("text_input", "type"):
|
elif etype in ("text_input", "type"):
|
||||||
|
|||||||
Reference in New Issue
Block a user