Pipeline replay visuel : - VLM-first : l'agent appelle Ollama directement pour trouver les éléments - Template matching en fallback (seuil strict 0.90) - Stop immédiat si élément non trouvé (pas de clic blind) - Replay depuis session brute (/replay-session) sans attendre le VLM - Vérification post-action (screenshot hash avant/après) - Gestion des popups (Enter/Escape/Tab+Enter) Worker VLM séparé : - run_worker.py : process distinct du serveur HTTP - Communication par fichiers (_worker_queue.txt + _replay_active.lock) - Le serveur HTTP ne fait plus jamais de VLM → toujours réactif - Service systemd rpa-worker.service Capture clavier : - raw_keys (vk + press/release) pour replay exact indépendant du layout - Fix AZERTY : ToUnicodeEx + AltGr detection - Enter capturé comme \n, Tab comme \t - Filtrage modificateurs seuls (Ctrl/Alt/Shift parasites) - Fusion text_input consécutifs, dédup key_combo Sécurité & Internet : - HTTPS Let's Encrypt (lea.labs + vwb.labs.laurinebazin.design) - Token API fixe dans .env.local - HTTP Basic Auth sur VWB - Security headers (HSTS, CSP, nosniff) - CORS domaines publics, plus de wildcard Infrastructure : - DPI awareness (SetProcessDpiAwareness) Python + Rust - Métadonnées système (dpi_scale, window_bounds, monitors, os_theme) - Template matching multi-scale [0.5, 2.0] - Résolution dynamique (plus de hardcode 1920x1080) - VLM prefill fix (47x speedup, 3.5s au lieu de 180s) Modules : - core/auth/ : credential vault (Fernet AES), TOTP (RFC 6238), auth handler - core/federation/ : LearningPack export/import anonymisé, FAISS global - deploy/ : package Léa (config.txt, Lea.bat, install.bat, LISEZMOI.txt) UX : - Filtrage OS (VWB + Chat montrent que les workflows de l'OS courant) - Bibliothèque persistante (cache local + SQLite) - Clustering hybride (titre fenêtre + DBSCAN) - EdgeConstraints + PostConditions peuplés - GraphBuilder compound actions (toutes les frappes) Agent Rust : - Token Bearer auth (network.rs) - sysinfo.rs (DPI, résolution, window bounds via Win32 API) - config.txt lu automatiquement - Support Chrome/Brave/Firefox (pas que Edge) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
196 lines
6.2 KiB
Python
196 lines
6.2 KiB
Python
# agent_v1/vision/system_info.py
|
|
"""
|
|
Capture des metadonnees systeme pour enrichir les evenements.
|
|
|
|
Collecte DPI, resolution, fenetre active, moniteur, theme OS et langue.
|
|
Les fonctions Windows (ctypes.windll, winreg) ont des fallbacks gracieux
|
|
pour Linux/Mac.
|
|
"""
|
|
|
|
import platform
|
|
import locale
|
|
import logging
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Cache du systeme d'exploitation pour eviter les appels repetes
|
|
_SYSTEM = platform.system()
|
|
|
|
|
|
def get_dpi_scale() -> int:
|
|
"""Retourne le facteur DPI en % (100 = normal, 150 = haute resolution).
|
|
|
|
Windows : ctypes.windll.user32.GetDpiForSystem()
|
|
Linux/Mac : fallback 100
|
|
|
|
NOTE : Le process DOIT deja etre DPI-aware (via SetProcessDpiAwareness(2)
|
|
appele dans config.py) pour que GetDpiForSystem retourne le vrai DPI.
|
|
"""
|
|
if _SYSTEM == "Windows":
|
|
try:
|
|
import ctypes
|
|
dpi = ctypes.windll.user32.GetDpiForSystem()
|
|
return round(dpi * 100 / 96) # 96 DPI = 100%
|
|
except Exception as e:
|
|
logger.debug(f"Impossible de lire le DPI Windows : {e}")
|
|
return 100
|
|
return 100 # Linux/Mac fallback
|
|
|
|
|
|
def get_window_bounds() -> Optional[List[int]]:
|
|
"""Retourne [x, y, width, height] de la fenetre active.
|
|
|
|
Windows : ctypes GetWindowRect(GetForegroundWindow())
|
|
Linux/Mac : fallback None
|
|
"""
|
|
if _SYSTEM == "Windows":
|
|
try:
|
|
import ctypes
|
|
import ctypes.wintypes
|
|
|
|
hwnd = ctypes.windll.user32.GetForegroundWindow()
|
|
if not hwnd:
|
|
return None
|
|
rect = ctypes.wintypes.RECT()
|
|
ctypes.windll.user32.GetWindowRect(hwnd, ctypes.byref(rect))
|
|
return [
|
|
rect.left,
|
|
rect.top,
|
|
rect.right - rect.left,
|
|
rect.bottom - rect.top,
|
|
]
|
|
except Exception as e:
|
|
logger.debug(f"Impossible de lire les bounds fenetre : {e}")
|
|
return None
|
|
|
|
# Linux : tentative via xdotool
|
|
if _SYSTEM == "Linux":
|
|
try:
|
|
import subprocess
|
|
|
|
wid = subprocess.check_output(
|
|
["xdotool", "getactivewindow"],
|
|
stderr=subprocess.DEVNULL,
|
|
).decode().strip()
|
|
geom = subprocess.check_output(
|
|
["xdotool", "getwindowgeometry", "--shell", wid],
|
|
stderr=subprocess.DEVNULL,
|
|
).decode()
|
|
# Parse "X=...\nY=...\nWIDTH=...\nHEIGHT=..."
|
|
vals: Dict[str, int] = {}
|
|
for line in geom.strip().splitlines():
|
|
if "=" in line:
|
|
k, v = line.split("=", 1)
|
|
vals[k.strip()] = int(v.strip())
|
|
if {"X", "Y", "WIDTH", "HEIGHT"} <= vals.keys():
|
|
return [vals["X"], vals["Y"], vals["WIDTH"], vals["HEIGHT"]]
|
|
except Exception:
|
|
pass
|
|
|
|
return None
|
|
|
|
|
|
def get_monitor_info() -> Tuple[int, List[Dict[str, int]]]:
|
|
"""Retourne (monitor_index, liste_moniteurs).
|
|
|
|
Chaque moniteur : {width, height, x, y}
|
|
monitor_index : index du moniteur contenant la fenetre active
|
|
"""
|
|
monitors: List[Dict[str, int]] = []
|
|
active_index = 0
|
|
|
|
try:
|
|
import mss
|
|
|
|
with mss.mss() as sct:
|
|
for mon in sct.monitors[1:]: # Skip le moniteur virtuel (index 0)
|
|
monitors.append({
|
|
"width": mon["width"],
|
|
"height": mon["height"],
|
|
"x": mon["left"],
|
|
"y": mon["top"],
|
|
})
|
|
except Exception as e:
|
|
logger.debug(f"mss indisponible, resolution par defaut : {e}")
|
|
monitors = [{"width": 1920, "height": 1080, "x": 0, "y": 0}]
|
|
|
|
# Determiner quel moniteur contient la fenetre active
|
|
bounds = get_window_bounds()
|
|
if bounds and len(monitors) > 1:
|
|
wx, wy = bounds[0], bounds[1]
|
|
for i, mon in enumerate(monitors):
|
|
if (mon["x"] <= wx < mon["x"] + mon["width"]
|
|
and mon["y"] <= wy < mon["y"] + mon["height"]):
|
|
active_index = i
|
|
break
|
|
|
|
return active_index, monitors
|
|
|
|
|
|
def get_os_theme() -> str:
|
|
"""Retourne 'light', 'dark' ou 'unknown'."""
|
|
if _SYSTEM == "Windows":
|
|
try:
|
|
import winreg
|
|
|
|
key = winreg.OpenKey(
|
|
winreg.HKEY_CURRENT_USER,
|
|
r"Software\Microsoft\Windows\CurrentVersion\Themes\Personalize",
|
|
)
|
|
value, _ = winreg.QueryValueEx(key, "AppsUseLightTheme")
|
|
winreg.CloseKey(key)
|
|
return "light" if value == 1 else "dark"
|
|
except Exception as e:
|
|
logger.debug(f"Impossible de lire le theme Windows : {e}")
|
|
return "unknown"
|
|
|
|
# Linux : tentative via gsettings (GNOME)
|
|
if _SYSTEM == "Linux":
|
|
try:
|
|
import subprocess
|
|
|
|
result = subprocess.check_output(
|
|
["gsettings", "get", "org.gnome.desktop.interface", "color-scheme"],
|
|
stderr=subprocess.DEVNULL,
|
|
).decode().strip().strip("'\"")
|
|
if "dark" in result.lower():
|
|
return "dark"
|
|
elif "light" in result.lower() or "default" in result.lower():
|
|
return "light"
|
|
except Exception:
|
|
pass
|
|
|
|
return "unknown"
|
|
|
|
|
|
def get_os_language() -> str:
|
|
"""Retourne le code langue (fr, en, de, etc.)."""
|
|
try:
|
|
lang = locale.getdefaultlocale()[0] # ex: 'fr_FR'
|
|
if lang:
|
|
return lang[:2] # ex: 'fr'
|
|
except Exception:
|
|
pass
|
|
return "unknown"
|
|
|
|
|
|
def get_screen_metadata() -> Dict[str, Any]:
|
|
"""Capture toutes les metadonnees systeme en une fois.
|
|
|
|
Appelee une fois au demarrage + a chaque changement de focus.
|
|
Resultat injecte dans les evenements envoyes au serveur.
|
|
"""
|
|
monitor_index, monitors = get_monitor_info()
|
|
primary = monitors[0] if monitors else {"width": 1920, "height": 1080}
|
|
|
|
return {
|
|
"dpi_scale": get_dpi_scale(),
|
|
"monitor_index": monitor_index,
|
|
"monitors": monitors,
|
|
"screen_resolution": [primary["width"], primary["height"]],
|
|
"window_bounds": get_window_bounds(),
|
|
"os_theme": get_os_theme(),
|
|
"os_language": get_os_language(),
|
|
}
|