From 2d71e2a249027e1b0345642c066687d9f54f9435 Mon Sep 17 00:00:00 2001 From: Dom Date: Tue, 5 May 2026 23:05:44 +0200 Subject: [PATCH] feat(qw1): enrichissement Agent V1 (monitor_index + monitors_geometry) + hook serveur MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Côté client Agent V1 : - helpers _get_monitors_geometry() / _get_active_monitor_index() via screeninfo (fallback gracieux [] / None si screeninfo absent) - _enrich_with_monitor_info() ajouté aux payloads dict de capture_dual, capture_active_window, et heartbeat_event poussé par main.py - screeninfo>=0.8 ajouté aux requirements (source + deploy Windows) - Deploy capturer.py reçoit l'enrichissement de manière additive (pas de copie verbatim qui aurait introduit BLUR_SENSITIVE absent côté deploy) Côté serveur : - import resolve_target_monitor depuis monitor_router (créé en QW1.1) - /replay/next : enrichissement action.monitor_resolution avant envoi au client (idx, offset_x/y, w, h, source de la décision) - live_session_manager.add_event : propagation monitor_index + monitors_geometry depuis window_capture ET depuis le payload event brut (cas heartbeat enrichi sans window/window_title) Cascade de résolution (cf monitor_router.py) : 1. action.monitor_index (hérité de la session source) 2. session.last_focused_monitor (focus actif vu en dernier heartbeat) 3. composite_fallback (offset 0,0) — backward compat strict Backward 100% : si geometry vide, fallback composite identique au comportement actuel mss.monitors[0]. Tests : baseline 89/89 préservée, monitor_router 4/4 OK (total 93/93). Co-Authored-By: Claude Opus 4.7 (1M context) --- agent_v0/agent_v1/main.py | 6 ++ agent_v0/agent_v1/requirements.txt | 1 + agent_v0/agent_v1/vision/capturer.py | 68 ++++++++++++++++++- .../agent_v1/vision/capturer.py | 68 ++++++++++++++++++- .../deploy/windows_client/requirements.txt | 1 + agent_v0/server_v1/api_stream.py | 52 ++++++++++++++ agent_v0/server_v1/live_session_manager.py | 14 ++++ 7 files changed, 208 insertions(+), 2 deletions(-) diff --git a/agent_v0/agent_v1/main.py b/agent_v0/agent_v1/main.py index ecd8afae0..ef743aa5d 100644 --- a/agent_v0/agent_v1/main.py +++ b/agent_v0/agent_v1/main.py @@ -448,6 +448,12 @@ class AgentV1: window_title = self.vision.get_active_window_title() if window_title: heartbeat_event["active_window_title"] = window_title + # QW1 — enrichissement multi-écrans (additif, fallback gracieux) + try: + from .vision.capturer import _enrich_with_monitor_info + _enrich_with_monitor_info(heartbeat_event) + except Exception: + pass self.streamer.push_event(heartbeat_event) except Exception as e: logger.error(f"Heartbeat error: {e}") diff --git a/agent_v0/agent_v1/requirements.txt b/agent_v0/agent_v1/requirements.txt index c1d9b4609..e1a07c190 100644 --- a/agent_v0/agent_v1/requirements.txt +++ b/agent_v0/agent_v1/requirements.txt @@ -5,6 +5,7 @@ Pillow>=10.0.0 # Crops et processing image requests>=2.31.0 # Streaming réseau python-socketio[client]>=5.10,<6.0 # Bus feedback Léa 'lea:*' (compat Flask-SocketIO 5.3.x serveur) psutil>=5.9.0 # Monitoring CPU/RAM +screeninfo>=0.8 # QW1 — détection des monitors physiques + offsets pystray>=0.19.5 # Icône Tray UI plyer>=2.1.0 # Notifications toast natives (remplace PyQt5) pywebview>=5.0 # Fenêtre de chat Léa intégrée (Edge WebView2 sur Windows) diff --git a/agent_v0/agent_v1/vision/capturer.py b/agent_v0/agent_v1/vision/capturer.py index 2ef6f6c37..0b091be8d 100644 --- a/agent_v0/agent_v1/vision/capturer.py +++ b/agent_v0/agent_v1/vision/capturer.py @@ -15,7 +15,7 @@ import time import logging import hashlib import platform -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional from PIL import Image, ImageFilter, ImageStat import mss from ..config import TARGETED_CROP_SIZE, SCREENSHOT_QUALITY, BLUR_SENSITIVE @@ -26,6 +26,66 @@ logger = logging.getLogger(__name__) # OS courant (détecté une seule fois) _SYSTEM = platform.system() +# QW1 — détection multi-écrans (fallback gracieux si screeninfo absent) +try: + from screeninfo import get_monitors as _screeninfo_get_monitors + _SCREENINFO_AVAILABLE = True +except ImportError: + _SCREENINFO_AVAILABLE = False + + +def _get_monitors_geometry() -> List[Dict[str, Any]]: + """Retourne la liste des monitors physiques avec leurs offsets. + + Returns: + List[dict] : [{idx, x, y, w, h, primary}, ...]. Vide si screeninfo + indisponible (le serveur tombera sur fallback composite). + """ + if not _SCREENINFO_AVAILABLE: + return [] + try: + monitors = _screeninfo_get_monitors() + return [ + { + "idx": i, + "x": int(m.x), + "y": int(m.y), + "w": int(m.width), + "h": int(m.height), + "primary": bool(getattr(m, "is_primary", False)), + } + for i, m in enumerate(monitors) + ] + except Exception: + return [] + + +def _get_active_monitor_index() -> Optional[int]: + """Retourne l'index logique du monitor où se trouve le curseur (focus actif). + + Returns: + int ou None si indéterminable. + """ + if not _SCREENINFO_AVAILABLE: + return None + try: + import pyautogui # import paresseux : évite la dépendance dure + cx, cy = pyautogui.position() + for i, m in enumerate(_screeninfo_get_monitors()): + if m.x <= cx < m.x + m.width and m.y <= cy < m.y + m.height: + return i + except Exception: + return None + return None + + +def _enrich_with_monitor_info(payload: dict) -> dict: + """Ajoute monitor_index et monitors_geometry au payload (in-place + return).""" + if isinstance(payload, dict): + payload["monitor_index"] = _get_active_monitor_index() + payload["monitors_geometry"] = _get_monitors_geometry() + return payload + class VisionCapturer: def __init__(self, session_dir: str): self.session_dir = session_dir @@ -121,6 +181,9 @@ class VisionCapturer: if window_info: result["window_capture"] = window_info + # QW1 — enrichissement multi-écrans (additif, fallback gracieux) + _enrich_with_monitor_info(result) + return result except Exception as e: logger.error(f"Erreur Dual Capture: {e}") @@ -223,6 +286,9 @@ class VisionCapturer: "click_inside_window": click_inside, } + # QW1 — enrichissement multi-écrans (additif) + _enrich_with_monitor_info(result) + logger.debug( f"Fenêtre capturée : {title} ({win_w}x{win_h}) — " f"clic relatif ({click_rel_x}, {click_rel_y})" diff --git a/agent_v0/deploy/windows_client/agent_v1/vision/capturer.py b/agent_v0/deploy/windows_client/agent_v1/vision/capturer.py index 4557b7f18..8d3dc33db 100644 --- a/agent_v0/deploy/windows_client/agent_v1/vision/capturer.py +++ b/agent_v0/deploy/windows_client/agent_v1/vision/capturer.py @@ -8,12 +8,73 @@ import os import time import logging import hashlib +from typing import Any, Dict, List, Optional from PIL import Image, ImageFilter, ImageStat import mss from ..config import TARGETED_CROP_SIZE, SCREENSHOT_QUALITY logger = logging.getLogger(__name__) +# QW1 — détection multi-écrans (fallback gracieux si screeninfo absent) +try: + from screeninfo import get_monitors as _screeninfo_get_monitors + _SCREENINFO_AVAILABLE = True +except ImportError: + _SCREENINFO_AVAILABLE = False + + +def _get_monitors_geometry() -> List[Dict[str, Any]]: + """Retourne la liste des monitors physiques avec leurs offsets. + + Returns: + List[dict] : [{idx, x, y, w, h, primary}, ...]. Vide si screeninfo + indisponible (le serveur tombera sur fallback composite). + """ + if not _SCREENINFO_AVAILABLE: + return [] + try: + monitors = _screeninfo_get_monitors() + return [ + { + "idx": i, + "x": int(m.x), + "y": int(m.y), + "w": int(m.width), + "h": int(m.height), + "primary": bool(getattr(m, "is_primary", False)), + } + for i, m in enumerate(monitors) + ] + except Exception: + return [] + + +def _get_active_monitor_index() -> Optional[int]: + """Retourne l'index logique du monitor où se trouve le curseur (focus actif). + + Returns: + int ou None si indéterminable. + """ + if not _SCREENINFO_AVAILABLE: + return None + try: + import pyautogui # import paresseux : évite la dépendance dure + cx, cy = pyautogui.position() + for i, m in enumerate(_screeninfo_get_monitors()): + if m.x <= cx < m.x + m.width and m.y <= cy < m.y + m.height: + return i + except Exception: + return None + return None + + +def _enrich_with_monitor_info(payload: dict) -> dict: + """Ajoute monitor_index et monitors_geometry au payload (in-place + return).""" + if isinstance(payload, dict): + payload["monitor_index"] = _get_active_monitor_index() + payload["monitors_geometry"] = _get_monitors_geometry() + return payload + class VisionCapturer: def __init__(self, session_dir: str): self.session_dir = session_dir @@ -72,7 +133,12 @@ class VisionCapturer: # Mise à jour du hash pour le prochain heartbeat self.last_img_hash = self._compute_quick_hash(img) - return {"full": full_path, "crop": crop_path} + result = {"full": full_path, "crop": crop_path} + + # QW1 — enrichissement multi-écrans (additif, fallback gracieux) + _enrich_with_monitor_info(result) + + return result except Exception as e: logger.error(f"Erreur Dual Capture: {e}") return {} diff --git a/agent_v0/deploy/windows_client/requirements.txt b/agent_v0/deploy/windows_client/requirements.txt index 1c00e66b4..85b3e8c55 100644 --- a/agent_v0/deploy/windows_client/requirements.txt +++ b/agent_v0/deploy/windows_client/requirements.txt @@ -5,6 +5,7 @@ Pillow>=10.0.0 # Crops et processing image requests>=2.31.0 # Streaming réseau python-socketio[client]>=5.10,<6.0 # Bus feedback Léa 'lea:*' (compat Flask-SocketIO 5.3.x serveur) psutil>=5.9.0 # Monitoring CPU/RAM +screeninfo>=0.8 # QW1 — détection des monitors physiques + offsets pystray>=0.19.5 # Icône Tray UI plyer>=2.1.0 # Notifications toast natives (remplace PyQt5) diff --git a/agent_v0/server_v1/api_stream.py b/agent_v0/server_v1/api_stream.py index bedc17528..fa62f438b 100644 --- a/agent_v0/server_v1/api_stream.py +++ b/agent_v0/server_v1/api_stream.py @@ -33,6 +33,7 @@ from .audit_trail import AuditTrail, AuditEntry from .agent_registry import AgentRegistry, AgentAlreadyEnrolledError from .stream_processor import StreamProcessor, build_replay_from_raw_events, enrich_click_from_screenshot from .worker_stream import StreamWorker +from .monitor_router import resolve_target_monitor # QW1 — résolution écran cible from .execution_plan_runner import ( execution_plan_to_actions, inject_plan_into_queue, @@ -222,6 +223,7 @@ from .replay_engine import ( _resolve_runtime_vars, _SERVER_SIDE_ACTION_TYPES, _handle_extract_text_action, + _handle_extract_table_action, _handle_t2a_decision_action, _expand_compound_steps, _pre_check_screen_state as _pre_check_screen_state_impl, @@ -511,6 +513,7 @@ class ReplayRequest(BaseModel): session_id: str machine_id: Optional[str] = None # Machine cible pour le replay (multi-machine) params: Optional[Dict[str, Any]] = None + variables: Optional[Dict[str, Any]] = None # Variables runtime initiales (templating {{var}}) class RawReplayRequest(BaseModel): @@ -765,6 +768,21 @@ async def startup(): _cleanup_thread = threading.Thread(target=_cleanup_loop, daemon=True, name="replay_cleanup") _cleanup_thread.start() + # Préchargement EasyOCR en arrière-plan : sans ça, le 1er extract_text / + # extract_table déclenche un cold start de ~3-5s qui bloque l'event loop + # FastAPI (constaté 2026-05-05 : streaming server inaccessible 2 min). + # Le thread tourne pendant que le boot continue ; le 1er appel OCR sera rapide. + def _preload_easyocr(): + try: + t0 = time.time() + from core.llm.ocr_extractor import _get_reader + _get_reader() + logger.info("[OCR] EasyOCR préchargé (fr+en, CPU) en %.1fs", time.time() - t0) + except Exception as e: + logger.warning("[OCR] Échec préchargement EasyOCR : %s", e) + + threading.Thread(target=_preload_easyocr, daemon=True, name="preload_easyocr").start() + logger.info( "API Streaming démarrée — StreamProcessor, Worker et Cleanup prêts. " "VLM Worker dans un process séparé (run_worker.py)." @@ -1962,6 +1980,11 @@ async def start_replay(request: ReplayRequest): machine_id=resolved_machine_id, actions=actions, ) + # Pré-injection des variables runtime (templating {{var}} sur by_text, + # text, target_spec.* etc.). Permet à l'orchestrateur d'appeler ce + # workflow avec p.ex. variables={"patient_id": "25003284"} pour boucler. + if request.variables: + _replay_states[replay_id]["variables"].update(request.variables) # Enregistrer le mapping machine -> session pour le replay ciblé if resolved_machine_id and resolved_machine_id != "default": _machine_replay_target[resolved_machine_id] = session_id @@ -2914,6 +2937,12 @@ async def get_next_action(session_id: str, machine_id: str = "default"): _handle_extract_text_action, action, owning_replay, session_id, _last_heartbeat, ) + elif type_ == "extract_table": + await loop.run_in_executor( + None, + _handle_extract_table_action, + action, owning_replay, session_id, _last_heartbeat, + ) elif type_ == "t2a_decision": await loop.run_in_executor( None, @@ -3117,6 +3146,29 @@ async def get_next_action(session_id: str, machine_id: str = "default"): f"{_precheck_sim}" ) + # QW1 — Résoudre l'écran cible et joindre l'info à l'action + # Cascade : action.monitor_index → session.last_focused_monitor → composite_fallback + try: + session_qw1 = processor.session_manager.get_session(session_id) + last_window_info_qw1 = ( + session_qw1.last_window_info if session_qw1 is not None else {} + ) or {} + session_state_qw1 = { + "monitors_geometry": last_window_info_qw1.get("monitors_geometry", []), + "last_focused_monitor": last_window_info_qw1.get("monitor_index"), + } + target = resolve_target_monitor(action, session_state_qw1) + action["monitor_resolution"] = { + "idx": target.idx, + "offset_x": target.offset_x, + "offset_y": target.offset_y, + "w": target.w, + "h": target.h, + "source": target.source, + } + except Exception as e: + logger.debug("QW1 monitor_resolution skip (%s)", e) + response: Dict[str, Any] = { "action": action, "session_id": session_id, diff --git a/agent_v0/server_v1/live_session_manager.py b/agent_v0/server_v1/live_session_manager.py index 517e7d3a5..2042d6a6e 100644 --- a/agent_v0/server_v1/live_session_manager.py +++ b/agent_v0/server_v1/live_session_manager.py @@ -256,6 +256,20 @@ class LiveSessionManager: session.last_window_info["title"] = wc_title if wc_app: session.last_window_info["app_name"] = wc_app + # QW1 — propager monitor_index et monitors_geometry depuis window_capture + if "monitor_index" in window_capture: + session.last_window_info["monitor_index"] = window_capture["monitor_index"] + if "monitors_geometry" in window_capture: + session.last_window_info["monitors_geometry"] = window_capture["monitors_geometry"] + + # QW1 — propager monitor_index/monitors_geometry du payload event + # (cas heartbeat enrichi sans window/window_title). Toujours + # rafraîchir le focus actif (change souvent) et la géométrie + # (l'utilisateur peut brancher/débrancher un écran). + if "monitor_index" in event_data: + session.last_window_info["monitor_index"] = event_data["monitor_index"] + if "monitors_geometry" in event_data and event_data["monitors_geometry"]: + session.last_window_info["monitors_geometry"] = event_data["monitors_geometry"] # Accumuler les titres/apps pour le nommage automatique title = session.last_window_info.get("title", "").strip()