feat(qw1): enrichissement Agent V1 (monitor_index + monitors_geometry) + hook serveur
Some checks failed
tests / Lint (ruff + black) (push) Successful in 16s
tests / Tests unitaires (sans GPU) (push) Failing after 15s
tests / Tests sécurité (critique) (push) Has been skipped

Côté client Agent V1 :
- helpers _get_monitors_geometry() / _get_active_monitor_index() via screeninfo
  (fallback gracieux [] / None si screeninfo absent)
- _enrich_with_monitor_info() ajouté aux payloads dict de capture_dual,
  capture_active_window, et heartbeat_event poussé par main.py
- screeninfo>=0.8 ajouté aux requirements (source + deploy Windows)
- Deploy capturer.py reçoit l'enrichissement de manière additive (pas de
  copie verbatim qui aurait introduit BLUR_SENSITIVE absent côté deploy)

Côté serveur :
- import resolve_target_monitor depuis monitor_router (créé en QW1.1)
- /replay/next : enrichissement action.monitor_resolution avant envoi
  au client (idx, offset_x/y, w, h, source de la décision)
- live_session_manager.add_event : propagation monitor_index +
  monitors_geometry depuis window_capture ET depuis le payload event
  brut (cas heartbeat enrichi sans window/window_title)

Cascade de résolution (cf monitor_router.py) :
1. action.monitor_index (hérité de la session source)
2. session.last_focused_monitor (focus actif vu en dernier heartbeat)
3. composite_fallback (offset 0,0) — backward compat strict

Backward 100% : si geometry vide, fallback composite identique au
comportement actuel mss.monitors[0].

Tests : baseline 89/89 préservée, monitor_router 4/4 OK (total 93/93).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-05-05 23:05:44 +02:00
parent fae95c5366
commit 2d71e2a249
7 changed files with 208 additions and 2 deletions

View File

@@ -448,6 +448,12 @@ class AgentV1:
window_title = self.vision.get_active_window_title()
if window_title:
heartbeat_event["active_window_title"] = window_title
# QW1 — enrichissement multi-écrans (additif, fallback gracieux)
try:
from .vision.capturer import _enrich_with_monitor_info
_enrich_with_monitor_info(heartbeat_event)
except Exception:
pass
self.streamer.push_event(heartbeat_event)
except Exception as e:
logger.error(f"Heartbeat error: {e}")

View File

@@ -5,6 +5,7 @@ Pillow>=10.0.0 # Crops et processing image
requests>=2.31.0 # Streaming réseau
python-socketio[client]>=5.10,<6.0 # Bus feedback Léa 'lea:*' (compat Flask-SocketIO 5.3.x serveur)
psutil>=5.9.0 # Monitoring CPU/RAM
screeninfo>=0.8 # QW1 — détection des monitors physiques + offsets
pystray>=0.19.5 # Icône Tray UI
plyer>=2.1.0 # Notifications toast natives (remplace PyQt5)
pywebview>=5.0 # Fenêtre de chat Léa intégrée (Edge WebView2 sur Windows)

View File

@@ -15,7 +15,7 @@ import time
import logging
import hashlib
import platform
from typing import Any, Dict, Optional
from typing import Any, Dict, List, Optional
from PIL import Image, ImageFilter, ImageStat
import mss
from ..config import TARGETED_CROP_SIZE, SCREENSHOT_QUALITY, BLUR_SENSITIVE
@@ -26,6 +26,66 @@ logger = logging.getLogger(__name__)
# OS courant (détecté une seule fois)
_SYSTEM = platform.system()
# QW1 — détection multi-écrans (fallback gracieux si screeninfo absent)
try:
from screeninfo import get_monitors as _screeninfo_get_monitors
_SCREENINFO_AVAILABLE = True
except ImportError:
_SCREENINFO_AVAILABLE = False
def _get_monitors_geometry() -> List[Dict[str, Any]]:
"""Retourne la liste des monitors physiques avec leurs offsets.
Returns:
List[dict] : [{idx, x, y, w, h, primary}, ...]. Vide si screeninfo
indisponible (le serveur tombera sur fallback composite).
"""
if not _SCREENINFO_AVAILABLE:
return []
try:
monitors = _screeninfo_get_monitors()
return [
{
"idx": i,
"x": int(m.x),
"y": int(m.y),
"w": int(m.width),
"h": int(m.height),
"primary": bool(getattr(m, "is_primary", False)),
}
for i, m in enumerate(monitors)
]
except Exception:
return []
def _get_active_monitor_index() -> Optional[int]:
"""Retourne l'index logique du monitor où se trouve le curseur (focus actif).
Returns:
int ou None si indéterminable.
"""
if not _SCREENINFO_AVAILABLE:
return None
try:
import pyautogui # import paresseux : évite la dépendance dure
cx, cy = pyautogui.position()
for i, m in enumerate(_screeninfo_get_monitors()):
if m.x <= cx < m.x + m.width and m.y <= cy < m.y + m.height:
return i
except Exception:
return None
return None
def _enrich_with_monitor_info(payload: dict) -> dict:
"""Ajoute monitor_index et monitors_geometry au payload (in-place + return)."""
if isinstance(payload, dict):
payload["monitor_index"] = _get_active_monitor_index()
payload["monitors_geometry"] = _get_monitors_geometry()
return payload
class VisionCapturer:
def __init__(self, session_dir: str):
self.session_dir = session_dir
@@ -121,6 +181,9 @@ class VisionCapturer:
if window_info:
result["window_capture"] = window_info
# QW1 — enrichissement multi-écrans (additif, fallback gracieux)
_enrich_with_monitor_info(result)
return result
except Exception as e:
logger.error(f"Erreur Dual Capture: {e}")
@@ -223,6 +286,9 @@ class VisionCapturer:
"click_inside_window": click_inside,
}
# QW1 — enrichissement multi-écrans (additif)
_enrich_with_monitor_info(result)
logger.debug(
f"Fenêtre capturée : {title} ({win_w}x{win_h}) — "
f"clic relatif ({click_rel_x}, {click_rel_y})"

View File

@@ -8,12 +8,73 @@ import os
import time
import logging
import hashlib
from typing import Any, Dict, List, Optional
from PIL import Image, ImageFilter, ImageStat
import mss
from ..config import TARGETED_CROP_SIZE, SCREENSHOT_QUALITY
logger = logging.getLogger(__name__)
# QW1 — détection multi-écrans (fallback gracieux si screeninfo absent)
try:
from screeninfo import get_monitors as _screeninfo_get_monitors
_SCREENINFO_AVAILABLE = True
except ImportError:
_SCREENINFO_AVAILABLE = False
def _get_monitors_geometry() -> List[Dict[str, Any]]:
"""Retourne la liste des monitors physiques avec leurs offsets.
Returns:
List[dict] : [{idx, x, y, w, h, primary}, ...]. Vide si screeninfo
indisponible (le serveur tombera sur fallback composite).
"""
if not _SCREENINFO_AVAILABLE:
return []
try:
monitors = _screeninfo_get_monitors()
return [
{
"idx": i,
"x": int(m.x),
"y": int(m.y),
"w": int(m.width),
"h": int(m.height),
"primary": bool(getattr(m, "is_primary", False)),
}
for i, m in enumerate(monitors)
]
except Exception:
return []
def _get_active_monitor_index() -> Optional[int]:
"""Retourne l'index logique du monitor où se trouve le curseur (focus actif).
Returns:
int ou None si indéterminable.
"""
if not _SCREENINFO_AVAILABLE:
return None
try:
import pyautogui # import paresseux : évite la dépendance dure
cx, cy = pyautogui.position()
for i, m in enumerate(_screeninfo_get_monitors()):
if m.x <= cx < m.x + m.width and m.y <= cy < m.y + m.height:
return i
except Exception:
return None
return None
def _enrich_with_monitor_info(payload: dict) -> dict:
"""Ajoute monitor_index et monitors_geometry au payload (in-place + return)."""
if isinstance(payload, dict):
payload["monitor_index"] = _get_active_monitor_index()
payload["monitors_geometry"] = _get_monitors_geometry()
return payload
class VisionCapturer:
def __init__(self, session_dir: str):
self.session_dir = session_dir
@@ -72,7 +133,12 @@ class VisionCapturer:
# Mise à jour du hash pour le prochain heartbeat
self.last_img_hash = self._compute_quick_hash(img)
return {"full": full_path, "crop": crop_path}
result = {"full": full_path, "crop": crop_path}
# QW1 — enrichissement multi-écrans (additif, fallback gracieux)
_enrich_with_monitor_info(result)
return result
except Exception as e:
logger.error(f"Erreur Dual Capture: {e}")
return {}

View File

@@ -5,6 +5,7 @@ Pillow>=10.0.0 # Crops et processing image
requests>=2.31.0 # Streaming réseau
python-socketio[client]>=5.10,<6.0 # Bus feedback Léa 'lea:*' (compat Flask-SocketIO 5.3.x serveur)
psutil>=5.9.0 # Monitoring CPU/RAM
screeninfo>=0.8 # QW1 — détection des monitors physiques + offsets
pystray>=0.19.5 # Icône Tray UI
plyer>=2.1.0 # Notifications toast natives (remplace PyQt5)

View File

@@ -33,6 +33,7 @@ from .audit_trail import AuditTrail, AuditEntry
from .agent_registry import AgentRegistry, AgentAlreadyEnrolledError
from .stream_processor import StreamProcessor, build_replay_from_raw_events, enrich_click_from_screenshot
from .worker_stream import StreamWorker
from .monitor_router import resolve_target_monitor # QW1 — résolution écran cible
from .execution_plan_runner import (
execution_plan_to_actions,
inject_plan_into_queue,
@@ -222,6 +223,7 @@ from .replay_engine import (
_resolve_runtime_vars,
_SERVER_SIDE_ACTION_TYPES,
_handle_extract_text_action,
_handle_extract_table_action,
_handle_t2a_decision_action,
_expand_compound_steps,
_pre_check_screen_state as _pre_check_screen_state_impl,
@@ -511,6 +513,7 @@ class ReplayRequest(BaseModel):
session_id: str
machine_id: Optional[str] = None # Machine cible pour le replay (multi-machine)
params: Optional[Dict[str, Any]] = None
variables: Optional[Dict[str, Any]] = None # Variables runtime initiales (templating {{var}})
class RawReplayRequest(BaseModel):
@@ -765,6 +768,21 @@ async def startup():
_cleanup_thread = threading.Thread(target=_cleanup_loop, daemon=True, name="replay_cleanup")
_cleanup_thread.start()
# Préchargement EasyOCR en arrière-plan : sans ça, le 1er extract_text /
# extract_table déclenche un cold start de ~3-5s qui bloque l'event loop
# FastAPI (constaté 2026-05-05 : streaming server inaccessible 2 min).
# Le thread tourne pendant que le boot continue ; le 1er appel OCR sera rapide.
def _preload_easyocr():
try:
t0 = time.time()
from core.llm.ocr_extractor import _get_reader
_get_reader()
logger.info("[OCR] EasyOCR préchargé (fr+en, CPU) en %.1fs", time.time() - t0)
except Exception as e:
logger.warning("[OCR] Échec préchargement EasyOCR : %s", e)
threading.Thread(target=_preload_easyocr, daemon=True, name="preload_easyocr").start()
logger.info(
"API Streaming démarrée — StreamProcessor, Worker et Cleanup prêts. "
"VLM Worker dans un process séparé (run_worker.py)."
@@ -1962,6 +1980,11 @@ async def start_replay(request: ReplayRequest):
machine_id=resolved_machine_id,
actions=actions,
)
# Pré-injection des variables runtime (templating {{var}} sur by_text,
# text, target_spec.* etc.). Permet à l'orchestrateur d'appeler ce
# workflow avec p.ex. variables={"patient_id": "25003284"} pour boucler.
if request.variables:
_replay_states[replay_id]["variables"].update(request.variables)
# Enregistrer le mapping machine -> session pour le replay ciblé
if resolved_machine_id and resolved_machine_id != "default":
_machine_replay_target[resolved_machine_id] = session_id
@@ -2914,6 +2937,12 @@ async def get_next_action(session_id: str, machine_id: str = "default"):
_handle_extract_text_action,
action, owning_replay, session_id, _last_heartbeat,
)
elif type_ == "extract_table":
await loop.run_in_executor(
None,
_handle_extract_table_action,
action, owning_replay, session_id, _last_heartbeat,
)
elif type_ == "t2a_decision":
await loop.run_in_executor(
None,
@@ -3117,6 +3146,29 @@ async def get_next_action(session_id: str, machine_id: str = "default"):
f"{_precheck_sim}"
)
# QW1 — Résoudre l'écran cible et joindre l'info à l'action
# Cascade : action.monitor_index → session.last_focused_monitor → composite_fallback
try:
session_qw1 = processor.session_manager.get_session(session_id)
last_window_info_qw1 = (
session_qw1.last_window_info if session_qw1 is not None else {}
) or {}
session_state_qw1 = {
"monitors_geometry": last_window_info_qw1.get("monitors_geometry", []),
"last_focused_monitor": last_window_info_qw1.get("monitor_index"),
}
target = resolve_target_monitor(action, session_state_qw1)
action["monitor_resolution"] = {
"idx": target.idx,
"offset_x": target.offset_x,
"offset_y": target.offset_y,
"w": target.w,
"h": target.h,
"source": target.source,
}
except Exception as e:
logger.debug("QW1 monitor_resolution skip (%s)", e)
response: Dict[str, Any] = {
"action": action,
"session_id": session_id,

View File

@@ -256,6 +256,20 @@ class LiveSessionManager:
session.last_window_info["title"] = wc_title
if wc_app:
session.last_window_info["app_name"] = wc_app
# QW1 — propager monitor_index et monitors_geometry depuis window_capture
if "monitor_index" in window_capture:
session.last_window_info["monitor_index"] = window_capture["monitor_index"]
if "monitors_geometry" in window_capture:
session.last_window_info["monitors_geometry"] = window_capture["monitors_geometry"]
# QW1 — propager monitor_index/monitors_geometry du payload event
# (cas heartbeat enrichi sans window/window_title). Toujours
# rafraîchir le focus actif (change souvent) et la géométrie
# (l'utilisateur peut brancher/débrancher un écran).
if "monitor_index" in event_data:
session.last_window_info["monitor_index"] = event_data["monitor_index"]
if "monitors_geometry" in event_data and event_data["monitors_geometry"]:
session.last_window_info["monitors_geometry"] = event_data["monitors_geometry"]
# Accumuler les titres/apps pour le nommage automatique
title = session.last_window_info.get("title", "").strip()