feat(qw1): capture par monitor + propagation offsets dans grounding cascade

_capture_screen() accepte un monitor_idx optionnel (None = composite legacy).
Index logique 0..N-1 mappé sur mss.monitors[idx+1] (mss[0] = composite).

Les 3 niveaux de grounding (OCR, UI-TARS, VLM) propagent l'offset retourné
par la capture pour traduire les coordonnées locales monitor en coordonnées
absolues écran (correct pour pyautogui.click).

find_element_on_screen() accepte monitor_idx et le forwarde aux 3 niveaux.

Backward 100% : monitor_idx=None partout → comportement strictement actuel.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-05-05 22:55:04 +02:00
parent 6582a69d31
commit fae95c5366
2 changed files with 108 additions and 20 deletions

View File

@@ -22,6 +22,18 @@ try:
except ImportError:
PYAUTOGUI_AVAILABLE = False
try:
import mss
MSS_AVAILABLE = True
except ImportError:
MSS_AVAILABLE = False
try:
from PIL import Image as PILImage
PIL_AVAILABLE = True
except ImportError:
PIL_AVAILABLE = False
def safe_type_text(text: str):
"""Saisie de texte compatible VM/Citrix et claviers AZERTY/QWERTY.
@@ -157,11 +169,13 @@ def handle_detected_pattern(pattern: Dict[str, Any]) -> bool:
screenshot = sct.grab(monitor)
screen = Image.frombytes('RGB', screenshot.size, screenshot.bgra, 'raw', 'BGRX')
# EasyOCR (rapide, bonne qualité GUI) avec fallback docTR
# EasyOCR (rapide, bonne qualité GUI) avec fallback docTR.
# gpu=True : harmonisé avec dialog_handler.py et title_verifier.py.
# Coût VRAM ~0.5 GB, sous le budget RTX 5070 (cf. deploy/VRAM_BUDGET.md).
words = []
try:
import easyocr
_reader = easyocr.Reader(['fr', 'en'], gpu=False, verbose=False)
_reader = easyocr.Reader(['fr', 'en'], gpu=True, verbose=False)
results = _reader.readtext(np.array(screen))
for (bbox_pts, text, conf) in results:
if not text or len(text.strip()) < 1:
@@ -312,6 +326,7 @@ def find_element_on_screen(
target_description: str = "",
anchor_image_base64: Optional[str] = None,
anchor_bbox: Optional[Dict] = None,
monitor_idx: Optional[int] = None,
) -> Optional[Dict[str, Any]]:
"""
Cherche un élément sur l'écran en utilisant 3 méthodes en cascade.
@@ -325,6 +340,7 @@ def find_element_on_screen(
target_description: Description plus longue (ex: "le dossier Demo sur le bureau")
anchor_image_base64: Image de référence de l'ancre (pour CLIP matching, réservé futur)
anchor_bbox: Position originale de l'ancre (pour désambiguïser les matchs multiples)
monitor_idx: Index logique 0..N-1 du monitor à scruter. None = composite legacy.
Returns:
{'x': int, 'y': int, 'method': str, 'confidence': float} ou None
@@ -347,6 +363,13 @@ def find_element_on_screen(
logger.debug("find_element_on_screen: ni target_text ni target_description fournis")
return None
# Propager monitor_idx au niveau OCR via anchor_bbox (sans muter l'argument original)
if monitor_idx is not None and anchor_bbox is not None:
anchor_bbox = dict(anchor_bbox) # copie pour ne pas muter l'argument
anchor_bbox["monitor_idx"] = monitor_idx
elif monitor_idx is not None:
anchor_bbox = {"monitor_idx": monitor_idx}
search_label = target_description or target_text
logger.info(f"[Grounding] Recherche élément: '{search_label}' (cascade 3 niveaux)")
@@ -356,12 +379,12 @@ def find_element_on_screen(
return result
# ─── Niveau 2 — UI-TARS grounding (~3s) ───
result = _grounding_ui_tars(target_text, target_description)
result = _grounding_ui_tars(target_text, target_description, monitor_idx=monitor_idx)
if result:
return result
# ─── Niveau 3 — VLM reasoning (~10s) ───
result = _grounding_vlm(target_text, target_description)
result = _grounding_vlm(target_text, target_description, monitor_idx=monitor_idx)
if result:
return result
@@ -411,20 +434,43 @@ def _describe_anchor_image(anchor_image_base64: str) -> Optional[str]:
return None
def _capture_screen():
"""Capture l'écran principal et retourne (PIL.Image, width, height)."""
try:
import mss
from PIL import Image as PILImage
def _capture_screen(monitor_idx=None):
"""Capture l'écran et retourne (PIL.Image, width, height, offset_x, offset_y).
Args:
monitor_idx: Index logique 0..N-1 du monitor à capturer (cf. screeninfo).
Si None : capture composite (mss.monitors[0]) — comportement legacy.
Returns:
(image, w, h, offset_x, offset_y). offset = (0,0) en mode composite.
"""
try:
with mss.mss() as sct:
monitor = sct.monitors[0]
if monitor_idx is None:
# Comportement actuel : composite tous écrans
monitor = sct.monitors[0]
offset_x, offset_y = 0, 0
else:
# mss skip monitors[0] (composite). Index logique 0 → mss.monitors[1].
mss_idx = int(monitor_idx) + 1
if mss_idx >= len(sct.monitors):
logger.warning(
"mss.monitors[%d] hors limites (n=%d) — fallback composite",
mss_idx, len(sct.monitors),
)
monitor = sct.monitors[0]
offset_x, offset_y = 0, 0
else:
monitor = sct.monitors[mss_idx]
offset_x = int(monitor.get("left", 0))
offset_y = int(monitor.get("top", 0))
screenshot = sct.grab(monitor)
screen = PILImage.frombytes('RGB', screenshot.size, screenshot.bgra, 'raw', 'BGRX')
return screen, monitor['width'], monitor['height']
return screen, monitor['width'], monitor['height'], offset_x, offset_y
except Exception as e:
logger.debug(f"Capture écran échouée: {e}")
return None, 0, 0
return None, 0, 0, 0, 0
def _grounding_ocr(target_text: str, anchor_bbox: Optional[Dict] = None) -> Optional[Dict[str, Any]]:
@@ -439,7 +485,8 @@ def _grounding_ocr(target_text: str, anchor_bbox: Optional[Dict] = None) -> Opti
return None
try:
screen, screen_w, screen_h = _capture_screen()
monitor_idx_param = anchor_bbox.get("monitor_idx") if anchor_bbox else None
screen, screen_w, screen_h, ox, oy = _capture_screen(monitor_idx=monitor_idx_param)
if screen is None:
return None
@@ -503,14 +550,14 @@ def _grounding_ocr(target_text: str, anchor_bbox: Optional[Dict] = None) -> Opti
sel = " ← CHOISI" if m is best else ""
logger.info(f" [OCR] Candidat: '{m['text']}' à ({m['x']}, {m['y']}) [{m['type']}]{sel}")
return {'x': best['x'], 'y': best['y'], 'method': 'ocr', 'confidence': best['conf']}
return {'x': best['x'] + ox, 'y': best['y'] + oy, 'method': 'ocr', 'confidence': best['conf']}
except Exception as e:
logger.debug(f"[Grounding/OCR] Erreur: {e}")
return None
def _grounding_ui_tars(target_text: str, target_description: str = "") -> Optional[Dict[str, Any]]:
def _grounding_ui_tars(target_text: str, target_description: str = "", monitor_idx=None) -> Optional[Dict[str, Any]]:
"""Niveau 2 — UI-TARS grounding visuel (~3s)."""
try:
import requests
@@ -519,7 +566,7 @@ def _grounding_ui_tars(target_text: str, target_description: str = "") -> Option
import re
import os
screen, screen_w, screen_h = _capture_screen()
screen, screen_w, screen_h, ox, oy = _capture_screen(monitor_idx=monitor_idx)
if screen is None:
return None
@@ -564,7 +611,7 @@ def _grounding_ui_tars(target_text: str, target_description: str = "") -> Option
# Valider que les coordonnées sont dans l'écran
if 0 <= x <= screen_w and 0 <= y <= screen_h:
logger.info(f"[Grounding/UI-TARS] Grounding → ({x}, {y})")
return {'x': x, 'y': y, 'method': 'ui_tars', 'confidence': 0.85}
return {'x': x + ox, 'y': y + oy, 'method': 'ui_tars', 'confidence': 0.85}
else:
logger.warning(f"[Grounding/UI-TARS] Coordonnées hors écran: ({x}, {y}) pour {screen_w}x{screen_h}")
return None
@@ -624,7 +671,7 @@ def _parse_ui_tars_coordinates(text: str, screen_w: int, screen_h: int) -> Optio
return None
def _grounding_vlm(target_text: str, target_description: str = "") -> Optional[Dict[str, Any]]:
def _grounding_vlm(target_text: str, target_description: str = "", monitor_idx=None) -> Optional[Dict[str, Any]]:
"""Niveau 3 — VLM reasoning + confirmation OCR (~10s)."""
try:
search_label = target_description or target_text
@@ -646,7 +693,7 @@ def _grounding_vlm(target_text: str, target_description: str = "") -> Optional[D
logger.info(f"[Grounding/VLM] VLM suggère de cliquer sur: '{vlm_target}'")
# Confirmation par OCR : chercher le target VLM sur l'écran
screen, screen_w, screen_h = _capture_screen()
screen, screen_w, screen_h, ox, oy = _capture_screen(monitor_idx=monitor_idx)
if screen is None:
return None
@@ -668,7 +715,7 @@ def _grounding_vlm(target_text: str, target_description: str = "") -> Optional[D
x = int((x1 + x2) / 2)
y = int((y1 + y2) / 2)
logger.info(f"[Grounding/VLM] Confirmé par OCR: '{word['text']}' à ({x}, {y})")
return {'x': x, 'y': y, 'method': 'vlm', 'confidence': 0.75}
return {'x': x + ox, 'y': y + oy, 'method': 'vlm', 'confidence': 0.75}
logger.debug(f"[Grounding/VLM] Target VLM '{vlm_target}' non trouvé par OCR")
return None