feat(qw1): capture par monitor + propagation offsets dans grounding cascade
_capture_screen() accepte un monitor_idx optionnel (None = composite legacy). Index logique 0..N-1 mappé sur mss.monitors[idx+1] (mss[0] = composite). Les 3 niveaux de grounding (OCR, UI-TARS, VLM) propagent l'offset retourné par la capture pour traduire les coordonnées locales monitor en coordonnées absolues écran (correct pour pyautogui.click). find_element_on_screen() accepte monitor_idx et le forwarde aux 3 niveaux. Backward 100% : monitor_idx=None partout → comportement strictement actuel. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,18 @@ try:
|
||||
except ImportError:
|
||||
PYAUTOGUI_AVAILABLE = False
|
||||
|
||||
try:
|
||||
import mss
|
||||
MSS_AVAILABLE = True
|
||||
except ImportError:
|
||||
MSS_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from PIL import Image as PILImage
|
||||
PIL_AVAILABLE = True
|
||||
except ImportError:
|
||||
PIL_AVAILABLE = False
|
||||
|
||||
|
||||
def safe_type_text(text: str):
|
||||
"""Saisie de texte compatible VM/Citrix et claviers AZERTY/QWERTY.
|
||||
@@ -157,11 +169,13 @@ def handle_detected_pattern(pattern: Dict[str, Any]) -> bool:
|
||||
screenshot = sct.grab(monitor)
|
||||
screen = Image.frombytes('RGB', screenshot.size, screenshot.bgra, 'raw', 'BGRX')
|
||||
|
||||
# EasyOCR (rapide, bonne qualité GUI) avec fallback docTR
|
||||
# EasyOCR (rapide, bonne qualité GUI) avec fallback docTR.
|
||||
# gpu=True : harmonisé avec dialog_handler.py et title_verifier.py.
|
||||
# Coût VRAM ~0.5 GB, sous le budget RTX 5070 (cf. deploy/VRAM_BUDGET.md).
|
||||
words = []
|
||||
try:
|
||||
import easyocr
|
||||
_reader = easyocr.Reader(['fr', 'en'], gpu=False, verbose=False)
|
||||
_reader = easyocr.Reader(['fr', 'en'], gpu=True, verbose=False)
|
||||
results = _reader.readtext(np.array(screen))
|
||||
for (bbox_pts, text, conf) in results:
|
||||
if not text or len(text.strip()) < 1:
|
||||
@@ -312,6 +326,7 @@ def find_element_on_screen(
|
||||
target_description: str = "",
|
||||
anchor_image_base64: Optional[str] = None,
|
||||
anchor_bbox: Optional[Dict] = None,
|
||||
monitor_idx: Optional[int] = None,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Cherche un élément sur l'écran en utilisant 3 méthodes en cascade.
|
||||
@@ -325,6 +340,7 @@ def find_element_on_screen(
|
||||
target_description: Description plus longue (ex: "le dossier Demo sur le bureau")
|
||||
anchor_image_base64: Image de référence de l'ancre (pour CLIP matching, réservé futur)
|
||||
anchor_bbox: Position originale de l'ancre (pour désambiguïser les matchs multiples)
|
||||
monitor_idx: Index logique 0..N-1 du monitor à scruter. None = composite legacy.
|
||||
|
||||
Returns:
|
||||
{'x': int, 'y': int, 'method': str, 'confidence': float} ou None
|
||||
@@ -347,6 +363,13 @@ def find_element_on_screen(
|
||||
logger.debug("find_element_on_screen: ni target_text ni target_description fournis")
|
||||
return None
|
||||
|
||||
# Propager monitor_idx au niveau OCR via anchor_bbox (sans muter l'argument original)
|
||||
if monitor_idx is not None and anchor_bbox is not None:
|
||||
anchor_bbox = dict(anchor_bbox) # copie pour ne pas muter l'argument
|
||||
anchor_bbox["monitor_idx"] = monitor_idx
|
||||
elif monitor_idx is not None:
|
||||
anchor_bbox = {"monitor_idx": monitor_idx}
|
||||
|
||||
search_label = target_description or target_text
|
||||
logger.info(f"[Grounding] Recherche élément: '{search_label}' (cascade 3 niveaux)")
|
||||
|
||||
@@ -356,12 +379,12 @@ def find_element_on_screen(
|
||||
return result
|
||||
|
||||
# ─── Niveau 2 — UI-TARS grounding (~3s) ───
|
||||
result = _grounding_ui_tars(target_text, target_description)
|
||||
result = _grounding_ui_tars(target_text, target_description, monitor_idx=monitor_idx)
|
||||
if result:
|
||||
return result
|
||||
|
||||
# ─── Niveau 3 — VLM reasoning (~10s) ───
|
||||
result = _grounding_vlm(target_text, target_description)
|
||||
result = _grounding_vlm(target_text, target_description, monitor_idx=monitor_idx)
|
||||
if result:
|
||||
return result
|
||||
|
||||
@@ -411,20 +434,43 @@ def _describe_anchor_image(anchor_image_base64: str) -> Optional[str]:
|
||||
return None
|
||||
|
||||
|
||||
def _capture_screen():
|
||||
"""Capture l'écran principal et retourne (PIL.Image, width, height)."""
|
||||
try:
|
||||
import mss
|
||||
from PIL import Image as PILImage
|
||||
def _capture_screen(monitor_idx=None):
|
||||
"""Capture l'écran et retourne (PIL.Image, width, height, offset_x, offset_y).
|
||||
|
||||
Args:
|
||||
monitor_idx: Index logique 0..N-1 du monitor à capturer (cf. screeninfo).
|
||||
Si None : capture composite (mss.monitors[0]) — comportement legacy.
|
||||
|
||||
Returns:
|
||||
(image, w, h, offset_x, offset_y). offset = (0,0) en mode composite.
|
||||
"""
|
||||
try:
|
||||
with mss.mss() as sct:
|
||||
monitor = sct.monitors[0]
|
||||
if monitor_idx is None:
|
||||
# Comportement actuel : composite tous écrans
|
||||
monitor = sct.monitors[0]
|
||||
offset_x, offset_y = 0, 0
|
||||
else:
|
||||
# mss skip monitors[0] (composite). Index logique 0 → mss.monitors[1].
|
||||
mss_idx = int(monitor_idx) + 1
|
||||
if mss_idx >= len(sct.monitors):
|
||||
logger.warning(
|
||||
"mss.monitors[%d] hors limites (n=%d) — fallback composite",
|
||||
mss_idx, len(sct.monitors),
|
||||
)
|
||||
monitor = sct.monitors[0]
|
||||
offset_x, offset_y = 0, 0
|
||||
else:
|
||||
monitor = sct.monitors[mss_idx]
|
||||
offset_x = int(monitor.get("left", 0))
|
||||
offset_y = int(monitor.get("top", 0))
|
||||
|
||||
screenshot = sct.grab(monitor)
|
||||
screen = PILImage.frombytes('RGB', screenshot.size, screenshot.bgra, 'raw', 'BGRX')
|
||||
return screen, monitor['width'], monitor['height']
|
||||
return screen, monitor['width'], monitor['height'], offset_x, offset_y
|
||||
except Exception as e:
|
||||
logger.debug(f"Capture écran échouée: {e}")
|
||||
return None, 0, 0
|
||||
return None, 0, 0, 0, 0
|
||||
|
||||
|
||||
def _grounding_ocr(target_text: str, anchor_bbox: Optional[Dict] = None) -> Optional[Dict[str, Any]]:
|
||||
@@ -439,7 +485,8 @@ def _grounding_ocr(target_text: str, anchor_bbox: Optional[Dict] = None) -> Opti
|
||||
return None
|
||||
|
||||
try:
|
||||
screen, screen_w, screen_h = _capture_screen()
|
||||
monitor_idx_param = anchor_bbox.get("monitor_idx") if anchor_bbox else None
|
||||
screen, screen_w, screen_h, ox, oy = _capture_screen(monitor_idx=monitor_idx_param)
|
||||
if screen is None:
|
||||
return None
|
||||
|
||||
@@ -503,14 +550,14 @@ def _grounding_ocr(target_text: str, anchor_bbox: Optional[Dict] = None) -> Opti
|
||||
sel = " ← CHOISI" if m is best else ""
|
||||
logger.info(f" [OCR] Candidat: '{m['text']}' à ({m['x']}, {m['y']}) [{m['type']}]{sel}")
|
||||
|
||||
return {'x': best['x'], 'y': best['y'], 'method': 'ocr', 'confidence': best['conf']}
|
||||
return {'x': best['x'] + ox, 'y': best['y'] + oy, 'method': 'ocr', 'confidence': best['conf']}
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"[Grounding/OCR] Erreur: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _grounding_ui_tars(target_text: str, target_description: str = "") -> Optional[Dict[str, Any]]:
|
||||
def _grounding_ui_tars(target_text: str, target_description: str = "", monitor_idx=None) -> Optional[Dict[str, Any]]:
|
||||
"""Niveau 2 — UI-TARS grounding visuel (~3s)."""
|
||||
try:
|
||||
import requests
|
||||
@@ -519,7 +566,7 @@ def _grounding_ui_tars(target_text: str, target_description: str = "") -> Option
|
||||
import re
|
||||
import os
|
||||
|
||||
screen, screen_w, screen_h = _capture_screen()
|
||||
screen, screen_w, screen_h, ox, oy = _capture_screen(monitor_idx=monitor_idx)
|
||||
if screen is None:
|
||||
return None
|
||||
|
||||
@@ -564,7 +611,7 @@ def _grounding_ui_tars(target_text: str, target_description: str = "") -> Option
|
||||
# Valider que les coordonnées sont dans l'écran
|
||||
if 0 <= x <= screen_w and 0 <= y <= screen_h:
|
||||
logger.info(f"[Grounding/UI-TARS] Grounding → ({x}, {y})")
|
||||
return {'x': x, 'y': y, 'method': 'ui_tars', 'confidence': 0.85}
|
||||
return {'x': x + ox, 'y': y + oy, 'method': 'ui_tars', 'confidence': 0.85}
|
||||
else:
|
||||
logger.warning(f"[Grounding/UI-TARS] Coordonnées hors écran: ({x}, {y}) pour {screen_w}x{screen_h}")
|
||||
return None
|
||||
@@ -624,7 +671,7 @@ def _parse_ui_tars_coordinates(text: str, screen_w: int, screen_h: int) -> Optio
|
||||
return None
|
||||
|
||||
|
||||
def _grounding_vlm(target_text: str, target_description: str = "") -> Optional[Dict[str, Any]]:
|
||||
def _grounding_vlm(target_text: str, target_description: str = "", monitor_idx=None) -> Optional[Dict[str, Any]]:
|
||||
"""Niveau 3 — VLM reasoning + confirmation OCR (~10s)."""
|
||||
try:
|
||||
search_label = target_description or target_text
|
||||
@@ -646,7 +693,7 @@ def _grounding_vlm(target_text: str, target_description: str = "") -> Optional[D
|
||||
logger.info(f"[Grounding/VLM] VLM suggère de cliquer sur: '{vlm_target}'")
|
||||
|
||||
# Confirmation par OCR : chercher le target VLM sur l'écran
|
||||
screen, screen_w, screen_h = _capture_screen()
|
||||
screen, screen_w, screen_h, ox, oy = _capture_screen(monitor_idx=monitor_idx)
|
||||
if screen is None:
|
||||
return None
|
||||
|
||||
@@ -668,7 +715,7 @@ def _grounding_vlm(target_text: str, target_description: str = "") -> Optional[D
|
||||
x = int((x1 + x2) / 2)
|
||||
y = int((y1 + y2) / 2)
|
||||
logger.info(f"[Grounding/VLM] Confirmé par OCR: '{word['text']}' à ({x}, {y})")
|
||||
return {'x': x, 'y': y, 'method': 'vlm', 'confidence': 0.75}
|
||||
return {'x': x + ox, 'y': y + oy, 'method': 'vlm', 'confidence': 0.75}
|
||||
|
||||
logger.debug(f"[Grounding/VLM] Target VLM '{vlm_target}' non trouvé par OCR")
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user