diff --git a/core/execution/observe_reason_act.py b/core/execution/observe_reason_act.py index db2703ef1..8cc414c40 100644 --- a/core/execution/observe_reason_act.py +++ b/core/execution/observe_reason_act.py @@ -866,18 +866,30 @@ Règles: x, y = None, None method_used = '' - # --- Méthode 1 : Template matching direct (~1-10ms) --- - if screenshot_b64 and CV2_AVAILABLE and PIL_AVAILABLE and MSS_AVAILABLE: + # --- Méthode 1 : UI-TARS grounding (~3s, 94% précision) --- + # Le plus fiable : on dit "click on X" et UI-TARS trouve les coordonnées + if target_text or target_desc: + try: + from core.execution.input_handler import _grounding_ui_tars + click_label = target_desc or target_text + logger.info(f"🎯 [ORA/UI-TARS] Recherche: '{click_label}'") + result = _grounding_ui_tars(target_text, target_desc) + if result: + x, y = result['x'], result['y'] + method_used = 'ui_tars' + logger.info(f"✅ [ORA/UI-TARS] Trouvé à ({x}, {y})") + except Exception as e: + logger.debug(f"⚠️ [ORA/UI-TARS] Erreur: {e}") + + # --- Méthode 2 : Template matching (~80ms) --- + if x is None and screenshot_b64 and CV2_AVAILABLE and PIL_AVAILABLE and MSS_AVAILABLE: try: import io as _io - - # Capturer l'écran with mss_lib.mss() as sct: mon = sct.monitors[0] grab = sct.grab(mon) screen_img = Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX') - # Décoder l'ancre raw_b64 = screenshot_b64.split(',')[1] if ',' in screenshot_b64 else screenshot_b64 anchor_data = base64.b64decode(raw_b64) anchor_img = Image.open(_io.BytesIO(anchor_data)) @@ -888,10 +900,9 @@ Règles: if anchor_cv.shape[0] < screen_cv.shape[0] and anchor_cv.shape[1] < screen_cv.shape[1]: t0 = time.time() result_tm = cv2.matchTemplate(screen_cv, anchor_cv, cv2.TM_CCOEFF_NORMED) - min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result_tm) + _, max_val, _, max_loc = cv2.minMaxLoc(result_tm) elapsed_ms = (time.time() - t0) * 1000 logger.info(f"⚡ [ORA/template] score={max_val:.3f} pos={max_loc} ({elapsed_ms:.0f}ms)") - if max_val > 0.75: x = max_loc[0] + anchor_cv.shape[1] // 2 y = max_loc[1] + anchor_cv.shape[0] // 2 @@ -899,22 +910,17 @@ Règles: except Exception as e: logger.debug(f"⚠️ [ORA/template] Erreur: {e}") - # --- Méthode 2 : find_element_on_screen (OCR → UI-TARS → VLM) --- - if x is None and (target_text or target_desc): + # --- Méthode 3 : OCR texte (~1s) --- + if x is None and target_text: try: - from core.execution.input_handler import find_element_on_screen - grounding_result = find_element_on_screen( - target_text=target_text, - target_description=target_desc, - anchor_image_base64=screenshot_b64, - anchor_bbox=bbox if bbox else None, - ) - if grounding_result: - x, y = grounding_result['x'], grounding_result['y'] - method_used = f"grounding_{grounding_result['method']}" - logger.info(f"🔍 [ORA/grounding] Trouvé via {method_used} à ({x}, {y})") + from core.execution.input_handler import _grounding_ocr + result = _grounding_ocr(target_text, anchor_bbox=bbox if bbox else None) + if result: + x, y = result['x'], result['y'] + method_used = 'ocr' + logger.info(f"🔍 [ORA/OCR] Trouvé à ({x}, {y})") except Exception as e: - logger.debug(f"⚠️ [ORA/grounding] Erreur: {e}") + logger.debug(f"⚠️ [ORA/OCR] Erreur: {e}") # --- Exécuter le clic --- if x is None: