refactor(ORA): UI-TARS en PREMIER pour les clics

Ordre : UI-TARS (3s, 94%) → Template (80ms) → OCR (1s) UI-TARS dit "click on CR_patient_demo" et trouve les coordonnées comme un humain. Le template matching échoue sur les icônes Windows (micro-différences visuelles → score 0.38 au lieu de 0.95). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-22 15:59:45 +02:00
parent f04398d5a7
commit eba6fea779
1 changed files with 27 additions and 21 deletions
--- a/core/execution/observe_reason_act.py
+++ b/core/execution/observe_reason_act.py
@@ -866,18 +866,30 @@ Règles:
        x, y = None, None
        method_used = ''

-        # --- Méthode 1 : Template matching direct (~1-10ms) ---
-        if screenshot_b64 and CV2_AVAILABLE and PIL_AVAILABLE and MSS_AVAILABLE:
+        # --- Méthode 1 : UI-TARS grounding (~3s, 94% précision) ---
+        # Le plus fiable : on dit "click on X" et UI-TARS trouve les coordonnées
+        if target_text or target_desc:
+            try:
+                from core.execution.input_handler import _grounding_ui_tars
+                click_label = target_desc or target_text
+                logger.info(f"🎯 [ORA/UI-TARS] Recherche: '{click_label}'")
+                result = _grounding_ui_tars(target_text, target_desc)
+                if result:
+                    x, y = result['x'], result['y']
+                    method_used = 'ui_tars'
+                    logger.info(f"✅ [ORA/UI-TARS] Trouvé à ({x}, {y})")
+            except Exception as e:
+                logger.debug(f"⚠️ [ORA/UI-TARS] Erreur: {e}")
+
+        # --- Méthode 2 : Template matching (~80ms) ---
+        if x is None and screenshot_b64 and CV2_AVAILABLE and PIL_AVAILABLE and MSS_AVAILABLE:
            try:
                import io as _io
-
-                # Capturer l'écran
                with mss_lib.mss() as sct:
                    mon = sct.monitors[0]
                    grab = sct.grab(mon)
                    screen_img = Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX')

-                # Décoder l'ancre
                raw_b64 = screenshot_b64.split(',')[1] if ',' in screenshot_b64 else screenshot_b64
                anchor_data = base64.b64decode(raw_b64)
                anchor_img = Image.open(_io.BytesIO(anchor_data))
@@ -888,10 +900,9 @@ Règles:
                if anchor_cv.shape[0] < screen_cv.shape[0] and anchor_cv.shape[1] < screen_cv.shape[1]:
                    t0 = time.time()
                    result_tm = cv2.matchTemplate(screen_cv, anchor_cv, cv2.TM_CCOEFF_NORMED)
-                    min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result_tm)
+                    _, max_val, _, max_loc = cv2.minMaxLoc(result_tm)
                    elapsed_ms = (time.time() - t0) * 1000
                    logger.info(f"⚡ [ORA/template] score={max_val:.3f} pos={max_loc} ({elapsed_ms:.0f}ms)")
-
                    if max_val > 0.75:
                        x = max_loc[0] + anchor_cv.shape[1] // 2
                        y = max_loc[1] + anchor_cv.shape[0] // 2
@@ -899,22 +910,17 @@ Règles:
            except Exception as e:
                logger.debug(f"⚠️ [ORA/template] Erreur: {e}")

-        # --- Méthode 2 : find_element_on_screen (OCR → UI-TARS → VLM) ---
-        if x is None and (target_text or target_desc):
+        # --- Méthode 3 : OCR texte (~1s) ---
+        if x is None and target_text:
            try:
-                from core.execution.input_handler import find_element_on_screen
-                grounding_result = find_element_on_screen(
-                    target_text=target_text,
-                    target_description=target_desc,
-                    anchor_image_base64=screenshot_b64,
-                    anchor_bbox=bbox if bbox else None,
-                )
-                if grounding_result:
-                    x, y = grounding_result['x'], grounding_result['y']
-                    method_used = f"grounding_{grounding_result['method']}"
-                    logger.info(f"🔍 [ORA/grounding] Trouvé via {method_used} à ({x}, {y})")
+                from core.execution.input_handler import _grounding_ocr
+                result = _grounding_ocr(target_text, anchor_bbox=bbox if bbox else None)
+                if result:
+                    x, y = result['x'], result['y']
+                    method_used = 'ocr'
+                    logger.info(f"🔍 [ORA/OCR] Trouvé à ({x}, {y})")
            except Exception as e:
-                logger.debug(f"⚠️ [ORA/grounding] Erreur: {e}")
+                logger.debug(f"⚠️ [ORA/OCR] Erreur: {e}")

        # --- Exécuter le clic ---
        if x is None: