refactor(ORA): UI-TARS en PREMIER pour les clics
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 15s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 16s
tests / Tests unitaires (sans GPU) (push) Failing after 16s
tests / Tests sécurité (critique) (push) Has been skipped
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 15s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 16s
tests / Tests unitaires (sans GPU) (push) Failing after 16s
tests / Tests sécurité (critique) (push) Has been skipped
Ordre : UI-TARS (3s, 94%) → Template (80ms) → OCR (1s) UI-TARS dit "click on CR_patient_demo" et trouve les coordonnées comme un humain. Le template matching échoue sur les icônes Windows (micro-différences visuelles → score 0.38 au lieu de 0.95). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -866,18 +866,30 @@ Règles:
|
||||
x, y = None, None
|
||||
method_used = ''
|
||||
|
||||
# --- Méthode 1 : Template matching direct (~1-10ms) ---
|
||||
if screenshot_b64 and CV2_AVAILABLE and PIL_AVAILABLE and MSS_AVAILABLE:
|
||||
# --- Méthode 1 : UI-TARS grounding (~3s, 94% précision) ---
|
||||
# Le plus fiable : on dit "click on X" et UI-TARS trouve les coordonnées
|
||||
if target_text or target_desc:
|
||||
try:
|
||||
from core.execution.input_handler import _grounding_ui_tars
|
||||
click_label = target_desc or target_text
|
||||
logger.info(f"🎯 [ORA/UI-TARS] Recherche: '{click_label}'")
|
||||
result = _grounding_ui_tars(target_text, target_desc)
|
||||
if result:
|
||||
x, y = result['x'], result['y']
|
||||
method_used = 'ui_tars'
|
||||
logger.info(f"✅ [ORA/UI-TARS] Trouvé à ({x}, {y})")
|
||||
except Exception as e:
|
||||
logger.debug(f"⚠️ [ORA/UI-TARS] Erreur: {e}")
|
||||
|
||||
# --- Méthode 2 : Template matching (~80ms) ---
|
||||
if x is None and screenshot_b64 and CV2_AVAILABLE and PIL_AVAILABLE and MSS_AVAILABLE:
|
||||
try:
|
||||
import io as _io
|
||||
|
||||
# Capturer l'écran
|
||||
with mss_lib.mss() as sct:
|
||||
mon = sct.monitors[0]
|
||||
grab = sct.grab(mon)
|
||||
screen_img = Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX')
|
||||
|
||||
# Décoder l'ancre
|
||||
raw_b64 = screenshot_b64.split(',')[1] if ',' in screenshot_b64 else screenshot_b64
|
||||
anchor_data = base64.b64decode(raw_b64)
|
||||
anchor_img = Image.open(_io.BytesIO(anchor_data))
|
||||
@@ -888,10 +900,9 @@ Règles:
|
||||
if anchor_cv.shape[0] < screen_cv.shape[0] and anchor_cv.shape[1] < screen_cv.shape[1]:
|
||||
t0 = time.time()
|
||||
result_tm = cv2.matchTemplate(screen_cv, anchor_cv, cv2.TM_CCOEFF_NORMED)
|
||||
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result_tm)
|
||||
_, max_val, _, max_loc = cv2.minMaxLoc(result_tm)
|
||||
elapsed_ms = (time.time() - t0) * 1000
|
||||
logger.info(f"⚡ [ORA/template] score={max_val:.3f} pos={max_loc} ({elapsed_ms:.0f}ms)")
|
||||
|
||||
if max_val > 0.75:
|
||||
x = max_loc[0] + anchor_cv.shape[1] // 2
|
||||
y = max_loc[1] + anchor_cv.shape[0] // 2
|
||||
@@ -899,22 +910,17 @@ Règles:
|
||||
except Exception as e:
|
||||
logger.debug(f"⚠️ [ORA/template] Erreur: {e}")
|
||||
|
||||
# --- Méthode 2 : find_element_on_screen (OCR → UI-TARS → VLM) ---
|
||||
if x is None and (target_text or target_desc):
|
||||
# --- Méthode 3 : OCR texte (~1s) ---
|
||||
if x is None and target_text:
|
||||
try:
|
||||
from core.execution.input_handler import find_element_on_screen
|
||||
grounding_result = find_element_on_screen(
|
||||
target_text=target_text,
|
||||
target_description=target_desc,
|
||||
anchor_image_base64=screenshot_b64,
|
||||
anchor_bbox=bbox if bbox else None,
|
||||
)
|
||||
if grounding_result:
|
||||
x, y = grounding_result['x'], grounding_result['y']
|
||||
method_used = f"grounding_{grounding_result['method']}"
|
||||
logger.info(f"🔍 [ORA/grounding] Trouvé via {method_used} à ({x}, {y})")
|
||||
from core.execution.input_handler import _grounding_ocr
|
||||
result = _grounding_ocr(target_text, anchor_bbox=bbox if bbox else None)
|
||||
if result:
|
||||
x, y = result['x'], result['y']
|
||||
method_used = 'ocr'
|
||||
logger.info(f"🔍 [ORA/OCR] Trouvé à ({x}, {y})")
|
||||
except Exception as e:
|
||||
logger.debug(f"⚠️ [ORA/grounding] Erreur: {e}")
|
||||
logger.debug(f"⚠️ [ORA/OCR] Erreur: {e}")
|
||||
|
||||
# --- Exécuter le clic ---
|
||||
if x is None:
|
||||
|
||||
Reference in New Issue
Block a user