refactor(ORA): UI-TARS en PREMIER pour les clics
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 15s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 16s
tests / Tests unitaires (sans GPU) (push) Failing after 16s
tests / Tests sécurité (critique) (push) Has been skipped

Ordre : UI-TARS (3s, 94%) → Template (80ms) → OCR (1s)

UI-TARS dit "click on CR_patient_demo" et trouve les coordonnées
comme un humain. Le template matching échoue sur les icônes Windows
(micro-différences visuelles → score 0.38 au lieu de 0.95).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-04-22 15:59:45 +02:00
parent f04398d5a7
commit eba6fea779

View File

@@ -866,18 +866,30 @@ Règles:
x, y = None, None
method_used = ''
# --- Méthode 1 : Template matching direct (~1-10ms) ---
if screenshot_b64 and CV2_AVAILABLE and PIL_AVAILABLE and MSS_AVAILABLE:
# --- Méthode 1 : UI-TARS grounding (~3s, 94% précision) ---
# Le plus fiable : on dit "click on X" et UI-TARS trouve les coordonnées
if target_text or target_desc:
try:
from core.execution.input_handler import _grounding_ui_tars
click_label = target_desc or target_text
logger.info(f"🎯 [ORA/UI-TARS] Recherche: '{click_label}'")
result = _grounding_ui_tars(target_text, target_desc)
if result:
x, y = result['x'], result['y']
method_used = 'ui_tars'
logger.info(f"✅ [ORA/UI-TARS] Trouvé à ({x}, {y})")
except Exception as e:
logger.debug(f"⚠️ [ORA/UI-TARS] Erreur: {e}")
# --- Méthode 2 : Template matching (~80ms) ---
if x is None and screenshot_b64 and CV2_AVAILABLE and PIL_AVAILABLE and MSS_AVAILABLE:
try:
import io as _io
# Capturer l'écran
with mss_lib.mss() as sct:
mon = sct.monitors[0]
grab = sct.grab(mon)
screen_img = Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX')
# Décoder l'ancre
raw_b64 = screenshot_b64.split(',')[1] if ',' in screenshot_b64 else screenshot_b64
anchor_data = base64.b64decode(raw_b64)
anchor_img = Image.open(_io.BytesIO(anchor_data))
@@ -888,10 +900,9 @@ Règles:
if anchor_cv.shape[0] < screen_cv.shape[0] and anchor_cv.shape[1] < screen_cv.shape[1]:
t0 = time.time()
result_tm = cv2.matchTemplate(screen_cv, anchor_cv, cv2.TM_CCOEFF_NORMED)
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result_tm)
_, max_val, _, max_loc = cv2.minMaxLoc(result_tm)
elapsed_ms = (time.time() - t0) * 1000
logger.info(f"⚡ [ORA/template] score={max_val:.3f} pos={max_loc} ({elapsed_ms:.0f}ms)")
if max_val > 0.75:
x = max_loc[0] + anchor_cv.shape[1] // 2
y = max_loc[1] + anchor_cv.shape[0] // 2
@@ -899,22 +910,17 @@ Règles:
except Exception as e:
logger.debug(f"⚠️ [ORA/template] Erreur: {e}")
# --- Méthode 2 : find_element_on_screen (OCR → UI-TARS → VLM) ---
if x is None and (target_text or target_desc):
# --- Méthode 3 : OCR texte (~1s) ---
if x is None and target_text:
try:
from core.execution.input_handler import find_element_on_screen
grounding_result = find_element_on_screen(
target_text=target_text,
target_description=target_desc,
anchor_image_base64=screenshot_b64,
anchor_bbox=bbox if bbox else None,
)
if grounding_result:
x, y = grounding_result['x'], grounding_result['y']
method_used = f"grounding_{grounding_result['method']}"
logger.info(f"🔍 [ORA/grounding] Trouvé via {method_used} à ({x}, {y})")
from core.execution.input_handler import _grounding_ocr
result = _grounding_ocr(target_text, anchor_bbox=bbox if bbox else None)
if result:
x, y = result['x'], result['y']
method_used = 'ocr'
logger.info(f"🔍 [ORA/OCR] Trouvé à ({x}, {y})")
except Exception as e:
logger.debug(f"⚠️ [ORA/grounding] Erreur: {e}")
logger.debug(f"⚠️ [ORA/OCR] Erreur: {e}")
# --- Exécuter le clic ---
if x is None: