From de1026ee2e1498f3c04cd70fd72cd792be5a1bf7 Mon Sep 17 00:00:00 2001 From: Dom Date: Tue, 21 Apr 2026 19:17:08 +0200 Subject: [PATCH] perf: template matching direct en PREMIER (~1-10ms) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cv2.matchTemplate cherche l'ancre directement dans le screenshot. Pas de RF-DETR, pas de CLIP, pas de 90 comparaisons. Seuil 0.75 pour éviter les faux positifs. Ordre : template (1ms) → CLIP (fallback) → OCR/UI-TARS (dernier recours) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../backend/api_v3/execute.py | 68 ++++++++++++++----- 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/visual_workflow_builder/backend/api_v3/execute.py b/visual_workflow_builder/backend/api_v3/execute.py index 9b38ea08f..e0cc61344 100644 --- a/visual_workflow_builder/backend/api_v3/execute.py +++ b/visual_workflow_builder/backend/api_v3/execute.py @@ -845,23 +845,59 @@ def execute_action(action_type: str, params: dict) -> dict: x, y, confidence, method_used = None, None, 0, '' - # === MÉTHODE 1 : CLIP visuel (rapide, fiable si écran similaire) === - from services.intelligent_executor import find_and_click - print(f"🧠 [Vision] Recherche visuelle CLIP...") - result = find_and_click( - anchor_image_base64=screenshot_base64, - anchor_bbox=anchor_bbox, - method='clip', - detection_threshold=0.35, - target_text=_fc_target_text, - target_description=_fc_target_desc - ) - if result['found'] and result['coordinates']: - x, y = result['coordinates']['x'], result['coordinates']['y'] - confidence = result['confidence'] - method_used = 'clip' + # === MÉTHODE 1 : Template matching direct (~1-10ms) === + try: + import cv2 + import numpy as np + import mss as mss_lib + from PIL import Image as PILImage - # === MÉTHODE 2 : OCR → UI-TARS (si CLIP échoue) === + with mss_lib.mss() as sct: + mon = sct.monitors[0] + grab = sct.grab(mon) + screen_img = PILImage.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX') + + # Décoder l'ancre + anchor_data = base64.b64decode(screenshot_base64.split(',')[1] if ',' in screenshot_base64 else screenshot_base64) + anchor_img = PILImage.open(io.BytesIO(anchor_data)) + + screen_cv = cv2.cvtColor(np.array(screen_img), cv2.COLOR_RGB2BGR) + anchor_cv = cv2.cvtColor(np.array(anchor_img), cv2.COLOR_RGB2BGR) + + if anchor_cv.shape[0] < screen_cv.shape[0] and anchor_cv.shape[1] < screen_cv.shape[1]: + import time as _t + _t0 = _t.time() + result_tm = cv2.matchTemplate(screen_cv, anchor_cv, cv2.TM_CCOEFF_NORMED) + min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result_tm) + _elapsed = (_t.time() - _t0) * 1000 + print(f"⚡ [Template] Score={max_val:.3f} pos={max_loc} ({_elapsed:.0f}ms)") + + if max_val > 0.75: + x = max_loc[0] + anchor_cv.shape[1] // 2 + y = max_loc[1] + anchor_cv.shape[0] // 2 + confidence = max_val + method_used = 'template' + except Exception as tmpl_err: + print(f"⚠️ [Template] Erreur: {tmpl_err}") + + # === MÉTHODE 2 : RF-DETR + CLIP (si template échoue) === + if x is None: + from services.intelligent_executor import find_and_click + print(f"🧠 [Vision] Fallback CLIP...") + result = find_and_click( + anchor_image_base64=screenshot_base64, + anchor_bbox=anchor_bbox, + method='clip', + detection_threshold=0.35, + target_text=_fc_target_text, + target_description=_fc_target_desc + ) + if result['found'] and result['coordinates']: + x, y = result['coordinates']['x'], result['coordinates']['y'] + confidence = result['confidence'] + method_used = 'clip' + + # === MÉTHODE 3 : OCR → UI-TARS (si tout échoue) === if x is None and _fc_target_text and _fc_target_text not in _action_types: print(f"🔍 [Grounding] Fallback OCR/UI-TARS: '{_fc_target_text}'") grounding_result = _shared_find_element(