From de1026ee2e1498f3c04cd70fd72cd792be5a1bf7 Mon Sep 17 00:00:00 2001
From: Dom <dom@rpa-vision-v3.local>
Date: Tue, 21 Apr 2026 19:17:08 +0200
Subject: [PATCH] perf: template matching direct en PREMIER (~1-10ms)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cv2.matchTemplate cherche l'ancre directement dans le screenshot.
Pas de RF-DETR, pas de CLIP, pas de 90 comparaisons.
Seuil 0.75 pour éviter les faux positifs.

Ordre : template (1ms) → CLIP (fallback) → OCR/UI-TARS (dernier recours)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../backend/api_v3/execute.py                 | 68 ++++++++++++++-----
 1 file changed, 52 insertions(+), 16 deletions(-)

diff --git a/visual_workflow_builder/backend/api_v3/execute.py b/visual_workflow_builder/backend/api_v3/execute.py
index 9b38ea08f..e0cc61344 100644
--- a/visual_workflow_builder/backend/api_v3/execute.py
+++ b/visual_workflow_builder/backend/api_v3/execute.py
@@ -845,23 +845,59 @@ def execute_action(action_type: str, params: dict) -> dict:
 
                     x, y, confidence, method_used = None, None, 0, ''
 
-                    # === MÉTHODE 1 : CLIP visuel (rapide, fiable si écran similaire) ===
-                    from services.intelligent_executor import find_and_click
-                    print(f"🧠 [Vision] Recherche visuelle CLIP...")
-                    result = find_and_click(
-                            anchor_image_base64=screenshot_base64,
-                            anchor_bbox=anchor_bbox,
-                            method='clip',
-                            detection_threshold=0.35,
-                            target_text=_fc_target_text,
-                            target_description=_fc_target_desc
-                        )
-                    if result['found'] and result['coordinates']:
-                        x, y = result['coordinates']['x'], result['coordinates']['y']
-                        confidence = result['confidence']
-                        method_used = 'clip'
+                    # === MÉTHODE 1 : Template matching direct (~1-10ms) ===
+                    try:
+                        import cv2
+                        import numpy as np
+                        import mss as mss_lib
+                        from PIL import Image as PILImage
 
-                    # === MÉTHODE 2 : OCR → UI-TARS (si CLIP échoue) ===
+                        with mss_lib.mss() as sct:
+                            mon = sct.monitors[0]
+                            grab = sct.grab(mon)
+                            screen_img = PILImage.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX')
+
+                        # Décoder l'ancre
+                        anchor_data = base64.b64decode(screenshot_base64.split(',')[1] if ',' in screenshot_base64 else screenshot_base64)
+                        anchor_img = PILImage.open(io.BytesIO(anchor_data))
+
+                        screen_cv = cv2.cvtColor(np.array(screen_img), cv2.COLOR_RGB2BGR)
+                        anchor_cv = cv2.cvtColor(np.array(anchor_img), cv2.COLOR_RGB2BGR)
+
+                        if anchor_cv.shape[0] < screen_cv.shape[0] and anchor_cv.shape[1] < screen_cv.shape[1]:
+                            import time as _t
+                            _t0 = _t.time()
+                            result_tm = cv2.matchTemplate(screen_cv, anchor_cv, cv2.TM_CCOEFF_NORMED)
+                            min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result_tm)
+                            _elapsed = (_t.time() - _t0) * 1000
+                            print(f"⚡ [Template] Score={max_val:.3f} pos={max_loc} ({_elapsed:.0f}ms)")
+
+                            if max_val > 0.75:
+                                x = max_loc[0] + anchor_cv.shape[1] // 2
+                                y = max_loc[1] + anchor_cv.shape[0] // 2
+                                confidence = max_val
+                                method_used = 'template'
+                    except Exception as tmpl_err:
+                        print(f"⚠️ [Template] Erreur: {tmpl_err}")
+
+                    # === MÉTHODE 2 : RF-DETR + CLIP (si template échoue) ===
+                    if x is None:
+                        from services.intelligent_executor import find_and_click
+                        print(f"🧠 [Vision] Fallback CLIP...")
+                        result = find_and_click(
+                                anchor_image_base64=screenshot_base64,
+                                anchor_bbox=anchor_bbox,
+                                method='clip',
+                                detection_threshold=0.35,
+                                target_text=_fc_target_text,
+                                target_description=_fc_target_desc
+                            )
+                        if result['found'] and result['coordinates']:
+                            x, y = result['coordinates']['x'], result['coordinates']['y']
+                            confidence = result['confidence']
+                            method_used = 'clip'
+
+                    # === MÉTHODE 3 : OCR → UI-TARS (si tout échoue) ===
                     if x is None and _fc_target_text and _fc_target_text not in _action_types:
                         print(f"🔍 [Grounding] Fallback OCR/UI-TARS: '{_fc_target_text}'")
                         grounding_result = _shared_find_element(