From 27490849a88f8fdfcefbc8aea42900381def5bbc Mon Sep 17 00:00:00 2001
From: Dom <dom@rpa-vision-v3.local>
Date: Tue, 21 Apr 2026 14:40:38 +0200
Subject: [PATCH] refactor: OCR/UI-TARS en PREMIER, CLIP en fallback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Le grounding par texte (OCR → UI-TARS) est maintenant la méthode
PRINCIPALE. CLIP n'est appelé que si le grounding échoue.

Avant : CLIP (faux positifs confiants) → cascade grounding (rarement atteinte)
Après : OCR 1s → UI-TARS 3s → CLIP (fallback visuel pur)

C'est comme ça que font UI-TARS, Agent-S3 et AppAgent.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../backend/api_v3/execute.py                 | 61 +++++++++++--------
 1 file changed, 37 insertions(+), 24 deletions(-)

diff --git a/visual_workflow_builder/backend/api_v3/execute.py b/visual_workflow_builder/backend/api_v3/execute.py
index f430b9313..e20f8c328 100644
--- a/visual_workflow_builder/backend/api_v3/execute.py
+++ b/visual_workflow_builder/backend/api_v3/execute.py
@@ -807,11 +807,8 @@ def execute_action(action_type: str, params: dict) -> dict:
             # === MODE INTELLIGENT / DEBUG ===
             if execution_mode in ['intelligent', 'debug'] and screenshot_base64:
                 try:
-                    from services.intelligent_executor import find_and_click
+                    print(f"🧠 [Action] Mode {execution_mode}: recherche de l'ancre...")
 
-                    print(f"🧠 [Action] Mode {execution_mode}: recherche visuelle de l'ancre...")
-
-                    # Convertir bbox au format attendu
                     anchor_bbox = {
                         'x': bbox.get('x', 0),
                         'y': bbox.get('y', 0),
@@ -819,11 +816,10 @@ def execute_action(action_type: str, params: dict) -> dict:
                         'height': bbox.get('height', 0)
                     }
 
-                    # Extraire le texte cible pour le grounding et la vérification CLIP
+                    # Extraire le texte cible
                     _fc_target_text = params.get('visual_anchor', {}).get('target_text', '')
                     if not _fc_target_text:
                         _fc_target_text = params.get('_step_label', '')
-                    # Si le label est juste le type d'action, essayer de décrire l'ancre
                     _action_types = {'click_anchor', 'double_click_anchor', 'right_click_anchor',
                                      'hover_anchor', 'focus_anchor', 'scroll_to_anchor'}
                     if _fc_target_text in _action_types and screenshot_base64:
@@ -833,27 +829,46 @@ def execute_action(action_type: str, params: dict) -> dict:
                             if _desc:
                                 print(f"🏷️ [Vision] Ancre décrite: '{_desc}'")
                                 _fc_target_text = _desc
-                        except Exception as e:
-                            print(f"⚠️ [Vision] Description ancre échouée: {e}")
+                        except Exception:
+                            pass
                     _fc_target_desc = params.get('visual_anchor', {}).get('description', '')
 
-                    # Trouver l'ancre avec la vision (CLIP + position - cf VISION_RPA_INTELLIGENT.md)
-                    result = find_and_click(
-                        anchor_image_base64=screenshot_base64,
-                        anchor_bbox=anchor_bbox,
-                        method='clip',  # UI-DETR-1 + CLIP avec pondération par distance
-                        detection_threshold=0.35,
-                        target_text=_fc_target_text,
-                        target_description=_fc_target_desc
-                    )
+                    x, y, confidence, method_used = None, None, 0, ''
 
-                    if result['found'] and result['coordinates']:
-                        x, y = result['coordinates']['x'], result['coordinates']['y']
-                        confidence = result['confidence']
+                    # === MÉTHODE PRINCIPALE : OCR → UI-TARS (fiable, 1-5s) ===
+                    if _fc_target_text and _fc_target_text not in _action_types:
+                        print(f"🔍 [Grounding] Recherche par texte: '{_fc_target_text}'")
+                        grounding_result = _shared_find_element(
+                            target_text=_fc_target_text,
+                            target_description=_fc_target_desc,
+                            anchor_image_base64=screenshot_base64
+                        )
+                        if grounding_result:
+                            x, y = grounding_result['x'], grounding_result['y']
+                            confidence = grounding_result['confidence']
+                            method_used = f"grounding_{grounding_result['method']}"
+                            print(f"✅ [Grounding] Trouvé via {grounding_result['method']} à ({x}, {y}) conf={confidence:.2f}")
 
-                        print(f"✅ [Vision] Ancre trouvée à ({x}, {y}) - confiance: {confidence:.2f}")
+                    # === FALLBACK : CLIP (si le grounding n'a rien trouvé) ===
+                    if x is None:
+                        print(f"🔄 [Vision] Fallback CLIP...")
+                        from services.intelligent_executor import find_and_click
+                        result = find_and_click(
+                            anchor_image_base64=screenshot_base64,
+                            anchor_bbox=anchor_bbox,
+                            method='clip',
+                            detection_threshold=0.35,
+                            target_text=_fc_target_text,
+                            target_description=_fc_target_desc
+                        )
+                        if result['found'] and result['coordinates']:
+                            x, y = result['coordinates']['x'], result['coordinates']['y']
+                            confidence = result['confidence']
+                            method_used = 'clip'
+
+                    if x is not None:
+                        print(f"✅ [Vision] Ancre trouvée à ({x}, {y}) — {method_used} (conf={confidence:.2f})")
 
-                        # Effectuer le clic
                         if click_type == 'double':
                             pyautogui.doubleClick(x, y)
                         elif click_type == 'right':
@@ -861,8 +876,6 @@ def execute_action(action_type: str, params: dict) -> dict:
                         else:
                             pyautogui.click(x, y)
 
-                        # Délai après le clic pour que l'application réagisse
-                        # 2 secondes pour laisser le temps aux applications de s'ouvrir
                         time.sleep(2.0)
 
                         return {