From 7feef3b6a95a52e9673007afa65cd6ebecb3b8b2 Mon Sep 17 00:00:00 2001
From: Dom <dom@rpa-vision-v3.local>
Date: Tue, 21 Apr 2026 18:36:20 +0200
Subject: [PATCH] =?UTF-8?q?fix:=20CLIP=20en=20premier,=20suppression=20v?=
 =?UTF-8?q?=C3=A9rification=20OCR=20crois=C3=A9e,=20fix=20indentation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../backend/api_v3/execute.py                 | 40 +++++++++----------
 .../backend/services/intelligent_executor.py  | 40 ++++---------------
 2 files changed, 27 insertions(+), 53 deletions(-)

diff --git a/visual_workflow_builder/backend/api_v3/execute.py b/visual_workflow_builder/backend/api_v3/execute.py
index eea38eea4..9b38ea08f 100644
--- a/visual_workflow_builder/backend/api_v3/execute.py
+++ b/visual_workflow_builder/backend/api_v3/execute.py
@@ -845,9 +845,25 @@ def execute_action(action_type: str, params: dict) -> dict:
 
                     x, y, confidence, method_used = None, None, 0, ''
 
-                    # === MÉTHODE PRINCIPALE : OCR → UI-TARS (fiable, 1-5s) ===
-                    if _fc_target_text and _fc_target_text not in _action_types:
-                        print(f"🔍 [Grounding] Recherche par texte: '{_fc_target_text}'")
+                    # === MÉTHODE 1 : CLIP visuel (rapide, fiable si écran similaire) ===
+                    from services.intelligent_executor import find_and_click
+                    print(f"🧠 [Vision] Recherche visuelle CLIP...")
+                    result = find_and_click(
+                            anchor_image_base64=screenshot_base64,
+                            anchor_bbox=anchor_bbox,
+                            method='clip',
+                            detection_threshold=0.35,
+                            target_text=_fc_target_text,
+                            target_description=_fc_target_desc
+                        )
+                    if result['found'] and result['coordinates']:
+                        x, y = result['coordinates']['x'], result['coordinates']['y']
+                        confidence = result['confidence']
+                        method_used = 'clip'
+
+                    # === MÉTHODE 2 : OCR → UI-TARS (si CLIP échoue) ===
+                    if x is None and _fc_target_text and _fc_target_text not in _action_types:
+                        print(f"🔍 [Grounding] Fallback OCR/UI-TARS: '{_fc_target_text}'")
                         grounding_result = _shared_find_element(
                             target_text=_fc_target_text,
                             target_description=_fc_target_desc,
@@ -858,24 +874,6 @@ def execute_action(action_type: str, params: dict) -> dict:
                             x, y = grounding_result['x'], grounding_result['y']
                             confidence = grounding_result['confidence']
                             method_used = f"grounding_{grounding_result['method']}"
-                            print(f"✅ [Grounding] Trouvé via {grounding_result['method']} à ({x}, {y}) conf={confidence:.2f}")
-
-                    # === FALLBACK : CLIP (si le grounding n'a rien trouvé) ===
-                    if x is None:
-                        print(f"🔄 [Vision] Fallback CLIP...")
-                        from services.intelligent_executor import find_and_click
-                        result = find_and_click(
-                            anchor_image_base64=screenshot_base64,
-                            anchor_bbox=anchor_bbox,
-                            method='clip',
-                            detection_threshold=0.35,
-                            target_text=_fc_target_text,
-                            target_description=_fc_target_desc
-                        )
-                        if result['found'] and result['coordinates']:
-                            x, y = result['coordinates']['x'], result['coordinates']['y']
-                            confidence = result['confidence']
-                            method_used = 'clip'
 
                     if x is not None:
                         print(f"✅ [Vision] Ancre trouvée à ({x}, {y}) — {method_used} (conf={confidence:.2f})")
diff --git a/visual_workflow_builder/backend/services/intelligent_executor.py b/visual_workflow_builder/backend/services/intelligent_executor.py
index d270f8d37..912dcc75e 100644
--- a/visual_workflow_builder/backend/services/intelligent_executor.py
+++ b/visual_workflow_builder/backend/services/intelligent_executor.py
@@ -724,38 +724,14 @@ def find_and_click(
 
                 # clip_result.found est conditionné par les seuils dans find_anchor_in_screen
                 if clip_result.found:
-                    # Vérification croisée OCR : le texte à cette position correspond-il ?
-                    clip_validated = True
-                    if target_text and target_text not in ('click_anchor', 'double_click_anchor',
-                            'right_click_anchor', 'hover_anchor', 'focus_anchor'):
-                        try:
-                            from services.ocr_service import ocr_extract_words
-                            words = ocr_extract_words(screen_image)
-                            cx, cy = clip_result.center['x'], clip_result.center['y']
-                            nearby_texts = []
-                            for w in words:
-                                wx = (w['bbox'][0] + w['bbox'][2]) / 2
-                                wy = (w['bbox'][1] + w['bbox'][3]) / 2
-                                dist = ((wx - cx)**2 + (wy - cy)**2) ** 0.5
-                                if dist < 100:
-                                    nearby_texts.append(w['text'])
-                            nearby_str = ' '.join(nearby_texts).lower()
-                            target_lower = target_text.lower()
-                            if target_lower not in nearby_str and not any(t.lower() in target_lower for t in nearby_texts if len(t) > 2):
-                                print(f"⛔ [Vision] CLIP rejeté par OCR: texte proche='{nearby_str}' ne contient pas '{target_text}'")
-                                clip_validated = False
-                        except Exception as ocr_err:
-                            print(f"⚠️ [Vision] Vérification OCR échouée: {ocr_err}")
-
-                    if clip_validated:
-                        print(f"✅ [Vision] UI-DETR-1+CLIP réussi! Confiance: {clip_result.confidence:.2f}")
-                        return {
-                            'found': True,
-                            'confidence': clip_result.confidence,
-                            'coordinates': clip_result.center,
-                            'bbox': clip_result.bbox,
-                            'method': 'clip',
-                            'search_time_ms': (_time.time() - start_time) * 1000
+                    print(f"✅ [Vision] UI-DETR-1+CLIP réussi! Confiance: {clip_result.confidence:.2f}")
+                    return {
+                        'found': True,
+                        'confidence': clip_result.confidence,
+                        'coordinates': clip_result.center,
+                        'bbox': clip_result.bbox,
+                        'method': 'clip',
+                        'search_time_ms': (_time.time() - start_time) * 1000
                         }
                 else:
                     print(f"⚠️ [Vision] UI-DETR-1+CLIP: rejeté (confiance: {clip_result.confidence:.2f})")