fix: CLIP en premier, suppression vérification OCR croisée, fix indentation

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-21 18:36:20 +02:00
parent 0b06db222d
commit 7feef3b6a9
2 changed files with 27 additions and 53 deletions
--- a/visual_workflow_builder/backend/api_v3/execute.py
+++ b/visual_workflow_builder/backend/api_v3/execute.py
@@ -845,9 +845,25 @@ def execute_action(action_type: str, params: dict) -> dict:

                    x, y, confidence, method_used = None, None, 0, ''

-                    # === MÉTHODE PRINCIPALE : OCR → UI-TARS (fiable, 1-5s) ===
-                    if _fc_target_text and _fc_target_text not in _action_types:
-                        print(f"🔍 [Grounding] Recherche par texte: '{_fc_target_text}'")
+                    # === MÉTHODE 1 : CLIP visuel (rapide, fiable si écran similaire) ===
+                    from services.intelligent_executor import find_and_click
+                    print(f"🧠 [Vision] Recherche visuelle CLIP...")
+                    result = find_and_click(
+                            anchor_image_base64=screenshot_base64,
+                            anchor_bbox=anchor_bbox,
+                            method='clip',
+                            detection_threshold=0.35,
+                            target_text=_fc_target_text,
+                            target_description=_fc_target_desc
+                        )
+                    if result['found'] and result['coordinates']:
+                        x, y = result['coordinates']['x'], result['coordinates']['y']
+                        confidence = result['confidence']
+                        method_used = 'clip'
+
+                    # === MÉTHODE 2 : OCR → UI-TARS (si CLIP échoue) ===
+                    if x is None and _fc_target_text and _fc_target_text not in _action_types:
+                        print(f"🔍 [Grounding] Fallback OCR/UI-TARS: '{_fc_target_text}'")
                        grounding_result = _shared_find_element(
                            target_text=_fc_target_text,
                            target_description=_fc_target_desc,
@@ -858,24 +874,6 @@ def execute_action(action_type: str, params: dict) -> dict:
                            x, y = grounding_result['x'], grounding_result['y']
                            confidence = grounding_result['confidence']
                            method_used = f"grounding_{grounding_result['method']}"
-                            print(f"✅ [Grounding] Trouvé via {grounding_result['method']} à ({x}, {y}) conf={confidence:.2f}")
-
-                    # === FALLBACK : CLIP (si le grounding n'a rien trouvé) ===
-                    if x is None:
-                        print(f"🔄 [Vision] Fallback CLIP...")
-                        from services.intelligent_executor import find_and_click
-                        result = find_and_click(
-                            anchor_image_base64=screenshot_base64,
-                            anchor_bbox=anchor_bbox,
-                            method='clip',
-                            detection_threshold=0.35,
-                            target_text=_fc_target_text,
-                            target_description=_fc_target_desc
-                        )
-                        if result['found'] and result['coordinates']:
-                            x, y = result['coordinates']['x'], result['coordinates']['y']
-                            confidence = result['confidence']
-                            method_used = 'clip'

                    if x is not None:
                        print(f"✅ [Vision] Ancre trouvée à ({x}, {y}) — {method_used} (conf={confidence:.2f})")