refactor: OCR/UI-TARS en PREMIER, CLIP en fallback

Le grounding par texte (OCR → UI-TARS) est maintenant la méthode PRINCIPALE. CLIP n'est appelé que si le grounding échoue. Avant : CLIP (faux positifs confiants) → cascade grounding (rarement atteinte) Après : OCR 1s → UI-TARS 3s → CLIP (fallback visuel pur) C'est comme ça que font UI-TARS, Agent-S3 et AppAgent. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-21 14:40:38 +02:00
parent cebbf0809a
commit 27490849a8
1 changed files with 37 additions and 24 deletions
--- a/visual_workflow_builder/backend/api_v3/execute.py
+++ b/visual_workflow_builder/backend/api_v3/execute.py
@@ -807,11 +807,8 @@ def execute_action(action_type: str, params: dict) -> dict:
            # === MODE INTELLIGENT / DEBUG ===
            if execution_mode in ['intelligent', 'debug'] and screenshot_base64:
                try:
-                    from services.intelligent_executor import find_and_click
+                    print(f"🧠 [Action] Mode {execution_mode}: recherche de l'ancre...")
                    print(f"🧠 [Action] Mode {execution_mode}: recherche visuelle de l'ancre...")
                    # Convertir bbox au format attendu
                    anchor_bbox = {
                        'x': bbox.get('x', 0),
                        'y': bbox.get('y', 0),
@@ -819,11 +816,10 @@ def execute_action(action_type: str, params: dict) -> dict:
                        'height': bbox.get('height', 0)
                    }
-                    # Extraire le texte cible pour le grounding et la vérification CLIP
+                    # Extraire le texte cible
                    _fc_target_text = params.get('visual_anchor', {}).get('target_text', '')
                    if not _fc_target_text:
                        _fc_target_text = params.get('_step_label', '')
                    # Si le label est juste le type d'action, essayer de décrire l'ancre
                    _action_types = {'click_anchor', 'double_click_anchor', 'right_click_anchor',
                                     'hover_anchor', 'focus_anchor', 'scroll_to_anchor'}
                    if _fc_target_text in _action_types and screenshot_base64:
@@ -833,27 +829,46 @@ def execute_action(action_type: str, params: dict) -> dict:
                            if _desc:
                                print(f"🏷️ [Vision] Ancre décrite: '{_desc}'")
                                _fc_target_text = _desc
-                        except Exception as e:
+                        except Exception:
-                            print(f"⚠️ [Vision] Description ancre échouée: {e}")
+                            pass
                    _fc_target_desc = params.get('visual_anchor', {}).get('description', '')
-                    # Trouver l'ancre avec la vision (CLIP + position - cf VISION_RPA_INTELLIGENT.md)
+                    x, y, confidence, method_used = None, None, 0, ''
                    result = find_and_click(
                        anchor_image_base64=screenshot_base64,
                        anchor_bbox=anchor_bbox,
                        method='clip',  # UI-DETR-1 + CLIP avec pondération par distance
                        detection_threshold=0.35,
                        target_text=_fc_target_text,
                        target_description=_fc_target_desc
                    )
-                    if result['found'] and result['coordinates']:
+                    # === MÉTHODE PRINCIPALE : OCR → UI-TARS (fiable, 1-5s) ===
-                        x, y = result['coordinates']['x'], result['coordinates']['y']
+                    if _fc_target_text and _fc_target_text not in _action_types:
-                        confidence = result['confidence']
+                        print(f"🔍 [Grounding] Recherche par texte: '{_fc_target_text}'")
                        grounding_result = _shared_find_element(
                            target_text=_fc_target_text,
                            target_description=_fc_target_desc,
                            anchor_image_base64=screenshot_base64
                        )
                        if grounding_result:
                            x, y = grounding_result['x'], grounding_result['y']
                            confidence = grounding_result['confidence']
                            method_used = f"grounding_{grounding_result['method']}"
                            print(f"✅ [Grounding] Trouvé via {grounding_result['method']} à ({x}, {y}) conf={confidence:.2f}")
-                        print(f"✅ [Vision] Ancre trouvée à ({x}, {y}) - confiance: {confidence:.2f}")
+                    # === FALLBACK : CLIP (si le grounding n'a rien trouvé) ===
                    if x is None:
                        print(f"🔄 [Vision] Fallback CLIP...")
                        from services.intelligent_executor import find_and_click
                        result = find_and_click(
                            anchor_image_base64=screenshot_base64,
                            anchor_bbox=anchor_bbox,
                            method='clip',
                            detection_threshold=0.35,
                            target_text=_fc_target_text,
                            target_description=_fc_target_desc
                        )
                        if result['found'] and result['coordinates']:
                            x, y = result['coordinates']['x'], result['coordinates']['y']
                            confidence = result['confidence']
                            method_used = 'clip'
                    if x is not None:
                        print(f"✅ [Vision] Ancre trouvée à ({x}, {y}) — {method_used} (conf={confidence:.2f})")
                        # Effectuer le clic
                        if click_type == 'double':
                            pyautogui.doubleClick(x, y)
                        elif click_type == 'right':
@@ -861,8 +876,6 @@ def execute_action(action_type: str, params: dict) -> dict:
                        else:
                            pyautogui.click(x, y)
                        # Délai après le clic pour que l'application réagisse
                        # 2 secondes pour laisser le temps aux applications de s'ouvrir
                        time.sleep(2.0)
                        return {