From 27490849a88f8fdfcefbc8aea42900381def5bbc Mon Sep 17 00:00:00 2001 From: Dom Date: Tue, 21 Apr 2026 14:40:38 +0200 Subject: [PATCH] refactor: OCR/UI-TARS en PREMIER, CLIP en fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Le grounding par texte (OCR → UI-TARS) est maintenant la méthode PRINCIPALE. CLIP n'est appelé que si le grounding échoue. Avant : CLIP (faux positifs confiants) → cascade grounding (rarement atteinte) Après : OCR 1s → UI-TARS 3s → CLIP (fallback visuel pur) C'est comme ça que font UI-TARS, Agent-S3 et AppAgent. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../backend/api_v3/execute.py | 61 +++++++++++-------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/visual_workflow_builder/backend/api_v3/execute.py b/visual_workflow_builder/backend/api_v3/execute.py index f430b9313..e20f8c328 100644 --- a/visual_workflow_builder/backend/api_v3/execute.py +++ b/visual_workflow_builder/backend/api_v3/execute.py @@ -807,11 +807,8 @@ def execute_action(action_type: str, params: dict) -> dict: # === MODE INTELLIGENT / DEBUG === if execution_mode in ['intelligent', 'debug'] and screenshot_base64: try: - from services.intelligent_executor import find_and_click + print(f"🧠 [Action] Mode {execution_mode}: recherche de l'ancre...") - print(f"🧠 [Action] Mode {execution_mode}: recherche visuelle de l'ancre...") - - # Convertir bbox au format attendu anchor_bbox = { 'x': bbox.get('x', 0), 'y': bbox.get('y', 0), @@ -819,11 +816,10 @@ def execute_action(action_type: str, params: dict) -> dict: 'height': bbox.get('height', 0) } - # Extraire le texte cible pour le grounding et la vérification CLIP + # Extraire le texte cible _fc_target_text = params.get('visual_anchor', {}).get('target_text', '') if not _fc_target_text: _fc_target_text = params.get('_step_label', '') - # Si le label est juste le type d'action, essayer de décrire l'ancre _action_types = {'click_anchor', 'double_click_anchor', 'right_click_anchor', 'hover_anchor', 'focus_anchor', 'scroll_to_anchor'} if _fc_target_text in _action_types and screenshot_base64: @@ -833,27 +829,46 @@ def execute_action(action_type: str, params: dict) -> dict: if _desc: print(f"🏷️ [Vision] Ancre décrite: '{_desc}'") _fc_target_text = _desc - except Exception as e: - print(f"⚠️ [Vision] Description ancre échouée: {e}") + except Exception: + pass _fc_target_desc = params.get('visual_anchor', {}).get('description', '') - # Trouver l'ancre avec la vision (CLIP + position - cf VISION_RPA_INTELLIGENT.md) - result = find_and_click( - anchor_image_base64=screenshot_base64, - anchor_bbox=anchor_bbox, - method='clip', # UI-DETR-1 + CLIP avec pondération par distance - detection_threshold=0.35, - target_text=_fc_target_text, - target_description=_fc_target_desc - ) + x, y, confidence, method_used = None, None, 0, '' - if result['found'] and result['coordinates']: - x, y = result['coordinates']['x'], result['coordinates']['y'] - confidence = result['confidence'] + # === MÉTHODE PRINCIPALE : OCR → UI-TARS (fiable, 1-5s) === + if _fc_target_text and _fc_target_text not in _action_types: + print(f"🔍 [Grounding] Recherche par texte: '{_fc_target_text}'") + grounding_result = _shared_find_element( + target_text=_fc_target_text, + target_description=_fc_target_desc, + anchor_image_base64=screenshot_base64 + ) + if grounding_result: + x, y = grounding_result['x'], grounding_result['y'] + confidence = grounding_result['confidence'] + method_used = f"grounding_{grounding_result['method']}" + print(f"✅ [Grounding] Trouvé via {grounding_result['method']} à ({x}, {y}) conf={confidence:.2f}") - print(f"✅ [Vision] Ancre trouvée à ({x}, {y}) - confiance: {confidence:.2f}") + # === FALLBACK : CLIP (si le grounding n'a rien trouvé) === + if x is None: + print(f"🔄 [Vision] Fallback CLIP...") + from services.intelligent_executor import find_and_click + result = find_and_click( + anchor_image_base64=screenshot_base64, + anchor_bbox=anchor_bbox, + method='clip', + detection_threshold=0.35, + target_text=_fc_target_text, + target_description=_fc_target_desc + ) + if result['found'] and result['coordinates']: + x, y = result['coordinates']['x'], result['coordinates']['y'] + confidence = result['confidence'] + method_used = 'clip' + + if x is not None: + print(f"✅ [Vision] Ancre trouvée à ({x}, {y}) — {method_used} (conf={confidence:.2f})") - # Effectuer le clic if click_type == 'double': pyautogui.doubleClick(x, y) elif click_type == 'right': @@ -861,8 +876,6 @@ def execute_action(action_type: str, params: dict) -> dict: else: pyautogui.click(x, y) - # Délai après le clic pour que l'application réagisse - # 2 secondes pour laisser le temps aux applications de s'ouvrir time.sleep(2.0) return {