feat: chaîne de grounding 3 niveaux + refonte capture écran

Grounding en cascade quand CLIP/template échouent : 1. OCR (docTR) → cherche le texte exact sur l'écran (~1s) 2. UI-TARS grounding → "click on X" → coordonnées (~3s, 94% ScreenSpot) 3. VLM reasoning → raisonnement complet + confirmation OCR (~10s) find_element_on_screen() dans input_handler.py (partagé VWB + Léa). Câblé dans find_and_click() et execute_action() comme fallback. Refonte capture écran : - mss.monitors[0] (composite) pour capturer la VM en plein écran - FullscreenSelector réécrit : overlay via getBoundingClientRect() - Bboxes et sélection alignées avec l'image (calcul JS, pas CSS) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-21 09:31:38 +02:00
parent 14a9442343
commit 73ddcdb29d
3 changed files with 392 additions and 2 deletions
--- a/visual_workflow_builder/backend/api_v3/execute.py
+++ b/visual_workflow_builder/backend/api_v3/execute.py
@@ -29,6 +29,7 @@ from core.execution.input_handler import (
    check_screen_for_patterns as _shared_check_patterns,
    handle_detected_pattern as _shared_handle_pattern,
    post_execution_cleanup as _shared_post_cleanup,
+    find_element_on_screen as _shared_find_element,
 )


@@ -213,6 +214,9 @@ def execute_workflow_thread(execution_id: str, workflow_id: str, app):
                        break

                    # === EXÉCUTION DE L'ACTION ===
+                    # Passer le label de l'étape pour le grounding textuel
+                    if step.label:
+                        params['_step_label'] = step.label
                    result = execute_action(step.action_type, params)

                    # === SELF-HEALING INTERACTIF ===
@@ -809,12 +813,20 @@ def execute_action(action_type: str, params: dict) -> dict:
                        'height': bbox.get('height', 0)
                    }

+                    # Extraire le texte cible pour le grounding en dernier recours
+                    _fc_target_text = params.get('visual_anchor', {}).get('target_text', '')
+                    if not _fc_target_text:
+                        _fc_target_text = params.get('_step_label', '')
+                    _fc_target_desc = params.get('visual_anchor', {}).get('description', '')
+
                    # Trouver l'ancre avec la vision (CLIP + position - cf VISION_RPA_INTELLIGENT.md)
                    result = find_and_click(
                        anchor_image_base64=screenshot_base64,
                        anchor_bbox=anchor_bbox,
                        method='clip',  # UI-DETR-1 + CLIP avec pondération par distance
-                        detection_threshold=0.35
+                        detection_threshold=0.35,
+                        target_text=_fc_target_text,
+                        target_description=_fc_target_desc
                    )

                    if result['found'] and result['coordinates']:
@@ -853,6 +865,47 @@ def execute_action(action_type: str, params: dict) -> dict:
                        print(f"❌ [Vision] Ancre NON trouvée (confiance: {confidence:.2f})")
                        print(f"   Raison: {reason}")

+                        # === FALLBACK: Chaîne de grounding (OCR → UI-TARS → VLM) ===
+                        target_text = params.get('visual_anchor', {}).get('target_text', '')
+                        if not target_text:
+                            target_text = params.get('_step_label', '')
+                        target_desc = params.get('visual_anchor', {}).get('description', '')
+
+                        if target_text:
+                            print(f"🔗 [Grounding] Tentative cascade pour '{target_text}'...")
+                            grounding_result = _shared_find_element(
+                                target_text=target_text,
+                                target_description=target_desc,
+                                anchor_image_base64=screenshot_base64
+                            )
+                            if grounding_result:
+                                gx, gy = grounding_result['x'], grounding_result['y']
+                                gmethod = grounding_result['method']
+                                gconf = grounding_result['confidence']
+                                print(f"✅ [Grounding] Trouvé via {gmethod} à ({gx}, {gy}) conf={gconf:.2f}")
+
+                                # Effectuer le clic
+                                if click_type == 'double':
+                                    pyautogui.doubleClick(gx, gy)
+                                elif click_type == 'right':
+                                    pyautogui.rightClick(gx, gy)
+                                else:
+                                    pyautogui.click(gx, gy)
+
+                                time.sleep(2.0)
+
+                                return {
+                                    'success': True,
+                                    'output': {
+                                        'clicked_at': {'x': gx, 'y': gy},
+                                        'mode': execution_mode,
+                                        'confidence': gconf,
+                                        'method': f'grounding_{gmethod}'
+                                    }
+                                }
+                            else:
+                                print(f"❌ [Grounding] Cascade échouée pour '{target_text}'")
+
                        # Si self-healing interactif activé, proposer des alternatives
                        if _execution_state.get('execution_mode') == 'intelligent' and candidates:
                            print(f"🔄 [Self-Healing] {len(candidates)} candidats disponibles - attente choix utilisateur")