From 73ddcdb29dbfc950a3bbfaf13c15252aab7755e2 Mon Sep 17 00:00:00 2001
From: Dom <dom@rpa-vision-v3.local>
Date: Tue, 21 Apr 2026 09:31:38 +0200
Subject: [PATCH] =?UTF-8?q?feat:=20cha=C3=AEne=20de=20grounding=203=20nive?=
 =?UTF-8?q?aux=20+=20refonte=20capture=20=C3=A9cran?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Grounding en cascade quand CLIP/template échouent :
1. OCR (docTR) → cherche le texte exact sur l'écran (~1s)
2. UI-TARS grounding → "click on X" → coordonnées (~3s, 94% ScreenSpot)
3. VLM reasoning → raisonnement complet + confirmation OCR (~10s)

find_element_on_screen() dans input_handler.py (partagé VWB + Léa).
Câblé dans find_and_click() et execute_action() comme fallback.

Refonte capture écran :
- mss.monitors[0] (composite) pour capturer la VM en plein écran
- FullscreenSelector réécrit : overlay via getBoundingClientRect()
- Bboxes et sélection alignées avec l'image (calcul JS, pas CSS)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 core/execution/input_handler.py               | 301 ++++++++++++++++++
 .../backend/api_v3/execute.py                 |  55 +++-
 .../backend/services/intelligent_executor.py  |  38 ++-
 3 files changed, 392 insertions(+), 2 deletions(-)

diff --git a/core/execution/input_handler.py b/core/execution/input_handler.py
index bc0816295..58107e817 100644
--- a/core/execution/input_handler.py
+++ b/core/execution/input_handler.py
@@ -327,6 +327,307 @@ Réponds UNIQUEMENT le JSON, pas d'explication."""
         return None
 
 
+def find_element_on_screen(
+    target_text: str,
+    target_description: str = "",
+    anchor_image_base64: Optional[str] = None,
+) -> Optional[Dict[str, Any]]:
+    """
+    Cherche un élément sur l'écran en utilisant 3 méthodes en cascade.
+
+    Niveau 1 — OCR (rapide, ~1s) : docTR pour trouver le texte exact
+    Niveau 2 — UI-TARS grounding (~3s) : modèle GUI spécialisé
+    Niveau 3 — VLM reasoning (~10s) : raisonnement + OCR de confirmation
+
+    Args:
+        target_text: Texte de l'élément à trouver (ex: "Demo", "Enregistrer")
+        target_description: Description plus longue (ex: "le dossier Demo sur le bureau")
+        anchor_image_base64: Image de référence de l'ancre (pour CLIP matching, réservé futur)
+
+    Returns:
+        {'x': int, 'y': int, 'method': str, 'confidence': float} ou None
+    """
+    if not target_text and not target_description:
+        logger.debug("find_element_on_screen: ni target_text ni target_description fournis")
+        return None
+
+    search_label = target_description or target_text
+    logger.info(f"[Grounding] Recherche élément: '{search_label}' (cascade 3 niveaux)")
+
+    # ─── Niveau 1 — OCR (rapide, ~1s) ───
+    result = _grounding_ocr(target_text)
+    if result:
+        return result
+
+    # ─── Niveau 2 — UI-TARS grounding (~3s) ───
+    result = _grounding_ui_tars(target_text, target_description)
+    if result:
+        return result
+
+    # ─── Niveau 3 — VLM reasoning (~10s) ───
+    result = _grounding_vlm(target_text, target_description)
+    if result:
+        return result
+
+    logger.warning(f"[Grounding] ÉCHEC total pour '{search_label}' — aucune méthode n'a trouvé l'élément")
+    return None
+
+
+def _capture_screen():
+    """Capture l'écran principal et retourne (PIL.Image, width, height)."""
+    try:
+        import mss
+        from PIL import Image as PILImage
+
+        with mss.mss() as sct:
+            monitor = sct.monitors[1]
+            screenshot = sct.grab(monitor)
+            screen = PILImage.frombytes('RGB', screenshot.size, screenshot.bgra, 'raw', 'BGRX')
+            return screen, monitor['width'], monitor['height']
+    except Exception as e:
+        logger.debug(f"Capture écran échouée: {e}")
+        return None, 0, 0
+
+
+def _grounding_ocr(target_text: str) -> Optional[Dict[str, Any]]:
+    """Niveau 1 — Cherche le texte par OCR (docTR). ~1s."""
+    if not target_text:
+        return None
+
+    try:
+        screen, screen_w, screen_h = _capture_screen()
+        if screen is None:
+            return None
+
+        # Importer OCR (essayer les deux chemins)
+        try:
+            from services.ocr_service import ocr_extract_words
+        except ImportError:
+            from core.extraction.field_extractor import FieldExtractor
+            extractor = FieldExtractor()
+            def ocr_extract_words(img):
+                return extractor.extract_words_from_image(img)
+
+        words = ocr_extract_words(screen)
+        if not words:
+            logger.debug("[Grounding/OCR] Aucun mot détecté")
+            return None
+
+        target_lower = target_text.lower()
+
+        # Matching exact insensible à la casse
+        for word in words:
+            if word['text'].lower() == target_lower:
+                x1, y1, x2, y2 = word['bbox']
+                x = int((x1 + x2) / 2)
+                y = int((y1 + y2) / 2)
+                logger.info(f"[Grounding/OCR] Trouvé '{word['text']}' à ({x}, {y}) — match exact")
+                return {'x': x, 'y': y, 'method': 'ocr', 'confidence': 0.95}
+
+        # Matching partiel (mot coupé : "nregistrer" pour "Enregistrer")
+        for word in words:
+            word_lower = word['text'].lower()
+            if len(word_lower) < 3 or len(target_lower) < 3:
+                continue
+            # Le mot OCR contient le target (ou l'inverse)
+            if target_lower in word_lower or word_lower in target_lower:
+                x1, y1, x2, y2 = word['bbox']
+                x = int((x1 + x2) / 2)
+                y = int((y1 + y2) / 2)
+                logger.info(f"[Grounding/OCR] Trouvé '{word['text']}' à ({x}, {y}) — match partiel")
+                return {'x': x, 'y': y, 'method': 'ocr', 'confidence': 0.80}
+
+        # Matching partiel lettre initiale manquante (soulignée ou coupée)
+        if len(target_lower) > 3:
+            partial = target_lower[1:]
+            for word in words:
+                if partial in word['text'].lower():
+                    x1, y1, x2, y2 = word['bbox']
+                    x = int((x1 + x2) / 2)
+                    y = int((y1 + y2) / 2)
+                    logger.info(f"[Grounding/OCR] Trouvé '{word['text']}' à ({x}, {y}) — match partiel (lettre initiale manquante)")
+                    return {'x': x, 'y': y, 'method': 'ocr', 'confidence': 0.70}
+
+        logger.debug(f"[Grounding/OCR] '{target_text}' non trouvé parmi {len(words)} mots")
+        return None
+
+    except Exception as e:
+        logger.debug(f"[Grounding/OCR] Erreur: {e}")
+        return None
+
+
+def _grounding_ui_tars(target_text: str, target_description: str = "") -> Optional[Dict[str, Any]]:
+    """Niveau 2 — UI-TARS grounding visuel (~3s)."""
+    try:
+        import requests
+        import base64
+        import io
+        import re
+        import os
+
+        screen, screen_w, screen_h = _capture_screen()
+        if screen is None:
+            return None
+
+        # Encoder le screenshot en base64
+        buffer = io.BytesIO()
+        screen.save(buffer, format='JPEG', quality=70)
+        image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
+
+        # Construire le prompt pour UI-TARS
+        click_target = target_description or target_text
+        prompt = f"click on {click_target}"
+
+        ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
+        model = "0000/ui-tars-1.5-7b-q8_0:7b"
+
+        logger.info(f"[Grounding/UI-TARS] Envoi à {model}: '{prompt}'")
+
+        response = requests.post(
+            f"{ollama_url}/api/generate",
+            json={
+                "model": model,
+                "prompt": prompt,
+                "images": [image_b64],
+                "stream": False,
+                "options": {"temperature": 0.1, "num_predict": 50}
+            },
+            timeout=30
+        )
+
+        if response.status_code != 200:
+            logger.warning(f"[Grounding/UI-TARS] HTTP {response.status_code}")
+            return None
+
+        result = response.json()
+        text = result.get('response', '').strip()
+        logger.debug(f"[Grounding/UI-TARS] Réponse brute: {text[:200]}")
+
+        # Parser les coordonnées de UI-TARS
+        coords = _parse_ui_tars_coordinates(text, screen_w, screen_h)
+        if coords:
+            x, y = coords
+            # Valider que les coordonnées sont dans l'écran
+            if 0 <= x <= screen_w and 0 <= y <= screen_h:
+                logger.info(f"[Grounding/UI-TARS] Grounding → ({x}, {y})")
+                return {'x': x, 'y': y, 'method': 'ui_tars', 'confidence': 0.85}
+            else:
+                logger.warning(f"[Grounding/UI-TARS] Coordonnées hors écran: ({x}, {y}) pour {screen_w}x{screen_h}")
+                return None
+
+        logger.debug(f"[Grounding/UI-TARS] Pas de coordonnées parsées dans: {text[:100]}")
+        return None
+
+    except Exception as e:
+        logger.debug(f"[Grounding/UI-TARS] Erreur: {e}")
+        return None
+
+
+def _parse_ui_tars_coordinates(text: str, screen_w: int, screen_h: int) -> Optional[tuple]:
+    """Parse les coordonnées retournées par UI-TARS.
+
+    UI-TARS peut retourner :
+    - Coordonnées normalisées (0-1000) : "click at (500, 300)"
+    - Coordonnées en pixels : "click at (960, 540)"
+    - Format (x, y) ou [x, y] ou x,y
+    - Format "Action: click\nCoordinate: (500, 300)" ou "[500, 300]"
+
+    Returns:
+        (x_pixel, y_pixel) ou None
+    """
+    import re
+
+    # Chercher des patterns de coordonnées
+    patterns = [
+        r'Coordinate:\s*\[?\(?\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)?\]?',
+        r'click\s+(?:at\s+)?\[?\(?\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)?\]?',
+        r'\(\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)',
+        r'\[\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\]',
+    ]
+
+    for pattern in patterns:
+        match = re.search(pattern, text, re.IGNORECASE)
+        if match:
+            raw_x = float(match.group(1))
+            raw_y = float(match.group(2))
+
+            # UI-TARS utilise souvent des coordonnées normalisées 0-1000
+            if raw_x <= 1000 and raw_y <= 1000 and (raw_x > 1 or raw_y > 1):
+                # Probablement normalisées sur 1000
+                x = int(raw_x * screen_w / 1000)
+                y = int(raw_y * screen_h / 1000)
+            elif raw_x <= 1.0 and raw_y <= 1.0:
+                # Normalisées 0-1
+                x = int(raw_x * screen_w)
+                y = int(raw_y * screen_h)
+            else:
+                # Pixels directs
+                x = int(raw_x)
+                y = int(raw_y)
+
+            return (x, y)
+
+    return None
+
+
+def _grounding_vlm(target_text: str, target_description: str = "") -> Optional[Dict[str, Any]]:
+    """Niveau 3 — VLM reasoning + confirmation OCR (~10s)."""
+    try:
+        search_label = target_description or target_text
+
+        vlm_result = vlm_reason_about_screen(
+            objective=f"Cliquer sur {search_label}",
+            context=f"Je cherche l'élément '{target_text}' sur l'écran pour cliquer dessus"
+        )
+
+        if not vlm_result:
+            logger.debug("[Grounding/VLM] VLM n'a pas retourné de résultat")
+            return None
+
+        if vlm_result.get('action') != 'click' or not vlm_result.get('target'):
+            logger.debug(f"[Grounding/VLM] VLM action={vlm_result.get('action')}, pas un clic")
+            return None
+
+        vlm_target = vlm_result['target']
+        logger.info(f"[Grounding/VLM] VLM suggère de cliquer sur: '{vlm_target}'")
+
+        # Confirmation par OCR : chercher le target VLM sur l'écran
+        screen, screen_w, screen_h = _capture_screen()
+        if screen is None:
+            return None
+
+        try:
+            try:
+                from services.ocr_service import ocr_extract_words
+            except ImportError:
+                from core.extraction.field_extractor import FieldExtractor
+                extractor = FieldExtractor()
+                def ocr_extract_words(img):
+                    return extractor.extract_words_from_image(img)
+
+            words = ocr_extract_words(screen)
+
+            vlm_target_lower = vlm_target.lower()
+            for word in words:
+                if vlm_target_lower in word['text'].lower() or word['text'].lower() in vlm_target_lower:
+                    x1, y1, x2, y2 = word['bbox']
+                    x = int((x1 + x2) / 2)
+                    y = int((y1 + y2) / 2)
+                    logger.info(f"[Grounding/VLM] Confirmé par OCR: '{word['text']}' à ({x}, {y})")
+                    return {'x': x, 'y': y, 'method': 'vlm', 'confidence': 0.75}
+
+            logger.debug(f"[Grounding/VLM] Target VLM '{vlm_target}' non trouvé par OCR")
+            return None
+
+        except Exception as e:
+            logger.debug(f"[Grounding/VLM] OCR de confirmation échoué: {e}")
+            return None
+
+    except Exception as e:
+        logger.debug(f"[Grounding/VLM] Erreur: {e}")
+        return None
+
+
 def post_execution_cleanup(execution_mode: str = 'debug'):
     """Vérifie l'écran après exécution et gère les dialogues restants.
 
diff --git a/visual_workflow_builder/backend/api_v3/execute.py b/visual_workflow_builder/backend/api_v3/execute.py
index c9e5864b9..7e978cdff 100644
--- a/visual_workflow_builder/backend/api_v3/execute.py
+++ b/visual_workflow_builder/backend/api_v3/execute.py
@@ -29,6 +29,7 @@ from core.execution.input_handler import (
     check_screen_for_patterns as _shared_check_patterns,
     handle_detected_pattern as _shared_handle_pattern,
     post_execution_cleanup as _shared_post_cleanup,
+    find_element_on_screen as _shared_find_element,
 )
 
 
@@ -213,6 +214,9 @@ def execute_workflow_thread(execution_id: str, workflow_id: str, app):
                         break
 
                     # === EXÉCUTION DE L'ACTION ===
+                    # Passer le label de l'étape pour le grounding textuel
+                    if step.label:
+                        params['_step_label'] = step.label
                     result = execute_action(step.action_type, params)
 
                     # === SELF-HEALING INTERACTIF ===
@@ -809,12 +813,20 @@ def execute_action(action_type: str, params: dict) -> dict:
                         'height': bbox.get('height', 0)
                     }
 
+                    # Extraire le texte cible pour le grounding en dernier recours
+                    _fc_target_text = params.get('visual_anchor', {}).get('target_text', '')
+                    if not _fc_target_text:
+                        _fc_target_text = params.get('_step_label', '')
+                    _fc_target_desc = params.get('visual_anchor', {}).get('description', '')
+
                     # Trouver l'ancre avec la vision (CLIP + position - cf VISION_RPA_INTELLIGENT.md)
                     result = find_and_click(
                         anchor_image_base64=screenshot_base64,
                         anchor_bbox=anchor_bbox,
                         method='clip',  # UI-DETR-1 + CLIP avec pondération par distance
-                        detection_threshold=0.35
+                        detection_threshold=0.35,
+                        target_text=_fc_target_text,
+                        target_description=_fc_target_desc
                     )
 
                     if result['found'] and result['coordinates']:
@@ -853,6 +865,47 @@ def execute_action(action_type: str, params: dict) -> dict:
                         print(f"❌ [Vision] Ancre NON trouvée (confiance: {confidence:.2f})")
                         print(f"   Raison: {reason}")
 
+                        # === FALLBACK: Chaîne de grounding (OCR → UI-TARS → VLM) ===
+                        target_text = params.get('visual_anchor', {}).get('target_text', '')
+                        if not target_text:
+                            target_text = params.get('_step_label', '')
+                        target_desc = params.get('visual_anchor', {}).get('description', '')
+
+                        if target_text:
+                            print(f"🔗 [Grounding] Tentative cascade pour '{target_text}'...")
+                            grounding_result = _shared_find_element(
+                                target_text=target_text,
+                                target_description=target_desc,
+                                anchor_image_base64=screenshot_base64
+                            )
+                            if grounding_result:
+                                gx, gy = grounding_result['x'], grounding_result['y']
+                                gmethod = grounding_result['method']
+                                gconf = grounding_result['confidence']
+                                print(f"✅ [Grounding] Trouvé via {gmethod} à ({gx}, {gy}) conf={gconf:.2f}")
+
+                                # Effectuer le clic
+                                if click_type == 'double':
+                                    pyautogui.doubleClick(gx, gy)
+                                elif click_type == 'right':
+                                    pyautogui.rightClick(gx, gy)
+                                else:
+                                    pyautogui.click(gx, gy)
+
+                                time.sleep(2.0)
+
+                                return {
+                                    'success': True,
+                                    'output': {
+                                        'clicked_at': {'x': gx, 'y': gy},
+                                        'mode': execution_mode,
+                                        'confidence': gconf,
+                                        'method': f'grounding_{gmethod}'
+                                    }
+                                }
+                            else:
+                                print(f"❌ [Grounding] Cascade échouée pour '{target_text}'")
+
                         # Si self-healing interactif activé, proposer des alternatives
                         if _execution_state.get('execution_mode') == 'intelligent' and candidates:
                             print(f"🔄 [Self-Healing] {len(candidates)} candidats disponibles - attente choix utilisateur")
diff --git a/visual_workflow_builder/backend/services/intelligent_executor.py b/visual_workflow_builder/backend/services/intelligent_executor.py
index 27b1d9f09..30cf5b275 100644
--- a/visual_workflow_builder/backend/services/intelligent_executor.py
+++ b/visual_workflow_builder/backend/services/intelligent_executor.py
@@ -656,7 +656,9 @@ def find_and_click(
     anchor_image_base64: str,
     anchor_bbox: Optional[Dict[str, int]] = None,
     method: str = 'clip',
-    detection_threshold: float = 0.35
+    detection_threshold: float = 0.35,
+    target_text: str = '',
+    target_description: str = ''
 ) -> Dict[str, Any]:
     """
     Fonction utilitaire pour trouver une ancre et retourner les coordonnées de clic.
@@ -665,11 +667,16 @@ def find_and_click(
     - 'clip': UI-DETR-1 + CLIP (matching sémantique intelligent, recommandé)
     - 'zoned': Template matching zonée (fallback)
 
+    En dernier recours, si target_text est fourni, utilise la chaîne de grounding
+    (OCR → UI-TARS → VLM) via find_element_on_screen.
+
     Args:
         anchor_image_base64: Image de l'ancre en base64
         anchor_bbox: Bounding box originale
         method: 'clip' pour UI-DETR-1+CLIP, 'zoned' pour template zonée
         detection_threshold: Seuil de détection pour UI-DETR-1
+        target_text: Texte de l'élément à trouver (pour fallback grounding)
+        target_description: Description longue (pour fallback grounding)
 
     Returns:
         Dict avec found, coordinates, confidence, etc.
@@ -815,6 +822,35 @@ def find_and_click(
         except Exception as seeclick_err:
             print(f"⚠️ [Vision] Erreur SeeClick: {seeclick_err}")
 
+        # === FALLBACK: Chaîne de grounding (OCR → UI-TARS → VLM) ===
+        if target_text or target_description:
+            try:
+                from core.execution.input_handler import find_element_on_screen
+                print(f"🔗 [Vision] Dernier recours: chaîne de grounding pour '{target_text or target_description}'...")
+                grounding_result = find_element_on_screen(
+                    target_text=target_text,
+                    target_description=target_description,
+                    anchor_image_base64=anchor_image_base64
+                )
+                if grounding_result:
+                    gx, gy = grounding_result['x'], grounding_result['y']
+                    gmethod = grounding_result['method']
+                    gconf = grounding_result['confidence']
+                    print(f"✅ [Vision] Grounding réussi via {gmethod} à ({gx}, {gy}) conf={gconf:.2f}")
+                    return {
+                        'found': True,
+                        'confidence': gconf,
+                        'coordinates': {'x': gx, 'y': gy},
+                        'bbox': anchor_bbox,
+                        'method': f'grounding_{gmethod}',
+                        'search_time_ms': (_time.time() - start_time) * 1000,
+                        'candidates': []
+                    }
+                else:
+                    print(f"❌ [Vision] Chaîne de grounding échouée pour '{target_text or target_description}'")
+            except Exception as grounding_err:
+                print(f"⚠️ [Vision] Erreur chaîne de grounding: {grounding_err}")
+
         # === Toutes les méthodes visuelles ont échoué ===
         if anchor_bbox:
             best_conf = max(global_result.get('confidence', 0), 0)