From 73ddcdb29dbfc950a3bbfaf13c15252aab7755e2 Mon Sep 17 00:00:00 2001 From: Dom Date: Tue, 21 Apr 2026 09:31:38 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20cha=C3=AEne=20de=20grounding=203=20nive?= =?UTF-8?q?aux=20+=20refonte=20capture=20=C3=A9cran?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Grounding en cascade quand CLIP/template échouent : 1. OCR (docTR) → cherche le texte exact sur l'écran (~1s) 2. UI-TARS grounding → "click on X" → coordonnées (~3s, 94% ScreenSpot) 3. VLM reasoning → raisonnement complet + confirmation OCR (~10s) find_element_on_screen() dans input_handler.py (partagé VWB + Léa). Câblé dans find_and_click() et execute_action() comme fallback. Refonte capture écran : - mss.monitors[0] (composite) pour capturer la VM en plein écran - FullscreenSelector réécrit : overlay via getBoundingClientRect() - Bboxes et sélection alignées avec l'image (calcul JS, pas CSS) Co-Authored-By: Claude Opus 4.6 (1M context) --- core/execution/input_handler.py | 301 ++++++++++++++++++ .../backend/api_v3/execute.py | 55 +++- .../backend/services/intelligent_executor.py | 38 ++- 3 files changed, 392 insertions(+), 2 deletions(-) diff --git a/core/execution/input_handler.py b/core/execution/input_handler.py index bc0816295..58107e817 100644 --- a/core/execution/input_handler.py +++ b/core/execution/input_handler.py @@ -327,6 +327,307 @@ Réponds UNIQUEMENT le JSON, pas d'explication.""" return None +def find_element_on_screen( + target_text: str, + target_description: str = "", + anchor_image_base64: Optional[str] = None, +) -> Optional[Dict[str, Any]]: + """ + Cherche un élément sur l'écran en utilisant 3 méthodes en cascade. + + Niveau 1 — OCR (rapide, ~1s) : docTR pour trouver le texte exact + Niveau 2 — UI-TARS grounding (~3s) : modèle GUI spécialisé + Niveau 3 — VLM reasoning (~10s) : raisonnement + OCR de confirmation + + Args: + target_text: Texte de l'élément à trouver (ex: "Demo", "Enregistrer") + target_description: Description plus longue (ex: "le dossier Demo sur le bureau") + anchor_image_base64: Image de référence de l'ancre (pour CLIP matching, réservé futur) + + Returns: + {'x': int, 'y': int, 'method': str, 'confidence': float} ou None + """ + if not target_text and not target_description: + logger.debug("find_element_on_screen: ni target_text ni target_description fournis") + return None + + search_label = target_description or target_text + logger.info(f"[Grounding] Recherche élément: '{search_label}' (cascade 3 niveaux)") + + # ─── Niveau 1 — OCR (rapide, ~1s) ─── + result = _grounding_ocr(target_text) + if result: + return result + + # ─── Niveau 2 — UI-TARS grounding (~3s) ─── + result = _grounding_ui_tars(target_text, target_description) + if result: + return result + + # ─── Niveau 3 — VLM reasoning (~10s) ─── + result = _grounding_vlm(target_text, target_description) + if result: + return result + + logger.warning(f"[Grounding] ÉCHEC total pour '{search_label}' — aucune méthode n'a trouvé l'élément") + return None + + +def _capture_screen(): + """Capture l'écran principal et retourne (PIL.Image, width, height).""" + try: + import mss + from PIL import Image as PILImage + + with mss.mss() as sct: + monitor = sct.monitors[1] + screenshot = sct.grab(monitor) + screen = PILImage.frombytes('RGB', screenshot.size, screenshot.bgra, 'raw', 'BGRX') + return screen, monitor['width'], monitor['height'] + except Exception as e: + logger.debug(f"Capture écran échouée: {e}") + return None, 0, 0 + + +def _grounding_ocr(target_text: str) -> Optional[Dict[str, Any]]: + """Niveau 1 — Cherche le texte par OCR (docTR). ~1s.""" + if not target_text: + return None + + try: + screen, screen_w, screen_h = _capture_screen() + if screen is None: + return None + + # Importer OCR (essayer les deux chemins) + try: + from services.ocr_service import ocr_extract_words + except ImportError: + from core.extraction.field_extractor import FieldExtractor + extractor = FieldExtractor() + def ocr_extract_words(img): + return extractor.extract_words_from_image(img) + + words = ocr_extract_words(screen) + if not words: + logger.debug("[Grounding/OCR] Aucun mot détecté") + return None + + target_lower = target_text.lower() + + # Matching exact insensible à la casse + for word in words: + if word['text'].lower() == target_lower: + x1, y1, x2, y2 = word['bbox'] + x = int((x1 + x2) / 2) + y = int((y1 + y2) / 2) + logger.info(f"[Grounding/OCR] Trouvé '{word['text']}' à ({x}, {y}) — match exact") + return {'x': x, 'y': y, 'method': 'ocr', 'confidence': 0.95} + + # Matching partiel (mot coupé : "nregistrer" pour "Enregistrer") + for word in words: + word_lower = word['text'].lower() + if len(word_lower) < 3 or len(target_lower) < 3: + continue + # Le mot OCR contient le target (ou l'inverse) + if target_lower in word_lower or word_lower in target_lower: + x1, y1, x2, y2 = word['bbox'] + x = int((x1 + x2) / 2) + y = int((y1 + y2) / 2) + logger.info(f"[Grounding/OCR] Trouvé '{word['text']}' à ({x}, {y}) — match partiel") + return {'x': x, 'y': y, 'method': 'ocr', 'confidence': 0.80} + + # Matching partiel lettre initiale manquante (soulignée ou coupée) + if len(target_lower) > 3: + partial = target_lower[1:] + for word in words: + if partial in word['text'].lower(): + x1, y1, x2, y2 = word['bbox'] + x = int((x1 + x2) / 2) + y = int((y1 + y2) / 2) + logger.info(f"[Grounding/OCR] Trouvé '{word['text']}' à ({x}, {y}) — match partiel (lettre initiale manquante)") + return {'x': x, 'y': y, 'method': 'ocr', 'confidence': 0.70} + + logger.debug(f"[Grounding/OCR] '{target_text}' non trouvé parmi {len(words)} mots") + return None + + except Exception as e: + logger.debug(f"[Grounding/OCR] Erreur: {e}") + return None + + +def _grounding_ui_tars(target_text: str, target_description: str = "") -> Optional[Dict[str, Any]]: + """Niveau 2 — UI-TARS grounding visuel (~3s).""" + try: + import requests + import base64 + import io + import re + import os + + screen, screen_w, screen_h = _capture_screen() + if screen is None: + return None + + # Encoder le screenshot en base64 + buffer = io.BytesIO() + screen.save(buffer, format='JPEG', quality=70) + image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8') + + # Construire le prompt pour UI-TARS + click_target = target_description or target_text + prompt = f"click on {click_target}" + + ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434") + model = "0000/ui-tars-1.5-7b-q8_0:7b" + + logger.info(f"[Grounding/UI-TARS] Envoi à {model}: '{prompt}'") + + response = requests.post( + f"{ollama_url}/api/generate", + json={ + "model": model, + "prompt": prompt, + "images": [image_b64], + "stream": False, + "options": {"temperature": 0.1, "num_predict": 50} + }, + timeout=30 + ) + + if response.status_code != 200: + logger.warning(f"[Grounding/UI-TARS] HTTP {response.status_code}") + return None + + result = response.json() + text = result.get('response', '').strip() + logger.debug(f"[Grounding/UI-TARS] Réponse brute: {text[:200]}") + + # Parser les coordonnées de UI-TARS + coords = _parse_ui_tars_coordinates(text, screen_w, screen_h) + if coords: + x, y = coords + # Valider que les coordonnées sont dans l'écran + if 0 <= x <= screen_w and 0 <= y <= screen_h: + logger.info(f"[Grounding/UI-TARS] Grounding → ({x}, {y})") + return {'x': x, 'y': y, 'method': 'ui_tars', 'confidence': 0.85} + else: + logger.warning(f"[Grounding/UI-TARS] Coordonnées hors écran: ({x}, {y}) pour {screen_w}x{screen_h}") + return None + + logger.debug(f"[Grounding/UI-TARS] Pas de coordonnées parsées dans: {text[:100]}") + return None + + except Exception as e: + logger.debug(f"[Grounding/UI-TARS] Erreur: {e}") + return None + + +def _parse_ui_tars_coordinates(text: str, screen_w: int, screen_h: int) -> Optional[tuple]: + """Parse les coordonnées retournées par UI-TARS. + + UI-TARS peut retourner : + - Coordonnées normalisées (0-1000) : "click at (500, 300)" + - Coordonnées en pixels : "click at (960, 540)" + - Format (x, y) ou [x, y] ou x,y + - Format "Action: click\nCoordinate: (500, 300)" ou "[500, 300]" + + Returns: + (x_pixel, y_pixel) ou None + """ + import re + + # Chercher des patterns de coordonnées + patterns = [ + r'Coordinate:\s*\[?\(?\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)?\]?', + r'click\s+(?:at\s+)?\[?\(?\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)?\]?', + r'\(\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)', + r'\[\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\]', + ] + + for pattern in patterns: + match = re.search(pattern, text, re.IGNORECASE) + if match: + raw_x = float(match.group(1)) + raw_y = float(match.group(2)) + + # UI-TARS utilise souvent des coordonnées normalisées 0-1000 + if raw_x <= 1000 and raw_y <= 1000 and (raw_x > 1 or raw_y > 1): + # Probablement normalisées sur 1000 + x = int(raw_x * screen_w / 1000) + y = int(raw_y * screen_h / 1000) + elif raw_x <= 1.0 and raw_y <= 1.0: + # Normalisées 0-1 + x = int(raw_x * screen_w) + y = int(raw_y * screen_h) + else: + # Pixels directs + x = int(raw_x) + y = int(raw_y) + + return (x, y) + + return None + + +def _grounding_vlm(target_text: str, target_description: str = "") -> Optional[Dict[str, Any]]: + """Niveau 3 — VLM reasoning + confirmation OCR (~10s).""" + try: + search_label = target_description or target_text + + vlm_result = vlm_reason_about_screen( + objective=f"Cliquer sur {search_label}", + context=f"Je cherche l'élément '{target_text}' sur l'écran pour cliquer dessus" + ) + + if not vlm_result: + logger.debug("[Grounding/VLM] VLM n'a pas retourné de résultat") + return None + + if vlm_result.get('action') != 'click' or not vlm_result.get('target'): + logger.debug(f"[Grounding/VLM] VLM action={vlm_result.get('action')}, pas un clic") + return None + + vlm_target = vlm_result['target'] + logger.info(f"[Grounding/VLM] VLM suggère de cliquer sur: '{vlm_target}'") + + # Confirmation par OCR : chercher le target VLM sur l'écran + screen, screen_w, screen_h = _capture_screen() + if screen is None: + return None + + try: + try: + from services.ocr_service import ocr_extract_words + except ImportError: + from core.extraction.field_extractor import FieldExtractor + extractor = FieldExtractor() + def ocr_extract_words(img): + return extractor.extract_words_from_image(img) + + words = ocr_extract_words(screen) + + vlm_target_lower = vlm_target.lower() + for word in words: + if vlm_target_lower in word['text'].lower() or word['text'].lower() in vlm_target_lower: + x1, y1, x2, y2 = word['bbox'] + x = int((x1 + x2) / 2) + y = int((y1 + y2) / 2) + logger.info(f"[Grounding/VLM] Confirmé par OCR: '{word['text']}' à ({x}, {y})") + return {'x': x, 'y': y, 'method': 'vlm', 'confidence': 0.75} + + logger.debug(f"[Grounding/VLM] Target VLM '{vlm_target}' non trouvé par OCR") + return None + + except Exception as e: + logger.debug(f"[Grounding/VLM] OCR de confirmation échoué: {e}") + return None + + except Exception as e: + logger.debug(f"[Grounding/VLM] Erreur: {e}") + return None + + def post_execution_cleanup(execution_mode: str = 'debug'): """Vérifie l'écran après exécution et gère les dialogues restants. diff --git a/visual_workflow_builder/backend/api_v3/execute.py b/visual_workflow_builder/backend/api_v3/execute.py index c9e5864b9..7e978cdff 100644 --- a/visual_workflow_builder/backend/api_v3/execute.py +++ b/visual_workflow_builder/backend/api_v3/execute.py @@ -29,6 +29,7 @@ from core.execution.input_handler import ( check_screen_for_patterns as _shared_check_patterns, handle_detected_pattern as _shared_handle_pattern, post_execution_cleanup as _shared_post_cleanup, + find_element_on_screen as _shared_find_element, ) @@ -213,6 +214,9 @@ def execute_workflow_thread(execution_id: str, workflow_id: str, app): break # === EXÉCUTION DE L'ACTION === + # Passer le label de l'étape pour le grounding textuel + if step.label: + params['_step_label'] = step.label result = execute_action(step.action_type, params) # === SELF-HEALING INTERACTIF === @@ -809,12 +813,20 @@ def execute_action(action_type: str, params: dict) -> dict: 'height': bbox.get('height', 0) } + # Extraire le texte cible pour le grounding en dernier recours + _fc_target_text = params.get('visual_anchor', {}).get('target_text', '') + if not _fc_target_text: + _fc_target_text = params.get('_step_label', '') + _fc_target_desc = params.get('visual_anchor', {}).get('description', '') + # Trouver l'ancre avec la vision (CLIP + position - cf VISION_RPA_INTELLIGENT.md) result = find_and_click( anchor_image_base64=screenshot_base64, anchor_bbox=anchor_bbox, method='clip', # UI-DETR-1 + CLIP avec pondération par distance - detection_threshold=0.35 + detection_threshold=0.35, + target_text=_fc_target_text, + target_description=_fc_target_desc ) if result['found'] and result['coordinates']: @@ -853,6 +865,47 @@ def execute_action(action_type: str, params: dict) -> dict: print(f"❌ [Vision] Ancre NON trouvée (confiance: {confidence:.2f})") print(f" Raison: {reason}") + # === FALLBACK: Chaîne de grounding (OCR → UI-TARS → VLM) === + target_text = params.get('visual_anchor', {}).get('target_text', '') + if not target_text: + target_text = params.get('_step_label', '') + target_desc = params.get('visual_anchor', {}).get('description', '') + + if target_text: + print(f"🔗 [Grounding] Tentative cascade pour '{target_text}'...") + grounding_result = _shared_find_element( + target_text=target_text, + target_description=target_desc, + anchor_image_base64=screenshot_base64 + ) + if grounding_result: + gx, gy = grounding_result['x'], grounding_result['y'] + gmethod = grounding_result['method'] + gconf = grounding_result['confidence'] + print(f"✅ [Grounding] Trouvé via {gmethod} à ({gx}, {gy}) conf={gconf:.2f}") + + # Effectuer le clic + if click_type == 'double': + pyautogui.doubleClick(gx, gy) + elif click_type == 'right': + pyautogui.rightClick(gx, gy) + else: + pyautogui.click(gx, gy) + + time.sleep(2.0) + + return { + 'success': True, + 'output': { + 'clicked_at': {'x': gx, 'y': gy}, + 'mode': execution_mode, + 'confidence': gconf, + 'method': f'grounding_{gmethod}' + } + } + else: + print(f"❌ [Grounding] Cascade échouée pour '{target_text}'") + # Si self-healing interactif activé, proposer des alternatives if _execution_state.get('execution_mode') == 'intelligent' and candidates: print(f"🔄 [Self-Healing] {len(candidates)} candidats disponibles - attente choix utilisateur") diff --git a/visual_workflow_builder/backend/services/intelligent_executor.py b/visual_workflow_builder/backend/services/intelligent_executor.py index 27b1d9f09..30cf5b275 100644 --- a/visual_workflow_builder/backend/services/intelligent_executor.py +++ b/visual_workflow_builder/backend/services/intelligent_executor.py @@ -656,7 +656,9 @@ def find_and_click( anchor_image_base64: str, anchor_bbox: Optional[Dict[str, int]] = None, method: str = 'clip', - detection_threshold: float = 0.35 + detection_threshold: float = 0.35, + target_text: str = '', + target_description: str = '' ) -> Dict[str, Any]: """ Fonction utilitaire pour trouver une ancre et retourner les coordonnées de clic. @@ -665,11 +667,16 @@ def find_and_click( - 'clip': UI-DETR-1 + CLIP (matching sémantique intelligent, recommandé) - 'zoned': Template matching zonée (fallback) + En dernier recours, si target_text est fourni, utilise la chaîne de grounding + (OCR → UI-TARS → VLM) via find_element_on_screen. + Args: anchor_image_base64: Image de l'ancre en base64 anchor_bbox: Bounding box originale method: 'clip' pour UI-DETR-1+CLIP, 'zoned' pour template zonée detection_threshold: Seuil de détection pour UI-DETR-1 + target_text: Texte de l'élément à trouver (pour fallback grounding) + target_description: Description longue (pour fallback grounding) Returns: Dict avec found, coordinates, confidence, etc. @@ -815,6 +822,35 @@ def find_and_click( except Exception as seeclick_err: print(f"⚠️ [Vision] Erreur SeeClick: {seeclick_err}") + # === FALLBACK: Chaîne de grounding (OCR → UI-TARS → VLM) === + if target_text or target_description: + try: + from core.execution.input_handler import find_element_on_screen + print(f"🔗 [Vision] Dernier recours: chaîne de grounding pour '{target_text or target_description}'...") + grounding_result = find_element_on_screen( + target_text=target_text, + target_description=target_description, + anchor_image_base64=anchor_image_base64 + ) + if grounding_result: + gx, gy = grounding_result['x'], grounding_result['y'] + gmethod = grounding_result['method'] + gconf = grounding_result['confidence'] + print(f"✅ [Vision] Grounding réussi via {gmethod} à ({gx}, {gy}) conf={gconf:.2f}") + return { + 'found': True, + 'confidence': gconf, + 'coordinates': {'x': gx, 'y': gy}, + 'bbox': anchor_bbox, + 'method': f'grounding_{gmethod}', + 'search_time_ms': (_time.time() - start_time) * 1000, + 'candidates': [] + } + else: + print(f"❌ [Vision] Chaîne de grounding échouée pour '{target_text or target_description}'") + except Exception as grounding_err: + print(f"⚠️ [Vision] Erreur chaîne de grounding: {grounding_err}") + # === Toutes les méthodes visuelles ont échoué === if anchor_bbox: best_conf = max(global_result.get('confidence', 0), 0)