From ffd97ae9a5de43093eb96a6c08a32b2e9159df31 Mon Sep 17 00:00:00 2001 From: Dom Date: Mon, 20 Apr 2026 11:06:17 +0200 Subject: [PATCH] =?UTF-8?q?feat(knowledge):=20d=C3=A9tection=20et=20gestio?= =?UTF-8?q?n=20automatique=20des=20dialogues=20UI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UIPatternLibrary câblée dans l'executor et le stream processor. Pendant un wait_for_anchor, Léa surveille l'écran toutes les secondes : 1. OCR plein écran (docTR) 2. Pattern matching (dialogues Save, OK, Cancel, cookies...) 3. OCR ciblé pour trouver le bouton par son texte réel 4. Clic sur le match le plus bas (bouton, pas titre) Fix : seuil ratio supprimé (trigger trouvé = match, quelle que soit la longueur du texte OCR). Matching strict mot exact ≥3 chars (évite les faux positifs sur lettres isolées). Fallback recherche partielle pour les lettres soulignées (E_nregistrer). Plus aucune coordonnée hardcodée — 100% vision. Co-Authored-By: Claude Opus 4.6 (1M context) --- core/knowledge/ui_patterns.py | 3 +- .../backend/api_v3/execute.py | 155 +++++++++++++++--- 2 files changed, 131 insertions(+), 27 deletions(-) diff --git a/core/knowledge/ui_patterns.py b/core/knowledge/ui_patterns.py index 740c62e0e..7cc401b5a 100644 --- a/core/knowledge/ui_patterns.py +++ b/core/knowledge/ui_patterns.py @@ -50,6 +50,7 @@ BUILTIN_PATTERNS: List[Dict[str, Any]] = [ "triggers": [ "voulez-vous enregistrer", "do you want to save", "save changes", "enregistrer les modifications", + "enregistrer sous", "save as", "sauvegarder", "unsaved changes", ], "action": "click", @@ -328,7 +329,7 @@ class UIPatternLibrary: score = trigger_score matched_trigger = trigger - if score > best_score and score > 0.05: + if score > best_score and matched_trigger is not None: best_score = score best_match = { "pattern": pattern.name, diff --git a/visual_workflow_builder/backend/api_v3/execute.py b/visual_workflow_builder/backend/api_v3/execute.py index df57d08a2..4e27fc831 100644 --- a/visual_workflow_builder/backend/api_v3/execute.py +++ b/visual_workflow_builder/backend/api_v3/execute.py @@ -195,6 +195,10 @@ def _check_screen_for_patterns() -> Optional[Dict[str, Any]]: import numpy as np lib = UIPatternLibrary() + # Debug: vérifier les triggers du dialog_save + save_patterns = [p for p in lib._patterns if p.name == 'dialog_save'] + if save_patterns: + print(f" 🔎 [Pattern] dialog_save triggers: {save_patterns[0].triggers}") with mss.mss() as sct: monitor = sct.monitors[1] @@ -208,42 +212,107 @@ def _check_screen_for_patterns() -> Optional[Dict[str, Any]]: return None if not ocr_text or len(ocr_text) < 5: + print(f" 🔎 [Pattern] OCR vide ou trop court ({len(ocr_text) if ocr_text else 0} chars)") return None - pattern = lib.find_pattern(ocr_text) - if pattern and pattern['category'] in ('dialog', 'popup'): - print(f"🧠 [Pattern] Détecté: {pattern['pattern']} → {pattern['action']} '{pattern['target']}'") - print(f" Texte OCR: {ocr_text[:100]}...") - return pattern + print(f" 🔎 [Pattern] OCR ({len(ocr_text)} chars): {ocr_text[:500]}") - return None + pattern = lib.find_pattern(ocr_text) + if pattern: + print(f" 🔎 [Pattern] Match: {pattern['pattern']} (category={pattern['category']})") + if pattern['category'] in ('dialog', 'popup'): + print(f"🧠 [Pattern] DÉTECTÉ: {pattern['pattern']} → {pattern['action']} '{pattern['target']}'") + return pattern + else: + print(f" 🔎 [Pattern] Ignoré (catégorie {pattern['category']})") + return None + else: + print(f" 🔎 [Pattern] Aucun match dans le texte OCR") + return None except Exception as e: - logger.debug(f"Pattern check échoué: {e}") + import traceback + print(f" 🔎 [Pattern] EXCEPTION: {e}") + traceback.print_exc() return None def _handle_detected_pattern(pattern: Dict[str, Any]) -> bool: - """Gère automatiquement un pattern UI détecté (clic sur OK, fermer popup, etc.). + """Gère automatiquement un pattern UI détecté. - Returns: - True si le pattern a été géré avec succès. + Cherche le bouton cible via OCR (position réelle sur l'écran), + avec fallback sur les coordonnées typiques si l'OCR ne trouve pas. """ import pyautogui action = pattern.get('action') target = pattern.get('target', '') - bbox = pattern.get('typical_bbox') alternatives = pattern.get('alternatives', []) - if action == 'click' and bbox: - screen_w, screen_h = pyautogui.size() - x = int((bbox[0] + bbox[2]) / 2 * screen_w) - y = int((bbox[1] + bbox[3]) / 2 * screen_h) - print(f"🤖 [Pattern] Clic automatique sur '{target}' à ({x}, {y})") - pyautogui.click(x, y) - time.sleep(1.0) - return True + if action == 'click': + candidates = [target] + alternatives + + # Chercher le bouton via OCR sur l'écran actuel + try: + import mss + from PIL import Image + from services.ocr_service import ocr_extract_words + + with mss.mss() as sct: + monitor = sct.monitors[1] + screenshot = sct.grab(monitor) + screen = Image.frombytes('RGB', screenshot.size, screenshot.bgra, 'raw', 'BGRX') + + words = ocr_extract_words(screen) + + # Collecter TOUS les matchs, puis prendre le plus bas (boutons = bas du dialogue) + all_matches = [] + + for candidate in candidates: + candidate_lower = candidate.lower() + for word in words: + word_text = word['text'].lower() + if len(word_text) < 3 or len(candidate_lower) < 3: + continue + if word_text == candidate_lower: + x1, y1, x2, y2 = word['bbox'] + all_matches.append({ + 'text': word['text'], + 'x': int((x1 + x2) / 2), + 'y': int((y1 + y2) / 2), + 'match_type': 'exact', + }) + + # Recherche partielle (ex: "nregistrer" sans le E souligné) + if not all_matches: + for candidate in candidates: + if len(candidate) > 3: + partial = candidate[1:].lower() + for word in words: + if partial in word['text'].lower(): + x1, y1, x2, y2 = word['bbox'] + all_matches.append({ + 'text': word['text'], + 'x': int((x1 + x2) / 2), + 'y': int((y1 + y2) / 2), + 'match_type': 'partial', + }) + + if all_matches: + for m in all_matches: + print(f" 🔎 [Pattern] Candidat: '{m['text']}' à ({m['x']}, {m['y']}) [{m['match_type']}]") + + best = max(all_matches, key=lambda m: m['y']) + print(f"🤖 [Pattern] Clic sur '{best['text']}' à ({best['x']}, {best['y']}) [le plus bas = bouton]") + pyautogui.click(best['x'], best['y']) + time.sleep(1.0) + return True + + except Exception as e: + print(f" 🔎 [Pattern] OCR bouton échoué: {e}") + + print(f" 🔎 [Pattern] Bouton '{target}' introuvable par OCR — pas de clic") + return False elif action == 'hotkey': keys = target.split('+') @@ -309,10 +378,18 @@ def execute_workflow_thread(execution_id: str, workflow_id: str, app): db.session.commit() # Vérifier si un dialogue/popup bloque l'écran avant l'étape - if index > 0 and _execution_state.get('execution_mode') in ('intelligent', 'debug'): - detected = _check_screen_for_patterns() - if detected: - _handle_detected_pattern(detected) + if index > 0: + exec_mode = _execution_state.get('execution_mode', 'basic') + print(f" 🔎 [Pattern] Vérification avant étape {index+1} (mode={exec_mode})") + if exec_mode in ('intelligent', 'debug'): + detected = _check_screen_for_patterns() + if detected: + print(f" 🧠 [Pattern] TROUVÉ: {detected.get('pattern')} → {detected.get('action')} '{detected.get('target')}'") + _handle_detected_pattern(detected) + else: + print(f" 🔎 [Pattern] Aucun dialogue détecté") + else: + print(f" 🔎 [Pattern] Skip (mode {exec_mode})") print(f"\n{'='*60}") print(f"📋 [Execute] Étape {index + 1}/{len(steps)}: {step.action_type}") @@ -1084,9 +1161,35 @@ def execute_action(action_type: str, params: dict) -> dict: elif action_type in ['wait_for_anchor', 'wait']: timeout_ms = params.get('timeout_ms', params.get('timeout', 5000)) - print(f"⏳ [Action] Attente {timeout_ms}ms") - time.sleep(timeout_ms / 1000) - return {'success': True, 'output': {'waited_ms': timeout_ms}} + print(f"⏳ [Action] Attente {timeout_ms}ms (avec surveillance patterns)") + + elapsed = 0 + check_interval = 1000 + pattern_handled = None + + while elapsed < timeout_ms: + wait_chunk = min(check_interval, timeout_ms - elapsed) + time.sleep(wait_chunk / 1000) + elapsed += wait_chunk + + if execution_mode in ('intelligent', 'debug'): + try: + detected = _check_screen_for_patterns() + if detected: + print(f"🧠 [Wait] Dialogue détecté: {detected.get('pattern')} → {detected.get('target')}") + _handle_detected_pattern(detected) + pattern_handled = detected + break + except Exception as e: + print(f" 🔎 [Wait] Erreur check: {e}") + + return { + 'success': True, + 'output': { + 'waited_ms': elapsed, + 'pattern_handled': pattern_handled.get('pattern') if pattern_handled else None + } + } elif action_type == 'keyboard_shortcut': keys = params.get('keys', [])