feat(knowledge): détection et gestion automatique des dialogues UI

UIPatternLibrary câblée dans l'executor et le stream processor. Pendant un wait_for_anchor, Léa surveille l'écran toutes les secondes : 1. OCR plein écran (docTR) 2. Pattern matching (dialogues Save, OK, Cancel, cookies...) 3. OCR ciblé pour trouver le bouton par son texte réel 4. Clic sur le match le plus bas (bouton, pas titre) Fix : seuil ratio supprimé (trigger trouvé = match, quelle que soit la longueur du texte OCR). Matching strict mot exact ≥3 chars (évite les faux positifs sur lettres isolées). Fallback recherche partielle pour les lettres soulignées (E_nregistrer). Plus aucune coordonnée hardcodée — 100% vision. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-20 11:06:17 +02:00
parent d168833609
commit ffd97ae9a5
2 changed files with 131 additions and 27 deletions
--- a/core/knowledge/ui_patterns.py
+++ b/core/knowledge/ui_patterns.py
@@ -50,6 +50,7 @@ BUILTIN_PATTERNS: List[Dict[str, Any]] = [
        "triggers": [
            "voulez-vous enregistrer", "do you want to save",
            "save changes", "enregistrer les modifications",
+            "enregistrer sous", "save as",
            "sauvegarder", "unsaved changes",
        ],
        "action": "click",
@@ -328,7 +329,7 @@ class UIPatternLibrary:
                        score = trigger_score
                        matched_trigger = trigger

-            if score > best_score and score > 0.05:
+            if score > best_score and matched_trigger is not None:
                best_score = score
                best_match = {
                    "pattern": pattern.name,
--- a/visual_workflow_builder/backend/api_v3/execute.py
+++ b/visual_workflow_builder/backend/api_v3/execute.py
@@ -195,6 +195,10 @@ def _check_screen_for_patterns() -> Optional[Dict[str, Any]]:
        import numpy as np

        lib = UIPatternLibrary()
+        # Debug: vérifier les triggers du dialog_save
+        save_patterns = [p for p in lib._patterns if p.name == 'dialog_save']
+        if save_patterns:
+            print(f"   🔎 [Pattern] dialog_save triggers: {save_patterns[0].triggers}")

        with mss.mss() as sct:
            monitor = sct.monitors[1]
@@ -208,42 +212,107 @@ def _check_screen_for_patterns() -> Optional[Dict[str, Any]]:
            return None

        if not ocr_text or len(ocr_text) < 5:
+            print(f"   🔎 [Pattern] OCR vide ou trop court ({len(ocr_text) if ocr_text else 0} chars)")
            return None

-        pattern = lib.find_pattern(ocr_text)
-        if pattern and pattern['category'] in ('dialog', 'popup'):
-            print(f"🧠 [Pattern] Détecté: {pattern['pattern']} → {pattern['action']} '{pattern['target']}'")
-            print(f"   Texte OCR: {ocr_text[:100]}...")
-            return pattern
+        print(f"   🔎 [Pattern] OCR ({len(ocr_text)} chars): {ocr_text[:500]}")

-        return None
+        pattern = lib.find_pattern(ocr_text)
+        if pattern:
+            print(f"   🔎 [Pattern] Match: {pattern['pattern']} (category={pattern['category']})")
+            if pattern['category'] in ('dialog', 'popup'):
+                print(f"🧠 [Pattern] DÉTECTÉ: {pattern['pattern']} → {pattern['action']} '{pattern['target']}'")
+                return pattern
+            else:
+                print(f"   🔎 [Pattern] Ignoré (catégorie {pattern['category']})")
+                return None
+        else:
+            print(f"   🔎 [Pattern] Aucun match dans le texte OCR")
+            return None

    except Exception as e:
-        logger.debug(f"Pattern check échoué: {e}")
+        import traceback
+        print(f"   🔎 [Pattern] EXCEPTION: {e}")
+        traceback.print_exc()
        return None


 def _handle_detected_pattern(pattern: Dict[str, Any]) -> bool:
-    """Gère automatiquement un pattern UI détecté (clic sur OK, fermer popup, etc.).
+    """Gère automatiquement un pattern UI détecté.

-    Returns:
-        True si le pattern a été géré avec succès.
+    Cherche le bouton cible via OCR (position réelle sur l'écran),
+    avec fallback sur les coordonnées typiques si l'OCR ne trouve pas.
    """
    import pyautogui

    action = pattern.get('action')
    target = pattern.get('target', '')
-    bbox = pattern.get('typical_bbox')
    alternatives = pattern.get('alternatives', [])

-    if action == 'click' and bbox:
-        screen_w, screen_h = pyautogui.size()
-        x = int((bbox[0] + bbox[2]) / 2 * screen_w)
-        y = int((bbox[1] + bbox[3]) / 2 * screen_h)
-        print(f"🤖 [Pattern] Clic automatique sur '{target}' à ({x}, {y})")
-        pyautogui.click(x, y)
-        time.sleep(1.0)
-        return True
+    if action == 'click':
+        candidates = [target] + alternatives
+
+        # Chercher le bouton via OCR sur l'écran actuel
+        try:
+            import mss
+            from PIL import Image
+            from services.ocr_service import ocr_extract_words
+
+            with mss.mss() as sct:
+                monitor = sct.monitors[1]
+                screenshot = sct.grab(monitor)
+                screen = Image.frombytes('RGB', screenshot.size, screenshot.bgra, 'raw', 'BGRX')
+
+            words = ocr_extract_words(screen)
+
+            # Collecter TOUS les matchs, puis prendre le plus bas (boutons = bas du dialogue)
+            all_matches = []
+
+            for candidate in candidates:
+                candidate_lower = candidate.lower()
+                for word in words:
+                    word_text = word['text'].lower()
+                    if len(word_text) < 3 or len(candidate_lower) < 3:
+                        continue
+                    if word_text == candidate_lower:
+                        x1, y1, x2, y2 = word['bbox']
+                        all_matches.append({
+                            'text': word['text'],
+                            'x': int((x1 + x2) / 2),
+                            'y': int((y1 + y2) / 2),
+                            'match_type': 'exact',
+                        })
+
+            # Recherche partielle (ex: "nregistrer" sans le E souligné)
+            if not all_matches:
+                for candidate in candidates:
+                    if len(candidate) > 3:
+                        partial = candidate[1:].lower()
+                        for word in words:
+                            if partial in word['text'].lower():
+                                x1, y1, x2, y2 = word['bbox']
+                                all_matches.append({
+                                    'text': word['text'],
+                                    'x': int((x1 + x2) / 2),
+                                    'y': int((y1 + y2) / 2),
+                                    'match_type': 'partial',
+                                })
+
+            if all_matches:
+                for m in all_matches:
+                    print(f"   🔎 [Pattern] Candidat: '{m['text']}' à ({m['x']}, {m['y']}) [{m['match_type']}]")
+
+                best = max(all_matches, key=lambda m: m['y'])
+                print(f"🤖 [Pattern] Clic sur '{best['text']}' à ({best['x']}, {best['y']}) [le plus bas = bouton]")
+                pyautogui.click(best['x'], best['y'])
+                time.sleep(1.0)
+                return True
+
+        except Exception as e:
+            print(f"   🔎 [Pattern] OCR bouton échoué: {e}")
+
+        print(f"   🔎 [Pattern] Bouton '{target}' introuvable par OCR — pas de clic")
+        return False

    elif action == 'hotkey':
        keys = target.split('+')
@@ -309,10 +378,18 @@ def execute_workflow_thread(execution_id: str, workflow_id: str, app):
                db.session.commit()

                # Vérifier si un dialogue/popup bloque l'écran avant l'étape
-                if index > 0 and _execution_state.get('execution_mode') in ('intelligent', 'debug'):
-                    detected = _check_screen_for_patterns()
-                    if detected:
-                        _handle_detected_pattern(detected)
+                if index > 0:
+                    exec_mode = _execution_state.get('execution_mode', 'basic')
+                    print(f"   🔎 [Pattern] Vérification avant étape {index+1} (mode={exec_mode})")
+                    if exec_mode in ('intelligent', 'debug'):
+                        detected = _check_screen_for_patterns()
+                        if detected:
+                            print(f"   🧠 [Pattern] TROUVÉ: {detected.get('pattern')} → {detected.get('action')} '{detected.get('target')}'")
+                            _handle_detected_pattern(detected)
+                        else:
+                            print(f"   🔎 [Pattern] Aucun dialogue détecté")
+                    else:
+                        print(f"   🔎 [Pattern] Skip (mode {exec_mode})")

                print(f"\n{'='*60}")
                print(f"📋 [Execute] Étape {index + 1}/{len(steps)}: {step.action_type}")
@@ -1084,9 +1161,35 @@ def execute_action(action_type: str, params: dict) -> dict:

        elif action_type in ['wait_for_anchor', 'wait']:
            timeout_ms = params.get('timeout_ms', params.get('timeout', 5000))
-            print(f"⏳ [Action] Attente {timeout_ms}ms")
-            time.sleep(timeout_ms / 1000)
-            return {'success': True, 'output': {'waited_ms': timeout_ms}}
+            print(f"⏳ [Action] Attente {timeout_ms}ms (avec surveillance patterns)")
+
+            elapsed = 0
+            check_interval = 1000
+            pattern_handled = None
+
+            while elapsed < timeout_ms:
+                wait_chunk = min(check_interval, timeout_ms - elapsed)
+                time.sleep(wait_chunk / 1000)
+                elapsed += wait_chunk
+
+                if execution_mode in ('intelligent', 'debug'):
+                    try:
+                        detected = _check_screen_for_patterns()
+                        if detected:
+                            print(f"🧠 [Wait] Dialogue détecté: {detected.get('pattern')} → {detected.get('target')}")
+                            _handle_detected_pattern(detected)
+                            pattern_handled = detected
+                            break
+                    except Exception as e:
+                        print(f"   🔎 [Wait] Erreur check: {e}")
+
+            return {
+                'success': True,
+                'output': {
+                    'waited_ms': elapsed,
+                    'pattern_handled': pattern_handled.get('pattern') if pattern_handled else None
+                }
+            }

        elif action_type == 'keyboard_shortcut':
            keys = params.get('keys', [])