feat(execution): cascade post-raccourci pilotée par DialogHandler/OCR

Le pHash global 8x8 sur écran 1920x1080 ne détecte pas l'ouverture d'un dialog modal dans une VM QEMU (un dialog 800x500 couvre ~3 pixels pHash, distance Hamming typique = 1-2, sous le seuil de 3). Découvert sur Win11/ Notepad : Ctrl+Shift+S ouvrait bien le dialog mais Léa abortait à tort. _handle_post_shortcut() poll désormais DialogHandler.handle_if_dialog() toutes les 500ms (EasyOCR + KNOWN_DIALOGS). 8s pour le premier dialog, 3s de stabilité entre dialogs successifs, 60s budget total. KNOWN_DIALOGS réordonné : popups modaux (confirmer/remplacer/écraser) prioritaires sur fenêtres parents (enregistrer sous/save as) car l'OCR full-screen capte les deux simultanément. DialogHandler bascule sur UITarsGrounder subprocess one-shot (au lieu du serveur HTTP localhost:8200 qui n'existait plus). InfiGUI worker, think_arbiter et ui_tars_grounder alignés sur le même contrat. Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
2026-04-26 20:19:39 +02:00
parent 3d6868f029
commit 487bcb8618
6 changed files with 474 additions and 243 deletions
--- a/core/execution/observe_reason_act.py
+++ b/core/execution/observe_reason_act.py
@@ -221,22 +221,31 @@ class ORALoop:
        if action_type in ('click_anchor', 'click', 'double_click_anchor', 'right_click_anchor'):
            target_text = anchor.get('target_text', '') or anchor.get('description', '')

-            # Si target_text est vide ou est un nom d'action → décrire le crop
-            if not target_text or target_text in _action_type_names:
-                screenshot_b64 = anchor.get('screenshot', '')
-                if screenshot_b64:
-                    try:
-                        from core.execution.input_handler import _describe_anchor_image
-                        desc = _describe_anchor_image(screenshot_b64)
-                        if desc and len(desc) > 2:
-                            target_text = desc
-                            print(f"🏷️ [ORA/reason] Ancre décrite par VLM: '{target_text}'")
-                    except Exception:
-                        pass
+            # Détecter les target_text absurdes : vide, nom d'action, ou bruit OCR
+            def _is_garbage(t):
+                if not t or t in _action_type_names:
+                    return True
+                # Bruit OCR : que des caractères spéciaux/chiffres/espaces
+                cleaned = t.replace('-', '').replace(' ', '').replace('.', '').replace('_', '')
+                if len(cleaned) < 3:
+                    return True
+                # Que des chiffres
+                if cleaned.isdigit():
+                    return True
+                return False
+
+            # Note: plus d'appel à _describe_anchor_image() (qwen2.5vl) ici.
+            # Le crop d'ancre (screenshot_b64) servira directement au template matching
+            # cv2 dans _act_click, puis fallback InfiGUI fusionné si nécessaire.
+            # Cela évite le conflit VRAM (qwen2.5vl 9.4GB + InfiGUI 2.4GB > 11.5GB GPU).

            # Dernier fallback : label si pas un nom d'action
-            if not target_text or target_text in _action_type_names:
+            if _is_garbage(target_text):
                target_text = label if label not in _action_type_names else ''
+                if target_text:
+                    print(f"🏷️ [ORA/reason] Label garbage, fallback texte: '{target_text}'")
+                else:
+                    print(f"🏷️ [ORA/reason] Pas de label texte — grounding via crop visuel uniquement")

            action = 'click'
            value = 'double' if action_type == 'double_click_anchor' else (
@@ -1245,6 +1254,7 @@ Règles:
            )

        print(f"🚀 [ORA] Démarrage workflow: {total} étapes, verify={self.verify_level}, retries={self.max_retries}")
+        print(f"🔧 [ORA] CODE VERSION: post-shortcut-dialog-handler ACTIF (26 avril 17h30)")

        for i, step in enumerate(steps):
            if not self._should_continue():
@@ -1326,6 +1336,47 @@ Règles:
                        )
                    )

+            # --- 3b. Post-raccourci : attendre changement écran + gérer dialogue ---
+            # Après un keyboard_shortcut (pas scroll), on polle le pHash pour détecter
+            # si un dialogue est apparu (ex: "Enregistrer sous" après Ctrl+Shift+S).
+            # Si oui → InfiGUI localise et clique le bouton visuellement.
+            if act_success and decision.action == 'hotkey' and not decision.value.startswith('scroll_'):
+                print(f"🔍 [ORA/post-shortcut] ENTRÉ dans le bloc post-shortcut (action={decision.action}, value={decision.value})")
+                dialog_handled = self._handle_post_shortcut(pre)
+                if dialog_handled:
+                    time.sleep(0.5)
+                    post = self.observe()
+                    self._last_post_phash = post.phash
+                    if on_progress:
+                        on_progress(i + 1, total, VerificationResult(
+                            success=True, change_level='major',
+                            matches_expected=True,
+                            detail="Dialogue géré visuellement après raccourci"
+                        ))
+                    continue
+                else:
+                    # Invariant : aucune étape suivante ne doit s'exécuter tant que
+                    # la cascade déclenchée par le raccourci n'est pas pleinement résolue.
+                    # Cas typique : Ctrl+S → "Enregistrer sous" non géré → on ABORT plutôt
+                    # que de cliquer sur des coordonnées potentiellement obsolètes.
+                    msg = (
+                        f"Étape {i+1}: raccourci '{decision.value}' — cascade post-raccourci "
+                        f"non résolue (dialogue absent ou bloqué). Workflow stoppé pour éviter "
+                        f"un clic dans un contexte incohérent."
+                    )
+                    print(f"❌ [ORA/post-shortcut] {msg}")
+                    logger.warning(f"🆘 [ORA] {msg}")
+                    if on_progress:
+                        on_progress(i + 1, total, VerificationResult(
+                            success=False, change_level='none',
+                            matches_expected=False,
+                            detail="Cascade post-raccourci non résolue"
+                        ))
+                    return LoopResult(
+                        success=False, steps_completed=i, total_steps=total,
+                        reason=msg,
+                    )
+
            # Petit délai pour laisser l'écran se stabiliser
            time.sleep(0.3)

@@ -1412,6 +1463,107 @@ Règles:
    # Méthodes privées — actions
    # ═══════════════════════════════════════════════════════════

+    def _handle_post_shortcut(self, pre_obs: 'Observation') -> bool:
+        """Après un raccourci clavier, résoudre la cascade de dialogues réflexes.
+
+        Pilotage par DialogHandler (OCR direct), PAS par pHash. Raison :
+        un dialog modal qui s'ouvre dans une VM ne change quasiment pas le
+        pHash global de l'écran hôte (signature 8x8 sur 1920x1080 — un dialog
+        de 800x500 couvre ~3 pixels pHash, distance Hamming souvent < 3).
+        On poll donc directement DialogHandler.handle_if_dialog().
+
+        Returns:
+            True si au moins un dialog connu a été détecté + géré et qu'aucun
+            autre dialog n'apparaît dans la fenêtre de stabilité finale.
+            False si aucun dialog connu n'apparaît dans la fenêtre d'attente
+            initiale (le workflow doit ABORT — état incohérent).
+        """
+        from core.grounding.dialog_handler import DialogHandler
+
+        # Fenêtre d'attente du PREMIER dialog après le raccourci. Win11/QEMU :
+        # Ctrl+Shift+S → "Enregistrer sous" apparaît en <2s typiquement.
+        first_dialog_timeout = 8.0
+        # Budget total pour résoudre toute la cascade (InfiGUI ~15s/dialog).
+        total_timeout = 60.0
+        # Fenêtre de stabilité après le dernier dialog géré : si rien d'autre
+        # n'apparaît pendant cette durée, la cascade est considérée terminée.
+        # Doit couvrir l'apparition du popup modal suivant (post_click_wait + marge).
+        stable_window = 3.0
+        # Délai post-clic avant de tester le dialog suivant.
+        post_click_wait = 1.5
+        # Cadence de polling OCR (EasyOCR full-screen ~500ms/poll).
+        poll_interval = 0.5
+        # Garde-fou anti-boucle infinie.
+        max_dialog_iterations = 5
+
+        t_start = time.time()
+        dh = DialogHandler()
+        dialogs_handled = 0
+
+        def _elapsed() -> float:
+            return time.time() - t_start
+
+        def _poll_dialog(deadline: float) -> Optional[Dict[str, Any]]:
+            """Poll DialogHandler jusqu'à détection d'un dialog connu OU deadline.
+
+            Retourne le dict result si un dialog connu a été géré (cliqué),
+            None si la deadline est atteinte sans match. Si DialogHandler
+            détecte ET clique avec succès, le clic InfiGUI peut excéder la
+            deadline mais on retourne quand même le résultat (action déjà
+            engagée — on ne va pas l'annuler).
+            """
+            while time.time() < deadline:
+                obs = self.observe()
+                try:
+                    result = dh.handle_if_dialog(obs.screenshot)
+                except Exception as e:
+                    print(f"⚠️ [ORA/post-shortcut] Erreur dialog handler: {e}")
+                    return None
+                if result.get('handled'):
+                    return result
+                sleep_left = deadline - time.time()
+                if sleep_left > 0:
+                    time.sleep(min(poll_interval, sleep_left))
+            return None
+
+        # --- Étape 1 : attendre le PREMIER dialog ---
+        first_deadline = t_start + min(total_timeout, first_dialog_timeout)
+        result = _poll_dialog(first_deadline)
+        if result is None:
+            print(f"⏳ [ORA/post-shortcut] Aucun dialog connu détecté après "
+                  f"{_elapsed():.1f}s (fenêtre={first_dialog_timeout}s) — "
+                  f"raccourci sans effet attendu")
+            return False
+
+        dialogs_handled = 1
+        print(f"✅ [ORA/post-shortcut] Dialog #1 géré: {result.get('action')} "
+              f"({_elapsed():.1f}s)")
+        time.sleep(post_click_wait)
+
+        # --- Étape 2 : cascade — chaque dialog suivant doit apparaître dans stable_window ---
+        for iteration in range(1, max_dialog_iterations):
+            if _elapsed() >= total_timeout:
+                print(f"⏳ [ORA/post-shortcut] Timeout cascade ({total_timeout:.0f}s, "
+                      f"{dialogs_handled} dialog(s) géré(s))")
+                return True  # au moins un dialog traité → considéré OK
+
+            next_deadline = min(time.time() + stable_window, t_start + total_timeout)
+            result = _poll_dialog(next_deadline)
+            if result is None:
+                # Pas de nouveau dialog dans stable_window → cascade terminée
+                print(f"✅ [ORA/post-shortcut] Cascade résolue "
+                      f"({dialogs_handled} dialog(s), {_elapsed():.1f}s)")
+                return True
+
+            dialogs_handled += 1
+            print(f"✅ [ORA/post-shortcut] Dialog #{dialogs_handled} géré: "
+                  f"{result.get('action')} ({_elapsed():.1f}s)")
+            time.sleep(post_click_wait)
+
+        print(f"⚠️ [ORA/post-shortcut] Trop d'itérations cascade "
+              f"({max_dialog_iterations}) — cascade malformée, on s'arrête là")
+        return dialogs_handled > 0
+
    def _act_click(self, decision: Decision, step_params: dict) -> bool:
        """Exécute un clic (simple, double, droit, hover, focus).

@@ -1425,16 +1577,62 @@ Règles:
        anchor = step_params.get('visual_anchor', {})
        screenshot_b64 = anchor.get('screenshot')
        bbox = anchor.get('bounding_box', {})
-        target_text = anchor.get('target_text', '') or decision.target
+        # Utiliser le target nettoyé par reason_workflow_step (pas relire le garbage de l'ancre)
+        target_text = decision.target
        target_desc = anchor.get('description', '')

+        print(f"🎯 [ORA/_act_click] target='{target_text}', desc='{target_desc[:40]}', bbox={bbox.get('x','?')},{bbox.get('y','?')}")
+
        x, y = None, None
        method_used = ''
+        # Score et position du template-first (réutilisés en fallback intermédiaire)
+        template_score = 0.0
+        template_xy: Optional[tuple] = None

-        # --- Pipeline FAST→SMART→THINK ---
+        # --- AVANT-POSTE : template matching cv2 sur le crop d'ancre ---
+        # Si l'UI n'a pas changé (cas dominant en replay), un match pixel-perfect
+        # nous donne le clic en ~50ms sans toucher au GPU. On ne déclenche le
+        # pipeline VLM que si le score est insuffisant.
+        if screenshot_b64 and CV2_AVAILABLE and PIL_AVAILABLE and MSS_AVAILABLE:
+            try:
+                import io as _io
+                with mss_lib.mss() as sct:
+                    mon = sct.monitors[0]
+                    grab = sct.grab(mon)
+                    screen_img = Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX')
+
+                raw_b64 = screenshot_b64.split(',')[1] if ',' in screenshot_b64 else screenshot_b64
+                anchor_data = base64.b64decode(raw_b64)
+                anchor_img = Image.open(_io.BytesIO(anchor_data))
+
+                screen_cv = cv2.cvtColor(np.array(screen_img), cv2.COLOR_RGB2BGR)
+                anchor_cv = cv2.cvtColor(np.array(anchor_img), cv2.COLOR_RGB2BGR)
+
+                if anchor_cv.shape[0] < screen_cv.shape[0] and anchor_cv.shape[1] < screen_cv.shape[1]:
+                    t0 = time.time()
+                    result_tm = cv2.matchTemplate(screen_cv, anchor_cv, cv2.TM_CCOEFF_NORMED)
+                    _, max_val, _, max_loc = cv2.minMaxLoc(result_tm)
+                    elapsed_ms = (time.time() - t0) * 1000
+                    template_score = float(max_val)
+                    template_xy = (
+                        max_loc[0] + anchor_cv.shape[1] // 2,
+                        max_loc[1] + anchor_cv.shape[0] // 2,
+                    )
+                    print(f"⚡ [ORA/template-first] score={template_score:.3f} pos={max_loc} ({elapsed_ms:.0f}ms)")
+                    # Seuil élevé pour le mode "direct" : on veut être quasi-certain
+                    # que c'est le même élément, pixel-perfect, avant de zapper le VLM.
+                    if template_score >= 0.95:
+                        x, y = template_xy
+                        method_used = 'template_direct'
+                        print(f"✅ [ORA/template-first] Match direct → ({x}, {y}), skip pipeline")
+            except Exception as e:
+                print(f"⚠️ [ORA/template-first] Erreur: {e}")
+
+        # --- Pipeline FAST→SMART→THINK (escalade si template-first n'a pas tranché) ---
        _use_fast = os.environ.get('RPA_USE_FAST_PIPELINE', '1') == '1'

-        if _use_fast and (target_text or target_desc):
+        if x is None and _use_fast and (target_text or target_desc or screenshot_b64):
+            print(f"🎯 [ORA/_act_click] RPA_USE_FAST_PIPELINE={_use_fast}, has_target={bool(target_text or target_desc)}, template_score={template_score:.3f}")
            try:
                from core.grounding.fast_pipeline import FastSmartThinkPipeline
                from core.grounding.target import GroundingTarget
@@ -1471,34 +1669,13 @@ Règles:
            except Exception as e:
                print(f"⚠️ [ORA/pipeline] Erreur: {e}")

-        # --- Fallback : ancien pipeline (template → OCR → static) ---
-        if x is None and screenshot_b64 and CV2_AVAILABLE and PIL_AVAILABLE and MSS_AVAILABLE:
-            try:
-                import io as _io
-                with mss_lib.mss() as sct:
-                    mon = sct.monitors[0]
-                    grab = sct.grab(mon)
-                    screen_img = Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX')
-
-                raw_b64 = screenshot_b64.split(',')[1] if ',' in screenshot_b64 else screenshot_b64
-                anchor_data = base64.b64decode(raw_b64)
-                anchor_img = Image.open(_io.BytesIO(anchor_data))
-
-                screen_cv = cv2.cvtColor(np.array(screen_img), cv2.COLOR_RGB2BGR)
-                anchor_cv = cv2.cvtColor(np.array(anchor_img), cv2.COLOR_RGB2BGR)
-
-                if anchor_cv.shape[0] < screen_cv.shape[0] and anchor_cv.shape[1] < screen_cv.shape[1]:
-                    t0 = time.time()
-                    result_tm = cv2.matchTemplate(screen_cv, anchor_cv, cv2.TM_CCOEFF_NORMED)
-                    _, max_val, _, max_loc = cv2.minMaxLoc(result_tm)
-                    elapsed_ms = (time.time() - t0) * 1000
-                    print(f"⚡ [ORA/template] score={max_val:.3f} pos={max_loc} ({elapsed_ms:.0f}ms)")
-                    if max_val > 0.75:
-                        x = max_loc[0] + anchor_cv.shape[1] // 2
-                        y = max_loc[1] + anchor_cv.shape[0] // 2
-                        method_used = 'template'
-            except Exception as e:
-                print(f"⚠️ [ORA/template] Erreur: {e}")
+        # --- Fallback : on réutilise le score template-first si pertinent ---
+        # Si le pipeline VLM a échoué mais que le template-first avait un score
+        # intermédiaire (0.75-0.95), on accepte ce match comme secours.
+        if x is None and template_xy is not None and template_score >= 0.75:
+            x, y = template_xy
+            method_used = 'template_fallback'
+            print(f"⚡ [ORA/template-fallback] Réutilisation score={template_score:.3f} → ({x}, {y})")

        if x is None and target_text:
            try: