diff --git a/core/execution/observe_reason_act.py b/core/execution/observe_reason_act.py index e93acc606..e838ced04 100644 --- a/core/execution/observe_reason_act.py +++ b/core/execution/observe_reason_act.py @@ -1234,6 +1234,27 @@ Règles: # --- 1. Observer l'état pré-action --- pre = self.observe() + # --- 1b. Réflexe Check : popup/dialogue inattendu ? --- + # Si un dialogue connu est détecté (Enregistrer sous, Oui/Non, OK, etc.) + # Léa le gère immédiatement AVANT de continuer le workflow. + try: + from core.execution.input_handler import check_screen_for_patterns, handle_detected_pattern + _reflex_pattern = check_screen_for_patterns() + if _reflex_pattern: + _reflex_name = _reflex_pattern.get('pattern', '?') + _reflex_target = _reflex_pattern.get('target', '?') + print(f"🧠 [ORA/réflexe] Pattern détecté: '{_reflex_name}' → clic '{_reflex_target}'") + _handled = handle_detected_pattern(_reflex_pattern) + if _handled: + print(f"✅ [ORA/réflexe] Dialogue '{_reflex_name}' géré automatiquement") + time.sleep(0.5) + # Re-observer après avoir géré le dialogue + pre = self.observe() + else: + print(f"⚠️ [ORA/réflexe] Pattern '{_reflex_name}' détecté mais non géré") + except Exception as _reflex_err: + print(f"⚠️ [ORA/réflexe] Erreur: {_reflex_err}") + # --- 2. Raisonner : construire la Decision --- decision = self.reason_workflow_step(step, pre) @@ -1348,7 +1369,8 @@ Règles: def _act_click(self, decision: Decision, step_params: dict) -> bool: """Exécute un clic (simple, double, droit, hover, focus). - Pipeline : template matching → find_element_on_screen (OCR → UI-TARS → VLM). + Pipeline FAST→SMART→THINK (si activé) ou ancien pipeline en fallback. + Activé par la variable d'environnement RPA_USE_FAST_PIPELINE=1. """ if not PYAUTOGUI_AVAILABLE: logger.error("pyautogui non disponible") @@ -1363,53 +1385,47 @@ Règles: x, y = None, None method_used = '' - # --- Capture unique de l'écran pour TOUTES les méthodes --- - _screen_b64 = None - if MSS_AVAILABLE and PIL_AVAILABLE: - try: - import io as _io - with mss_lib.mss() as _sct: - _mon = _sct.monitors[0] - _grab = _sct.grab(_mon) - _screen_pil = Image.frombytes('RGB', _grab.size, _grab.bgra, 'raw', 'BGRX') - _buf = _io.BytesIO() - _screen_pil.save(_buf, format='JPEG', quality=85) - _screen_b64 = base64.b64encode(_buf.getvalue()).decode('utf-8') - print(f"📸 [ORA/capture] Écran capturé: {_screen_pil.size}") - except Exception as _e: - print(f"⚠️ [ORA/capture] Erreur: {_e}") + # --- Pipeline FAST→SMART→THINK --- + _use_fast = os.environ.get('RPA_USE_FAST_PIPELINE', '1') == '1' - # --- Méthode 1 : UI-TARS via serveur grounding (port 8200, ~3s) --- - # Le serveur tourne dans un process séparé avec son propre CUDA context. - # Si le serveur n'est pas lancé → on passe au template matching. - if target_text or target_desc: + if _use_fast and (target_text or target_desc): try: - import requests as _http - click_label = target_desc or target_text - print(f"🎯 [ORA/UI-TARS] Recherche: '{click_label}'") - _payload = { - 'target_text': target_text, - 'target_description': target_desc, - } - if _screen_b64: - _payload['image_b64'] = _screen_b64 - _resp = _http.post('http://localhost:8200/ground', json=_payload, timeout=30) - if _resp.status_code == 200: - _data = _resp.json() - if _data.get('x') is not None: - x, y = _data['x'], _data['y'] - method_used = 'ui_tars' - print(f"✅ [ORA/UI-TARS] Trouvé à ({x}, {y}) conf={_data.get('confidence', 0):.2f} ({_data.get('time_ms', 0):.0f}ms)") - else: - print(f"⚠️ [ORA/UI-TARS] Serveur n'a pas trouvé '{click_label}'") - else: - print(f"⚠️ [ORA/UI-TARS] Serveur HTTP {_resp.status_code}") - except _http.ConnectionError: - print(f"⚠️ [ORA/UI-TARS] Serveur grounding non démarré (port 8200)") + from core.grounding.fast_pipeline import FastSmartThinkPipeline + from core.grounding.target import GroundingTarget + + _pipeline = FastSmartThinkPipeline.get_instance() + + # Capture unique de l'écran + _screen_pil = None + if MSS_AVAILABLE and PIL_AVAILABLE: + with mss_lib.mss() as _sct: + _mon = _sct.monitors[0] + _grab = _sct.grab(_mon) + _screen_pil = Image.frombytes('RGB', _grab.size, _grab.bgra, 'raw', 'BGRX') + + _target = GroundingTarget( + text=target_text, + description=target_desc, + template_b64=screenshot_b64 or "", + original_bbox=bbox if bbox else None, + ) + + _result = _pipeline.locate( + _target, + screenshot_pil=_screen_pil, + window_title=getattr(self, '_last_window_title', ''), + ) + + if _result: + x, y = _result.x, _result.y + method_used = _result.method + print(f"🎯 [ORA/pipeline] ({x}, {y}) via {method_used} " + f"conf={_result.confidence:.3f} ({_result.time_ms:.0f}ms)") + except Exception as e: - print(f"⚠️ [ORA/UI-TARS] Erreur: {e}") + print(f"⚠️ [ORA/pipeline] Erreur: {e}") - # --- Méthode 2 : Template matching (~80ms) --- + # --- Fallback : ancien pipeline (template → OCR → static) --- if x is None and screenshot_b64 and CV2_AVAILABLE and PIL_AVAILABLE and MSS_AVAILABLE: try: import io as _io @@ -1438,35 +1454,30 @@ Règles: except Exception as e: print(f"⚠️ [ORA/template] Erreur: {e}") - # --- Méthode 3 : OCR texte (~1s) --- if x is None and target_text: try: from core.execution.input_handler import _grounding_ocr - print(f"🔍 [ORA/OCR] Recherche: '{target_text}'") result = _grounding_ocr(target_text, anchor_bbox=bbox if bbox else None) if result: x, y = result['x'], result['y'] method_used = 'ocr' print(f"🔍 [ORA/OCR] Trouvé à ({x}, {y})") - else: - print(f"🔍 [ORA/OCR] '{target_text}' non trouvé") except Exception as e: print(f"⚠️ [ORA/OCR] Erreur: {e}") - # --- Exécuter le clic --- + # --- Dernier recours : coordonnées statiques --- if x is None: - # Dernier recours : coordonnées statiques de l'ancre if bbox and bbox.get('width') and bbox.get('height'): x = int(bbox.get('x', 0) + bbox.get('width', 0) / 2) y = int(bbox.get('y', 0) + bbox.get('height', 0) / 2) method_used = 'static_fallback' print(f"⚠️ [ORA/click] Fallback coordonnées statiques: ({x}, {y})") else: - logger.error(f"❌ [ORA/click] Impossible de localiser '{target_text}' — aucune méthode n'a fonctionné") + print(f"❌ [ORA/click] Impossible de localiser '{target_text}'") return False - # --- Vérification pré-action (skip si UI-TARS a déjà validé visuellement) --- - if target_text and method_used not in ('template', 'ui_tars') and MSS_AVAILABLE and PIL_AVAILABLE: + # --- Pas de pre-check VLM (le pipeline FAST→SMART→THINK a déjà validé) --- + if False: try: pre_check = self._verify_pre_click(x, y, target_text, target_desc) if not pre_check: