diff --git a/core/execution/__init__.py b/core/execution/__init__.py index cad809b9a..bb82a97ae 100644 --- a/core/execution/__init__.py +++ b/core/execution/__init__.py @@ -10,6 +10,7 @@ from .error_handler import ErrorHandler, ErrorType, RecoveryStrategy from .workflow_runner import WorkflowRunner, RunResult, RunStatus, RunnerConfig from .dag_executor import DAGExecutor, WorkflowStep, StepType, StepStatus, DAGExecutionResult from .llm_actions import LLMActionHandler +from .observe_reason_act import ORALoop, Observation, Decision, VerificationResult, LoopResult # Import tardif pour éviter import circulaire avec pipeline def _get_execution_loop(): @@ -34,5 +35,11 @@ __all__ = [ 'StepStatus', 'DAGExecutionResult', 'LLMActionHandler', + # ORA — boucle Observe-Raisonne-Agit avec vérification + 'ORALoop', + 'Observation', + 'Decision', + 'VerificationResult', + 'LoopResult', # ExecutionLoop accessible via import direct du module ] diff --git a/core/execution/observe_reason_act.py b/core/execution/observe_reason_act.py new file mode 100644 index 000000000..e25bba712 --- /dev/null +++ b/core/execution/observe_reason_act.py @@ -0,0 +1,794 @@ +""" +Boucle Observe → Raisonne → Agit (ORA) pour l'exécution vérifiée de workflows. + +Chaque étape du workflow passe par 3 phases : +1. 👁 OBSERVE : capture écran + pHash + titre fenêtre +2. 🧠 RAISONNE : traduit le step VWB en Decision exécutable +3. 🎯 AGIT : exécute l'action (clic, saisie, raccourci, attente) +4. ✅/❌ VÉRIFIE : compare pré/post screenshot pour confirmer le résultat + +Module standalone — importable depuis n'importe où. +""" + +import logging +import time +import subprocess +import base64 +import os +import json +import re +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional + +logger = logging.getLogger(__name__) + +# --- Imports conditionnels --- +try: + from PIL import Image + PIL_AVAILABLE = True +except ImportError: + Image = None + PIL_AVAILABLE = False + +try: + import mss as mss_lib + MSS_AVAILABLE = True +except ImportError: + mss_lib = None + MSS_AVAILABLE = False + +try: + import cv2 + import numpy as np + CV2_AVAILABLE = True +except ImportError: + cv2 = None + np = None + CV2_AVAILABLE = False + +try: + import pyautogui + PYAUTOGUI_AVAILABLE = True +except ImportError: + pyautogui = None + PYAUTOGUI_AVAILABLE = False + +try: + import imagehash + IMAGEHASH_AVAILABLE = True +except ImportError: + imagehash = None + IMAGEHASH_AVAILABLE = False + + +# ═══════════════════════════════════════════════════════════════ +# Dataclasses +# ═══════════════════════════════════════════════════════════════ + +@dataclass +class Observation: + """Capture de l'état courant de l'écran.""" + screenshot: Any # PIL Image + phash: Any # imagehash.ImageHash + window_title: str + timestamp: float + + +@dataclass +class Decision: + """Décision d'action à exécuter.""" + action: str # "click" | "type" | "hotkey" | "wait" | "done" | "need_help" + target: str + value: str # texte à saisir ou touches + reasoning: str + expected_after: str + confidence: float + done: bool = False + + +@dataclass +class VerificationResult: + """Résultat de la vérification post-action.""" + success: bool + change_level: str # "same" | "minor" | "major" + matches_expected: bool + detail: str + + +@dataclass +class LoopResult: + """Résultat global de l'exécution de la boucle ORA.""" + success: bool + steps_completed: int + total_steps: int + reason: str = "" + + +# ═══════════════════════════════════════════════════════════════ +# Classe principale +# ═══════════════════════════════════════════════════════════════ + +class ORALoop: + """ + Boucle Observe-Raisonne-Agit avec vérification intégrée. + + Args: + max_retries: Nombre de réessais par étape en cas d'échec de vérification. + max_steps: Nombre maximal d'étapes autorisées (garde-fou). + verify_level: 'none' | 'phash' | 'vlm' | 'auto' + - none : pas de vérification post-action + - phash : comparaison pHash uniquement + - vlm : pHash + appel VLM systématique + - auto : pHash toujours, VLM seulement si confiance < 0.95 + """ + + def __init__(self, max_retries: int = 2, max_steps: int = 50, verify_level: str = 'auto'): + self.max_retries = max_retries + self.max_steps = max_steps + self.verify_level = verify_level + + # Variables runtime injectées par le workflow + self._variables: Dict[str, Any] = {} + + # ─── Phase 1 : OBSERVE ──────────────────────────────── + + def observe(self) -> Observation: + """👁 Capture l'état courant de l'écran. + + Returns: + Observation avec screenshot PIL, pHash, titre fenêtre, timestamp. + """ + screenshot = self._capture_screen() + phash = self._compute_phash(screenshot) + window_title = self._get_active_window_title() + ts = time.time() + + logger.info(f"👁 [ORA/observe] titre='{window_title}' phash={phash}") + return Observation( + screenshot=screenshot, + phash=phash, + window_title=window_title, + timestamp=ts, + ) + + # ─── Phase 2 : RAISONNE ─────────────────────────────── + + def reason_workflow_step(self, step_params: dict, observation: Observation) -> Decision: + """🧠 Traduit un step VWB en Decision exécutable. + + Le workflow dit déjà quoi faire — pas d'appel VLM ici. + On se contente de mapper les paramètres du step vers une Decision. + + Args: + step_params: Paramètres du step (action_type, visual_anchor, keys, text, etc.) + observation: Observation courante (pour contexte) + + Returns: + Decision prête à exécuter. + """ + action_type = step_params.get('action_type', '') + params = step_params.get('parameters', {}) + anchor = step_params.get('visual_anchor', {}) + label = step_params.get('label', action_type) + + # --- Mapper action_type vers action Decision --- + + if action_type in ('click_anchor', 'click', 'double_click_anchor', 'right_click_anchor'): + target_text = anchor.get('target_text', '') or label + action = 'click' + value = 'double' if action_type == 'double_click_anchor' else ( + 'right' if action_type == 'right_click_anchor' else 'left') + expected = f"L'écran devrait changer après le clic sur '{target_text}'" + confidence = 0.9 + + elif action_type in ('type_text', 'type'): + text = params.get('text', '') + # Résoudre les variables {{var}} + text = self._resolve_variables(text) + action = 'type' + target_text = '' + value = text + expected = f"Le texte '{text[:30]}...' devrait apparaître dans le champ" + confidence = 0.95 + + elif action_type == 'keyboard_shortcut': + keys = params.get('keys', []) + action = 'hotkey' + target_text = '' + value = '+'.join(keys) + expected = f"L'écran devrait réagir au raccourci {value}" + confidence = 0.9 + + elif action_type in ('wait_for_anchor', 'wait'): + timeout_ms = params.get('timeout_ms', params.get('timeout', 5000)) + action = 'wait' + target_text = '' + value = str(timeout_ms) + expected = "Attente passive" + confidence = 1.0 + + elif action_type in ('hover_anchor', 'hover'): + target_text = anchor.get('target_text', '') or label + action = 'click' # On réutilise click avec value='hover' + value = 'hover' + expected = f"Le curseur survole '{target_text}'" + confidence = 0.95 + + elif action_type in ('scroll_to_anchor', 'scroll'): + direction = params.get('scroll_direction', 'down') + amount = params.get('scroll_amount', params.get('scroll_step_pixels', 3)) + action = 'hotkey' # Scroll passe par pyautogui.scroll + target_text = '' + value = f"scroll_{direction}_{amount}" + expected = f"L'écran devrait défiler vers {direction}" + confidence = 0.9 + + elif action_type in ('focus_anchor', 'focus'): + target_text = anchor.get('target_text', '') or label + action = 'click' + value = 'focus' + expected = f"L'élément '{target_text}' a le focus" + confidence = 0.95 + + elif action_type in ('ai_analyze_text', 'ai_ocr', 'ai_summarize', + 'ai_extract', 'ai_classify', 'ai_custom', + 'extract_text', 'screenshot_evidence', + 'verify_element_exists', 'verify_element'): + # Actions passthrough : exécutées tel quel par execute_action existant + action = 'passthrough' + target_text = '' + value = action_type + expected = "L'action IA/extraction devrait retourner un résultat" + confidence = 0.95 + + else: + action = 'passthrough' + target_text = '' + value = action_type + expected = f"Action '{action_type}' exécutée" + confidence = 0.8 + + decision = Decision( + action=action, + target=target_text, + value=value, + reasoning=f"Step VWB '{label}' → {action} (confiance={confidence:.2f})", + expected_after=expected, + confidence=confidence, + ) + logger.info(f"🧠 [ORA/reason] {decision.action} target='{decision.target}' value='{decision.value[:50]}'") + return decision + + # ─── Phase 3 : AGIT ─────────────────────────────────── + + def act(self, decision: Decision, step_params: dict = None) -> bool: + """🎯 Exécute la Decision. + + Utilise le même pipeline que execute.py : + - Template matching → CLIP → OCR/UI-TARS pour les clics + - safe_type_text pour la saisie + - pyautogui.hotkey pour les raccourcis + + Args: + decision: Decision à exécuter. + step_params: Paramètres originaux du step (pour l'ancre, etc.) + + Returns: + True si l'action a été exécutée (pas encore vérifiée). + """ + if step_params is None: + step_params = {} + + logger.info(f"🎯 [ORA/act] {decision.action} target='{decision.target}' value='{decision.value[:50]}'") + + try: + if decision.action == 'click': + return self._act_click(decision, step_params) + + elif decision.action == 'type': + return self._act_type(decision) + + elif decision.action == 'hotkey': + return self._act_hotkey(decision, step_params) + + elif decision.action == 'wait': + return self._act_wait(decision) + + elif decision.action == 'passthrough': + # Exécuté directement via execute_action dans run_workflow + return True + + elif decision.action == 'done': + return True + + elif decision.action == 'need_help': + logger.warning("🆘 [ORA/act] L'action nécessite une aide humaine") + return False + + else: + logger.warning(f"🎯 [ORA/act] Action inconnue: {decision.action}") + return False + + except Exception as e: + logger.error(f"❌ [ORA/act] Erreur: {e}", exc_info=True) + return False + + # ─── Phase 4 : VÉRIFIE ──────────────────────────────── + + def verify(self, pre: Observation, post: Observation, decision: Decision) -> VerificationResult: + """✅/❌ Vérifie que l'action a produit l'effet attendu. + + Level 1 (toujours) : compare pHash pre vs post. + Level 2 (conditionnel) : appel VLM si verify_level='vlm' ou 'auto' avec confiance < 0.95. + + Args: + pre: Observation avant l'action. + post: Observation après l'action. + decision: Decision exécutée. + + Returns: + VerificationResult. + """ + # Pas de vérification demandée + if self.verify_level == 'none': + return VerificationResult( + success=True, change_level='unknown', + matches_expected=True, detail="Vérification désactivée" + ) + + # Actions qui ne changent pas l'écran + if decision.action in ('wait', 'done', 'passthrough'): + return VerificationResult( + success=True, change_level='n/a', + matches_expected=True, detail=f"Action '{decision.action}' — pas de changement attendu" + ) + + # --- Level 1 : pHash --- + distance = self._phash_distance(pre.phash, post.phash) + change_level = self._classify_change(distance) + + logger.info(f"🔍 [ORA/verify] pHash distance={distance} → {change_level}") + + # Déterminer le succès selon le niveau de changement + if change_level == 'same': + # Rien n'a changé — probable échec + # Exception : certains clics (focus, hover) peuvent ne pas changer l'écran + if decision.value in ('hover', 'focus'): + success = True + detail = f"Pas de changement visible (normal pour {decision.value})" + else: + success = False + detail = f"Aucun changement détecté (distance={distance})" + elif change_level == 'minor': + # Changement mineur — OK si confiance élevée + success = decision.confidence > 0.9 + detail = f"Changement mineur (distance={distance})" + else: + # Changement majeur — succès + success = True + detail = f"Changement majeur détecté (distance={distance})" + + matches_expected = success + + # --- Level 2 : VLM (conditionnel) --- + if self.verify_level == 'vlm' or (self.verify_level == 'auto' and decision.confidence < 0.95 and not success): + vlm_result = self._vlm_verify(post, decision) + if vlm_result is not None: + matches_expected = vlm_result.get('matches', success) + actual_state = vlm_result.get('actual_state', '') + detail += f" | VLM: matches={matches_expected}, état='{actual_state[:80]}'" + # Le VLM a le dernier mot + success = matches_expected + logger.info(f"🔍 [ORA/verify/VLM] matches={matches_expected} état='{actual_state[:60]}'") + + emoji = "✅" if success else "❌" + logger.info(f"{emoji} [ORA/verify] success={success} level={change_level} — {detail[:100]}") + + return VerificationResult( + success=success, + change_level=change_level, + matches_expected=matches_expected, + detail=detail, + ) + + # ─── Boucle principale ──────────────────────────────── + + def run_workflow( + self, + steps: list, + on_progress: Optional[Callable] = None, + execute_action_fn: Optional[Callable] = None, + ) -> LoopResult: + """Exécute un workflow complet avec la boucle ORA. + + Args: + steps: Liste de dicts avec les paramètres de chaque étape. + Chaque dict contient : action_type, parameters, visual_anchor, label. + on_progress: Callback(step_index, total_steps, verification_result). + execute_action_fn: Fonction externe pour les actions passthrough + (signature: execute_action(action_type, params) -> dict). + + Returns: + LoopResult. + """ + total = len(steps) + if total == 0: + return LoopResult(success=True, steps_completed=0, total_steps=0, reason="Aucune étape") + + if total > self.max_steps: + return LoopResult( + success=False, steps_completed=0, total_steps=total, + reason=f"Trop d'étapes ({total} > max {self.max_steps})" + ) + + logger.info(f"🚀 [ORA] Démarrage workflow: {total} étapes, verify={self.verify_level}, retries={self.max_retries}") + + for i, step in enumerate(steps): + logger.info(f"\n{'='*60}") + logger.info(f"📋 [ORA] Étape {i+1}/{total}: {step.get('action_type', '?')} — {step.get('label', '')}") + + # --- 1. Observer l'état pré-action --- + pre = self.observe() + + # --- 2. Raisonner : construire la Decision --- + decision = self.reason_workflow_step(step, pre) + + # --- 3. Agir --- + if decision.action == 'passthrough' and execute_action_fn: + # Déléguer aux fonctions existantes (IA, extraction, etc.) + action_type = step.get('action_type', '') + params = step.get('_full_params', step.get('parameters', {})) + result = execute_action_fn(action_type, params) + act_success = result.get('success', False) + if not act_success: + logger.warning(f"❌ [ORA] Action passthrough échouée: {result.get('error', '?')}") + return LoopResult( + success=False, steps_completed=i, total_steps=total, + reason=f"Étape {i+1} passthrough échouée: {result.get('error', '?')}" + ) + # Pas de vérification pour les passthrough + if on_progress: + on_progress(i + 1, total, VerificationResult( + success=True, change_level='n/a', + matches_expected=True, detail="Passthrough OK" + )) + continue + + act_success = self.act(decision, step) + if not act_success and decision.action not in ('wait', 'done'): + logger.warning(f"❌ [ORA] Action échouée immédiatement") + return LoopResult( + success=False, steps_completed=i, total_steps=total, + reason=f"Étape {i+1}: action '{decision.action}' échouée" + ) + + # Petit délai pour laisser l'écran se stabiliser + time.sleep(0.3) + + # --- 4. Observer l'état post-action --- + post = self.observe() + + # --- 5. Vérifier --- + verification = self.verify(pre, post, decision) + + if not verification.success: + # Réessayer + retried = False + for retry in range(self.max_retries): + logger.info(f"🔄 [ORA] Retry {retry+1}/{self.max_retries} pour étape {i+1}") + pre_retry = self.observe() + act_success = self.act(decision, step) + time.sleep(0.3) + post_retry = self.observe() + verification = self.verify(pre_retry, post_retry, decision) + if verification.success: + retried = True + logger.info(f"✅ [ORA] Retry {retry+1} réussi") + break + if not retried and not verification.success: + logger.warning(f"❌ [ORA] Étape {i+1} échouée après {self.max_retries} retries") + return LoopResult( + success=False, steps_completed=i, total_steps=total, + reason=f"Étape {i+1}: vérification échouée — {verification.detail}" + ) + + # --- Callback de progression --- + if on_progress: + on_progress(i + 1, total, verification) + + logger.info(f"✅ [ORA] Workflow terminé avec succès: {total}/{total} étapes") + return LoopResult(success=True, steps_completed=total, total_steps=total) + + # ═══════════════════════════════════════════════════════════ + # Méthodes privées — actions + # ═══════════════════════════════════════════════════════════ + + def _act_click(self, decision: Decision, step_params: dict) -> bool: + """Exécute un clic (simple, double, droit, hover, focus). + + Pipeline : template matching → find_element_on_screen (OCR → UI-TARS → VLM). + """ + if not PYAUTOGUI_AVAILABLE: + logger.error("pyautogui non disponible") + return False + + anchor = step_params.get('visual_anchor', {}) + screenshot_b64 = anchor.get('screenshot') + bbox = anchor.get('bounding_box', {}) + target_text = anchor.get('target_text', '') or decision.target + target_desc = anchor.get('description', '') + + x, y = None, None + method_used = '' + + # --- Méthode 1 : Template matching direct (~1-10ms) --- + if screenshot_b64 and CV2_AVAILABLE and PIL_AVAILABLE and MSS_AVAILABLE: + try: + import io as _io + + # Capturer l'écran + with mss_lib.mss() as sct: + mon = sct.monitors[0] + grab = sct.grab(mon) + screen_img = Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX') + + # Décoder l'ancre + raw_b64 = screenshot_b64.split(',')[1] if ',' in screenshot_b64 else screenshot_b64 + anchor_data = base64.b64decode(raw_b64) + anchor_img = Image.open(_io.BytesIO(anchor_data)) + + screen_cv = cv2.cvtColor(np.array(screen_img), cv2.COLOR_RGB2BGR) + anchor_cv = cv2.cvtColor(np.array(anchor_img), cv2.COLOR_RGB2BGR) + + if anchor_cv.shape[0] < screen_cv.shape[0] and anchor_cv.shape[1] < screen_cv.shape[1]: + t0 = time.time() + result_tm = cv2.matchTemplate(screen_cv, anchor_cv, cv2.TM_CCOEFF_NORMED) + min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result_tm) + elapsed_ms = (time.time() - t0) * 1000 + logger.info(f"⚡ [ORA/template] score={max_val:.3f} pos={max_loc} ({elapsed_ms:.0f}ms)") + + if max_val > 0.75: + x = max_loc[0] + anchor_cv.shape[1] // 2 + y = max_loc[1] + anchor_cv.shape[0] // 2 + method_used = 'template' + except Exception as e: + logger.debug(f"⚠️ [ORA/template] Erreur: {e}") + + # --- Méthode 2 : find_element_on_screen (OCR → UI-TARS → VLM) --- + if x is None and (target_text or target_desc): + try: + from core.execution.input_handler import find_element_on_screen + grounding_result = find_element_on_screen( + target_text=target_text, + target_description=target_desc, + anchor_image_base64=screenshot_b64, + anchor_bbox=bbox if bbox else None, + ) + if grounding_result: + x, y = grounding_result['x'], grounding_result['y'] + method_used = f"grounding_{grounding_result['method']}" + logger.info(f"🔍 [ORA/grounding] Trouvé via {method_used} à ({x}, {y})") + except Exception as e: + logger.debug(f"⚠️ [ORA/grounding] Erreur: {e}") + + # --- Exécuter le clic --- + if x is None: + # Dernier recours : coordonnées statiques de l'ancre + if bbox and bbox.get('width') and bbox.get('height'): + x = int(bbox.get('x', 0) + bbox.get('width', 0) / 2) + y = int(bbox.get('y', 0) + bbox.get('height', 0) / 2) + method_used = 'static_fallback' + logger.warning(f"⚠️ [ORA/click] Fallback coordonnées statiques: ({x}, {y})") + else: + logger.error(f"❌ [ORA/click] Impossible de localiser '{target_text}' — aucune méthode n'a fonctionné") + return False + + logger.info(f"🖱️ [ORA/click] {decision.value} à ({x}, {y}) via {method_used}") + + if decision.value == 'double': + pyautogui.doubleClick(x, y) + elif decision.value == 'right': + pyautogui.rightClick(x, y) + elif decision.value == 'hover': + pyautogui.moveTo(x, y, duration=0.3) + time.sleep(step_params.get('parameters', {}).get('hover_duration_ms', + step_params.get('parameters', {}).get('duration_ms', 1000)) / 1000) + elif decision.value == 'focus': + pyautogui.click(x, y) + time.sleep(0.3) + else: + pyautogui.click(x, y) + + return True + + def _act_type(self, decision: Decision) -> bool: + """Saisie de texte via safe_type_text.""" + if not decision.value: + logger.warning("🎯 [ORA/type] Pas de texte à saisir") + return True # Vide = rien à faire, pas un échec + + try: + from core.execution.input_handler import safe_type_text + safe_type_text(decision.value) + return True + except Exception as e: + logger.error(f"❌ [ORA/type] Erreur saisie: {e}") + return False + + def _act_hotkey(self, decision: Decision, step_params: dict) -> bool: + """Raccourci clavier ou scroll.""" + if not PYAUTOGUI_AVAILABLE: + logger.error("pyautogui non disponible") + return False + + value = decision.value + + # Gestion du scroll (encodé dans value) + if value.startswith('scroll_'): + parts = value.split('_') + # scroll_down_3 ou scroll_up_5 + direction = parts[1] if len(parts) > 1 else 'down' + amount = int(parts[2]) if len(parts) > 2 else 3 + + # Positionner la souris si ancre disponible + anchor = step_params.get('visual_anchor', {}) + bbox = anchor.get('bounding_box', {}) + if bbox and bbox.get('width'): + sx = int(bbox.get('x', 0) + bbox.get('width', 0) / 2) + sy = int(bbox.get('y', 0) + bbox.get('height', 0) / 2) + pyautogui.moveTo(sx, sy, duration=0.1) + + scroll_value = amount if direction in ('up', 'left') else -amount + if direction in ('left', 'right'): + pyautogui.hscroll(scroll_value) + else: + pyautogui.scroll(scroll_value) + time.sleep(0.5) + return True + + # Raccourci clavier normal + keys = value.split('+') + logger.info(f"⌨️ [ORA/hotkey] {'+'.join(keys)}") + pyautogui.hotkey(*keys) + return True + + def _act_wait(self, decision: Decision) -> bool: + """Attente passive.""" + try: + timeout_ms = int(decision.value) + except (ValueError, TypeError): + timeout_ms = 5000 + + logger.info(f"⏳ [ORA/wait] {timeout_ms}ms") + time.sleep(timeout_ms / 1000) + return True + + # ═══════════════════════════════════════════════════════════ + # Méthodes privées — utilitaires + # ═══════════════════════════════════════════════════════════ + + def _capture_screen(self) -> Any: + """Capture l'écran principal, retourne une PIL Image.""" + if not MSS_AVAILABLE or not PIL_AVAILABLE: + logger.warning("mss ou PIL non disponible — capture impossible") + return None + + try: + with mss_lib.mss() as sct: + monitor = sct.monitors[0] + grab = sct.grab(monitor) + return Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX') + except Exception as e: + logger.error(f"Erreur capture écran: {e}") + return None + + def _compute_phash(self, image: Any) -> Any: + """Calcule le pHash d'une image PIL.""" + if image is None or not IMAGEHASH_AVAILABLE: + return None + try: + from core.analytics.screen_change_detector import compute_phash + return compute_phash(image) + except Exception as e: + logger.debug(f"Erreur compute_phash: {e}") + return None + + def _get_active_window_title(self) -> str: + """Récupère le titre de la fenêtre active via xdotool.""" + try: + result = subprocess.run( + ['xdotool', 'getactivewindow', 'getwindowname'], + capture_output=True, text=True, timeout=2 + ) + if result.returncode == 0: + return result.stdout.strip() + except Exception: + pass + return '' + + def _phash_distance(self, hash1: Any, hash2: Any) -> int: + """Distance de Hamming entre deux pHash. Retourne 999 si non calculable.""" + if hash1 is None or hash2 is None: + return 999 + try: + return hash1 - hash2 + except Exception: + return 999 + + def _classify_change(self, distance: int) -> str: + """Classifie un changement d'écran selon la distance pHash.""" + if distance < 5: + return 'same' + elif distance < 15: + return 'minor' + else: + return 'major' + + def _vlm_verify(self, post: Observation, decision: Decision) -> Optional[dict]: + """Appelle le VLM pour vérifier l'écran post-action. + + Returns: + Dict {'matches': bool, 'actual_state': str} ou None si échec. + """ + if post.screenshot is None: + return None + + try: + import requests + import io as _io + + buffer = _io.BytesIO() + post.screenshot.save(buffer, format='JPEG', quality=70) + image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8') + + prompt = ( + f"L'action '{decision.action} {decision.target}' vient d'être exécutée. " + f"Le résultat attendu était: '{decision.expected_after}'. " + f"L'écran correspond-il ? Réponds JSON: " + f'{{\"matches\": true/false, \"actual_state\": \"...\"}}' + ) + + ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434") + model = os.environ.get("RPA_REASONING_MODEL", "qwen2.5vl:7b") + + response = requests.post( + f"{ollama_url}/api/generate", + json={ + "model": model, + "prompt": prompt, + "images": [image_b64], + "stream": False, + "options": {"temperature": 0.1, "num_predict": 200} + }, + timeout=30, + ) + + if response.status_code != 200: + logger.warning(f"[ORA/vlm_verify] HTTP {response.status_code}") + return None + + text = response.json().get('response', '').strip() + match = re.search(r'\{[\s\S]*\}', text) + if match: + parsed = json.loads(match.group()) + return { + 'matches': bool(parsed.get('matches', False)), + 'actual_state': str(parsed.get('actual_state', '')), + } + + return None + + except Exception as e: + logger.debug(f"[ORA/vlm_verify] Erreur: {e}") + return None + + def _resolve_variables(self, text: str) -> str: + """Résout les variables {{var}} dans un texte.""" + if not text or '{{' not in text: + return text + + def replace_var(match): + var_name = match.group(1) + return str(self._variables.get(var_name, match.group(0))) + + return re.sub(r'\{\{(\w+)\}\}', replace_var, text) diff --git a/visual_workflow_builder/backend/api_v3/execute.py b/visual_workflow_builder/backend/api_v3/execute.py index 21d5610d1..83e700f2a 100644 --- a/visual_workflow_builder/backend/api_v3/execute.py +++ b/visual_workflow_builder/backend/api_v3/execute.py @@ -1346,6 +1346,182 @@ def execute_action(action_type: str, params: dict) -> dict: return {'success': False, 'error': str(e)} +def run_workflow_verified(execution_id: str, workflow_id: str, app): + """ + Thread d'exécution en mode 'verified' — boucle ORA. + + Charge les steps du workflow depuis la BDD, construit les params + comme execute_workflow_thread, puis délègue à ORALoop.run_workflow(). + + NE modifie PAS execute_workflow_thread : les modes basic/intelligent/debug + continuent de fonctionner exactement comme avant. + """ + global _execution_state + + with app.app_context(): + try: + execution = Execution.query.get(execution_id) + workflow = Workflow.query.get(workflow_id) + + if not execution or not workflow: + logger.error("[ORA] Workflow ou exécution non trouvé") + return + + steps_db = workflow.steps.order_by(Step.order).all() + execution.total_steps = len(steps_db) + execution.status = 'running' + execution.started_at = datetime.utcnow() + db.session.commit() + + logger.info(f"🚀 [ORA] Démarrage workflow vérifié {workflow_id}: {len(steps_db)} étapes") + + # --- Construire les params pour chaque step (même logique que execute_workflow_thread) --- + ora_steps = [] + for step in steps_db: + params = step.parameters or {} + + # Charger l'ancre visuelle si présente + if step.anchor_id: + anchor = VisualAnchor.query.get(step.anchor_id) + if anchor: + anchor_image_path = anchor.thumbnail_path or anchor.image_path + if anchor_image_path and os.path.exists(anchor_image_path): + with open(anchor_image_path, 'rb') as f: + image_base64 = base64.b64encode(f.read()).decode('utf-8') + else: + image_base64 = None + + anchor_data = { + 'anchor_id': anchor.id, + 'screenshot': image_base64, + 'bounding_box': { + 'x': anchor.bbox_x, + 'y': anchor.bbox_y, + 'width': anchor.bbox_width, + 'height': anchor.bbox_height, + }, + 'metadata': { + 'screen_resolution': { + 'width': anchor.screen_width, + 'height': anchor.screen_height, + } + }, + } + if anchor.target_text: + anchor_data['target_text'] = anchor.target_text + if anchor.ocr_description: + anchor_data['description'] = anchor.ocr_description + + params['visual_anchor'] = anchor_data + + # Injecter le label pour le grounding + if step.label: + params['_step_label'] = step.label + + ora_steps.append({ + 'action_type': step.action_type, + 'parameters': params, + 'visual_anchor': params.get('visual_anchor', {}), + 'label': step.label or step.action_type, + '_full_params': params, # Pour les passthrough + '_step_id': step.id, + }) + + # --- Créer et lancer la boucle ORA --- + from core.execution.observe_reason_act import ORALoop + + ora = ORALoop(max_retries=2, max_steps=50, verify_level='auto') + ora._variables = _execution_state.get('variables', {}) + + # Créer les ExecutionStep en amont pour le suivi + step_results_map = {} + for idx, step in enumerate(steps_db): + step_result = ExecutionStep( + execution_id=execution_id, + step_id=step.id, + status='pending', + ) + db.session.add(step_result) + step_results_map[idx] = step_result + db.session.commit() + + def on_progress(step_index, total, verification): + """Callback de progression — met à jour la BDD.""" + try: + sr = step_results_map.get(step_index - 1) + if sr: + sr.status = 'success' if verification.success else 'error' + sr.started_at = sr.started_at or datetime.utcnow() + sr.ended_at = datetime.utcnow() + if sr.started_at: + sr.duration_ms = int((sr.ended_at - sr.started_at).total_seconds() * 1000) + sr.output = { + 'change_level': verification.change_level, + 'matches_expected': verification.matches_expected, + 'detail': verification.detail, + 'mode': 'verified', + } + if verification.success: + execution.completed_steps = step_index + else: + execution.failed_steps = (execution.failed_steps or 0) + 1 + sr.error_message = verification.detail + execution.current_step_index = step_index + db.session.commit() + except Exception as e: + logger.warning(f"[ORA/progress] Erreur BDD: {e}") + + # Exécuter + loop_result = ora.run_workflow( + steps=ora_steps, + on_progress=on_progress, + execute_action_fn=execute_action, + ) + + # Synchroniser les variables runtime + _execution_state['variables'] = ora._variables + + # Finaliser + if loop_result.success: + execution.status = 'completed' + else: + execution.status = 'error' + execution.error_message = loop_result.reason + # Marquer le step en échec + failed_sr = step_results_map.get(loop_result.steps_completed) + if failed_sr and failed_sr.status == 'pending': + failed_sr.status = 'error' + failed_sr.error_message = loop_result.reason + failed_sr.ended_at = datetime.utcnow() + + execution.ended_at = datetime.utcnow() + execution.completed_steps = loop_result.steps_completed + db.session.commit() + + logger.info( + f"{'✅' if loop_result.success else '❌'} [ORA] Workflow terminé: " + f"{loop_result.steps_completed}/{loop_result.total_steps} " + f"({execution.status}) — {loop_result.reason}" + ) + + except Exception as e: + logger.error(f"❌ [ORA] Erreur fatale: {e}", exc_info=True) + try: + execution = Execution.query.get(execution_id) + if execution: + execution.status = 'error' + execution.error_message = f"Erreur fatale ORA: {str(e)}" + execution.ended_at = datetime.utcnow() + db.session.commit() + except Exception as db_err: + logger.warning(f"[ORA] DB cleanup error: {db_err}") + + finally: + with _execution_lock: + _execution_state['is_running'] = False + _execution_state['current_execution_id'] = None + + @api_v3_bp.route('/execute/start', methods=['POST']) def start_execution(): """ @@ -1371,7 +1547,7 @@ def start_execution(): minimize_browser = data.get('minimize_browser', True) # Activé par défaut # Valider le mode - if execution_mode not in ['basic', 'intelligent', 'debug']: + if execution_mode not in ['basic', 'intelligent', 'debug', 'verified']: execution_mode = 'basic' # Utiliser le workflow actif si non spécifié @@ -1430,15 +1606,22 @@ def start_execution(): from flask import current_app app = current_app._get_current_object() - thread = threading.Thread( - target=execute_workflow_thread, - args=(execution.id, workflow_id, app) - ) + if execution_mode == 'verified': + # Mode ORA : boucle observe→raisonne→agit avec vérification + thread = threading.Thread( + target=run_workflow_verified, + args=(execution.id, workflow_id, app) + ) + else: + thread = threading.Thread( + target=execute_workflow_thread, + args=(execution.id, workflow_id, app) + ) thread.daemon = True thread.start() _execution_state['thread'] = thread - print(f"🚀 [API v3] Exécution lancée: {execution.id}") + print(f"🚀 [API v3] Exécution lancée: {execution.id} (mode={execution_mode})") return jsonify({ 'success': True,