feat: instructions en langage naturel via boucle ORA

reason_instruction() : le VLM regarde l'écran, décide la prochaine action atomique (click/type/hotkey/scroll/done), retourne un Decision avec expected_after pour la vérification. run_instruction() : boucle ORA complète pour instructions texte. CognitiveContext mis à jour à chaque étape (objectif, historique, faits appris, confiance). POST /api/v3/execute/instruction : endpoint API pour lancer une instruction en langage naturel. Thread daemon, polling du résultat via GET /api/v3/execute/instruction/result. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-22 09:09:53 +02:00
parent 0c5fffe951
commit 0ec5e2a25b
2 changed files with 490 additions and 0 deletions
--- a/core/execution/observe_reason_act.py
+++ b/core/execution/observe_reason_act.py
@@ -22,6 +22,14 @@ from typing import Any, Callable, Dict, List, Optional

 logger = logging.getLogger(__name__)

+# Import du contexte cognitif (mémoire de travail)
+try:
+    from core.cognition.working_memory import CognitiveContext
+    COGNITIVE_AVAILABLE = True
+except ImportError:
+    CognitiveContext = None
+    COGNITIVE_AVAILABLE = False
+
 # --- Imports conditionnels ---
 try:
    from PIL import Image
@@ -112,6 +120,11 @@ class ORALoop:
    """
    Boucle Observe-Raisonne-Agit avec vérification intégrée.

+    Deux modes d'utilisation :
+    - **Workflow** : run_workflow() exécute une liste d'étapes VWB pré-définies.
+    - **Instruction** : run_instruction() exécute une instruction en langage naturel
+      via le VLM qui décide à chaque pas la prochaine action atomique (Phase 4).
+
    Args:
        max_retries: Nombre de réessais par étape en cas d'échec de vérification.
        max_steps: Nombre maximal d'étapes autorisées (garde-fou).
@@ -130,6 +143,9 @@ class ORALoop:
        # Variables runtime injectées par le workflow
        self._variables: Dict[str, Any] = {}

+        # Contexte cognitif pour le mode instruction (Phase 4)
+        self.ctx: Optional['CognitiveContext'] = None
+
    # ─── Phase 1 : OBSERVE ────────────────────────────────

    def observe(self) -> Observation:
@@ -259,6 +275,309 @@ class ORALoop:
        logger.info(f"🧠 [ORA/reason] {decision.action} target='{decision.target}' value='{decision.value[:50]}'")
        return decision

+    # ─── Phase 2b : RAISONNE (mode instruction) ─────────
+
+    def reason_instruction(self, instruction: str, observation: Observation) -> Decision:
+        """Le VLM regarde l'écran et décide la prochaine action atomique.
+
+        Mode instruction (Phase 4) : pas de workflow pré-défini, le VLM
+        pilote entièrement l'exécution à partir de l'objectif en langage
+        naturel et de ce qu'il voit à l'écran.
+
+        Args:
+            instruction: Instruction utilisateur en langage naturel.
+            observation: Observation courante (screenshot, titre fenêtre...).
+
+        Returns:
+            Decision avec l'action atomique à exécuter.
+        """
+        try:
+            import requests
+            import io as _io
+
+            # --- Construire le contexte cognitif ---
+            ctx_block = ""
+            if self.ctx and COGNITIVE_AVAILABLE:
+                # Mettre à jour l'observation dans le contexte
+                self.ctx.observe(
+                    window_title=observation.window_title,
+                )
+                ctx_block = self.ctx.to_prompt_context()
+
+            # --- Encoder le screenshot en JPEG base64 ---
+            image_b64 = None
+            if observation.screenshot is not None:
+                buffer = _io.BytesIO()
+                observation.screenshot.save(buffer, format='JPEG', quality=70)
+                image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
+
+            if image_b64 is None:
+                logger.error("🧠 [ORA/reason_instruction] Pas de screenshot disponible")
+                return Decision(
+                    action='need_help', target='', value='',
+                    reasoning='Impossible de capturer l\'écran',
+                    expected_after='', confidence=0.0, done=False,
+                )
+
+            # --- Construire le prompt VLM ---
+            prompt = f"""Tu es Léa, un agent RPA visuel. Tu dois accomplir une tâche sur un ordinateur.
+
+{ctx_block}
+
+INSTRUCTION: {instruction}
+
+ÉCRAN ACTUEL: [image jointe]
+
+Quelle est la PROCHAINE ACTION ATOMIQUE à effectuer?
+
+Réponds en JSON strict:
+{{
+  "action": "click" | "type" | "hotkey" | "wait" | "scroll" | "done",
+  "target": "texte exact du bouton/champ/élément à cliquer",
+  "value": "texte à saisir ou touches (ctrl+s)",
+  "reasoning": "pourquoi cette action",
+  "expected_after": "ce que l'écran devrait montrer après",
+  "confidence": 0.0-1.0,
+  "done": false
+}}
+
+Règles:
+- UNE SEULE action atomique par réponse
+- Si l'objectif est atteint, action="done", done=true
+- Si tu ne sais pas, confidence < 0.3
+- expected_after est OBLIGATOIRE"""
+
+            # --- Appel VLM (Ollama) ---
+            ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
+            model = os.environ.get("RPA_REASONING_MODEL", "qwen2.5vl:7b")
+
+            logger.info(f"🧠 [ORA/reason_instruction] Appel VLM {model}...")
+
+            response = requests.post(
+                f"{ollama_url}/api/generate",
+                json={
+                    "model": model,
+                    "prompt": prompt,
+                    "images": [image_b64],
+                    "stream": False,
+                    "options": {"temperature": 0.1, "num_predict": 300},
+                },
+                timeout=30,
+            )
+
+            if response.status_code != 200:
+                logger.warning(f"🧠 [ORA/reason_instruction] HTTP {response.status_code}")
+                return Decision(
+                    action='need_help', target='', value='',
+                    reasoning=f'VLM HTTP {response.status_code}',
+                    expected_after='', confidence=0.0, done=False,
+                )
+
+            # --- Parser le JSON de la réponse ---
+            text = response.json().get('response', '').strip()
+            logger.debug(f"🧠 [ORA/reason_instruction] Réponse brute: {text[:300]}")
+
+            match = re.search(r'\{[\s\S]*\}', text)
+            if not match:
+                logger.warning(f"🧠 [ORA/reason_instruction] JSON introuvable dans: {text[:200]}")
+                return Decision(
+                    action='need_help', target='', value='',
+                    reasoning=f'Réponse VLM non parseable: {text[:100]}',
+                    expected_after='', confidence=0.0, done=False,
+                )
+
+            parsed = json.loads(match.group())
+
+            decision = Decision(
+                action=parsed.get('action', 'need_help'),
+                target=parsed.get('target', ''),
+                value=parsed.get('value', ''),
+                reasoning=parsed.get('reasoning', ''),
+                expected_after=parsed.get('expected_after', ''),
+                confidence=float(parsed.get('confidence', 0.5)),
+                done=bool(parsed.get('done', False)),
+            )
+
+            logger.info(
+                f"🧠 [ORA/reason_instruction] → {decision.action} "
+                f"target='{decision.target}' value='{decision.value[:50]}' "
+                f"conf={decision.confidence:.2f} done={decision.done}"
+            )
+            return decision
+
+        except json.JSONDecodeError as e:
+            logger.warning(f"🧠 [ORA/reason_instruction] JSON parse error: {e}")
+            return Decision(
+                action='need_help', target='', value='',
+                reasoning=f'JSON invalide: {e}',
+                expected_after='', confidence=0.0, done=False,
+            )
+        except Exception as e:
+            logger.error(f"🧠 [ORA/reason_instruction] Erreur: {e}", exc_info=True)
+            return Decision(
+                action='need_help', target='', value='',
+                reasoning=f'Erreur interne: {e}',
+                expected_after='', confidence=0.0, done=False,
+            )
+
+    # ─── Boucle instruction (Phase 4) ────────────────────
+
+    def run_instruction(
+        self,
+        instruction: str,
+        on_progress: Optional[Callable] = None,
+    ) -> LoopResult:
+        """Exécute une instruction en langage naturel via la boucle ORA.
+
+        Le VLM décide à chaque pas la prochaine action atomique.
+        Pas de workflow pré-défini — Léa raisonne en boucle jusqu'à
+        l'objectif atteint, la confiance trop basse, ou le max d'étapes.
+
+        Args:
+            instruction: Instruction utilisateur (ex: "Ouvre Calculatrice et tape 2+2").
+            on_progress: Callback(step_num, max_steps, verification_result).
+
+        Returns:
+            LoopResult.
+        """
+        logger.info(f"🚀 [ORA/instruction] Démarrage: '{instruction}' (max {self.max_steps} étapes)")
+
+        # --- Initialiser le contexte cognitif ---
+        if COGNITIVE_AVAILABLE and CognitiveContext is not None:
+            self.ctx = CognitiveContext(objective=instruction)
+        else:
+            self.ctx = None
+            logger.warning("🧠 [ORA/instruction] CognitiveContext non disponible — mode dégradé")
+
+        for step_num in range(self.max_steps):
+            logger.info(f"\n{'='*60}")
+            logger.info(f"📋 [ORA/instruction] Étape {step_num + 1}/{self.max_steps}")
+
+            # --- 1. Observer ---
+            pre = self.observe()
+
+            # --- 2. Raisonner ---
+            decision = self.reason_instruction(instruction, pre)
+
+            # --- Objectif atteint ---
+            if decision.done:
+                logger.info(f"✅ [ORA/instruction] Objectif atteint en {step_num + 1} étapes: {decision.reasoning}")
+                if self.ctx:
+                    self.ctx.record_action('done', decision.target, result='Objectif atteint', success=True)
+                return LoopResult(
+                    success=True,
+                    steps_completed=step_num + 1,
+                    total_steps=self.max_steps,
+                    reason=f"Objectif atteint: {decision.reasoning}",
+                )
+
+            # --- Confiance trop basse ---
+            if decision.confidence < 0.3:
+                logger.warning(
+                    f"❌ [ORA/instruction] Confiance trop basse ({decision.confidence:.2f}): "
+                    f"{decision.reasoning}"
+                )
+                if self.ctx:
+                    self.ctx.ask_for_help(f"Confiance {decision.confidence:.2f}: {decision.reasoning}")
+                return LoopResult(
+                    success=False,
+                    steps_completed=step_num,
+                    total_steps=self.max_steps,
+                    reason=f"Confiance trop basse ({decision.confidence:.2f}): {decision.reasoning}",
+                )
+
+            # --- Besoin d'aide ---
+            if decision.action == 'need_help':
+                logger.warning(f"🆘 [ORA/instruction] Besoin d'aide: {decision.reasoning}")
+                if self.ctx:
+                    self.ctx.ask_for_help(decision.reasoning)
+                return LoopResult(
+                    success=False,
+                    steps_completed=step_num,
+                    total_steps=self.max_steps,
+                    reason=f"Besoin d'aide: {decision.reasoning}",
+                )
+
+            # --- 3. Agir ---
+            act_success = self.act(decision)
+            if not act_success and decision.action not in ('wait', 'done'):
+                logger.warning(f"❌ [ORA/instruction] Action échouée: {decision.action}")
+
+                # Enregistrer l'échec et tenter un retry
+                if self.ctx:
+                    self.ctx.record_action(
+                        decision.action, decision.target,
+                        result='Échec exécution', success=False,
+                    )
+
+                # On ne s'arrête pas immédiatement — le VLM va adapter
+                # au prochain tour en voyant que l'écran n'a pas changé.
+                # Mais on décrémente la confiance contextuelle.
+                continue
+
+            # Petit délai pour stabilisation écran
+            time.sleep(0.3)
+
+            # --- 4. Observer post-action ---
+            post = self.observe()
+
+            # --- 5. Vérifier ---
+            verification = self.verify(pre, post, decision)
+
+            # --- Mettre à jour le contexte cognitif ---
+            if self.ctx:
+                self.ctx.record_action(
+                    decision.action, decision.target,
+                    result=verification.detail[:80],
+                    success=verification.success,
+                )
+                self.ctx.set_expected_screen(decision.expected_after)
+                self.ctx.advance_step()
+
+            # --- Retries si échec vérification ---
+            if not verification.success:
+                retried = False
+                for retry in range(self.max_retries):
+                    logger.info(f"🔄 [ORA/instruction] Retry {retry + 1}/{self.max_retries}")
+                    pre_retry = self.observe()
+                    self.act(decision)
+                    time.sleep(0.3)
+                    post_retry = self.observe()
+                    verification = self.verify(pre_retry, post_retry, decision)
+                    if verification.success:
+                        retried = True
+                        logger.info(f"✅ [ORA/instruction] Retry {retry + 1} réussi")
+                        if self.ctx:
+                            self.ctx.record_action(
+                                decision.action, decision.target,
+                                result=f'Retry {retry + 1} OK', success=True,
+                            )
+                        break
+                if not retried and not verification.success:
+                    # On ne bloque pas — le VLM verra l'échec au tour suivant
+                    logger.warning(
+                        f"⚠️ [ORA/instruction] Étape {step_num + 1} vérification échouée "
+                        f"après {self.max_retries} retries — on continue"
+                    )
+                    if self.ctx:
+                        self.ctx.learn(
+                            f"L'action '{decision.action}' sur '{decision.target}' "
+                            f"n'a pas produit le changement attendu"
+                        )
+
+            # --- Callback progression ---
+            if on_progress:
+                on_progress(step_num + 1, self.max_steps, verification)
+
+        # --- Max steps atteint ---
+        logger.warning(f"❌ [ORA/instruction] Max steps atteint ({self.max_steps})")
+        return LoopResult(
+            success=False,
+            steps_completed=self.max_steps,
+            total_steps=self.max_steps,
+            reason=f"Nombre maximal d'étapes atteint ({self.max_steps})",
+        )
+
    # ─── Phase 3 : AGIT ───────────────────────────────────

    def act(self, decision: Decision, step_params: dict = None) -> bool:
@@ -291,6 +610,9 @@ class ORALoop:
            elif decision.action == 'hotkey':
                return self._act_hotkey(decision, step_params)

+            elif decision.action == 'scroll':
+                return self._act_scroll(decision)
+
            elif decision.action == 'wait':
                return self._act_wait(decision)

@@ -663,6 +985,37 @@ class ORALoop:
        time.sleep(timeout_ms / 1000)
        return True

+    def _act_scroll(self, decision: Decision) -> bool:
+        """Scroll écran (mode instruction — pas d'ancre, scroll au centre)."""
+        if not PYAUTOGUI_AVAILABLE:
+            logger.error("pyautogui non disponible")
+            return False
+
+        # Le VLM peut mettre "down", "up", "3" ou "down_3" dans value
+        value = (decision.value or 'down').lower().strip()
+        direction = 'down'
+        amount = 3
+
+        for d in ('up', 'down', 'left', 'right'):
+            if d in value:
+                direction = d
+                break
+
+        # Extraire un nombre si présent
+        nums = re.findall(r'\d+', value)
+        if nums:
+            amount = int(nums[0])
+
+        scroll_value = amount if direction in ('up', 'left') else -amount
+        logger.info(f"📜 [ORA/scroll] {direction} x{amount}")
+
+        if direction in ('left', 'right'):
+            pyautogui.hscroll(scroll_value)
+        else:
+            pyautogui.scroll(scroll_value)
+        time.sleep(0.5)
+        return True
+
    # ═══════════════════════════════════════════════════════════
    # Méthodes privées — utilitaires
    # ═══════════════════════════════════════════════════════════
--- a/visual_workflow_builder/backend/api_v3/execute.py
+++ b/visual_workflow_builder/backend/api_v3/execute.py
@@ -1901,3 +1901,140 @@ def get_healing_candidates():
        'step_info': step_info,
        'original_bbox': step_info.get('original_bbox')
    })
+
+
+# ═══════════════════════════════════════════════════════════
+# Mode INSTRUCTION (Phase 4) — exécution par langage naturel
+# ═══════════════════════════════════════════════════════════
+
+@api_v3_bp.route('/execute/instruction', methods=['POST'])
+def execute_instruction():
+    """Exécute une instruction en langage naturel.
+
+    Le VLM regarde l'écran et décide à chaque pas la prochaine action
+    atomique jusqu'à ce que l'objectif soit atteint.
+
+    POST JSON:
+        instruction (str): L'instruction en langage naturel.
+        max_steps (int, opt): Nombre max d'étapes (défaut 30).
+        verify_level (str, opt): 'none' | 'phash' | 'vlm' | 'auto' (défaut 'auto').
+
+    Returns:
+        202 avec l'ID d'exécution si le thread a démarré.
+    """
+    global _execution_state
+
+    data = request.get_json() or {}
+    instruction = data.get('instruction', '').strip()
+    max_steps = data.get('max_steps', 30)
+    verify_level = data.get('verify_level', 'auto')
+
+    if not instruction:
+        return jsonify({'success': False, 'error': 'Instruction vide'}), 400
+
+    # Vérifier qu'aucune exécution n'est en cours
+    with _execution_lock:
+        if _execution_state['is_running']:
+            return jsonify({
+                'success': False,
+                'error': 'Une exécution est déjà en cours'
+            }), 409
+
+    # Minimiser le navigateur VWB pour accéder à l'écran cible
+    minimize_active_window()
+
+    # Générer un ID d'exécution
+    exec_id = generate_id('instr')
+
+    def run():
+        """Thread d'exécution de l'instruction."""
+        try:
+            from core.execution.observe_reason_act import ORALoop
+
+            loop = ORALoop(
+                max_retries=2,
+                max_steps=max_steps,
+                verify_level=verify_level,
+            )
+
+            logger.info(f"🚀 [Instruction] Démarrage: '{instruction}' (exec_id={exec_id})")
+
+            def on_progress(step_num, total, verification):
+                with _execution_lock:
+                    _execution_state['current_step_info'] = {
+                        'index': step_num - 1,
+                        'total': total,
+                        'verification': {
+                            'success': verification.success,
+                            'change_level': verification.change_level,
+                            'detail': verification.detail,
+                        } if verification else None,
+                    }
+
+            result = loop.run_instruction(instruction, on_progress=on_progress)
+
+            with _execution_lock:
+                _execution_state['last_instruction_result'] = {
+                    'success': result.success,
+                    'steps_completed': result.steps_completed,
+                    'total_steps': result.total_steps,
+                    'reason': result.reason,
+                    'instruction': instruction,
+                    'exec_id': exec_id,
+                }
+                _execution_state['is_running'] = False
+                _execution_state['current_execution_id'] = None
+
+            emoji = "✅" if result.success else "❌"
+            logger.info(
+                f"{emoji} [Instruction] Terminé: success={result.success}, "
+                f"steps={result.steps_completed}/{result.total_steps}, "
+                f"reason={result.reason}"
+            )
+
+        except Exception as e:
+            logger.error(f"❌ [Instruction] Erreur fatale: {e}", exc_info=True)
+            with _execution_lock:
+                _execution_state['last_instruction_result'] = {
+                    'success': False,
+                    'steps_completed': 0,
+                    'total_steps': max_steps,
+                    'reason': f'Erreur fatale: {e}',
+                    'instruction': instruction,
+                    'exec_id': exec_id,
+                }
+                _execution_state['is_running'] = False
+                _execution_state['current_execution_id'] = None
+
+    # Lancer le thread d'exécution
+    with _execution_lock:
+        _execution_state['is_running'] = True
+        _execution_state['should_stop'] = False
+        _execution_state['current_execution_id'] = exec_id
+
+    thread = threading.Thread(target=run, daemon=True, name=f'instruction-{exec_id}')
+    thread.start()
+
+    return jsonify({
+        'success': True,
+        'message': f'Instruction lancée: {instruction}',
+        'exec_id': exec_id,
+    }), 202
+
+
+@api_v3_bp.route('/execute/instruction/result', methods=['GET'])
+def get_instruction_result():
+    """Retourne le résultat de la dernière exécution d'instruction.
+
+    Returns:
+        JSON avec le résultat ou null si aucun résultat disponible.
+    """
+    with _execution_lock:
+        result = _execution_state.get('last_instruction_result')
+        is_running = _execution_state['is_running']
+
+    return jsonify({
+        'success': True,
+        'is_running': is_running,
+        'result': result,
+    })