feat: instructions en langage naturel via boucle ORA
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 11s
security-audit / Scan secrets (grep) (push) Successful in 11s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 14s
tests / Tests sécurité (critique) (push) Has been skipped
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 11s
security-audit / Scan secrets (grep) (push) Successful in 11s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 14s
tests / Tests sécurité (critique) (push) Has been skipped
reason_instruction() : le VLM regarde l'écran, décide la prochaine action atomique (click/type/hotkey/scroll/done), retourne un Decision avec expected_after pour la vérification. run_instruction() : boucle ORA complète pour instructions texte. CognitiveContext mis à jour à chaque étape (objectif, historique, faits appris, confiance). POST /api/v3/execute/instruction : endpoint API pour lancer une instruction en langage naturel. Thread daemon, polling du résultat via GET /api/v3/execute/instruction/result. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,14 @@ from typing import Any, Callable, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Import du contexte cognitif (mémoire de travail)
|
||||
try:
|
||||
from core.cognition.working_memory import CognitiveContext
|
||||
COGNITIVE_AVAILABLE = True
|
||||
except ImportError:
|
||||
CognitiveContext = None
|
||||
COGNITIVE_AVAILABLE = False
|
||||
|
||||
# --- Imports conditionnels ---
|
||||
try:
|
||||
from PIL import Image
|
||||
@@ -112,6 +120,11 @@ class ORALoop:
|
||||
"""
|
||||
Boucle Observe-Raisonne-Agit avec vérification intégrée.
|
||||
|
||||
Deux modes d'utilisation :
|
||||
- **Workflow** : run_workflow() exécute une liste d'étapes VWB pré-définies.
|
||||
- **Instruction** : run_instruction() exécute une instruction en langage naturel
|
||||
via le VLM qui décide à chaque pas la prochaine action atomique (Phase 4).
|
||||
|
||||
Args:
|
||||
max_retries: Nombre de réessais par étape en cas d'échec de vérification.
|
||||
max_steps: Nombre maximal d'étapes autorisées (garde-fou).
|
||||
@@ -130,6 +143,9 @@ class ORALoop:
|
||||
# Variables runtime injectées par le workflow
|
||||
self._variables: Dict[str, Any] = {}
|
||||
|
||||
# Contexte cognitif pour le mode instruction (Phase 4)
|
||||
self.ctx: Optional['CognitiveContext'] = None
|
||||
|
||||
# ─── Phase 1 : OBSERVE ────────────────────────────────
|
||||
|
||||
def observe(self) -> Observation:
|
||||
@@ -259,6 +275,309 @@ class ORALoop:
|
||||
logger.info(f"🧠 [ORA/reason] {decision.action} target='{decision.target}' value='{decision.value[:50]}'")
|
||||
return decision
|
||||
|
||||
# ─── Phase 2b : RAISONNE (mode instruction) ─────────
|
||||
|
||||
def reason_instruction(self, instruction: str, observation: Observation) -> Decision:
|
||||
"""Le VLM regarde l'écran et décide la prochaine action atomique.
|
||||
|
||||
Mode instruction (Phase 4) : pas de workflow pré-défini, le VLM
|
||||
pilote entièrement l'exécution à partir de l'objectif en langage
|
||||
naturel et de ce qu'il voit à l'écran.
|
||||
|
||||
Args:
|
||||
instruction: Instruction utilisateur en langage naturel.
|
||||
observation: Observation courante (screenshot, titre fenêtre...).
|
||||
|
||||
Returns:
|
||||
Decision avec l'action atomique à exécuter.
|
||||
"""
|
||||
try:
|
||||
import requests
|
||||
import io as _io
|
||||
|
||||
# --- Construire le contexte cognitif ---
|
||||
ctx_block = ""
|
||||
if self.ctx and COGNITIVE_AVAILABLE:
|
||||
# Mettre à jour l'observation dans le contexte
|
||||
self.ctx.observe(
|
||||
window_title=observation.window_title,
|
||||
)
|
||||
ctx_block = self.ctx.to_prompt_context()
|
||||
|
||||
# --- Encoder le screenshot en JPEG base64 ---
|
||||
image_b64 = None
|
||||
if observation.screenshot is not None:
|
||||
buffer = _io.BytesIO()
|
||||
observation.screenshot.save(buffer, format='JPEG', quality=70)
|
||||
image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
||||
|
||||
if image_b64 is None:
|
||||
logger.error("🧠 [ORA/reason_instruction] Pas de screenshot disponible")
|
||||
return Decision(
|
||||
action='need_help', target='', value='',
|
||||
reasoning='Impossible de capturer l\'écran',
|
||||
expected_after='', confidence=0.0, done=False,
|
||||
)
|
||||
|
||||
# --- Construire le prompt VLM ---
|
||||
prompt = f"""Tu es Léa, un agent RPA visuel. Tu dois accomplir une tâche sur un ordinateur.
|
||||
|
||||
{ctx_block}
|
||||
|
||||
INSTRUCTION: {instruction}
|
||||
|
||||
ÉCRAN ACTUEL: [image jointe]
|
||||
|
||||
Quelle est la PROCHAINE ACTION ATOMIQUE à effectuer?
|
||||
|
||||
Réponds en JSON strict:
|
||||
{{
|
||||
"action": "click" | "type" | "hotkey" | "wait" | "scroll" | "done",
|
||||
"target": "texte exact du bouton/champ/élément à cliquer",
|
||||
"value": "texte à saisir ou touches (ctrl+s)",
|
||||
"reasoning": "pourquoi cette action",
|
||||
"expected_after": "ce que l'écran devrait montrer après",
|
||||
"confidence": 0.0-1.0,
|
||||
"done": false
|
||||
}}
|
||||
|
||||
Règles:
|
||||
- UNE SEULE action atomique par réponse
|
||||
- Si l'objectif est atteint, action="done", done=true
|
||||
- Si tu ne sais pas, confidence < 0.3
|
||||
- expected_after est OBLIGATOIRE"""
|
||||
|
||||
# --- Appel VLM (Ollama) ---
|
||||
ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
||||
model = os.environ.get("RPA_REASONING_MODEL", "qwen2.5vl:7b")
|
||||
|
||||
logger.info(f"🧠 [ORA/reason_instruction] Appel VLM {model}...")
|
||||
|
||||
response = requests.post(
|
||||
f"{ollama_url}/api/generate",
|
||||
json={
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"images": [image_b64],
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.1, "num_predict": 300},
|
||||
},
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.warning(f"🧠 [ORA/reason_instruction] HTTP {response.status_code}")
|
||||
return Decision(
|
||||
action='need_help', target='', value='',
|
||||
reasoning=f'VLM HTTP {response.status_code}',
|
||||
expected_after='', confidence=0.0, done=False,
|
||||
)
|
||||
|
||||
# --- Parser le JSON de la réponse ---
|
||||
text = response.json().get('response', '').strip()
|
||||
logger.debug(f"🧠 [ORA/reason_instruction] Réponse brute: {text[:300]}")
|
||||
|
||||
match = re.search(r'\{[\s\S]*\}', text)
|
||||
if not match:
|
||||
logger.warning(f"🧠 [ORA/reason_instruction] JSON introuvable dans: {text[:200]}")
|
||||
return Decision(
|
||||
action='need_help', target='', value='',
|
||||
reasoning=f'Réponse VLM non parseable: {text[:100]}',
|
||||
expected_after='', confidence=0.0, done=False,
|
||||
)
|
||||
|
||||
parsed = json.loads(match.group())
|
||||
|
||||
decision = Decision(
|
||||
action=parsed.get('action', 'need_help'),
|
||||
target=parsed.get('target', ''),
|
||||
value=parsed.get('value', ''),
|
||||
reasoning=parsed.get('reasoning', ''),
|
||||
expected_after=parsed.get('expected_after', ''),
|
||||
confidence=float(parsed.get('confidence', 0.5)),
|
||||
done=bool(parsed.get('done', False)),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"🧠 [ORA/reason_instruction] → {decision.action} "
|
||||
f"target='{decision.target}' value='{decision.value[:50]}' "
|
||||
f"conf={decision.confidence:.2f} done={decision.done}"
|
||||
)
|
||||
return decision
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"🧠 [ORA/reason_instruction] JSON parse error: {e}")
|
||||
return Decision(
|
||||
action='need_help', target='', value='',
|
||||
reasoning=f'JSON invalide: {e}',
|
||||
expected_after='', confidence=0.0, done=False,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"🧠 [ORA/reason_instruction] Erreur: {e}", exc_info=True)
|
||||
return Decision(
|
||||
action='need_help', target='', value='',
|
||||
reasoning=f'Erreur interne: {e}',
|
||||
expected_after='', confidence=0.0, done=False,
|
||||
)
|
||||
|
||||
# ─── Boucle instruction (Phase 4) ────────────────────
|
||||
|
||||
def run_instruction(
|
||||
self,
|
||||
instruction: str,
|
||||
on_progress: Optional[Callable] = None,
|
||||
) -> LoopResult:
|
||||
"""Exécute une instruction en langage naturel via la boucle ORA.
|
||||
|
||||
Le VLM décide à chaque pas la prochaine action atomique.
|
||||
Pas de workflow pré-défini — Léa raisonne en boucle jusqu'à
|
||||
l'objectif atteint, la confiance trop basse, ou le max d'étapes.
|
||||
|
||||
Args:
|
||||
instruction: Instruction utilisateur (ex: "Ouvre Calculatrice et tape 2+2").
|
||||
on_progress: Callback(step_num, max_steps, verification_result).
|
||||
|
||||
Returns:
|
||||
LoopResult.
|
||||
"""
|
||||
logger.info(f"🚀 [ORA/instruction] Démarrage: '{instruction}' (max {self.max_steps} étapes)")
|
||||
|
||||
# --- Initialiser le contexte cognitif ---
|
||||
if COGNITIVE_AVAILABLE and CognitiveContext is not None:
|
||||
self.ctx = CognitiveContext(objective=instruction)
|
||||
else:
|
||||
self.ctx = None
|
||||
logger.warning("🧠 [ORA/instruction] CognitiveContext non disponible — mode dégradé")
|
||||
|
||||
for step_num in range(self.max_steps):
|
||||
logger.info(f"\n{'='*60}")
|
||||
logger.info(f"📋 [ORA/instruction] Étape {step_num + 1}/{self.max_steps}")
|
||||
|
||||
# --- 1. Observer ---
|
||||
pre = self.observe()
|
||||
|
||||
# --- 2. Raisonner ---
|
||||
decision = self.reason_instruction(instruction, pre)
|
||||
|
||||
# --- Objectif atteint ---
|
||||
if decision.done:
|
||||
logger.info(f"✅ [ORA/instruction] Objectif atteint en {step_num + 1} étapes: {decision.reasoning}")
|
||||
if self.ctx:
|
||||
self.ctx.record_action('done', decision.target, result='Objectif atteint', success=True)
|
||||
return LoopResult(
|
||||
success=True,
|
||||
steps_completed=step_num + 1,
|
||||
total_steps=self.max_steps,
|
||||
reason=f"Objectif atteint: {decision.reasoning}",
|
||||
)
|
||||
|
||||
# --- Confiance trop basse ---
|
||||
if decision.confidence < 0.3:
|
||||
logger.warning(
|
||||
f"❌ [ORA/instruction] Confiance trop basse ({decision.confidence:.2f}): "
|
||||
f"{decision.reasoning}"
|
||||
)
|
||||
if self.ctx:
|
||||
self.ctx.ask_for_help(f"Confiance {decision.confidence:.2f}: {decision.reasoning}")
|
||||
return LoopResult(
|
||||
success=False,
|
||||
steps_completed=step_num,
|
||||
total_steps=self.max_steps,
|
||||
reason=f"Confiance trop basse ({decision.confidence:.2f}): {decision.reasoning}",
|
||||
)
|
||||
|
||||
# --- Besoin d'aide ---
|
||||
if decision.action == 'need_help':
|
||||
logger.warning(f"🆘 [ORA/instruction] Besoin d'aide: {decision.reasoning}")
|
||||
if self.ctx:
|
||||
self.ctx.ask_for_help(decision.reasoning)
|
||||
return LoopResult(
|
||||
success=False,
|
||||
steps_completed=step_num,
|
||||
total_steps=self.max_steps,
|
||||
reason=f"Besoin d'aide: {decision.reasoning}",
|
||||
)
|
||||
|
||||
# --- 3. Agir ---
|
||||
act_success = self.act(decision)
|
||||
if not act_success and decision.action not in ('wait', 'done'):
|
||||
logger.warning(f"❌ [ORA/instruction] Action échouée: {decision.action}")
|
||||
|
||||
# Enregistrer l'échec et tenter un retry
|
||||
if self.ctx:
|
||||
self.ctx.record_action(
|
||||
decision.action, decision.target,
|
||||
result='Échec exécution', success=False,
|
||||
)
|
||||
|
||||
# On ne s'arrête pas immédiatement — le VLM va adapter
|
||||
# au prochain tour en voyant que l'écran n'a pas changé.
|
||||
# Mais on décrémente la confiance contextuelle.
|
||||
continue
|
||||
|
||||
# Petit délai pour stabilisation écran
|
||||
time.sleep(0.3)
|
||||
|
||||
# --- 4. Observer post-action ---
|
||||
post = self.observe()
|
||||
|
||||
# --- 5. Vérifier ---
|
||||
verification = self.verify(pre, post, decision)
|
||||
|
||||
# --- Mettre à jour le contexte cognitif ---
|
||||
if self.ctx:
|
||||
self.ctx.record_action(
|
||||
decision.action, decision.target,
|
||||
result=verification.detail[:80],
|
||||
success=verification.success,
|
||||
)
|
||||
self.ctx.set_expected_screen(decision.expected_after)
|
||||
self.ctx.advance_step()
|
||||
|
||||
# --- Retries si échec vérification ---
|
||||
if not verification.success:
|
||||
retried = False
|
||||
for retry in range(self.max_retries):
|
||||
logger.info(f"🔄 [ORA/instruction] Retry {retry + 1}/{self.max_retries}")
|
||||
pre_retry = self.observe()
|
||||
self.act(decision)
|
||||
time.sleep(0.3)
|
||||
post_retry = self.observe()
|
||||
verification = self.verify(pre_retry, post_retry, decision)
|
||||
if verification.success:
|
||||
retried = True
|
||||
logger.info(f"✅ [ORA/instruction] Retry {retry + 1} réussi")
|
||||
if self.ctx:
|
||||
self.ctx.record_action(
|
||||
decision.action, decision.target,
|
||||
result=f'Retry {retry + 1} OK', success=True,
|
||||
)
|
||||
break
|
||||
if not retried and not verification.success:
|
||||
# On ne bloque pas — le VLM verra l'échec au tour suivant
|
||||
logger.warning(
|
||||
f"⚠️ [ORA/instruction] Étape {step_num + 1} vérification échouée "
|
||||
f"après {self.max_retries} retries — on continue"
|
||||
)
|
||||
if self.ctx:
|
||||
self.ctx.learn(
|
||||
f"L'action '{decision.action}' sur '{decision.target}' "
|
||||
f"n'a pas produit le changement attendu"
|
||||
)
|
||||
|
||||
# --- Callback progression ---
|
||||
if on_progress:
|
||||
on_progress(step_num + 1, self.max_steps, verification)
|
||||
|
||||
# --- Max steps atteint ---
|
||||
logger.warning(f"❌ [ORA/instruction] Max steps atteint ({self.max_steps})")
|
||||
return LoopResult(
|
||||
success=False,
|
||||
steps_completed=self.max_steps,
|
||||
total_steps=self.max_steps,
|
||||
reason=f"Nombre maximal d'étapes atteint ({self.max_steps})",
|
||||
)
|
||||
|
||||
# ─── Phase 3 : AGIT ───────────────────────────────────
|
||||
|
||||
def act(self, decision: Decision, step_params: dict = None) -> bool:
|
||||
@@ -291,6 +610,9 @@ class ORALoop:
|
||||
elif decision.action == 'hotkey':
|
||||
return self._act_hotkey(decision, step_params)
|
||||
|
||||
elif decision.action == 'scroll':
|
||||
return self._act_scroll(decision)
|
||||
|
||||
elif decision.action == 'wait':
|
||||
return self._act_wait(decision)
|
||||
|
||||
@@ -663,6 +985,37 @@ class ORALoop:
|
||||
time.sleep(timeout_ms / 1000)
|
||||
return True
|
||||
|
||||
def _act_scroll(self, decision: Decision) -> bool:
|
||||
"""Scroll écran (mode instruction — pas d'ancre, scroll au centre)."""
|
||||
if not PYAUTOGUI_AVAILABLE:
|
||||
logger.error("pyautogui non disponible")
|
||||
return False
|
||||
|
||||
# Le VLM peut mettre "down", "up", "3" ou "down_3" dans value
|
||||
value = (decision.value or 'down').lower().strip()
|
||||
direction = 'down'
|
||||
amount = 3
|
||||
|
||||
for d in ('up', 'down', 'left', 'right'):
|
||||
if d in value:
|
||||
direction = d
|
||||
break
|
||||
|
||||
# Extraire un nombre si présent
|
||||
nums = re.findall(r'\d+', value)
|
||||
if nums:
|
||||
amount = int(nums[0])
|
||||
|
||||
scroll_value = amount if direction in ('up', 'left') else -amount
|
||||
logger.info(f"📜 [ORA/scroll] {direction} x{amount}")
|
||||
|
||||
if direction in ('left', 'right'):
|
||||
pyautogui.hscroll(scroll_value)
|
||||
else:
|
||||
pyautogui.scroll(scroll_value)
|
||||
time.sleep(0.5)
|
||||
return True
|
||||
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
# Méthodes privées — utilitaires
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
|
||||
@@ -1901,3 +1901,140 @@ def get_healing_candidates():
|
||||
'step_info': step_info,
|
||||
'original_bbox': step_info.get('original_bbox')
|
||||
})
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
# Mode INSTRUCTION (Phase 4) — exécution par langage naturel
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
|
||||
@api_v3_bp.route('/execute/instruction', methods=['POST'])
|
||||
def execute_instruction():
|
||||
"""Exécute une instruction en langage naturel.
|
||||
|
||||
Le VLM regarde l'écran et décide à chaque pas la prochaine action
|
||||
atomique jusqu'à ce que l'objectif soit atteint.
|
||||
|
||||
POST JSON:
|
||||
instruction (str): L'instruction en langage naturel.
|
||||
max_steps (int, opt): Nombre max d'étapes (défaut 30).
|
||||
verify_level (str, opt): 'none' | 'phash' | 'vlm' | 'auto' (défaut 'auto').
|
||||
|
||||
Returns:
|
||||
202 avec l'ID d'exécution si le thread a démarré.
|
||||
"""
|
||||
global _execution_state
|
||||
|
||||
data = request.get_json() or {}
|
||||
instruction = data.get('instruction', '').strip()
|
||||
max_steps = data.get('max_steps', 30)
|
||||
verify_level = data.get('verify_level', 'auto')
|
||||
|
||||
if not instruction:
|
||||
return jsonify({'success': False, 'error': 'Instruction vide'}), 400
|
||||
|
||||
# Vérifier qu'aucune exécution n'est en cours
|
||||
with _execution_lock:
|
||||
if _execution_state['is_running']:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': 'Une exécution est déjà en cours'
|
||||
}), 409
|
||||
|
||||
# Minimiser le navigateur VWB pour accéder à l'écran cible
|
||||
minimize_active_window()
|
||||
|
||||
# Générer un ID d'exécution
|
||||
exec_id = generate_id('instr')
|
||||
|
||||
def run():
|
||||
"""Thread d'exécution de l'instruction."""
|
||||
try:
|
||||
from core.execution.observe_reason_act import ORALoop
|
||||
|
||||
loop = ORALoop(
|
||||
max_retries=2,
|
||||
max_steps=max_steps,
|
||||
verify_level=verify_level,
|
||||
)
|
||||
|
||||
logger.info(f"🚀 [Instruction] Démarrage: '{instruction}' (exec_id={exec_id})")
|
||||
|
||||
def on_progress(step_num, total, verification):
|
||||
with _execution_lock:
|
||||
_execution_state['current_step_info'] = {
|
||||
'index': step_num - 1,
|
||||
'total': total,
|
||||
'verification': {
|
||||
'success': verification.success,
|
||||
'change_level': verification.change_level,
|
||||
'detail': verification.detail,
|
||||
} if verification else None,
|
||||
}
|
||||
|
||||
result = loop.run_instruction(instruction, on_progress=on_progress)
|
||||
|
||||
with _execution_lock:
|
||||
_execution_state['last_instruction_result'] = {
|
||||
'success': result.success,
|
||||
'steps_completed': result.steps_completed,
|
||||
'total_steps': result.total_steps,
|
||||
'reason': result.reason,
|
||||
'instruction': instruction,
|
||||
'exec_id': exec_id,
|
||||
}
|
||||
_execution_state['is_running'] = False
|
||||
_execution_state['current_execution_id'] = None
|
||||
|
||||
emoji = "✅" if result.success else "❌"
|
||||
logger.info(
|
||||
f"{emoji} [Instruction] Terminé: success={result.success}, "
|
||||
f"steps={result.steps_completed}/{result.total_steps}, "
|
||||
f"reason={result.reason}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ [Instruction] Erreur fatale: {e}", exc_info=True)
|
||||
with _execution_lock:
|
||||
_execution_state['last_instruction_result'] = {
|
||||
'success': False,
|
||||
'steps_completed': 0,
|
||||
'total_steps': max_steps,
|
||||
'reason': f'Erreur fatale: {e}',
|
||||
'instruction': instruction,
|
||||
'exec_id': exec_id,
|
||||
}
|
||||
_execution_state['is_running'] = False
|
||||
_execution_state['current_execution_id'] = None
|
||||
|
||||
# Lancer le thread d'exécution
|
||||
with _execution_lock:
|
||||
_execution_state['is_running'] = True
|
||||
_execution_state['should_stop'] = False
|
||||
_execution_state['current_execution_id'] = exec_id
|
||||
|
||||
thread = threading.Thread(target=run, daemon=True, name=f'instruction-{exec_id}')
|
||||
thread.start()
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': f'Instruction lancée: {instruction}',
|
||||
'exec_id': exec_id,
|
||||
}), 202
|
||||
|
||||
|
||||
@api_v3_bp.route('/execute/instruction/result', methods=['GET'])
|
||||
def get_instruction_result():
|
||||
"""Retourne le résultat de la dernière exécution d'instruction.
|
||||
|
||||
Returns:
|
||||
JSON avec le résultat ou null si aucun résultat disponible.
|
||||
"""
|
||||
with _execution_lock:
|
||||
result = _execution_state.get('last_instruction_result')
|
||||
is_running = _execution_state['is_running']
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'is_running': is_running,
|
||||
'result': result,
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user