Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 12s
security-audit / pip-audit (CVE dépendances) (push) Successful in 12s
security-audit / Scan secrets (grep) (push) Successful in 7s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 15s
tests / Tests sécurité (critique) (push) Has been skipped
VRAMOrchestrator : bascule automatique entre modes SHADOW et REPLAY. - SHADOW : streaming server + agent_chat actifs - REPLAY : VLM qwen2.5vl:7b chargé, services non-essentiels stoppés vlm_reason_about_screen() appelle ensure_reasoning_ready() avant chaque raisonnement — libère la VRAM si nécessaire. Benchmark : qwen2.5vl:7b en 10s (warm) vs 44s quand VRAM saturée. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
354 lines
12 KiB
Python
354 lines
12 KiB
Python
"""
|
|
Module partagé de saisie texte et gestion des dialogues.
|
|
|
|
Utilisé par les deux executors :
|
|
- VWB executor (visual_workflow_builder/backend/api_v3/execute.py)
|
|
- Core executor (core/execution/action_executor.py)
|
|
|
|
Garantit le même comportement AZERTY/VM/Citrix partout.
|
|
"""
|
|
|
|
import logging
|
|
import subprocess
|
|
import shutil
|
|
import time
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
try:
|
|
import pyautogui
|
|
PYAUTOGUI_AVAILABLE = True
|
|
except ImportError:
|
|
PYAUTOGUI_AVAILABLE = False
|
|
|
|
|
|
def safe_type_text(text: str):
|
|
"""Saisie de texte compatible VM/Citrix et claviers AZERTY/QWERTY.
|
|
|
|
Priorité :
|
|
1. xdotool type avec refresh layout → traverse les VM spice/QEMU
|
|
2. Presse-papier (xclip) + Ctrl+V → fallback
|
|
3. pyautogui.write() → dernier recours
|
|
"""
|
|
if not text:
|
|
return
|
|
|
|
# Méthode 1 : xdotool type avec refresh du layout clavier
|
|
if shutil.which('xdotool') and shutil.which('setxkbmap'):
|
|
try:
|
|
subprocess.run(['setxkbmap', 'fr'], timeout=2)
|
|
subprocess.run(
|
|
['xdotool', 'type', '--delay', '0', '--clearmodifiers', '--', text],
|
|
timeout=max(30, len(text) * 0.05),
|
|
check=True
|
|
)
|
|
logger.debug(f"Saisie via xdotool type ({len(text)} car.)")
|
|
return
|
|
except Exception as e:
|
|
logger.debug(f"xdotool type échoué: {e}")
|
|
|
|
# Méthode 2 : Presse-papier
|
|
xclip = shutil.which('xclip')
|
|
if xclip and PYAUTOGUI_AVAILABLE:
|
|
try:
|
|
p = subprocess.Popen(
|
|
['xclip', '-selection', 'clipboard'],
|
|
stdin=subprocess.PIPE,
|
|
stdout=subprocess.DEVNULL,
|
|
stderr=subprocess.DEVNULL
|
|
)
|
|
p.stdin.write(text.encode('utf-8'))
|
|
p.stdin.close()
|
|
time.sleep(0.2)
|
|
pyautogui.hotkey('ctrl', 'v')
|
|
time.sleep(0.3)
|
|
logger.debug(f"Saisie via presse-papier ({len(text)} car.)")
|
|
return
|
|
except Exception as e:
|
|
logger.debug(f"xclip échoué: {e}")
|
|
|
|
# Méthode 3 : pyautogui
|
|
if PYAUTOGUI_AVAILABLE:
|
|
logger.warning("Saisie via pyautogui.write() (AZERTY non garanti)")
|
|
pyautogui.write(text, interval=0.02)
|
|
else:
|
|
logger.warning(f"Aucune méthode de saisie disponible pour: {text[:50]}")
|
|
|
|
|
|
def check_screen_for_patterns() -> Optional[Dict[str, Any]]:
|
|
"""Vérifie si l'écran contient un pattern UI connu (dialogue, popup).
|
|
|
|
Capture l'écran, extrait le texte via OCR, et cherche un pattern
|
|
dans la UIPatternLibrary.
|
|
|
|
Returns:
|
|
Dict avec le pattern trouvé, ou None.
|
|
"""
|
|
try:
|
|
from core.knowledge.ui_patterns import UIPatternLibrary
|
|
import mss
|
|
from PIL import Image
|
|
|
|
lib = UIPatternLibrary()
|
|
|
|
with mss.mss() as sct:
|
|
monitor = sct.monitors[1]
|
|
screenshot = sct.grab(monitor)
|
|
screen = Image.frombytes('RGB', screenshot.size, screenshot.bgra, 'raw', 'BGRX')
|
|
|
|
try:
|
|
# Essayer docTR d'abord (peut être importé depuis différents chemins)
|
|
try:
|
|
from services.ocr_service import ocr_extract_text
|
|
except ImportError:
|
|
from core.extraction.field_extractor import FieldExtractor
|
|
extractor = FieldExtractor()
|
|
ocr_extract_text = lambda img: extractor.extract_text_from_image(img)
|
|
|
|
ocr_text = ocr_extract_text(screen)
|
|
except ImportError:
|
|
logger.debug("OCR non disponible pour pattern check")
|
|
return None
|
|
|
|
if not ocr_text or len(ocr_text) < 5:
|
|
return None
|
|
|
|
pattern = lib.find_pattern(ocr_text)
|
|
if pattern and pattern['category'] in ('dialog', 'popup'):
|
|
logger.info(f"Pattern UI détecté: {pattern['pattern']} → {pattern['action']} '{pattern['target']}'")
|
|
return pattern
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Pattern check échoué: {e}")
|
|
return None
|
|
|
|
|
|
def handle_detected_pattern(pattern: Dict[str, Any]) -> bool:
|
|
"""Gère automatiquement un pattern UI détecté.
|
|
|
|
Cherche le bouton cible via OCR (position réelle sur l'écran).
|
|
100% vision — zéro coordonnée hardcodée.
|
|
|
|
Returns:
|
|
True si le pattern a été géré avec succès.
|
|
"""
|
|
if not PYAUTOGUI_AVAILABLE:
|
|
logger.warning("pyautogui non disponible — impossible de gérer le pattern")
|
|
return False
|
|
|
|
action = pattern.get('action')
|
|
target = pattern.get('target', '')
|
|
alternatives = pattern.get('alternatives', [])
|
|
|
|
if action == 'click':
|
|
candidates_labels = [target] + alternatives
|
|
|
|
try:
|
|
import mss
|
|
from PIL import Image
|
|
|
|
# Importer OCR (essayer les deux chemins)
|
|
try:
|
|
from services.ocr_service import ocr_extract_words
|
|
except ImportError:
|
|
from core.extraction.field_extractor import FieldExtractor
|
|
extractor = FieldExtractor()
|
|
def ocr_extract_words(img):
|
|
return extractor.extract_words_from_image(img)
|
|
|
|
with mss.mss() as sct:
|
|
monitor = sct.monitors[1]
|
|
screenshot = sct.grab(monitor)
|
|
screen = Image.frombytes('RGB', screenshot.size, screenshot.bgra, 'raw', 'BGRX')
|
|
|
|
words = ocr_extract_words(screen)
|
|
|
|
# Collecter tous les matchs, prendre le plus bas (bouton = bas du dialogue)
|
|
all_matches = []
|
|
|
|
for candidate in candidates_labels:
|
|
candidate_lower = candidate.lower()
|
|
for word in words:
|
|
word_text = word['text'].lower()
|
|
if len(word_text) < 2 or len(candidate_lower) < 2:
|
|
continue
|
|
if word_text == candidate_lower:
|
|
x1, y1, x2, y2 = word['bbox']
|
|
all_matches.append({
|
|
'text': word['text'],
|
|
'x': int((x1 + x2) / 2),
|
|
'y': int((y1 + y2) / 2),
|
|
'match_type': 'exact',
|
|
})
|
|
|
|
# Recherche partielle (lettre soulignée manquante)
|
|
if not all_matches:
|
|
for candidate in candidates_labels:
|
|
if len(candidate) > 3:
|
|
partial = candidate[1:].lower()
|
|
for word in words:
|
|
if partial in word['text'].lower():
|
|
x1, y1, x2, y2 = word['bbox']
|
|
all_matches.append({
|
|
'text': word['text'],
|
|
'x': int((x1 + x2) / 2),
|
|
'y': int((y1 + y2) / 2),
|
|
'match_type': 'partial',
|
|
})
|
|
|
|
if all_matches:
|
|
best = max(all_matches, key=lambda m: m['y'])
|
|
logger.info(f"Clic sur '{best['text']}' à ({best['x']}, {best['y']})")
|
|
pyautogui.click(best['x'], best['y'])
|
|
time.sleep(1.0)
|
|
return True
|
|
|
|
logger.info(f"Bouton '{target}' introuvable par OCR — appel VLM...")
|
|
vlm_result = vlm_reason_about_screen(
|
|
objective=f"Cliquer sur le bouton '{target}'",
|
|
context=f"Un dialogue '{pattern.get('pattern')}' est détecté"
|
|
)
|
|
if vlm_result and vlm_result.get('action') == 'click' and vlm_result.get('target'):
|
|
vlm_target = vlm_result['target']
|
|
for word in words:
|
|
if vlm_target.lower() in word['text'].lower():
|
|
x1, y1, x2, y2 = word['bbox']
|
|
x = int((x1 + x2) / 2)
|
|
y = int((y1 + y2) / 2)
|
|
logger.info(f"VLM → clic sur '{word['text']}' à ({x}, {y})")
|
|
pyautogui.click(x, y)
|
|
time.sleep(1.0)
|
|
return True
|
|
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.warning(f"OCR bouton échoué: {e}")
|
|
return False
|
|
|
|
elif action == 'hotkey':
|
|
keys = target.split('+')
|
|
logger.info(f"Raccourci automatique: {target}")
|
|
pyautogui.hotkey(*keys)
|
|
time.sleep(0.5)
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def vlm_reason_about_screen(objective: str = "", context: str = "") -> Optional[Dict[str, Any]]:
|
|
"""Demande au VLM de raisonner sur l'écran actuel et proposer une action.
|
|
|
|
Utilisé quand les réflexes (patterns) ne suffisent pas.
|
|
Le VLM voit l'écran et décide quoi faire.
|
|
|
|
Args:
|
|
objective: Ce que Léa essaie de faire (ex: "cliquer sur Enregistrer")
|
|
context: Contexte additionnel (ex: "un dialogue est apparu")
|
|
|
|
Returns:
|
|
Dict avec 'action', 'target', 'reasoning' ou None si le VLM ne peut pas aider.
|
|
"""
|
|
try:
|
|
import mss
|
|
import requests
|
|
import json
|
|
import base64
|
|
import io
|
|
import os
|
|
from PIL import Image
|
|
|
|
with mss.mss() as sct:
|
|
monitor = sct.monitors[1]
|
|
screenshot = sct.grab(monitor)
|
|
screen = Image.frombytes('RGB', screenshot.size, screenshot.bgra, 'raw', 'BGRX')
|
|
|
|
buffer = io.BytesIO()
|
|
screen.save(buffer, format='JPEG', quality=70)
|
|
image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
|
|
|
prompt = f"""Analyse cet écran et dis-moi quoi faire.
|
|
|
|
Objectif : {objective or "Interagir avec l'interface visible"}
|
|
Contexte : {context or "Aucun contexte supplémentaire"}
|
|
|
|
Réponds en JSON strict :
|
|
{{
|
|
"action": "click" ou "type" ou "wait" ou "nothing",
|
|
"target": "texte exact du bouton ou champ à cliquer",
|
|
"reasoning": "explication courte de ton choix"
|
|
}}
|
|
|
|
Si tu vois un dialogue ou une popup, indique quel bouton cliquer.
|
|
Si l'écran est normal sans action nécessaire, réponds action="nothing".
|
|
Réponds UNIQUEMENT le JSON, pas d'explication."""
|
|
|
|
from core.cognition.vram_orchestrator import get_orchestrator
|
|
orch = get_orchestrator()
|
|
orch.ensure_reasoning_ready()
|
|
|
|
ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
|
model = os.environ.get("RPA_REASONING_MODEL", "qwen2.5vl:7b")
|
|
|
|
response = requests.post(
|
|
f"{ollama_url}/api/generate",
|
|
json={
|
|
"model": model,
|
|
"prompt": prompt,
|
|
"images": [image_b64],
|
|
"stream": False,
|
|
"options": {"temperature": 0.1, "num_predict": 200}
|
|
},
|
|
timeout=30
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
logger.warning(f"VLM reasoning failed: HTTP {response.status_code}")
|
|
return None
|
|
|
|
result = response.json()
|
|
text = result.get('response', '').strip()
|
|
|
|
import re
|
|
match = re.search(r'\{[\s\S]*\}', text)
|
|
if match:
|
|
parsed = json.loads(match.group())
|
|
logger.info(f"VLM reasoning: {parsed.get('action')} '{parsed.get('target')}' — {parsed.get('reasoning', '')[:80]}")
|
|
return parsed
|
|
|
|
logger.debug(f"VLM response not parseable: {text[:100]}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.debug(f"VLM reasoning failed: {e}")
|
|
return None
|
|
|
|
|
|
def post_execution_cleanup(execution_mode: str = 'debug'):
|
|
"""Vérifie l'écran après exécution et gère les dialogues restants.
|
|
|
|
Appelé après la dernière étape d'un workflow pour laisser l'écran propre.
|
|
"""
|
|
if execution_mode not in ('intelligent', 'debug'):
|
|
return
|
|
|
|
logger.info("Vérification écran final...")
|
|
time.sleep(1.0)
|
|
for _ in range(3):
|
|
detected = check_screen_for_patterns()
|
|
if detected:
|
|
logger.info(f"Dialogue résiduel détecté: {detected.get('pattern')}")
|
|
handle_detected_pattern(detected)
|
|
time.sleep(1.0)
|
|
else:
|
|
vlm_result = vlm_reason_about_screen(
|
|
objective="Vérifier que l'écran est propre après l'exécution",
|
|
context="Le workflow vient de se terminer"
|
|
)
|
|
if vlm_result and vlm_result.get('action') in ('click', 'type'):
|
|
logger.info(f"VLM post-workflow: {vlm_result.get('action')} '{vlm_result.get('target')}'")
|
|
break
|