feat: replay visuel VLM-first, worker séparé, package Léa, AZERTY, sécurité HTTPS
Pipeline replay visuel : - VLM-first : l'agent appelle Ollama directement pour trouver les éléments - Template matching en fallback (seuil strict 0.90) - Stop immédiat si élément non trouvé (pas de clic blind) - Replay depuis session brute (/replay-session) sans attendre le VLM - Vérification post-action (screenshot hash avant/après) - Gestion des popups (Enter/Escape/Tab+Enter) Worker VLM séparé : - run_worker.py : process distinct du serveur HTTP - Communication par fichiers (_worker_queue.txt + _replay_active.lock) - Le serveur HTTP ne fait plus jamais de VLM → toujours réactif - Service systemd rpa-worker.service Capture clavier : - raw_keys (vk + press/release) pour replay exact indépendant du layout - Fix AZERTY : ToUnicodeEx + AltGr detection - Enter capturé comme \n, Tab comme \t - Filtrage modificateurs seuls (Ctrl/Alt/Shift parasites) - Fusion text_input consécutifs, dédup key_combo Sécurité & Internet : - HTTPS Let's Encrypt (lea.labs + vwb.labs.laurinebazin.design) - Token API fixe dans .env.local - HTTP Basic Auth sur VWB - Security headers (HSTS, CSP, nosniff) - CORS domaines publics, plus de wildcard Infrastructure : - DPI awareness (SetProcessDpiAwareness) Python + Rust - Métadonnées système (dpi_scale, window_bounds, monitors, os_theme) - Template matching multi-scale [0.5, 2.0] - Résolution dynamique (plus de hardcode 1920x1080) - VLM prefill fix (47x speedup, 3.5s au lieu de 180s) Modules : - core/auth/ : credential vault (Fernet AES), TOTP (RFC 6238), auth handler - core/federation/ : LearningPack export/import anonymisé, FAISS global - deploy/ : package Léa (config.txt, Lea.bat, install.bat, LISEZMOI.txt) UX : - Filtrage OS (VWB + Chat montrent que les workflows de l'OS courant) - Bibliothèque persistante (cache local + SQLite) - Clustering hybride (titre fenêtre + DBSCAN) - EdgeConstraints + PostConditions peuplés - GraphBuilder compound actions (toutes les frappes) Agent Rust : - Token Bearer auth (network.rs) - sysinfo.rs (DPI, résolution, window bounds via Win32 API) - config.txt lu automatiquement - Support Chrome/Brave/Firefox (pas que Edge) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
165
visual_workflow_builder/backend/vlm_provider.py
Normal file
165
visual_workflow_builder/backend/vlm_provider.py
Normal file
@@ -0,0 +1,165 @@
|
||||
import os
|
||||
import io
|
||||
import json
|
||||
import base64
|
||||
from typing import Optional, Dict, Any, List
|
||||
from PIL import Image
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Charger les variables d'environnement
|
||||
env_paths = [
|
||||
os.path.join(os.getcwd(), ".env.local"),
|
||||
os.path.join(os.getcwd(), "rpa_vision_v3/.env.local"),
|
||||
os.path.join(os.path.dirname(__file__), "../../../.env.local")
|
||||
]
|
||||
for path in env_paths:
|
||||
if os.path.exists(path):
|
||||
load_dotenv(path, override=True)
|
||||
break
|
||||
|
||||
class VLMProvider:
|
||||
"""Hub de Vision Sémantique Multi-Fournisseurs (OpenAI, Gemini, Anthropic, Ollama)"""
|
||||
|
||||
def __init__(self):
|
||||
# Clés API
|
||||
self.openai_key = os.getenv("OPENAI_API_KEY")
|
||||
self.gemini_key = os.getenv("GOOGLE_API_KEY")
|
||||
self.anthropic_key = os.getenv("ANTHROPIC_API_KEY")
|
||||
self.deepseek_key = os.getenv("DEEPSEEK_API_KEY")
|
||||
|
||||
# Configuration Ollama Local
|
||||
self.ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
|
||||
self.local_model = os.getenv("VLM_MODEL", "qwen3-vl:8b")
|
||||
|
||||
# Priorité par défaut
|
||||
self.preferred_cloud = "openai" # gpt-4o est la référence UI
|
||||
print(f"🔧 [VLM Hub] Initialisé. OpenAI: {bool(self.openai_key)}, Gemini: {bool(self.gemini_key)}, Anthropic: {bool(self.anthropic_key)}")
|
||||
|
||||
def _to_base64(self, image_input) -> str:
|
||||
"""Convertit n'importe quel input image en base64 pur"""
|
||||
if isinstance(image_input, Image.Image):
|
||||
buffer = io.BytesIO()
|
||||
image_input.save(buffer, format="PNG")
|
||||
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
elif isinstance(image_input, str):
|
||||
if image_input.startswith("data:image"):
|
||||
return image_input.split(",", 1)[1]
|
||||
elif os.path.exists(image_input):
|
||||
with open(image_input, "rb") as f:
|
||||
return base64.b64encode(f.read()).decode("utf-8")
|
||||
return image_input # Base64 brut supposé
|
||||
return base64.b64encode(image_input).decode("utf-8")
|
||||
|
||||
def detect_ui_element(self, screenshot, anchor_image=None, description: str = "") -> Optional[Dict[str, Any]]:
|
||||
"""Tente de localiser l'élément en essayant les fournisseurs par ordre de qualité"""
|
||||
|
||||
# 1. Tenter OpenAI (Référence Vision UI)
|
||||
if self.openai_key:
|
||||
res = self._call_openai(screenshot, anchor_image, description)
|
||||
if res and res.get('found'): return res
|
||||
|
||||
# 2. Tenter Gemini (Excellent backup Vision)
|
||||
if self.gemini_key:
|
||||
res = self._call_gemini(screenshot, anchor_image, description)
|
||||
if res and res.get('found'): return res
|
||||
|
||||
# 3. Tenter Anthropic (Précision logique)
|
||||
if self.anthropic_key:
|
||||
res = self._call_anthropic(screenshot, anchor_image, description)
|
||||
if res and res.get('found'): return res
|
||||
|
||||
# 4. Fallback Local (Ollama) - Crucial pour le DGX Spark
|
||||
return self._call_ollama_local(screenshot, anchor_image, description)
|
||||
|
||||
def _call_openai(self, screenshot, anchor_image, description):
|
||||
try:
|
||||
from openai import OpenAI
|
||||
client = OpenAI(api_key=self.openai_key)
|
||||
prompt = f"Expert UI: Localise précisément '{description}'. Retourne JSON: {{'found': bool, 'bbox': [ymin, xmin, ymax, xmax] (0-1000), 'confidence': float}}"
|
||||
|
||||
content = [{"type": "text", "text": prompt}]
|
||||
content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{self._to_base64(screenshot)}"}})
|
||||
if anchor_image:
|
||||
content.append({"type": "text", "text": "Ancre de référence:"})
|
||||
content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{self._to_base64(anchor_image)}"}})
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=[{"role": "user", "content": content}],
|
||||
response_format={"type": "json_object"},
|
||||
temperature=0
|
||||
)
|
||||
return json.loads(response.choices[0].message.content)
|
||||
except Exception as e:
|
||||
print(f"⚠️ [Hub] OpenAI Error: {e}")
|
||||
return None
|
||||
|
||||
def _call_gemini(self, screenshot, anchor_image, description):
|
||||
try:
|
||||
from google import genai
|
||||
client = genai.Client(api_key=self.gemini_key)
|
||||
prompt = f"Expert UI: Localise précisément '{description}'. Retourne JSON: {{'found': bool, 'bbox': [ymin, xmin, ymax, xmax] (0-1000), 'confidence': float}}"
|
||||
|
||||
contents = [prompt, Image.open(io.BytesIO(base64.b64decode(self._to_base64(screenshot))))]
|
||||
if anchor_image:
|
||||
contents.append(Image.open(io.BytesIO(base64.b64decode(self._to_base64(anchor_image)))))
|
||||
|
||||
response = client.models.generate_content(
|
||||
model="gemini-1.5-flash",
|
||||
contents=contents,
|
||||
config={"response_mime_type": "application/json"}
|
||||
)
|
||||
return json.loads(response.text)
|
||||
except Exception as e:
|
||||
print(f"⚠️ [Hub] Gemini Error: {e}")
|
||||
return None
|
||||
|
||||
def _call_anthropic(self, screenshot, anchor_image, description):
|
||||
try:
|
||||
import anthropic
|
||||
client = anthropic.Anthropic(api_key=self.anthropic_key)
|
||||
# Claude 3.5 Sonnet supporte la vision mais pas le format JSON strict en sortie nativement via config
|
||||
# On utilise un prompt renforcé
|
||||
prompt = f"Localise '{description}'. Réponds UNIQUEMENT en JSON : {{'found': bool, 'bbox': [ymin, xmin, ymax, xmax], 'confidence': float}}"
|
||||
|
||||
content = [{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": self._to_base64(screenshot)}},
|
||||
{"type": "text", "text": prompt}]
|
||||
|
||||
response = client.messages.create(
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
max_tokens=1000,
|
||||
messages=[{"role": "user", "content": content}]
|
||||
)
|
||||
text = response.content[0].text
|
||||
return json.loads(text[text.find('{'):text.rfind('}')+1])
|
||||
except Exception as e:
|
||||
print(f"⚠️ [Hub] Anthropic Error: {e}")
|
||||
return None
|
||||
|
||||
def _call_ollama_local(self, screenshot, anchor_image, description):
|
||||
"""Appel à Ollama local (Mode DGX Spark / Offline)"""
|
||||
try:
|
||||
import requests
|
||||
print(f"🏠 [Hub] Fallback Local Ollama ({self.local_model})...")
|
||||
prompt = f"Localise l'élément '{description}'. Retourne JSON: {{'found': bool, 'bbox': [ymin, xmin, ymax, xmax] (0-1000)}}"
|
||||
|
||||
payload = {
|
||||
"model": self.local_model,
|
||||
"prompt": prompt,
|
||||
"images": [self._to_base64(screenshot)],
|
||||
"stream": False,
|
||||
"format": "json"
|
||||
}
|
||||
if anchor_image:
|
||||
payload["images"].append(self._to_base64(anchor_image))
|
||||
|
||||
response = requests.post(f"{self.ollama_url}/api/generate", json=payload, timeout=60)
|
||||
if response.status_code == 200:
|
||||
return json.loads(response.json().get('response', '{}'))
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"❌ [Hub] Local Ollama Error: {e}")
|
||||
return {"found": False, "error": str(e)}
|
||||
|
||||
# Instance unique
|
||||
vlm_hub = VLMProvider()
|
||||
Reference in New Issue
Block a user