Files
rpa_vision_v3/visual_workflow_builder/backend/vlm_provider.py
Dom d5deac3029 feat: replay visuel VLM-first, worker séparé, package Léa, AZERTY, sécurité HTTPS
Pipeline replay visuel :
- VLM-first : l'agent appelle Ollama directement pour trouver les éléments
- Template matching en fallback (seuil strict 0.90)
- Stop immédiat si élément non trouvé (pas de clic blind)
- Replay depuis session brute (/replay-session) sans attendre le VLM
- Vérification post-action (screenshot hash avant/après)
- Gestion des popups (Enter/Escape/Tab+Enter)

Worker VLM séparé :
- run_worker.py : process distinct du serveur HTTP
- Communication par fichiers (_worker_queue.txt + _replay_active.lock)
- Le serveur HTTP ne fait plus jamais de VLM → toujours réactif
- Service systemd rpa-worker.service

Capture clavier :
- raw_keys (vk + press/release) pour replay exact indépendant du layout
- Fix AZERTY : ToUnicodeEx + AltGr detection
- Enter capturé comme \n, Tab comme \t
- Filtrage modificateurs seuls (Ctrl/Alt/Shift parasites)
- Fusion text_input consécutifs, dédup key_combo

Sécurité & Internet :
- HTTPS Let's Encrypt (lea.labs + vwb.labs.laurinebazin.design)
- Token API fixe dans .env.local
- HTTP Basic Auth sur VWB
- Security headers (HSTS, CSP, nosniff)
- CORS domaines publics, plus de wildcard

Infrastructure :
- DPI awareness (SetProcessDpiAwareness) Python + Rust
- Métadonnées système (dpi_scale, window_bounds, monitors, os_theme)
- Template matching multi-scale [0.5, 2.0]
- Résolution dynamique (plus de hardcode 1920x1080)
- VLM prefill fix (47x speedup, 3.5s au lieu de 180s)

Modules :
- core/auth/ : credential vault (Fernet AES), TOTP (RFC 6238), auth handler
- core/federation/ : LearningPack export/import anonymisé, FAISS global
- deploy/ : package Léa (config.txt, Lea.bat, install.bat, LISEZMOI.txt)

UX :
- Filtrage OS (VWB + Chat montrent que les workflows de l'OS courant)
- Bibliothèque persistante (cache local + SQLite)
- Clustering hybride (titre fenêtre + DBSCAN)
- EdgeConstraints + PostConditions peuplés
- GraphBuilder compound actions (toutes les frappes)

Agent Rust :
- Token Bearer auth (network.rs)
- sysinfo.rs (DPI, résolution, window bounds via Win32 API)
- config.txt lu automatiquement
- Support Chrome/Brave/Firefox (pas que Edge)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 10:19:18 +01:00

166 lines
7.4 KiB
Python

import os
import io
import json
import base64
from typing import Optional, Dict, Any, List
from PIL import Image
from dotenv import load_dotenv
# Charger les variables d'environnement
env_paths = [
os.path.join(os.getcwd(), ".env.local"),
os.path.join(os.getcwd(), "rpa_vision_v3/.env.local"),
os.path.join(os.path.dirname(__file__), "../../../.env.local")
]
for path in env_paths:
if os.path.exists(path):
load_dotenv(path, override=True)
break
class VLMProvider:
"""Hub de Vision Sémantique Multi-Fournisseurs (OpenAI, Gemini, Anthropic, Ollama)"""
def __init__(self):
# Clés API
self.openai_key = os.getenv("OPENAI_API_KEY")
self.gemini_key = os.getenv("GOOGLE_API_KEY")
self.anthropic_key = os.getenv("ANTHROPIC_API_KEY")
self.deepseek_key = os.getenv("DEEPSEEK_API_KEY")
# Configuration Ollama Local
self.ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
self.local_model = os.getenv("VLM_MODEL", "qwen3-vl:8b")
# Priorité par défaut
self.preferred_cloud = "openai" # gpt-4o est la référence UI
print(f"🔧 [VLM Hub] Initialisé. OpenAI: {bool(self.openai_key)}, Gemini: {bool(self.gemini_key)}, Anthropic: {bool(self.anthropic_key)}")
def _to_base64(self, image_input) -> str:
"""Convertit n'importe quel input image en base64 pur"""
if isinstance(image_input, Image.Image):
buffer = io.BytesIO()
image_input.save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode("utf-8")
elif isinstance(image_input, str):
if image_input.startswith("data:image"):
return image_input.split(",", 1)[1]
elif os.path.exists(image_input):
with open(image_input, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
return image_input # Base64 brut supposé
return base64.b64encode(image_input).decode("utf-8")
def detect_ui_element(self, screenshot, anchor_image=None, description: str = "") -> Optional[Dict[str, Any]]:
"""Tente de localiser l'élément en essayant les fournisseurs par ordre de qualité"""
# 1. Tenter OpenAI (Référence Vision UI)
if self.openai_key:
res = self._call_openai(screenshot, anchor_image, description)
if res and res.get('found'): return res
# 2. Tenter Gemini (Excellent backup Vision)
if self.gemini_key:
res = self._call_gemini(screenshot, anchor_image, description)
if res and res.get('found'): return res
# 3. Tenter Anthropic (Précision logique)
if self.anthropic_key:
res = self._call_anthropic(screenshot, anchor_image, description)
if res and res.get('found'): return res
# 4. Fallback Local (Ollama) - Crucial pour le DGX Spark
return self._call_ollama_local(screenshot, anchor_image, description)
def _call_openai(self, screenshot, anchor_image, description):
try:
from openai import OpenAI
client = OpenAI(api_key=self.openai_key)
prompt = f"Expert UI: Localise précisément '{description}'. Retourne JSON: {{'found': bool, 'bbox': [ymin, xmin, ymax, xmax] (0-1000), 'confidence': float}}"
content = [{"type": "text", "text": prompt}]
content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{self._to_base64(screenshot)}"}})
if anchor_image:
content.append({"type": "text", "text": "Ancre de référence:"})
content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{self._to_base64(anchor_image)}"}})
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": content}],
response_format={"type": "json_object"},
temperature=0
)
return json.loads(response.choices[0].message.content)
except Exception as e:
print(f"⚠️ [Hub] OpenAI Error: {e}")
return None
def _call_gemini(self, screenshot, anchor_image, description):
try:
from google import genai
client = genai.Client(api_key=self.gemini_key)
prompt = f"Expert UI: Localise précisément '{description}'. Retourne JSON: {{'found': bool, 'bbox': [ymin, xmin, ymax, xmax] (0-1000), 'confidence': float}}"
contents = [prompt, Image.open(io.BytesIO(base64.b64decode(self._to_base64(screenshot))))]
if anchor_image:
contents.append(Image.open(io.BytesIO(base64.b64decode(self._to_base64(anchor_image)))))
response = client.models.generate_content(
model="gemini-1.5-flash",
contents=contents,
config={"response_mime_type": "application/json"}
)
return json.loads(response.text)
except Exception as e:
print(f"⚠️ [Hub] Gemini Error: {e}")
return None
def _call_anthropic(self, screenshot, anchor_image, description):
try:
import anthropic
client = anthropic.Anthropic(api_key=self.anthropic_key)
# Claude 3.5 Sonnet supporte la vision mais pas le format JSON strict en sortie nativement via config
# On utilise un prompt renforcé
prompt = f"Localise '{description}'. Réponds UNIQUEMENT en JSON : {{'found': bool, 'bbox': [ymin, xmin, ymax, xmax], 'confidence': float}}"
content = [{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": self._to_base64(screenshot)}},
{"type": "text", "text": prompt}]
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1000,
messages=[{"role": "user", "content": content}]
)
text = response.content[0].text
return json.loads(text[text.find('{'):text.rfind('}')+1])
except Exception as e:
print(f"⚠️ [Hub] Anthropic Error: {e}")
return None
def _call_ollama_local(self, screenshot, anchor_image, description):
"""Appel à Ollama local (Mode DGX Spark / Offline)"""
try:
import requests
print(f"🏠 [Hub] Fallback Local Ollama ({self.local_model})...")
prompt = f"Localise l'élément '{description}'. Retourne JSON: {{'found': bool, 'bbox': [ymin, xmin, ymax, xmax] (0-1000)}}"
payload = {
"model": self.local_model,
"prompt": prompt,
"images": [self._to_base64(screenshot)],
"stream": False,
"format": "json"
}
if anchor_image:
payload["images"].append(self._to_base64(anchor_image))
response = requests.post(f"{self.ollama_url}/api/generate", json=payload, timeout=60)
if response.status_code == 200:
return json.loads(response.json().get('response', '{}'))
return None
except Exception as e:
print(f"❌ [Hub] Local Ollama Error: {e}")
return {"found": False, "error": str(e)}
# Instance unique
vlm_hub = VLMProvider()