- Smart systray (pystray+plyer) remplace PyQt5 : notifications toast, menu dynamique avec workflows, chat "Que dois-je faire ?", icône colorée - Preflight GPU : check_machine_ready() + @pytest.mark.gpu dans conftest - Correction 63 tests cassés → 0 failed (1200 passed) - Tests VWB obsolètes déplacés vers _a_trier/ - Support qwen3-vl:8b sur GPU (remplace qwen2.5vl:3b) - fix images < 32x32 (Ollama panic) - fix force_json=False (qwen3-vl incompatible) - fix temperature 0.1 (0.0 bloque avec images) - Fix captor Windows : Key.esc, _get_key_name() - Fix LeaServerClient : check_connection, list_workflows format - deploy_windows.py : packaging propre client Windows - VWB : edges visibles (#607d8b) + fitView automatique Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
483 lines
17 KiB
Python
483 lines
17 KiB
Python
"""
|
|
OllamaClient - Client pour Vision-Language Models via Ollama
|
|
|
|
Interface pour communiquer avec des VLM (Qwen, LLaVA, etc.) via Ollama.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Dict, List, Optional, Any
|
|
import requests
|
|
import json
|
|
import base64
|
|
from pathlib import Path
|
|
from PIL import Image
|
|
import io
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class OllamaClient:
|
|
"""
|
|
Client Ollama pour VLM
|
|
|
|
Permet d'envoyer des images et prompts à un VLM via l'API Ollama.
|
|
"""
|
|
|
|
def __init__(self,
|
|
endpoint: str = "http://localhost:11434",
|
|
model: str = "qwen3-vl:8b",
|
|
timeout: int = 60):
|
|
"""
|
|
Initialiser le client Ollama
|
|
|
|
Args:
|
|
endpoint: URL de l'API Ollama
|
|
model: Nom du modèle VLM à utiliser
|
|
timeout: Timeout en secondes
|
|
"""
|
|
self.endpoint = endpoint.rstrip('/')
|
|
self.model = model
|
|
self.timeout = timeout
|
|
self._check_connection()
|
|
|
|
def _check_connection(self) -> bool:
|
|
"""Vérifier la connexion à Ollama"""
|
|
try:
|
|
response = requests.get(f"{self.endpoint}/api/tags", timeout=5)
|
|
if response.status_code == 200:
|
|
models = response.json().get('models', [])
|
|
model_names = [m['name'] for m in models]
|
|
if self.model not in model_names:
|
|
logger.warning(f" Model '{self.model}' not found in Ollama")
|
|
logger.info(f"Available models: {model_names}")
|
|
return True
|
|
except Exception as e:
|
|
logger.warning(f" Cannot connect to Ollama at {self.endpoint}: {e}")
|
|
return False
|
|
return False
|
|
|
|
def generate(self,
|
|
prompt: str,
|
|
image_path: Optional[str] = None,
|
|
image: Optional[Image.Image] = None,
|
|
system_prompt: Optional[str] = None,
|
|
temperature: float = 0.1,
|
|
max_tokens: int = 500,
|
|
force_json: bool = False) -> Dict[str, Any]:
|
|
"""
|
|
Générer une réponse du VLM
|
|
|
|
Args:
|
|
prompt: Prompt textuel
|
|
image_path: Chemin vers une image (optionnel)
|
|
image: Image PIL (optionnel)
|
|
system_prompt: Prompt système (optionnel)
|
|
temperature: Température de génération
|
|
max_tokens: Nombre max de tokens
|
|
|
|
Returns:
|
|
Dict avec 'response', 'success', 'error'
|
|
"""
|
|
try:
|
|
# Préparer l'image si fournie
|
|
image_data = None
|
|
if image_path:
|
|
image_data = self._encode_image_from_path(image_path)
|
|
elif image:
|
|
image_data = self._encode_image_from_pil(image)
|
|
|
|
# Construire la requête avec thinking mode désactivé
|
|
# Pour Qwen3, utiliser /nothink au début du prompt
|
|
effective_prompt = prompt
|
|
if "qwen" in self.model.lower():
|
|
effective_prompt = f"/nothink {prompt}"
|
|
|
|
payload = {
|
|
"model": self.model,
|
|
"prompt": effective_prompt,
|
|
"stream": False,
|
|
"options": {
|
|
"temperature": temperature,
|
|
"num_predict": max_tokens,
|
|
"num_ctx": 2048, # Contexte réduit pour plus de vitesse
|
|
"top_k": 1 # Plus rapide pour les tâches de classification
|
|
}
|
|
}
|
|
|
|
# Forcer la sortie JSON si demandé (réduit drastiquement les erreurs de parsing)
|
|
if force_json:
|
|
payload["format"] = "json"
|
|
|
|
if system_prompt:
|
|
payload["system"] = system_prompt
|
|
|
|
if image_data:
|
|
payload["images"] = [image_data]
|
|
|
|
# Envoyer la requête
|
|
response = requests.post(
|
|
f"{self.endpoint}/api/generate",
|
|
json=payload,
|
|
timeout=self.timeout
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
return {
|
|
"response": result.get("response", ""),
|
|
"success": True,
|
|
"error": None
|
|
}
|
|
else:
|
|
return {
|
|
"response": "",
|
|
"success": False,
|
|
"error": f"HTTP {response.status_code}: {response.text}"
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"response": "",
|
|
"success": False,
|
|
"error": str(e)
|
|
}
|
|
|
|
def detect_ui_elements(self, image_path: str) -> Dict[str, Any]:
|
|
"""
|
|
Détecter les éléments UI dans une image
|
|
|
|
Args:
|
|
image_path: Chemin vers le screenshot
|
|
|
|
Returns:
|
|
Dict avec liste d'éléments détectés
|
|
"""
|
|
prompt = """Analyze this screenshot and list all interactive UI elements you can see.
|
|
For each element, provide:
|
|
- Type (button, text_input, checkbox, radio, dropdown, tab, link, icon, table_row, menu_item)
|
|
- Position (approximate x, y coordinates)
|
|
- Label or text content
|
|
- Semantic role (primary_action, cancel, submit, form_input, search_field, navigation, settings, close)
|
|
|
|
Format your response as JSON."""
|
|
|
|
result = self.generate(prompt, image_path=image_path, temperature=0.1)
|
|
|
|
if result["success"]:
|
|
try:
|
|
# Parser la réponse JSON
|
|
elements = json.loads(result["response"])
|
|
return {"elements": elements, "success": True}
|
|
except json.JSONDecodeError:
|
|
# Si pas JSON valide, retourner texte brut
|
|
return {"elements": [], "success": False, "raw_response": result["response"]}
|
|
|
|
return {"elements": [], "success": False, "error": result["error"]}
|
|
|
|
def classify_element_type(self,
|
|
element_image: Image.Image,
|
|
context: Optional[str] = None) -> Dict[str, Any]:
|
|
"""
|
|
Classifier le type d'un élément UI
|
|
|
|
Args:
|
|
element_image: Image de l'élément
|
|
context: Contexte additionnel
|
|
|
|
Returns:
|
|
Dict avec 'type' et 'confidence'
|
|
"""
|
|
types_list = "button, text_input, checkbox, radio, dropdown, tab, link, icon, table_row, menu_item"
|
|
|
|
prompt = f"""What type of UI element is this?
|
|
Choose ONLY ONE from: {types_list}
|
|
|
|
Respond with just the type name, nothing else."""
|
|
|
|
if context:
|
|
prompt += f"\n\nContext: {context}"
|
|
|
|
result = self.generate(prompt, image=element_image, temperature=0.0)
|
|
|
|
if result["success"]:
|
|
element_type = result["response"].strip().lower()
|
|
# Valider que c'est un type connu
|
|
valid_types = types_list.split(", ")
|
|
if element_type in valid_types:
|
|
return {"type": element_type, "confidence": 0.9, "success": True}
|
|
else:
|
|
# Essayer de trouver le type le plus proche
|
|
for vtype in valid_types:
|
|
if vtype in element_type:
|
|
return {"type": vtype, "confidence": 0.7, "success": True}
|
|
|
|
return {"type": "unknown", "confidence": 0.0, "success": False}
|
|
|
|
def classify_element_role(self,
|
|
element_image: Image.Image,
|
|
element_type: str,
|
|
context: Optional[str] = None) -> Dict[str, Any]:
|
|
"""
|
|
Classifier le rôle sémantique d'un élément
|
|
|
|
Args:
|
|
element_image: Image de l'élément
|
|
element_type: Type de l'élément
|
|
context: Contexte additionnel
|
|
|
|
Returns:
|
|
Dict avec 'role' et 'confidence'
|
|
"""
|
|
roles_list = "primary_action, cancel, submit, form_input, search_field, navigation, settings, close, delete, edit, save"
|
|
|
|
prompt = f"""This is a {element_type}. What is its semantic role or purpose?
|
|
Choose ONLY ONE from: {roles_list}
|
|
|
|
Respond with just the role name, nothing else."""
|
|
|
|
if context:
|
|
prompt += f"\n\nContext: {context}"
|
|
|
|
result = self.generate(prompt, image=element_image, temperature=0.0)
|
|
|
|
if result["success"]:
|
|
role = result["response"].strip().lower()
|
|
# Valider que c'est un rôle connu
|
|
valid_roles = roles_list.split(", ")
|
|
if role in valid_roles:
|
|
return {"role": role, "confidence": 0.9, "success": True}
|
|
else:
|
|
# Essayer de trouver le rôle le plus proche
|
|
for vrole in valid_roles:
|
|
if vrole in role:
|
|
return {"role": vrole, "confidence": 0.7, "success": True}
|
|
|
|
return {"role": "unknown", "confidence": 0.0, "success": False}
|
|
|
|
def extract_text(self, image: Image.Image) -> Dict[str, Any]:
|
|
"""
|
|
Extraire le texte d'une image
|
|
|
|
Args:
|
|
image: Image PIL
|
|
|
|
Returns:
|
|
Dict avec 'text' extrait
|
|
"""
|
|
prompt = "Extract all visible text from this image. Return only the text, nothing else."
|
|
|
|
result = self.generate(prompt, image=image, temperature=0.0)
|
|
|
|
if result["success"]:
|
|
return {"text": result["response"].strip(), "success": True}
|
|
|
|
return {"text": "", "success": False, "error": result["error"]}
|
|
|
|
def classify_element_complete(self, element_image: Image.Image) -> Dict[str, Any]:
|
|
"""
|
|
Classifier complètement un élément UI en UN SEUL appel VLM (optimisé)
|
|
|
|
Au lieu de 3 appels séparés (type, role, text), cette méthode
|
|
fait UN SEUL appel pour obtenir toutes les informations.
|
|
|
|
Réduction de performance: 3 appels → 1 appel = 66% plus rapide
|
|
|
|
Args:
|
|
element_image: Image PIL de l'élément
|
|
|
|
Returns:
|
|
Dict avec 'type', 'role', 'text', 'confidence', 'success'
|
|
"""
|
|
# System prompt direct — pas de thinking, JSON uniquement
|
|
system_prompt = "You are a JSON-only UI classifier. No thinking. No explanation. Output raw JSON only."
|
|
|
|
# User prompt avec exemples explicites pour guider le modèle
|
|
prompt = """/no_think
|
|
Look at this UI element image and classify it. Reply with ONLY a JSON object, nothing else.
|
|
|
|
Types: button, text_input, checkbox, radio, dropdown, tab, link, icon, table_row, menu_item
|
|
Roles: primary_action, cancel, submit, form_input, search_field, navigation, settings, close, delete, edit, save
|
|
|
|
Example 1: {"type": "button", "role": "submit", "text": "OK"}
|
|
Example 2: {"type": "text_input", "role": "form_input", "text": ""}
|
|
Example 3: {"type": "icon", "role": "close", "text": "X"}
|
|
|
|
Your answer:"""
|
|
|
|
# Note: force_json=False car qwen3-vl ne supporte pas format:json
|
|
# temperature=0.1 car qwen3-vl bloque à 0.0 avec des images
|
|
result = self.generate(
|
|
prompt,
|
|
image=element_image,
|
|
system_prompt=system_prompt,
|
|
temperature=0.1,
|
|
max_tokens=200,
|
|
force_json=False
|
|
)
|
|
|
|
if result["success"]:
|
|
try:
|
|
# Parser la réponse JSON
|
|
response_text = result["response"].strip()
|
|
|
|
# Nettoyer la réponse si elle contient du markdown
|
|
if response_text.startswith("```"):
|
|
lines = response_text.split("\n")
|
|
response_text = "\n".join([l for l in lines if not l.startswith("```")])
|
|
response_text = response_text.strip()
|
|
|
|
data = json.loads(response_text)
|
|
|
|
# Valider les valeurs
|
|
valid_types = ["button", "text_input", "checkbox", "radio", "dropdown",
|
|
"tab", "link", "icon", "table_row", "menu_item"]
|
|
valid_roles = ["primary_action", "cancel", "submit", "form_input",
|
|
"search_field", "navigation", "settings", "close",
|
|
"delete", "edit", "save"]
|
|
|
|
elem_type = data.get("type", "unknown").lower()
|
|
elem_role = data.get("role", "unknown").lower()
|
|
elem_text = data.get("text", "")
|
|
|
|
# Fallback si type/role invalides
|
|
if elem_type not in valid_types:
|
|
elem_type = "unknown"
|
|
if elem_role not in valid_roles:
|
|
elem_role = "unknown"
|
|
|
|
return {
|
|
"type": elem_type,
|
|
"role": elem_role,
|
|
"text": elem_text,
|
|
"confidence": 0.85,
|
|
"success": True
|
|
}
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.warning(f"JSON parse error in classify_element_complete: {e}")
|
|
logger.debug(f"Raw response: {result['response'][:200]}")
|
|
return {
|
|
"type": "unknown",
|
|
"role": "unknown",
|
|
"text": "",
|
|
"confidence": 0.0,
|
|
"success": False,
|
|
"error": f"JSON parse error: {e}"
|
|
}
|
|
|
|
return {
|
|
"type": "unknown",
|
|
"role": "unknown",
|
|
"text": "",
|
|
"confidence": 0.0,
|
|
"success": False,
|
|
"error": result.get("error", "VLM call failed")
|
|
}
|
|
|
|
def _encode_image_from_path(self, image_path: str) -> str:
|
|
"""Encoder une image depuis un fichier en base64"""
|
|
with open(image_path, 'rb') as f:
|
|
return base64.b64encode(f.read()).decode('utf-8')
|
|
|
|
def _encode_image_from_pil(self, image: Image.Image) -> str:
|
|
"""Encoder une image PIL en base64 avec prétraitement optimisé"""
|
|
# 1. Convertir en RGB si nécessaire (évite erreurs PNG transparent)
|
|
if image.mode != 'RGB':
|
|
image = image.convert('RGB')
|
|
|
|
# 1b. Minimum 32x32 (requis par qwen3-vl, sinon Ollama panic)
|
|
min_size = 32
|
|
if image.width < min_size or image.height < min_size:
|
|
new_w = max(image.width, min_size)
|
|
new_h = max(image.height, min_size)
|
|
image = image.resize((new_w, new_h), Image.NEAREST)
|
|
|
|
# 2. Redimensionnement intelligent : max 1280px sur le côté long
|
|
max_size = 1280
|
|
if max(image.size) > max_size:
|
|
ratio = max_size / max(image.size)
|
|
new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
|
|
image = image.resize(new_size, Image.Resampling.LANCZOS)
|
|
|
|
# 3. Sauvegarder en JPEG qualité 90 (plus léger, meilleur pour VLM)
|
|
buffer = io.BytesIO()
|
|
image.save(buffer, format='JPEG', quality=90)
|
|
return base64.b64encode(buffer.getvalue()).decode('utf-8')
|
|
|
|
def list_models(self) -> List[str]:
|
|
"""Lister les modèles disponibles dans Ollama"""
|
|
try:
|
|
response = requests.get(f"{self.endpoint}/api/tags", timeout=5)
|
|
if response.status_code == 200:
|
|
models = response.json().get('models', [])
|
|
return [m['name'] for m in models]
|
|
except Exception as e:
|
|
logger.error(f"Error listing models: {e}")
|
|
return []
|
|
|
|
def pull_model(self, model_name: str) -> bool:
|
|
"""
|
|
Télécharger un modèle dans Ollama
|
|
|
|
Args:
|
|
model_name: Nom du modèle à télécharger
|
|
|
|
Returns:
|
|
True si succès
|
|
"""
|
|
try:
|
|
logger.info(f"Pulling model {model_name}...")
|
|
response = requests.post(
|
|
f"{self.endpoint}/api/pull",
|
|
json={"name": model_name},
|
|
stream=True,
|
|
timeout=600
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
for line in response.iter_lines():
|
|
if line:
|
|
data = json.loads(line)
|
|
if 'status' in data:
|
|
logger.info(f" {data['status']}")
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Error pulling model: {e}")
|
|
return False
|
|
|
|
|
|
# ============================================================================
|
|
# Fonctions utilitaires
|
|
# ============================================================================
|
|
|
|
def create_ollama_client(model: str = "qwen3-vl:8b",
|
|
endpoint: str = "http://localhost:11434") -> OllamaClient:
|
|
"""
|
|
Créer un client Ollama
|
|
|
|
Args:
|
|
model: Nom du modèle VLM
|
|
endpoint: URL de l'API Ollama
|
|
|
|
Returns:
|
|
OllamaClient configuré
|
|
"""
|
|
return OllamaClient(endpoint=endpoint, model=model)
|
|
|
|
|
|
def check_ollama_available(endpoint: str = "http://localhost:11434") -> bool:
|
|
"""
|
|
Vérifier si Ollama est disponible
|
|
|
|
Args:
|
|
endpoint: URL de l'API Ollama
|
|
|
|
Returns:
|
|
True si disponible
|
|
"""
|
|
try:
|
|
response = requests.get(f"{endpoint}/api/tags", timeout=5)
|
|
return response.status_code == 200
|
|
except (requests.RequestException, ConnectionError, TimeoutError):
|
|
return False
|