v1.0 - Version stable: multi-PC, détection UI-DETR-1, 3 modes exécution
- Frontend v4 accessible sur réseau local (192.168.1.40) - Ports ouverts: 3002 (frontend), 5001 (backend), 5004 (dashboard) - Ollama GPU fonctionnel - Self-healing interactif - Dashboard confiance Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
471
core/detection/ollama_client.py
Normal file
471
core/detection/ollama_client.py
Normal file
@@ -0,0 +1,471 @@
|
||||
"""
|
||||
OllamaClient - Client pour Vision-Language Models via Ollama
|
||||
|
||||
Interface pour communiquer avec des VLM (Qwen, LLaVA, etc.) via Ollama.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, List, Optional, Any
|
||||
import requests
|
||||
import json
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OllamaClient:
|
||||
"""
|
||||
Client Ollama pour VLM
|
||||
|
||||
Permet d'envoyer des images et prompts à un VLM via l'API Ollama.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
endpoint: str = "http://localhost:11434",
|
||||
model: str = "qwen3-vl:8b",
|
||||
timeout: int = 60):
|
||||
"""
|
||||
Initialiser le client Ollama
|
||||
|
||||
Args:
|
||||
endpoint: URL de l'API Ollama
|
||||
model: Nom du modèle VLM à utiliser
|
||||
timeout: Timeout en secondes
|
||||
"""
|
||||
self.endpoint = endpoint.rstrip('/')
|
||||
self.model = model
|
||||
self.timeout = timeout
|
||||
self._check_connection()
|
||||
|
||||
def _check_connection(self) -> bool:
|
||||
"""Vérifier la connexion à Ollama"""
|
||||
try:
|
||||
response = requests.get(f"{self.endpoint}/api/tags", timeout=5)
|
||||
if response.status_code == 200:
|
||||
models = response.json().get('models', [])
|
||||
model_names = [m['name'] for m in models]
|
||||
if self.model not in model_names:
|
||||
logger.warning(f" Model '{self.model}' not found in Ollama")
|
||||
logger.info(f"Available models: {model_names}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f" Cannot connect to Ollama at {self.endpoint}: {e}")
|
||||
return False
|
||||
return False
|
||||
|
||||
def generate(self,
|
||||
prompt: str,
|
||||
image_path: Optional[str] = None,
|
||||
image: Optional[Image.Image] = None,
|
||||
system_prompt: Optional[str] = None,
|
||||
temperature: float = 0.1,
|
||||
max_tokens: int = 500,
|
||||
force_json: bool = False) -> Dict[str, Any]:
|
||||
"""
|
||||
Générer une réponse du VLM
|
||||
|
||||
Args:
|
||||
prompt: Prompt textuel
|
||||
image_path: Chemin vers une image (optionnel)
|
||||
image: Image PIL (optionnel)
|
||||
system_prompt: Prompt système (optionnel)
|
||||
temperature: Température de génération
|
||||
max_tokens: Nombre max de tokens
|
||||
|
||||
Returns:
|
||||
Dict avec 'response', 'success', 'error'
|
||||
"""
|
||||
try:
|
||||
# Préparer l'image si fournie
|
||||
image_data = None
|
||||
if image_path:
|
||||
image_data = self._encode_image_from_path(image_path)
|
||||
elif image:
|
||||
image_data = self._encode_image_from_pil(image)
|
||||
|
||||
# Construire la requête avec thinking mode désactivé
|
||||
# Pour Qwen3, utiliser /nothink au début du prompt
|
||||
effective_prompt = prompt
|
||||
if "qwen" in self.model.lower():
|
||||
effective_prompt = f"/nothink {prompt}"
|
||||
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"prompt": effective_prompt,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": temperature,
|
||||
"num_predict": max_tokens,
|
||||
"num_ctx": 2048, # Contexte réduit pour plus de vitesse
|
||||
"top_k": 1 # Plus rapide pour les tâches de classification
|
||||
}
|
||||
}
|
||||
|
||||
# Forcer la sortie JSON si demandé (réduit drastiquement les erreurs de parsing)
|
||||
if force_json:
|
||||
payload["format"] = "json"
|
||||
|
||||
if system_prompt:
|
||||
payload["system"] = system_prompt
|
||||
|
||||
if image_data:
|
||||
payload["images"] = [image_data]
|
||||
|
||||
# Envoyer la requête
|
||||
response = requests.post(
|
||||
f"{self.endpoint}/api/generate",
|
||||
json=payload,
|
||||
timeout=self.timeout
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
return {
|
||||
"response": result.get("response", ""),
|
||||
"success": True,
|
||||
"error": None
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"response": "",
|
||||
"success": False,
|
||||
"error": f"HTTP {response.status_code}: {response.text}"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"response": "",
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def detect_ui_elements(self, image_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Détecter les éléments UI dans une image
|
||||
|
||||
Args:
|
||||
image_path: Chemin vers le screenshot
|
||||
|
||||
Returns:
|
||||
Dict avec liste d'éléments détectés
|
||||
"""
|
||||
prompt = """Analyze this screenshot and list all interactive UI elements you can see.
|
||||
For each element, provide:
|
||||
- Type (button, text_input, checkbox, radio, dropdown, tab, link, icon, table_row, menu_item)
|
||||
- Position (approximate x, y coordinates)
|
||||
- Label or text content
|
||||
- Semantic role (primary_action, cancel, submit, form_input, search_field, navigation, settings, close)
|
||||
|
||||
Format your response as JSON."""
|
||||
|
||||
result = self.generate(prompt, image_path=image_path, temperature=0.1)
|
||||
|
||||
if result["success"]:
|
||||
try:
|
||||
# Parser la réponse JSON
|
||||
elements = json.loads(result["response"])
|
||||
return {"elements": elements, "success": True}
|
||||
except json.JSONDecodeError:
|
||||
# Si pas JSON valide, retourner texte brut
|
||||
return {"elements": [], "success": False, "raw_response": result["response"]}
|
||||
|
||||
return {"elements": [], "success": False, "error": result["error"]}
|
||||
|
||||
def classify_element_type(self,
|
||||
element_image: Image.Image,
|
||||
context: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Classifier le type d'un élément UI
|
||||
|
||||
Args:
|
||||
element_image: Image de l'élément
|
||||
context: Contexte additionnel
|
||||
|
||||
Returns:
|
||||
Dict avec 'type' et 'confidence'
|
||||
"""
|
||||
types_list = "button, text_input, checkbox, radio, dropdown, tab, link, icon, table_row, menu_item"
|
||||
|
||||
prompt = f"""What type of UI element is this?
|
||||
Choose ONLY ONE from: {types_list}
|
||||
|
||||
Respond with just the type name, nothing else."""
|
||||
|
||||
if context:
|
||||
prompt += f"\n\nContext: {context}"
|
||||
|
||||
result = self.generate(prompt, image=element_image, temperature=0.0)
|
||||
|
||||
if result["success"]:
|
||||
element_type = result["response"].strip().lower()
|
||||
# Valider que c'est un type connu
|
||||
valid_types = types_list.split(", ")
|
||||
if element_type in valid_types:
|
||||
return {"type": element_type, "confidence": 0.9, "success": True}
|
||||
else:
|
||||
# Essayer de trouver le type le plus proche
|
||||
for vtype in valid_types:
|
||||
if vtype in element_type:
|
||||
return {"type": vtype, "confidence": 0.7, "success": True}
|
||||
|
||||
return {"type": "unknown", "confidence": 0.0, "success": False}
|
||||
|
||||
def classify_element_role(self,
|
||||
element_image: Image.Image,
|
||||
element_type: str,
|
||||
context: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Classifier le rôle sémantique d'un élément
|
||||
|
||||
Args:
|
||||
element_image: Image de l'élément
|
||||
element_type: Type de l'élément
|
||||
context: Contexte additionnel
|
||||
|
||||
Returns:
|
||||
Dict avec 'role' et 'confidence'
|
||||
"""
|
||||
roles_list = "primary_action, cancel, submit, form_input, search_field, navigation, settings, close, delete, edit, save"
|
||||
|
||||
prompt = f"""This is a {element_type}. What is its semantic role or purpose?
|
||||
Choose ONLY ONE from: {roles_list}
|
||||
|
||||
Respond with just the role name, nothing else."""
|
||||
|
||||
if context:
|
||||
prompt += f"\n\nContext: {context}"
|
||||
|
||||
result = self.generate(prompt, image=element_image, temperature=0.0)
|
||||
|
||||
if result["success"]:
|
||||
role = result["response"].strip().lower()
|
||||
# Valider que c'est un rôle connu
|
||||
valid_roles = roles_list.split(", ")
|
||||
if role in valid_roles:
|
||||
return {"role": role, "confidence": 0.9, "success": True}
|
||||
else:
|
||||
# Essayer de trouver le rôle le plus proche
|
||||
for vrole in valid_roles:
|
||||
if vrole in role:
|
||||
return {"role": vrole, "confidence": 0.7, "success": True}
|
||||
|
||||
return {"role": "unknown", "confidence": 0.0, "success": False}
|
||||
|
||||
def extract_text(self, image: Image.Image) -> Dict[str, Any]:
|
||||
"""
|
||||
Extraire le texte d'une image
|
||||
|
||||
Args:
|
||||
image: Image PIL
|
||||
|
||||
Returns:
|
||||
Dict avec 'text' extrait
|
||||
"""
|
||||
prompt = "Extract all visible text from this image. Return only the text, nothing else."
|
||||
|
||||
result = self.generate(prompt, image=image, temperature=0.0)
|
||||
|
||||
if result["success"]:
|
||||
return {"text": result["response"].strip(), "success": True}
|
||||
|
||||
return {"text": "", "success": False, "error": result["error"]}
|
||||
|
||||
def classify_element_complete(self, element_image: Image.Image) -> Dict[str, Any]:
|
||||
"""
|
||||
Classifier complètement un élément UI en UN SEUL appel VLM (optimisé)
|
||||
|
||||
Au lieu de 3 appels séparés (type, role, text), cette méthode
|
||||
fait UN SEUL appel pour obtenir toutes les informations.
|
||||
|
||||
Réduction de performance: 3 appels → 1 appel = 66% plus rapide
|
||||
|
||||
Args:
|
||||
element_image: Image PIL de l'élément
|
||||
|
||||
Returns:
|
||||
Dict avec 'type', 'role', 'text', 'confidence', 'success'
|
||||
"""
|
||||
# System prompt "zéro tolérance" - Force le VLM à NE produire QUE du JSON
|
||||
system_prompt = """You are a UI element classifier.
|
||||
Your ONLY task is to output valid JSON. Never explain. Never comment. Never discuss.
|
||||
Expected format:
|
||||
{"type": "...", "role": "...", "text": "..."}"""
|
||||
|
||||
# User prompt simplifié et direct
|
||||
prompt = """Classify this UI element:
|
||||
- Type: Choose ONE from [button, text_input, checkbox, radio, dropdown, tab, link, icon, table_row, menu_item]
|
||||
- Role: Choose ONE from [primary_action, cancel, submit, form_input, search_field, navigation, settings, close, delete, edit, save]
|
||||
- Text: Any visible text (empty string if none)
|
||||
|
||||
Output JSON only."""
|
||||
|
||||
result = self.generate(
|
||||
prompt,
|
||||
image=element_image,
|
||||
system_prompt=system_prompt,
|
||||
temperature=0.0,
|
||||
max_tokens=150,
|
||||
force_json=True
|
||||
)
|
||||
|
||||
if result["success"]:
|
||||
try:
|
||||
# Parser la réponse JSON
|
||||
response_text = result["response"].strip()
|
||||
|
||||
# Nettoyer la réponse si elle contient du markdown
|
||||
if response_text.startswith("```"):
|
||||
lines = response_text.split("\n")
|
||||
response_text = "\n".join([l for l in lines if not l.startswith("```")])
|
||||
response_text = response_text.strip()
|
||||
|
||||
data = json.loads(response_text)
|
||||
|
||||
# Valider les valeurs
|
||||
valid_types = ["button", "text_input", "checkbox", "radio", "dropdown",
|
||||
"tab", "link", "icon", "table_row", "menu_item"]
|
||||
valid_roles = ["primary_action", "cancel", "submit", "form_input",
|
||||
"search_field", "navigation", "settings", "close",
|
||||
"delete", "edit", "save"]
|
||||
|
||||
elem_type = data.get("type", "unknown").lower()
|
||||
elem_role = data.get("role", "unknown").lower()
|
||||
elem_text = data.get("text", "")
|
||||
|
||||
# Fallback si type/role invalides
|
||||
if elem_type not in valid_types:
|
||||
elem_type = "unknown"
|
||||
if elem_role not in valid_roles:
|
||||
elem_role = "unknown"
|
||||
|
||||
return {
|
||||
"type": elem_type,
|
||||
"role": elem_role,
|
||||
"text": elem_text,
|
||||
"confidence": 0.85,
|
||||
"success": True
|
||||
}
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"JSON parse error in classify_element_complete: {e}")
|
||||
logger.debug(f"Raw response: {result['response'][:200]}")
|
||||
return {
|
||||
"type": "unknown",
|
||||
"role": "unknown",
|
||||
"text": "",
|
||||
"confidence": 0.0,
|
||||
"success": False,
|
||||
"error": f"JSON parse error: {e}"
|
||||
}
|
||||
|
||||
return {
|
||||
"type": "unknown",
|
||||
"role": "unknown",
|
||||
"text": "",
|
||||
"confidence": 0.0,
|
||||
"success": False,
|
||||
"error": result.get("error", "VLM call failed")
|
||||
}
|
||||
|
||||
def _encode_image_from_path(self, image_path: str) -> str:
|
||||
"""Encoder une image depuis un fichier en base64"""
|
||||
with open(image_path, 'rb') as f:
|
||||
return base64.b64encode(f.read()).decode('utf-8')
|
||||
|
||||
def _encode_image_from_pil(self, image: Image.Image) -> str:
|
||||
"""Encoder une image PIL en base64 avec prétraitement optimisé"""
|
||||
# 1. Convertir en RGB si nécessaire (évite erreurs PNG transparent)
|
||||
if image.mode != 'RGB':
|
||||
image = image.convert('RGB')
|
||||
|
||||
# 2. Redimensionnement intelligent : max 1280px sur le côté long
|
||||
max_size = 1280
|
||||
if max(image.size) > max_size:
|
||||
ratio = max_size / max(image.size)
|
||||
new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
|
||||
image = image.resize(new_size, Image.Resampling.LANCZOS)
|
||||
|
||||
# 3. Sauvegarder en JPEG qualité 90 (plus léger, meilleur pour VLM)
|
||||
buffer = io.BytesIO()
|
||||
image.save(buffer, format='JPEG', quality=90)
|
||||
return base64.b64encode(buffer.getvalue()).decode('utf-8')
|
||||
|
||||
def list_models(self) -> List[str]:
|
||||
"""Lister les modèles disponibles dans Ollama"""
|
||||
try:
|
||||
response = requests.get(f"{self.endpoint}/api/tags", timeout=5)
|
||||
if response.status_code == 200:
|
||||
models = response.json().get('models', [])
|
||||
return [m['name'] for m in models]
|
||||
except Exception as e:
|
||||
logger.error(f"Error listing models: {e}")
|
||||
return []
|
||||
|
||||
def pull_model(self, model_name: str) -> bool:
|
||||
"""
|
||||
Télécharger un modèle dans Ollama
|
||||
|
||||
Args:
|
||||
model_name: Nom du modèle à télécharger
|
||||
|
||||
Returns:
|
||||
True si succès
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Pulling model {model_name}...")
|
||||
response = requests.post(
|
||||
f"{self.endpoint}/api/pull",
|
||||
json={"name": model_name},
|
||||
stream=True,
|
||||
timeout=600
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
data = json.loads(line)
|
||||
if 'status' in data:
|
||||
logger.info(f" {data['status']}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error pulling model: {e}")
|
||||
return False
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Fonctions utilitaires
|
||||
# ============================================================================
|
||||
|
||||
def create_ollama_client(model: str = "qwen3-vl:8b",
|
||||
endpoint: str = "http://localhost:11434") -> OllamaClient:
|
||||
"""
|
||||
Créer un client Ollama
|
||||
|
||||
Args:
|
||||
model: Nom du modèle VLM
|
||||
endpoint: URL de l'API Ollama
|
||||
|
||||
Returns:
|
||||
OllamaClient configuré
|
||||
"""
|
||||
return OllamaClient(endpoint=endpoint, model=model)
|
||||
|
||||
|
||||
def check_ollama_available(endpoint: str = "http://localhost:11434") -> bool:
|
||||
"""
|
||||
Vérifier si Ollama est disponible
|
||||
|
||||
Args:
|
||||
endpoint: URL de l'API Ollama
|
||||
|
||||
Returns:
|
||||
True si disponible
|
||||
"""
|
||||
try:
|
||||
response = requests.get(f"{endpoint}/api/tags", timeout=5)
|
||||
return response.status_code == 200
|
||||
except (requests.RequestException, ConnectionError, TimeoutError):
|
||||
return False
|
||||
Reference in New Issue
Block a user