""" OllamaClient - Client pour Vision-Language Models via Ollama Interface pour communiquer avec des VLM (Qwen, LLaVA, etc.) via Ollama. """ import logging from typing import Dict, List, Optional, Any import requests import json import base64 from pathlib import Path from PIL import Image import io logger = logging.getLogger(__name__) class OllamaClient: """ Client Ollama pour VLM Permet d'envoyer des images et prompts à un VLM via l'API Ollama. """ def __init__(self, endpoint: str = "http://localhost:11434", model: str = "qwen3-vl:8b", timeout: int = 60): """ Initialiser le client Ollama Args: endpoint: URL de l'API Ollama model: Nom du modèle VLM à utiliser timeout: Timeout en secondes """ self.endpoint = endpoint.rstrip('/') self.model = model self.timeout = timeout self._check_connection() def _check_connection(self) -> bool: """Vérifier la connexion à Ollama""" try: response = requests.get(f"{self.endpoint}/api/tags", timeout=5) if response.status_code == 200: models = response.json().get('models', []) model_names = [m['name'] for m in models] if self.model not in model_names: logger.warning(f" Model '{self.model}' not found in Ollama") logger.info(f"Available models: {model_names}") return True except Exception as e: logger.warning(f" Cannot connect to Ollama at {self.endpoint}: {e}") return False return False def generate(self, prompt: str, image_path: Optional[str] = None, image: Optional[Image.Image] = None, system_prompt: Optional[str] = None, temperature: float = 0.1, max_tokens: int = 500, force_json: bool = False) -> Dict[str, Any]: """ Générer une réponse du VLM Args: prompt: Prompt textuel image_path: Chemin vers une image (optionnel) image: Image PIL (optionnel) system_prompt: Prompt système (optionnel) temperature: Température de génération max_tokens: Nombre max de tokens Returns: Dict avec 'response', 'success', 'error' """ try: # Préparer l'image si fournie image_data = None if image_path: image_data = self._encode_image_from_path(image_path) elif image: image_data = self._encode_image_from_pil(image) # Construire la requête avec thinking mode désactivé # Pour Qwen3, utiliser /nothink au début du prompt effective_prompt = prompt if "qwen" in self.model.lower(): effective_prompt = f"/nothink {prompt}" payload = { "model": self.model, "prompt": effective_prompt, "stream": False, "options": { "temperature": temperature, "num_predict": max_tokens, "num_ctx": 2048, # Contexte réduit pour plus de vitesse "top_k": 1 # Plus rapide pour les tâches de classification } } # Forcer la sortie JSON si demandé (réduit drastiquement les erreurs de parsing) if force_json: payload["format"] = "json" if system_prompt: payload["system"] = system_prompt if image_data: payload["images"] = [image_data] # Envoyer la requête response = requests.post( f"{self.endpoint}/api/generate", json=payload, timeout=self.timeout ) if response.status_code == 200: result = response.json() return { "response": result.get("response", ""), "success": True, "error": None } else: return { "response": "", "success": False, "error": f"HTTP {response.status_code}: {response.text}" } except Exception as e: return { "response": "", "success": False, "error": str(e) } def detect_ui_elements(self, image_path: str) -> Dict[str, Any]: """ Détecter les éléments UI dans une image Args: image_path: Chemin vers le screenshot Returns: Dict avec liste d'éléments détectés """ prompt = """Analyze this screenshot and list all interactive UI elements you can see. For each element, provide: - Type (button, text_input, checkbox, radio, dropdown, tab, link, icon, table_row, menu_item) - Position (approximate x, y coordinates) - Label or text content - Semantic role (primary_action, cancel, submit, form_input, search_field, navigation, settings, close) Format your response as JSON.""" result = self.generate(prompt, image_path=image_path, temperature=0.1) if result["success"]: try: # Parser la réponse JSON elements = json.loads(result["response"]) return {"elements": elements, "success": True} except json.JSONDecodeError: # Si pas JSON valide, retourner texte brut return {"elements": [], "success": False, "raw_response": result["response"]} return {"elements": [], "success": False, "error": result["error"]} def classify_element_type(self, element_image: Image.Image, context: Optional[str] = None) -> Dict[str, Any]: """ Classifier le type d'un élément UI Args: element_image: Image de l'élément context: Contexte additionnel Returns: Dict avec 'type' et 'confidence' """ types_list = "button, text_input, checkbox, radio, dropdown, tab, link, icon, table_row, menu_item" prompt = f"""What type of UI element is this? Choose ONLY ONE from: {types_list} Respond with just the type name, nothing else.""" if context: prompt += f"\n\nContext: {context}" result = self.generate(prompt, image=element_image, temperature=0.0) if result["success"]: element_type = result["response"].strip().lower() # Valider que c'est un type connu valid_types = types_list.split(", ") if element_type in valid_types: return {"type": element_type, "confidence": 0.9, "success": True} else: # Essayer de trouver le type le plus proche for vtype in valid_types: if vtype in element_type: return {"type": vtype, "confidence": 0.7, "success": True} return {"type": "unknown", "confidence": 0.0, "success": False} def classify_element_role(self, element_image: Image.Image, element_type: str, context: Optional[str] = None) -> Dict[str, Any]: """ Classifier le rôle sémantique d'un élément Args: element_image: Image de l'élément element_type: Type de l'élément context: Contexte additionnel Returns: Dict avec 'role' et 'confidence' """ roles_list = "primary_action, cancel, submit, form_input, search_field, navigation, settings, close, delete, edit, save" prompt = f"""This is a {element_type}. What is its semantic role or purpose? Choose ONLY ONE from: {roles_list} Respond with just the role name, nothing else.""" if context: prompt += f"\n\nContext: {context}" result = self.generate(prompt, image=element_image, temperature=0.0) if result["success"]: role = result["response"].strip().lower() # Valider que c'est un rôle connu valid_roles = roles_list.split(", ") if role in valid_roles: return {"role": role, "confidence": 0.9, "success": True} else: # Essayer de trouver le rôle le plus proche for vrole in valid_roles: if vrole in role: return {"role": vrole, "confidence": 0.7, "success": True} return {"role": "unknown", "confidence": 0.0, "success": False} def extract_text(self, image: Image.Image) -> Dict[str, Any]: """ Extraire le texte d'une image Args: image: Image PIL Returns: Dict avec 'text' extrait """ prompt = "Extract all visible text from this image. Return only the text, nothing else." result = self.generate(prompt, image=image, temperature=0.0) if result["success"]: return {"text": result["response"].strip(), "success": True} return {"text": "", "success": False, "error": result["error"]} def classify_element_complete(self, element_image: Image.Image) -> Dict[str, Any]: """ Classifier complètement un élément UI en UN SEUL appel VLM (optimisé) Au lieu de 3 appels séparés (type, role, text), cette méthode fait UN SEUL appel pour obtenir toutes les informations. Réduction de performance: 3 appels → 1 appel = 66% plus rapide Args: element_image: Image PIL de l'élément Returns: Dict avec 'type', 'role', 'text', 'confidence', 'success' """ # System prompt direct — pas de thinking, JSON uniquement system_prompt = "You are a JSON-only UI classifier. No thinking. No explanation. Output raw JSON only." # User prompt avec exemples explicites pour guider le modèle prompt = """/no_think Look at this UI element image and classify it. Reply with ONLY a JSON object, nothing else. Types: button, text_input, checkbox, radio, dropdown, tab, link, icon, table_row, menu_item Roles: primary_action, cancel, submit, form_input, search_field, navigation, settings, close, delete, edit, save Example 1: {"type": "button", "role": "submit", "text": "OK"} Example 2: {"type": "text_input", "role": "form_input", "text": ""} Example 3: {"type": "icon", "role": "close", "text": "X"} Your answer:""" # Note: force_json=False car qwen3-vl ne supporte pas format:json # temperature=0.1 car qwen3-vl bloque à 0.0 avec des images result = self.generate( prompt, image=element_image, system_prompt=system_prompt, temperature=0.1, max_tokens=200, force_json=False ) if result["success"]: try: # Parser la réponse JSON response_text = result["response"].strip() # Nettoyer la réponse si elle contient du markdown if response_text.startswith("```"): lines = response_text.split("\n") response_text = "\n".join([l for l in lines if not l.startswith("```")]) response_text = response_text.strip() data = json.loads(response_text) # Valider les valeurs valid_types = ["button", "text_input", "checkbox", "radio", "dropdown", "tab", "link", "icon", "table_row", "menu_item"] valid_roles = ["primary_action", "cancel", "submit", "form_input", "search_field", "navigation", "settings", "close", "delete", "edit", "save"] elem_type = data.get("type", "unknown").lower() elem_role = data.get("role", "unknown").lower() elem_text = data.get("text", "") # Fallback si type/role invalides if elem_type not in valid_types: elem_type = "unknown" if elem_role not in valid_roles: elem_role = "unknown" return { "type": elem_type, "role": elem_role, "text": elem_text, "confidence": 0.85, "success": True } except json.JSONDecodeError as e: logger.warning(f"JSON parse error in classify_element_complete: {e}") logger.debug(f"Raw response: {result['response'][:200]}") return { "type": "unknown", "role": "unknown", "text": "", "confidence": 0.0, "success": False, "error": f"JSON parse error: {e}" } return { "type": "unknown", "role": "unknown", "text": "", "confidence": 0.0, "success": False, "error": result.get("error", "VLM call failed") } def _encode_image_from_path(self, image_path: str) -> str: """Encoder une image depuis un fichier en base64""" with open(image_path, 'rb') as f: return base64.b64encode(f.read()).decode('utf-8') def _encode_image_from_pil(self, image: Image.Image) -> str: """Encoder une image PIL en base64 avec prétraitement optimisé""" # 1. Convertir en RGB si nécessaire (évite erreurs PNG transparent) if image.mode != 'RGB': image = image.convert('RGB') # 1b. Minimum 32x32 (requis par qwen3-vl, sinon Ollama panic) min_size = 32 if image.width < min_size or image.height < min_size: new_w = max(image.width, min_size) new_h = max(image.height, min_size) image = image.resize((new_w, new_h), Image.NEAREST) # 2. Redimensionnement intelligent : max 1280px sur le côté long max_size = 1280 if max(image.size) > max_size: ratio = max_size / max(image.size) new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio)) image = image.resize(new_size, Image.Resampling.LANCZOS) # 3. Sauvegarder en JPEG qualité 90 (plus léger, meilleur pour VLM) buffer = io.BytesIO() image.save(buffer, format='JPEG', quality=90) return base64.b64encode(buffer.getvalue()).decode('utf-8') def list_models(self) -> List[str]: """Lister les modèles disponibles dans Ollama""" try: response = requests.get(f"{self.endpoint}/api/tags", timeout=5) if response.status_code == 200: models = response.json().get('models', []) return [m['name'] for m in models] except Exception as e: logger.error(f"Error listing models: {e}") return [] def pull_model(self, model_name: str) -> bool: """ Télécharger un modèle dans Ollama Args: model_name: Nom du modèle à télécharger Returns: True si succès """ try: logger.info(f"Pulling model {model_name}...") response = requests.post( f"{self.endpoint}/api/pull", json={"name": model_name}, stream=True, timeout=600 ) if response.status_code == 200: for line in response.iter_lines(): if line: data = json.loads(line) if 'status' in data: logger.info(f" {data['status']}") return True except Exception as e: logger.error(f"Error pulling model: {e}") return False # ============================================================================ # Fonctions utilitaires # ============================================================================ def create_ollama_client(model: str = "qwen3-vl:8b", endpoint: str = "http://localhost:11434") -> OllamaClient: """ Créer un client Ollama Args: model: Nom du modèle VLM endpoint: URL de l'API Ollama Returns: OllamaClient configuré """ return OllamaClient(endpoint=endpoint, model=model) def check_ollama_available(endpoint: str = "http://localhost:11434") -> bool: """ Vérifier si Ollama est disponible Args: endpoint: URL de l'API Ollama Returns: True si disponible """ try: response = requests.get(f"{endpoint}/api/tags", timeout=5) return response.status_code == 200 except (requests.RequestException, ConnectionError, TimeoutError): return False