""" Analyse visuelle des actions utilisateur. Extrait et analyse la région autour d'une action pour créer une signature visuelle. """ import numpy as np import cv2 from typing import Dict, Any, Optional, Tuple, Union from pathlib import Path from PIL import Image from .embeddings_manager import EmbeddingsManager from .embedders import EmbeddingManager as NewEmbeddingManager from .utils.vision_utils import VisionUtils from .llm_manager import LLMManager from .logger import Logger class VisionAnalysis: """ Analyse visuelle des actions pour créer des signatures réutilisables. """ def __init__( self, embeddings_manager: Union[EmbeddingsManager, NewEmbeddingManager], vision_utils: VisionUtils, llm_manager: Optional[LLMManager] = None, logger: Optional[Logger] = None ): """ Initialise l'analyseur visuel. Args: embeddings_manager: Pour créer les embeddings (old or new system) vision_utils: Pour la détection d'éléments llm_manager: Pour le contexte (optionnel) logger: Pour la journalisation """ self.embeddings = embeddings_manager self.vision = vision_utils self.llm = llm_manager self.logger = logger # Detect if using new embedding system self._use_new_system = isinstance(embeddings_manager, NewEmbeddingManager) def analyze_action( self, screenshot: np.ndarray, x: int, y: int, action_type: str, window: str ) -> Dict[str, Any]: """ Analyse une action utilisateur et crée sa signature visuelle. Args: screenshot: Image complète de l'écran x, y: Position de l'action action_type: Type d'action (mouse_click, key_press, etc.) window: Fenêtre active Returns: Signature visuelle de l'action (always returns a valid signature) """ # Initialize defaults element_type = "unknown" element_description = "" region = None region_coords = (0, 0, 0, 0) embedding = None try: # 1. Extraire la région autour de l'action region, region_coords = self._extract_region(screenshot, x, y, size=100) # 2. Créer l'embedding de la région if self._use_new_system: # New system: convert numpy to PIL, then embed region_rgb = cv2.cvtColor(region, cv2.COLOR_BGR2RGB) pil_image = Image.fromarray(region_rgb.astype(np.uint8)) embedding = self.embeddings.embed(pil_image) else: # Old system: use encode_image directly embedding = self.embeddings.encode_image(region) except Exception as e: if self.logger: self.logger.log_action({ "action": "region_extraction_failed", "error": str(e), "position": (x, y) }) # Continue with defaults # 3. Détecter le type d'élément avec le LLM try: # Utiliser Qwen3-VL pour identifier l'élément if self.llm and region is not None: try: prompt = f"""Analyse cette région d'interface utilisateur où l'utilisateur a cliqué. Position du clic: ({x}, {y}) Type d'action: {action_type} Identifie l'élément UI en une courte phrase (max 30 caractères). Exemples: "Bouton Rafraîchir", "Icône Paramètres", "Champ de texte" Réponds UNIQUEMENT avec l'identification, sans explication.""" response = self.llm.generate_with_vision( prompt=prompt, images=[region] ) element_description = response.strip()[:50] # Extraire le type (premier mot généralement) words = element_description.lower().split() if words: if "bouton" in words or "button" in words: element_type = "button" elif "icône" in words or "icon" in words: element_type = "icon" elif "champ" in words or "field" in words or "input" in words: element_type = "text_field" elif "lien" in words or "link" in words: element_type = "link" else: element_type = words[0] except Exception as llm_error: if self.logger: self.logger.log_action({ "action": "llm_analysis_failed", "error": str(llm_error), "position": (x, y) }) # Fallback: détection visuelle classique if element_type == "unknown" and region is not None: try: all_detections = [] for elem_type in ["button", "icon", "text field"]: detections = self.vision.detect(elem_type, screenshot) all_detections.extend(detections) if all_detections: closest = self._find_closest_detection(all_detections, x, y) if closest: element_type = closest.label except Exception as vision_error: if self.logger: self.logger.log_action({ "action": "vision_detection_failed", "error": str(vision_error), "position": (x, y) }) except Exception as e: if self.logger: self.logger.log_action({ "action": "element_detection_failed", "error": str(e), "position": (x, y) }) # 4. Créer la signature (always return a valid signature) signature = { "position": (x, y), "region_coords": region_coords, "region_image": region, "embedding": embedding, "element_type": element_type, "element_description": element_description, "action_type": action_type, "window": window, "screenshot_shape": screenshot.shape } return signature def _extract_region( self, image: np.ndarray, x: int, y: int, size: int = 100 ) -> Tuple[np.ndarray, Tuple[int, int, int, int]]: """ Extrait une région carrée autour d'un point. Returns: (région extraite, coordonnées (x1, y1, x2, y2)) """ h, w = image.shape[:2] half_size = size // 2 # Calculer les coordonnées en restant dans l'image x1 = max(0, x - half_size) y1 = max(0, y - half_size) x2 = min(w, x + half_size) y2 = min(h, y + half_size) region = image[y1:y2, x1:x2].copy() # Redimensionner à size x size si nécessaire if region.shape[0] != size or region.shape[1] != size: region = cv2.resize(region, (size, size)) return region, (x1, y1, x2, y2) def _find_closest_detection( self, detections: list, x: int, y: int, max_distance: int = 50 ): """ Trouve la détection la plus proche d'un point. Returns: Détection la plus proche ou None """ if not detections: return None closest = None min_dist = float('inf') for det in detections: # Gérer à la fois les objets Detection et les dicts if hasattr(det, 'bbox'): bbox = det.bbox # Objet Detection else: bbox = det.get("bbox", []) # Dict if bbox and len(bbox) >= 4: # bbox format: (x, y, w, h) cx = bbox[0] + bbox[2] / 2 cy = bbox[1] + bbox[3] / 2 # Distance au point dist = np.sqrt((cx - x)**2 + (cy - y)**2) if dist < min_dist and dist < max_distance: min_dist = dist closest = det return closest def compare_signatures( self, sig1: Dict[str, Any], sig2: Dict[str, Any] ) -> float: """ Compare deux signatures visuelles. Returns: Score de similarité (0-1) """ # Comparer les embeddings emb1 = sig1.get("embedding") emb2 = sig2.get("embedding") if emb1 is None or emb2 is None: return 0.0 # Similarité cosinus similarity = np.dot(emb1, emb2) / ( np.linalg.norm(emb1) * np.linalg.norm(emb2) ) return float(similarity)