Geniusia_v2/geniusia2/core/vision_analysis.py

"""
Analyse visuelle des actions utilisateur.
Extrait et analyse la région autour d'une action pour créer une signature visuelle.
"""

import numpy as np
import cv2
from typing import Dict, Any, Optional, Tuple, Union
from pathlib import Path
from PIL import Image

from .embeddings_manager import EmbeddingsManager
from .embedders import EmbeddingManager as NewEmbeddingManager
from .utils.vision_utils import VisionUtils
from .llm_manager import LLMManager
from .logger import Logger


class VisionAnalysis:
    """
    Analyse visuelle des actions pour créer des signatures réutilisables.
    """

    def __init__(
        self,
        embeddings_manager: Union[EmbeddingsManager, NewEmbeddingManager],
        vision_utils: VisionUtils,
        llm_manager: Optional[LLMManager] = None,
        logger: Optional[Logger] = None
    ):
        """
        Initialise l'analyseur visuel.

        Args:
            embeddings_manager: Pour créer les embeddings (old or new system)
            vision_utils: Pour la détection d'éléments
            llm_manager: Pour le contexte (optionnel)
            logger: Pour la journalisation
        """
        self.embeddings = embeddings_manager
        self.vision = vision_utils
        self.llm = llm_manager
        self.logger = logger

        # Detect if using new embedding system
        self._use_new_system = isinstance(embeddings_manager, NewEmbeddingManager)

    def analyze_action(
        self,
        screenshot: np.ndarray,
        x: int,
        y: int,
        action_type: str,
        window: str
    ) -> Dict[str, Any]:
        """
        Analyse une action utilisateur et crée sa signature visuelle.

        Args:
            screenshot: Image complète de l'écran
            x, y: Position de l'action
            action_type: Type d'action (mouse_click, key_press, etc.)
            window: Fenêtre active

        Returns:
            Signature visuelle de l'action (always returns a valid signature)
        """
        # Initialize defaults
        element_type = "unknown"
        element_description = ""
        region = None
        region_coords = (0, 0, 0, 0)
        embedding = None

        try:
            # 1. Extraire la région autour de l'action
            region, region_coords = self._extract_region(screenshot, x, y, size=100)

            # 2. Créer l'embedding de la région
            if self._use_new_system:
                # New system: convert numpy to PIL, then embed
                region_rgb = cv2.cvtColor(region, cv2.COLOR_BGR2RGB)
                pil_image = Image.fromarray(region_rgb.astype(np.uint8))
                embedding = self.embeddings.embed(pil_image)
            else:
                # Old system: use encode_image directly
                embedding = self.embeddings.encode_image(region)

        except Exception as e:
            if self.logger:
                self.logger.log_action({
                    "action": "region_extraction_failed",
                    "error": str(e),
                    "position": (x, y)
                })
            # Continue with defaults

        # 3. Détecter le type d'élément avec le LLM
        try:
            # Utiliser Qwen3-VL pour identifier l'élément
            if self.llm and region is not None:
                try:
                    prompt = f"""Analyse cette région d'interface utilisateur où l'utilisateur a cliqué.

Position du clic: ({x}, {y})
Type d'action: {action_type}

Identifie l'élément UI en une courte phrase (max 30 caractères).
Exemples: "Bouton Rafraîchir", "Icône Paramètres", "Champ de texte"

Réponds UNIQUEMENT avec l'identification, sans explication."""

                    response = self.llm.generate_with_vision(
                        prompt=prompt,
                        images=[region]
                    )

                    element_description = response.strip()[:50]

                    # Extraire le type (premier mot généralement)
                    words = element_description.lower().split()
                    if words:
                        if "bouton" in words or "button" in words:
                            element_type = "button"
                        elif "icône" in words or "icon" in words:
                            element_type = "icon"
                        elif "champ" in words or "field" in words or "input" in words:
                            element_type = "text_field"
                        elif "lien" in words or "link" in words:
                            element_type = "link"
                        else:
                            element_type = words[0]

                except Exception as llm_error:
                    if self.logger:
                        self.logger.log_action({
                            "action": "llm_analysis_failed",
                            "error": str(llm_error),
                            "position": (x, y)
                        })

            # Fallback: détection visuelle classique
            if element_type == "unknown" and region is not None:
                try:
                    all_detections = []
                    for elem_type in ["button", "icon", "text field"]:
                        detections = self.vision.detect(elem_type, screenshot)
                        all_detections.extend(detections)

                    if all_detections:
                        closest = self._find_closest_detection(all_detections, x, y)
                        if closest:
                            element_type = closest.label

                except Exception as vision_error:
                    if self.logger:
                        self.logger.log_action({
                            "action": "vision_detection_failed",
                            "error": str(vision_error),
                            "position": (x, y)
                        })

        except Exception as e:
            if self.logger:
                self.logger.log_action({
                    "action": "element_detection_failed",
                    "error": str(e),
                    "position": (x, y)
                })

        # 4. Créer la signature (always return a valid signature)
        signature = {
            "position": (x, y),
            "region_coords": region_coords,
            "region_image": region,
            "embedding": embedding,
            "element_type": element_type,
            "element_description": element_description,
            "action_type": action_type,
            "window": window,
            "screenshot_shape": screenshot.shape
        }

        return signature

    def _extract_region(
        self,
        image: np.ndarray,
        x: int,
        y: int,
        size: int = 100
    ) -> Tuple[np.ndarray, Tuple[int, int, int, int]]:
        """
        Extrait une région carrée autour d'un point.

        Returns:
            (région extraite, coordonnées (x1, y1, x2, y2))
        """
        h, w = image.shape[:2]
        half_size = size // 2

        # Calculer les coordonnées en restant dans l'image
        x1 = max(0, x - half_size)
        y1 = max(0, y - half_size)
        x2 = min(w, x + half_size)
        y2 = min(h, y + half_size)

        region = image[y1:y2, x1:x2].copy()

        # Redimensionner à size x size si nécessaire
        if region.shape[0] != size or region.shape[1] != size:
            region = cv2.resize(region, (size, size))

        return region, (x1, y1, x2, y2)

    def _find_closest_detection(
        self,
        detections: list,
        x: int,
        y: int,
        max_distance: int = 50
    ):
        """
        Trouve la détection la plus proche d'un point.

        Returns:
            Détection la plus proche ou None
        """
        if not detections:
            return None

        closest = None
        min_dist = float('inf')

        for det in detections:
            # Gérer à la fois les objets Detection et les dicts
            if hasattr(det, 'bbox'):
                bbox = det.bbox  # Objet Detection
            else:
                bbox = det.get("bbox", [])  # Dict

            if bbox and len(bbox) >= 4:
                # bbox format: (x, y, w, h)
                cx = bbox[0] + bbox[2] / 2
                cy = bbox[1] + bbox[3] / 2

                # Distance au point
                dist = np.sqrt((cx - x)**2 + (cy - y)**2)

                if dist < min_dist and dist < max_distance:
                    min_dist = dist
                    closest = det

        return closest

    def compare_signatures(
        self,
        sig1: Dict[str, Any],
        sig2: Dict[str, Any]
    ) -> float:
        """
        Compare deux signatures visuelles.

        Returns:
            Score de similarité (0-1)
        """
        # Comparer les embeddings
        emb1 = sig1.get("embedding")
        emb2 = sig2.get("embedding")

        if emb1 is None or emb2 is None:
            return 0.0

        # Similarité cosinus
        similarity = np.dot(emb1, emb2) / (
            np.linalg.norm(emb1) * np.linalg.norm(emb2)
        )

        return float(similarity)