Initial commit

2026-03-05 00:20:25 +01:00
commit dcd4de9945
1954 changed files with 669380 additions and 0 deletions
--- a/geniusia2/core/vision_analysis.py
+++ b/geniusia2/core/vision_analysis.py
@@ -0,0 +1,279 @@
+"""
+Analyse visuelle des actions utilisateur.
+Extrait et analyse la région autour d'une action pour créer une signature visuelle.
+"""
+
+import numpy as np
+import cv2
+from typing import Dict, Any, Optional, Tuple, Union
+from pathlib import Path
+from PIL import Image
+
+from .embeddings_manager import EmbeddingsManager
+from .embedders import EmbeddingManager as NewEmbeddingManager
+from .utils.vision_utils import VisionUtils
+from .llm_manager import LLMManager
+from .logger import Logger
+
+
+class VisionAnalysis:
+    """
+    Analyse visuelle des actions pour créer des signatures réutilisables.
+    """
+    
+    def __init__(
+        self,
+        embeddings_manager: Union[EmbeddingsManager, NewEmbeddingManager],
+        vision_utils: VisionUtils,
+        llm_manager: Optional[LLMManager] = None,
+        logger: Optional[Logger] = None
+    ):
+        """
+        Initialise l'analyseur visuel.
+        
+        Args:
+            embeddings_manager: Pour créer les embeddings (old or new system)
+            vision_utils: Pour la détection d'éléments
+            llm_manager: Pour le contexte (optionnel)
+            logger: Pour la journalisation
+        """
+        self.embeddings = embeddings_manager
+        self.vision = vision_utils
+        self.llm = llm_manager
+        self.logger = logger
+        
+        # Detect if using new embedding system
+        self._use_new_system = isinstance(embeddings_manager, NewEmbeddingManager)
+    
+    def analyze_action(
+        self,
+        screenshot: np.ndarray,
+        x: int,
+        y: int,
+        action_type: str,
+        window: str
+    ) -> Dict[str, Any]:
+        """
+        Analyse une action utilisateur et crée sa signature visuelle.
+        
+        Args:
+            screenshot: Image complète de l'écran
+            x, y: Position de l'action
+            action_type: Type d'action (mouse_click, key_press, etc.)
+            window: Fenêtre active
+        
+        Returns:
+            Signature visuelle de l'action (always returns a valid signature)
+        """
+        # Initialize defaults
+        element_type = "unknown"
+        element_description = ""
+        region = None
+        region_coords = (0, 0, 0, 0)
+        embedding = None
+        
+        try:
+            # 1. Extraire la région autour de l'action
+            region, region_coords = self._extract_region(screenshot, x, y, size=100)
+            
+            # 2. Créer l'embedding de la région
+            if self._use_new_system:
+                # New system: convert numpy to PIL, then embed
+                region_rgb = cv2.cvtColor(region, cv2.COLOR_BGR2RGB)
+                pil_image = Image.fromarray(region_rgb.astype(np.uint8))
+                embedding = self.embeddings.embed(pil_image)
+            else:
+                # Old system: use encode_image directly
+                embedding = self.embeddings.encode_image(region)
+            
+        except Exception as e:
+            if self.logger:
+                self.logger.log_action({
+                    "action": "region_extraction_failed",
+                    "error": str(e),
+                    "position": (x, y)
+                })
+            # Continue with defaults
+        
+        # 3. Détecter le type d'élément avec le LLM
+        try:
+            # Utiliser Qwen3-VL pour identifier l'élément
+            if self.llm and region is not None:
+                try:
+                    prompt = f"""Analyse cette région d'interface utilisateur où l'utilisateur a cliqué.
+
+Position du clic: ({x}, {y})
+Type d'action: {action_type}
+
+Identifie l'élément UI en une courte phrase (max 30 caractères).
+Exemples: "Bouton Rafraîchir", "Icône Paramètres", "Champ de texte"
+
+Réponds UNIQUEMENT avec l'identification, sans explication."""
+                    
+                    response = self.llm.generate_with_vision(
+                        prompt=prompt,
+                        images=[region]
+                    )
+                    
+                    element_description = response.strip()[:50]
+                    
+                    # Extraire le type (premier mot généralement)
+                    words = element_description.lower().split()
+                    if words:
+                        if "bouton" in words or "button" in words:
+                            element_type = "button"
+                        elif "icône" in words or "icon" in words:
+                            element_type = "icon"
+                        elif "champ" in words or "field" in words or "input" in words:
+                            element_type = "text_field"
+                        elif "lien" in words or "link" in words:
+                            element_type = "link"
+                        else:
+                            element_type = words[0]
+                            
+                except Exception as llm_error:
+                    if self.logger:
+                        self.logger.log_action({
+                            "action": "llm_analysis_failed",
+                            "error": str(llm_error),
+                            "position": (x, y)
+                        })
+            
+            # Fallback: détection visuelle classique
+            if element_type == "unknown" and region is not None:
+                try:
+                    all_detections = []
+                    for elem_type in ["button", "icon", "text field"]:
+                        detections = self.vision.detect(elem_type, screenshot)
+                        all_detections.extend(detections)
+                    
+                    if all_detections:
+                        closest = self._find_closest_detection(all_detections, x, y)
+                        if closest:
+                            element_type = closest.label
+                            
+                except Exception as vision_error:
+                    if self.logger:
+                        self.logger.log_action({
+                            "action": "vision_detection_failed",
+                            "error": str(vision_error),
+                            "position": (x, y)
+                        })
+                        
+        except Exception as e:
+            if self.logger:
+                self.logger.log_action({
+                    "action": "element_detection_failed",
+                    "error": str(e),
+                    "position": (x, y)
+                })
+        
+        # 4. Créer la signature (always return a valid signature)
+        signature = {
+            "position": (x, y),
+            "region_coords": region_coords,
+            "region_image": region,
+            "embedding": embedding,
+            "element_type": element_type,
+            "element_description": element_description,
+            "action_type": action_type,
+            "window": window,
+            "screenshot_shape": screenshot.shape
+        }
+        
+        return signature
+    
+    def _extract_region(
+        self,
+        image: np.ndarray,
+        x: int,
+        y: int,
+        size: int = 100
+    ) -> Tuple[np.ndarray, Tuple[int, int, int, int]]:
+        """
+        Extrait une région carrée autour d'un point.
+        
+        Returns:
+            (région extraite, coordonnées (x1, y1, x2, y2))
+        """
+        h, w = image.shape[:2]
+        half_size = size // 2
+        
+        # Calculer les coordonnées en restant dans l'image
+        x1 = max(0, x - half_size)
+        y1 = max(0, y - half_size)
+        x2 = min(w, x + half_size)
+        y2 = min(h, y + half_size)
+        
+        region = image[y1:y2, x1:x2].copy()
+        
+        # Redimensionner à size x size si nécessaire
+        if region.shape[0] != size or region.shape[1] != size:
+            region = cv2.resize(region, (size, size))
+        
+        return region, (x1, y1, x2, y2)
+    
+    def _find_closest_detection(
+        self,
+        detections: list,
+        x: int,
+        y: int,
+        max_distance: int = 50
+    ):
+        """
+        Trouve la détection la plus proche d'un point.
+        
+        Returns:
+            Détection la plus proche ou None
+        """
+        if not detections:
+            return None
+        
+        closest = None
+        min_dist = float('inf')
+        
+        for det in detections:
+            # Gérer à la fois les objets Detection et les dicts
+            if hasattr(det, 'bbox'):
+                bbox = det.bbox  # Objet Detection
+            else:
+                bbox = det.get("bbox", [])  # Dict
+            
+            if bbox and len(bbox) >= 4:
+                # bbox format: (x, y, w, h)
+                cx = bbox[0] + bbox[2] / 2
+                cy = bbox[1] + bbox[3] / 2
+                
+                # Distance au point
+                dist = np.sqrt((cx - x)**2 + (cy - y)**2)
+                
+                if dist < min_dist and dist < max_distance:
+                    min_dist = dist
+                    closest = det
+        
+        return closest
+    
+    def compare_signatures(
+        self,
+        sig1: Dict[str, Any],
+        sig2: Dict[str, Any]
+    ) -> float:
+        """
+        Compare deux signatures visuelles.
+        
+        Returns:
+            Score de similarité (0-1)
+        """
+        # Comparer les embeddings
+        emb1 = sig1.get("embedding")
+        emb2 = sig2.get("embedding")
+        
+        if emb1 is None or emb2 is None:
+            return 0.0
+        
+        # Similarité cosinus
+        similarity = np.dot(emb1, emb2) / (
+            np.linalg.norm(emb1) * np.linalg.norm(emb2)
+        )
+        
+        return float(similarity)