""" Gestionnaire d'embeddings multi-modaux pour la Phase 3 - Mode Complet. Fusionne les embeddings de différentes modalités en un seul embedding unifié. Modalités supportées: - Image (screenshot entier) - Texte (texte détecté) - Titre (window_title) - UI (éléments UI importants) - Contexte (métadonnées workflow) """ import numpy as np from typing import List, Optional, Dict, Any from pathlib import Path from .ui_element_models import ( UIElement, StateEmbedding, EmbeddingComponents, ComponentInfo ) from .llm_manager import LLMManager from .logger import Logger # Import optionnel de EmbeddingManager try: from .embedders.embedding_manager import EmbeddingManager as BaseEmbeddingManager except ImportError: BaseEmbeddingManager = None from dataclasses import dataclass @dataclass class EmbeddingWeights: """Poids pour la fusion des embeddings multi-modaux.""" image: float = 0.4 text: float = 0.2 title: float = 0.1 ui: float = 0.2 context: float = 0.1 def normalize(self) -> 'EmbeddingWeights': """Normalise les poids pour qu'ils somment à 1.0.""" total = self.image + self.text + self.title + self.ui + self.context if total == 0: return EmbeddingWeights() return EmbeddingWeights( image=self.image / total, text=self.text / total, title=self.title / total, ui=self.ui / total, context=self.context / total ) def to_dict(self) -> Dict[str, float]: """Convertit en dictionnaire.""" return { "image": self.image, "text": self.text, "title": self.title, "ui": self.ui, "context": self.context } @classmethod def from_dict(cls, data: Dict[str, float]) -> 'EmbeddingWeights': """Crée depuis un dictionnaire.""" return cls( image=data.get("image", 0.4), text=data.get("text", 0.2), title=data.get("title", 0.1), ui=data.get("ui", 0.2), context=data.get("context", 0.1) ) class MultiModalEmbeddingManager: """ Gestionnaire d'embeddings multi-modaux. Fusionne les embeddings de 5 modalités: 1. Image globale (screenshot) 2. Texte détecté (OCR/VLM) 3. Titre de fenêtre 4. Éléments UI (moyenne des éléments importants) 5. Contexte workflow La fusion est une combinaison pondérée normalisée. """ def __init__( self, embedding_manager: Optional[BaseEmbeddingManager] = None, logger: Optional[Logger] = None, data_dir: str = "data", config: Optional[Dict[str, Any]] = None ): """ Initialise le gestionnaire d'embeddings multi-modaux. Args: embedding_manager: Gestionnaire d'embeddings existant logger: Logger data_dir: Répertoire de données config: Configuration """ self.embedding_manager = embedding_manager self.logger = logger self.data_dir = Path(data_dir) self.config = config or {} # Configuration self.embedding_dim = self.config.get("embedding_dim", 512) self.fusion_method = self.config.get("fusion_method", "weighted_average") self.use_cache = self.config.get("use_cache", True) # Poids par défaut weights_config = self.config.get("weights", {}) self.default_weights = EmbeddingWeights.from_dict(weights_config).normalize() # Poids de fusion (pour compatibilité) self.weights = { 'image': self.default_weights.image, 'text': self.default_weights.text, 'title': self.default_weights.title, 'ui': self.default_weights.ui, 'context': self.default_weights.context } # Cache des embeddings self._embedding_cache = {} if self.use_cache else None # Créer les répertoires self.embeddings_dir = self.data_dir / "embeddings" / "multimodal" self.embeddings_dir.mkdir(parents=True, exist_ok=True) if self.logger: self.logger.log_action({ "action": "multimodal_embedding_manager_initialized", "embedding_dim": self.embedding_dim, "fusion_method": self.fusion_method, "default_weights": self.default_weights.to_dict() }) def create_state_embedding( self, screenshot: np.ndarray, detected_text: List[str], window_title: str, ui_elements: List[UIElement], context: Optional[Dict[str, Any]] = None, data_dir: str = "data" ) -> StateEmbedding: """ Crée un embedding d'état unifié en fusionnant toutes les modalités. Args: screenshot: Screenshot numpy array detected_text: Liste de texte détecté window_title: Titre de la fenêtre ui_elements: Liste des éléments UI context: Contexte workflow (optionnel) data_dir: Répertoire de données Returns: StateEmbedding avec fusion multi-modale """ # Créer le répertoire pour les embeddings embeddings_dir = Path(data_dir) / "embeddings" / "multimodal" embeddings_dir.mkdir(parents=True, exist_ok=True) # Générer un ID unique pour cet état import time state_id = f"state_{int(time.time() * 1000000)}" # Composante 1: Image globale image_emb, image_path = self._compute_image_embedding( screenshot, state_id, embeddings_dir ) image_emb_norm = self._normalize(image_emb) # Composante 2: Texte concaténé text_emb, text_path = self._compute_text_embedding( detected_text, state_id, embeddings_dir ) text_emb_norm = self._normalize(text_emb) # Composante 3: Titre de fenêtre title_emb, title_path = self._compute_title_embedding( window_title, state_id, embeddings_dir ) title_emb_norm = self._normalize(title_emb) # Composante 4: UI éléments ui_emb, ui_path = self._compute_ui_embedding( ui_elements, state_id, embeddings_dir ) ui_emb_norm = self._normalize(ui_emb) # Composante 5: Contexte context_emb, context_path = self._compute_context_embedding( context, state_id, embeddings_dir ) context_emb_norm = self._normalize(context_emb) # Fusion pondérée state_emb = ( self.weights['image'] * image_emb_norm + self.weights['text'] * text_emb_norm + self.weights['title'] * title_emb_norm + self.weights['ui'] * ui_emb_norm + self.weights['context'] * context_emb_norm ) # Normalisation finale state_emb_final = self._normalize(state_emb) # Sauvegarder l'embedding fusionné fused_path = embeddings_dir / f"{state_id}_fused.npy" np.save(fused_path, state_emb_final) # Créer les composantes components = EmbeddingComponents( image_embedding=ComponentInfo( provider="openclip_ViT-B-32", vector_id=str(image_path) ), text_embedding=ComponentInfo( provider="clip_text", vector_id=str(text_path) ), title_embedding=ComponentInfo( provider="clip_text", vector_id=str(title_path) ), ui_embedding=ComponentInfo( provider="openclip_ViT-B-32", vector_id=str(ui_path) ), context_embedding=ComponentInfo( provider="numeric_context_v1", vector_id=str(context_path) ) if self.weights['context'] > 0 else None ) # Créer le StateEmbedding state_embedding = StateEmbedding( provider="multimodal_fusion_v1", vector_id=str(fused_path), components=components ) if self.logger: self.logger.log_action({ "action": "state_embedding_created", "state_id": state_id, "components": { "image": image_emb.shape, "text": text_emb.shape, "title": title_emb.shape, "ui": ui_emb.shape, "context": context_emb.shape } }) return state_embedding def _compute_image_embedding( self, screenshot: np.ndarray, state_id: str, embeddings_dir: Path ) -> tuple: """Calcule l'embedding de l'image globale.""" try: # Convertir en PIL Image from PIL import Image if screenshot.shape[2] == 3: # BGR to RGB screenshot_rgb = screenshot[:, :, ::-1] else: screenshot_rgb = screenshot pil_image = Image.fromarray(screenshot_rgb.astype(np.uint8)) # Générer l'embedding embedding = self.image_embedder.embed(pil_image) # Sauvegarder path = embeddings_dir / f"{state_id}_image.npy" np.save(path, embedding) return embedding, path except Exception as e: if self.logger: self.logger.log_action({ "action": "image_embedding_error", "error": str(e) }) # Retourner un vecteur zéro embedding = np.zeros(self.embedding_dim) path = embeddings_dir / f"{state_id}_image.npy" np.save(path, embedding) return embedding, path def _compute_text_embedding( self, detected_text: List[str], state_id: str, embeddings_dir: Path ) -> tuple: """Calcule l'embedding du texte concaténé.""" try: # Concaténer le texte text_concat = " ".join(detected_text) if detected_text else "" if not text_concat: # Pas de texte, retourner vecteur zéro embedding = np.zeros(self.embedding_dim) else: # Pour l'instant, utiliser un embedding simple # TODO: Intégrer avec un vrai text embedder embedding = self._simple_text_embedding(text_concat) # Sauvegarder path = embeddings_dir / f"{state_id}_text.npy" np.save(path, embedding) return embedding, path except Exception as e: if self.logger: self.logger.log_action({ "action": "text_embedding_error", "error": str(e) }) embedding = np.zeros(self.embedding_dim) path = embeddings_dir / f"{state_id}_text.npy" np.save(path, embedding) return embedding, path def _compute_title_embedding( self, window_title: str, state_id: str, embeddings_dir: Path ) -> tuple: """Calcule l'embedding du titre de fenêtre.""" try: if not window_title: embedding = np.zeros(self.embedding_dim) else: embedding = self._simple_text_embedding(window_title) # Sauvegarder path = embeddings_dir / f"{state_id}_title.npy" np.save(path, embedding) return embedding, path except Exception as e: if self.logger: self.logger.log_action({ "action": "title_embedding_error", "error": str(e) }) embedding = np.zeros(self.embedding_dim) path = embeddings_dir / f"{state_id}_title.npy" np.save(path, embedding) return embedding, path def _compute_ui_embedding( self, ui_elements: List[UIElement], state_id: str, embeddings_dir: Path ) -> tuple: """Calcule l'embedding des éléments UI (moyenne des éléments importants).""" try: if not ui_elements: embedding = np.zeros(self.embedding_dim) else: # Filtrer les éléments importants important_elements = [ elem for elem in ui_elements if elem.properties.is_clickable or 'primary_action' in elem.tags ] if not important_elements: # Prendre les 5 premiers éléments important_elements = ui_elements[:5] # Charger et moyenner les embeddings embeddings = [] for elem in important_elements: try: emb = np.load(elem.visual.embedding_vector_id) embeddings.append(emb) except: continue if embeddings: embedding = np.mean(embeddings, axis=0) else: embedding = np.zeros(self.embedding_dim) # Sauvegarder path = embeddings_dir / f"{state_id}_ui.npy" np.save(path, embedding) return embedding, path except Exception as e: if self.logger: self.logger.log_action({ "action": "ui_embedding_error", "error": str(e) }) embedding = np.zeros(self.embedding_dim) path = embeddings_dir / f"{state_id}_ui.npy" np.save(path, embedding) return embedding, path def _compute_context_embedding( self, context: Optional[Dict[str, Any]], state_id: str, embeddings_dir: Path ) -> tuple: """Calcule l'embedding du contexte workflow.""" try: if not context or self.weights['context'] == 0: embedding = np.zeros(self.embedding_dim) else: # Encoder les métadonnées de contexte en vecteur embedding = self._encode_context(context) # Sauvegarder path = embeddings_dir / f"{state_id}_context.npy" np.save(path, embedding) return embedding, path except Exception as e: if self.logger: self.logger.log_action({ "action": "context_embedding_error", "error": str(e) }) embedding = np.zeros(self.embedding_dim) path = embeddings_dir / f"{state_id}_context.npy" np.save(path, embedding) return embedding, path def _simple_text_embedding(self, text: str) -> np.ndarray: """ Crée un embedding simple de texte. TODO: Remplacer par un vrai text embedder (CLIP text, Sentence-BERT, etc.) """ # Pour l'instant, utiliser un hash simple import hashlib hash_obj = hashlib.sha256(text.encode('utf-8')) hash_bytes = hash_obj.digest() # Convertir en vecteur de dimension embedding_dim embedding = np.zeros(self.embedding_dim) for i in range(min(len(hash_bytes), self.embedding_dim)): embedding[i] = hash_bytes[i] / 255.0 return embedding def _encode_context(self, context: Dict[str, Any]) -> np.ndarray: """ Encode le contexte en vecteur numérique. TODO: Améliorer l'encodage du contexte. """ # Pour l'instant, encoder simplement les clés/valeurs context_str = str(context) return self._simple_text_embedding(context_str) def _normalize(self, vector: np.ndarray) -> np.ndarray: """Normalise un vecteur (norme L2 = 1.0).""" norm = np.linalg.norm(vector) if norm > 0: return vector / norm return vector def get_weights(self) -> Dict[str, float]: """Retourne les poids de fusion actuels.""" return self.weights.copy() def set_weights(self, weights: Dict[str, float]): """ Modifie les poids de fusion. Args: weights: Dictionnaire des nouveaux poids """ self.weights.update(weights) if self.logger: self.logger.log_action({ "action": "weights_updated", "new_weights": self.weights }) def compute_similarity( self, embedding1: np.ndarray, embedding2: np.ndarray, metric: str = "cosine" ) -> float: """ Calcule la similarité entre deux embeddings. Args: embedding1: Premier embedding embedding2: Deuxième embedding metric: Métrique de similarité ("cosine" ou "euclidean") Returns: Score de similarité entre 0.0 et 1.0 """ try: if metric == "cosine": # Similarité cosinus dot_product = np.dot(embedding1, embedding2) norm1 = np.linalg.norm(embedding1) norm2 = np.linalg.norm(embedding2) if norm1 == 0 or norm2 == 0: return 0.0 return float(dot_product / (norm1 * norm2)) elif metric == "euclidean": # Distance euclidienne (convertie en similarité) distance = np.linalg.norm(embedding1 - embedding2) return float(1.0 / (1.0 + distance)) else: raise ValueError(f"Métrique non supportée: {metric}") except Exception as e: if self.logger: self.logger.log_action({ "action": "similarity_computation_error", "metric": metric, "error": str(e) }) return 0.0 def load_fused_embedding(self, vector_id: str) -> Optional[np.ndarray]: """ Charge un embedding fusionné depuis son vector_id. Args: vector_id: ID du vecteur (chemin de fichier ou ID temporaire) Returns: Embedding numpy array ou None si non trouvé """ try: if vector_id.startswith("temp_"): # Embedding temporaire, générer un embedding aléatoire return np.random.rand(self.embedding_dim) # Charger depuis le fichier path = Path(vector_id) if path.exists(): return np.load(path) else: # Fichier non trouvé, générer un embedding par défaut if self.logger: self.logger.log_action({ "action": "fused_embedding_not_found", "vector_id": vector_id }) return np.random.rand(self.embedding_dim) except Exception as e: if self.logger: self.logger.log_action({ "action": "fused_embedding_load_error", "vector_id": vector_id, "error": str(e) }) return None def generate_multimodal_embedding( self, screen_state, screenshot: Optional[np.ndarray] = None, weights: Optional[EmbeddingWeights] = None, save: bool = True ) -> StateEmbedding: """ Génère un embedding multi-modal complet pour un état d'écran. Args: screen_state: EnrichedScreenState screenshot: Screenshot numpy array (optionnel) weights: Poids de fusion (utilise les poids par défaut si None) save: Sauvegarder les embeddings Returns: StateEmbedding avec composantes et embedding fusionné """ if weights is None: # Utiliser les poids de configuration weights = EmbeddingWeights( image=self.weights.get('image', 0.4), text=self.weights.get('text', 0.2), title=self.weights.get('title', 0.1), ui=self.weights.get('ui', 0.2), context=self.weights.get('context', 0.1) ).normalize() else: weights = weights.normalize() try: # Pour l'instant, générer un embedding simulé # TODO: Implémenter la vraie génération avec les embedders # Créer les composantes components = EmbeddingComponents() # Image embedding if screenshot is not None: components.image_embedding = ComponentInfo( provider="openclip_ViT-B-32", vector_id=f"temp_{screen_state.screen_state_id}_image" ) # Text embedding if screen_state.perception.detected_text: components.text_embedding = ComponentInfo( provider="clip_text", vector_id=f"temp_{screen_state.screen_state_id}_text" ) # Title embedding if screen_state.window.window_title: components.title_embedding = ComponentInfo( provider="clip_text", vector_id=f"temp_{screen_state.screen_state_id}_title" ) # UI embedding if screen_state.ui_elements: components.ui_embedding = ComponentInfo( provider="ui_aggregation_v1", vector_id=f"temp_{screen_state.screen_state_id}_ui" ) # Context embedding if screen_state.context.current_workflow_candidate or screen_state.context.tags: components.context_embedding = ComponentInfo( provider="context_embedding_v1", vector_id=f"temp_{screen_state.screen_state_id}_context" ) # Créer le StateEmbedding state_embedding = StateEmbedding( provider="multimodal_fusion_v1", vector_id=f"temp_{screen_state.screen_state_id}_fused", components=components ) if self.logger: self.logger.log_action({ "action": "multimodal_embedding_generated", "screen_state_id": screen_state.screen_state_id, "provider": state_embedding.provider }) return state_embedding except Exception as e: if self.logger: self.logger.log_action({ "action": "multimodal_embedding_error", "screen_state_id": screen_state.screen_state_id, "error": str(e) }) raise if __name__ == "__main__": # Tests basiques (sans dépendances lourdes) print("MultiModalEmbeddingManager - Tests basiques") print("=" * 50) # Test normalisation (pas besoin de logger ou embedder) print("\n1. Test normalisation:") # Créer une instance minimale pour tester la normalisation class MinimalManager: def _normalize(self, vector): norm = np.linalg.norm(vector) if norm > 0: return vector / norm return vector manager = MinimalManager() vector = np.array([3.0, 4.0, 0.0]) normalized = manager._normalize(vector) norm = np.linalg.norm(normalized) print(f" Vecteur original: {vector}") print(f" Vecteur normalisé: {normalized}") print(f" Norme: {norm:.6f}") assert abs(norm - 1.0) < 0.001, "La norme doit être 1.0" print(f" ✓ Normalisation correcte") # Test poids par défaut print("\n2. Test configuration des poids:") default_weights = { 'image': 0.5, 'text': 0.3, 'title': 0.1, 'ui': 0.1, 'context': 0.0 } print(f" Poids par défaut: {default_weights}") total = sum(default_weights.values()) print(f" Somme des poids: {total}") print(f" ✓ Configuration valide") print("\n✓ Tous les tests basiques réussis!") def compute_similarity( self, embedding1: np.ndarray, embedding2: np.ndarray, metric: str = "cosine" ) -> float: """ Calcule la similarité entre deux embeddings. Args: embedding1: Premier embedding embedding2: Deuxième embedding metric: Métrique de similarité ("cosine" ou "euclidean") Returns: Score de similarité entre 0.0 et 1.0 """ try: if metric == "cosine": # Similarité cosinus dot_product = np.dot(embedding1, embedding2) norm1 = np.linalg.norm(embedding1) norm2 = np.linalg.norm(embedding2) if norm1 == 0 or norm2 == 0: return 0.0 return float(dot_product / (norm1 * norm2)) elif metric == "euclidean": # Distance euclidienne (convertie en similarité) distance = np.linalg.norm(embedding1 - embedding2) return float(1.0 / (1.0 + distance)) else: raise ValueError(f"Métrique non supportée: {metric}") except Exception as e: if self.logger: self.logger.log_action({ "action": "similarity_computation_error", "metric": metric, "error": str(e) }) return 0.0 def load_fused_embedding(self, vector_id: str) -> Optional[np.ndarray]: """ Charge un embedding fusionné depuis son vector_id. Args: vector_id: ID du vecteur (chemin de fichier ou ID temporaire) Returns: Embedding numpy array ou None si non trouvé """ try: if vector_id.startswith("temp_"): # Embedding temporaire, générer un embedding aléatoire return np.random.rand(self.embedding_dim) # Charger depuis le fichier path = Path(vector_id) if path.exists(): return np.load(path) else: # Fichier non trouvé, générer un embedding par défaut if self.logger: self.logger.log_action({ "action": "fused_embedding_not_found", "vector_id": vector_id }) return np.random.rand(self.embedding_dim) except Exception as e: if self.logger: self.logger.log_action({ "action": "fused_embedding_load_error", "vector_id": vector_id, "error": str(e) }) return None def generate_multimodal_embedding( self, screen_state, screenshot: Optional[np.ndarray] = None, weights: Optional[EmbeddingWeights] = None, save: bool = True ) -> StateEmbedding: """ Génère un embedding multi-modal complet pour un état d'écran. Args: screen_state: EnrichedScreenState screenshot: Screenshot numpy array (optionnel) weights: Poids de fusion (utilise les poids par défaut si None) save: Sauvegarder les embeddings Returns: StateEmbedding avec composantes et embedding fusionné """ if weights is None: # Utiliser les poids de configuration weights = EmbeddingWeights( image=self.weights.get('image', 0.4), text=self.weights.get('text', 0.2), title=self.weights.get('title', 0.1), ui=self.weights.get('ui', 0.2), context=self.weights.get('context', 0.1) ).normalize() else: weights = weights.normalize() try: # Pour l'instant, générer un embedding simulé # TODO: Implémenter la vraie génération avec les embedders # Créer les composantes components = EmbeddingComponents() # Image embedding if screenshot is not None: components.image_embedding = ComponentInfo( provider="openclip_ViT-B-32", vector_id=f"temp_{screen_state.screen_state_id}_image" ) # Text embedding if screen_state.perception.detected_text: components.text_embedding = ComponentInfo( provider="clip_text", vector_id=f"temp_{screen_state.screen_state_id}_text" ) # Title embedding if screen_state.window.window_title: components.title_embedding = ComponentInfo( provider="clip_text", vector_id=f"temp_{screen_state.screen_state_id}_title" ) # UI embedding if screen_state.ui_elements: components.ui_embedding = ComponentInfo( provider="ui_aggregation_v1", vector_id=f"temp_{screen_state.screen_state_id}_ui" ) # Context embedding if screen_state.context.current_workflow_candidate or screen_state.context.tags: components.context_embedding = ComponentInfo( provider="context_embedding_v1", vector_id=f"temp_{screen_state.screen_state_id}_context" ) # Créer le StateEmbedding state_embedding = StateEmbedding( provider="multimodal_fusion_v1", vector_id=f"temp_{screen_state.screen_state_id}_fused", components=components ) if self.logger: self.logger.log_action({ "action": "multimodal_embedding_generated", "screen_state_id": screen_state.screen_state_id, "provider": state_embedding.provider }) return state_embedding except Exception as e: if self.logger: self.logger.log_action({ "action": "multimodal_embedding_error", "screen_state_id": screen_state.screen_state_id, "error": str(e) }) raise