""" Module d'intégration pour la capture d'écran enrichie avec détection d'éléments UI. Intègre UIElementDetector avec ScreenStateManager pour le mode enrichi. """ from typing import Optional, List import numpy as np from .ui_element_detector import UIElementDetector from .screen_state_manager import ScreenStateManager from .ui_element_models import EnrichedScreenState, WindowInfo from .multimodal_embedding_manager import MultiModalEmbeddingManager from .enhanced_workflow_matcher import EnhancedWorkflowMatcher from .llm_manager import LLMManager from .logger import Logger class EnrichedScreenCapture: """ Gestionnaire de capture d'écran enrichie. Combine: - Capture d'écran - Détection d'éléments UI (mode enrichi) - Création d'EnrichedScreenState - Sauvegarde """ def __init__( self, llm_manager: Optional[LLMManager] = None, logger: Optional[Logger] = None, data_dir: str = "data", mode: str = "enriched", config: Optional[dict] = None ): """ Initialise le gestionnaire de capture enrichie. Args: llm_manager: Gestionnaire LLM pour VLM logger: Logger data_dir: Répertoire de données mode: Mode de traitement ("light", "enriched", "complete") config: Configuration """ self.llm = llm_manager self.logger = logger self.data_dir = data_dir self.mode = mode self.config = config or {} # Créer le ScreenStateManager self.screen_state_manager = ScreenStateManager( logger=logger, data_dir=data_dir, mode=mode ) # Créer le UIElementDetector (seulement en mode enrichi ou complet) self.ui_detector = None if mode in ["enriched", "complete"]: self.ui_detector = UIElementDetector( llm_manager=llm_manager, logger=logger, config=self.config.get("ui_detector", {}) ) # Créer le MultiModalEmbeddingManager (seulement en mode complet) self.multimodal_manager = None if mode == "complete": self.multimodal_manager = MultiModalEmbeddingManager( logger=logger, data_dir=data_dir, config=self.config.get("multimodal_embedding", {}) ) # Créer l'EnhancedWorkflowMatcher (seulement en mode complet) self.enhanced_matcher = None if mode == "complete" and self.multimodal_manager: self.enhanced_matcher = EnhancedWorkflowMatcher( multimodal_manager=self.multimodal_manager, logger=logger, config=self.config.get("enhanced_matcher", {}) ) if self.logger: self.logger.log_action({ "action": "enriched_screen_capture_initialized", "mode": mode, "ui_detection_enabled": self.ui_detector is not None, "multimodal_embedding_enabled": self.multimodal_manager is not None, "enhanced_matching_enabled": self.enhanced_matcher is not None }) def capture_and_enrich( self, screenshot: np.ndarray, session_id: str, window_title: str, app_name: str, screen_resolution: tuple, detected_text: Optional[List[str]] = None, context_tags: Optional[List[str]] = None, workflow_candidate: Optional[str] = None, save: bool = True ) -> EnrichedScreenState: """ Capture et enrichit un screenshot avec détection d'éléments. Args: screenshot: Screenshot numpy array session_id: ID de session window_title: Titre de la fenêtre app_name: Nom de l'application screen_resolution: Résolution d'écran detected_text: Texte détecté (optionnel) context_tags: Tags de contexte workflow_candidate: Workflow candidat save: Sauvegarder sur disque Returns: EnrichedScreenState créé """ # Créer les informations de fenêtre window_info = WindowInfo( app_name=app_name, window_title=window_title, screen_resolution=screen_resolution ) # Détecter les éléments UI (si mode enrichi/complet) ui_elements = [] if self.ui_detector: try: if self.logger: self.logger.log_action({ "action": "ui_detection_started", "app_name": app_name }) ui_elements = self.ui_detector.detect_elements( screenshot=screenshot, window_info=window_info, data_dir=self.data_dir ) if self.logger: self.logger.log_action({ "action": "ui_detection_completed", "elements_count": len(ui_elements) }) except Exception as e: if self.logger: self.logger.log_action({ "action": "ui_detection_error", "error": str(e) }) # Continuer sans éléments UI ui_elements = [] # Créer l'EnrichedScreenState # Pour l'instant, on utilise le ScreenStateManager pour créer la base screen_state = self.screen_state_manager.create_screen_state( session_id=session_id, window_title=window_title, app_name=app_name, screenshot_path=f"{self.data_dir}/screens/temp_screenshot.png", screen_resolution=screen_resolution, detected_text=detected_text, context_tags=context_tags, workflow_candidate=workflow_candidate ) # Ajouter les éléments UI détectés screen_state.ui_elements = ui_elements screen_state.mode = self.mode # Mode complet: générer l'embedding multi-modal if self.mode == "complete" and self.multimodal_manager: try: if self.logger: self.logger.log_action({ "action": "multimodal_embedding_generation_started", "screen_state_id": screen_state.screen_state_id }) # Générer l'embedding multi-modal multimodal_embedding = self.multimodal_manager.generate_multimodal_embedding( screen_state=screen_state, screenshot=screenshot, save=save ) # Remplacer l'embedding simple par l'embedding multi-modal screen_state.state_embedding = multimodal_embedding if self.logger: self.logger.log_action({ "action": "multimodal_embedding_generated", "screen_state_id": screen_state.screen_state_id, "provider": multimodal_embedding.provider }) except Exception as e: if self.logger: self.logger.log_action({ "action": "multimodal_embedding_error", "screen_state_id": screen_state.screen_state_id, "error": str(e) }) # Continuer avec l'embedding simple # Sauvegarder si demandé if save: # Sauvegarder le screenshot import cv2 from pathlib import Path screenshot_path = Path(self.data_dir) / "screens" / f"{screen_state.screen_state_id}.png" screenshot_path.parent.mkdir(parents=True, exist_ok=True) cv2.imwrite(str(screenshot_path), screenshot) # Mettre à jour le chemin screen_state.raw.screenshot_path = str(screenshot_path) # Sauvegarder l'état self.screen_state_manager.save_screen_state(screen_state) return screen_state def get_mode(self) -> str: """Retourne le mode actuel.""" return self.mode def set_mode(self, mode: str): """ Change le mode de traitement. Args: mode: Nouveau mode ("light", "enriched", "complete") """ self.mode = mode self.screen_state_manager.mode = mode # Créer/détruire le UIElementDetector selon le mode if mode in ["enriched", "complete"] and self.ui_detector is None: self.ui_detector = UIElementDetector( llm_manager=self.llm, logger=self.logger, config=self.config.get("ui_detector", {}) ) elif mode == "light": self.ui_detector = None # Créer/détruire le MultiModalEmbeddingManager selon le mode if mode == "complete" and self.multimodal_manager is None: self.multimodal_manager = MultiModalEmbeddingManager( logger=self.logger, data_dir=self.data_dir, config=self.config.get("multimodal_embedding", {}) ) elif mode != "complete": self.multimodal_manager = None # Créer/détruire l'EnhancedWorkflowMatcher selon le mode if mode == "complete" and self.multimodal_manager and self.enhanced_matcher is None: self.enhanced_matcher = EnhancedWorkflowMatcher( multimodal_manager=self.multimodal_manager, logger=self.logger, config=self.config.get("enhanced_matcher", {}) ) elif mode != "complete": self.enhanced_matcher = None if self.logger: self.logger.log_action({ "action": "mode_changed", "new_mode": mode, "ui_detection_enabled": self.ui_detector is not None, "multimodal_embedding_enabled": self.multimodal_manager is not None, "enhanced_matching_enabled": self.enhanced_matcher is not None }) def find_matching_workflows( self, screen_state: EnrichedScreenState, screenshot: Optional[np.ndarray] = None, workflows: Optional[List] = None, top_k: int = 5 ): """ Trouve les workflows qui matchent le mieux avec l'écran actuel. Utilise l'EnhancedWorkflowMatcher en mode complet, sinon retourne None. Args: screen_state: État d'écran enrichi screenshot: Screenshot numpy array (optionnel) workflows: Liste de workflows à comparer (charge tous si None) top_k: Nombre de meilleurs matches à retourner Returns: Liste des meilleurs WorkflowMatch ou None si pas en mode complet """ if self.mode == "complete" and self.enhanced_matcher: return self.enhanced_matcher.find_matching_workflows( screen_state=screen_state, screenshot=screenshot, workflows=workflows, top_k=top_k ) else: if self.logger: self.logger.log_action({ "action": "enhanced_matching_not_available", "current_mode": self.mode, "reason": "Enhanced matching requires 'complete' mode" }) return None if __name__ == "__main__": # Tests basiques from .logger import Logger import shutil from pathlib import Path print("EnrichedScreenCapture - Tests basiques") print("=" * 50) # Créer un logger de test logger = Logger(log_dir="test_logs") # Test mode light print("\n1. Test mode light:") capture_light = EnrichedScreenCapture( logger=logger, data_dir="test_data", mode="light" ) print(f" Mode: {capture_light.get_mode()}") print(f" UI Detector: {capture_light.ui_detector is not None}") # Test mode enriched print("\n2. Test mode enriched:") capture_enriched = EnrichedScreenCapture( logger=logger, data_dir="test_data", mode="enriched" ) print(f" Mode: {capture_enriched.get_mode()}") print(f" UI Detector: {capture_enriched.ui_detector is not None}") # Test changement de mode print("\n3. Test changement de mode:") capture_enriched.set_mode("light") print(f" Nouveau mode: {capture_enriched.get_mode()}") print(f" UI Detector après changement: {capture_enriched.ui_detector is not None}") print("\n✓ Tests basiques réussis!") # Nettoyage if Path("test_data").exists(): shutil.rmtree("test_data") if Path("test_logs").exists(): shutil.rmtree("test_logs")