Files
Geniusia_v2/geniusia2/core/enriched_screen_capture.py
2026-03-05 00:20:25 +01:00

368 lines
13 KiB
Python

"""
Module d'intégration pour la capture d'écran enrichie avec détection d'éléments UI.
Intègre UIElementDetector avec ScreenStateManager pour le mode enrichi.
"""
from typing import Optional, List
import numpy as np
from .ui_element_detector import UIElementDetector
from .screen_state_manager import ScreenStateManager
from .ui_element_models import EnrichedScreenState, WindowInfo
from .multimodal_embedding_manager import MultiModalEmbeddingManager
from .enhanced_workflow_matcher import EnhancedWorkflowMatcher
from .llm_manager import LLMManager
from .logger import Logger
class EnrichedScreenCapture:
"""
Gestionnaire de capture d'écran enrichie.
Combine:
- Capture d'écran
- Détection d'éléments UI (mode enrichi)
- Création d'EnrichedScreenState
- Sauvegarde
"""
def __init__(
self,
llm_manager: Optional[LLMManager] = None,
logger: Optional[Logger] = None,
data_dir: str = "data",
mode: str = "enriched",
config: Optional[dict] = None
):
"""
Initialise le gestionnaire de capture enrichie.
Args:
llm_manager: Gestionnaire LLM pour VLM
logger: Logger
data_dir: Répertoire de données
mode: Mode de traitement ("light", "enriched", "complete")
config: Configuration
"""
self.llm = llm_manager
self.logger = logger
self.data_dir = data_dir
self.mode = mode
self.config = config or {}
# Créer le ScreenStateManager
self.screen_state_manager = ScreenStateManager(
logger=logger,
data_dir=data_dir,
mode=mode
)
# Créer le UIElementDetector (seulement en mode enrichi ou complet)
self.ui_detector = None
if mode in ["enriched", "complete"]:
self.ui_detector = UIElementDetector(
llm_manager=llm_manager,
logger=logger,
config=self.config.get("ui_detector", {})
)
# Créer le MultiModalEmbeddingManager (seulement en mode complet)
self.multimodal_manager = None
if mode == "complete":
self.multimodal_manager = MultiModalEmbeddingManager(
logger=logger,
data_dir=data_dir,
config=self.config.get("multimodal_embedding", {})
)
# Créer l'EnhancedWorkflowMatcher (seulement en mode complet)
self.enhanced_matcher = None
if mode == "complete" and self.multimodal_manager:
self.enhanced_matcher = EnhancedWorkflowMatcher(
multimodal_manager=self.multimodal_manager,
logger=logger,
config=self.config.get("enhanced_matcher", {})
)
if self.logger:
self.logger.log_action({
"action": "enriched_screen_capture_initialized",
"mode": mode,
"ui_detection_enabled": self.ui_detector is not None,
"multimodal_embedding_enabled": self.multimodal_manager is not None,
"enhanced_matching_enabled": self.enhanced_matcher is not None
})
def capture_and_enrich(
self,
screenshot: np.ndarray,
session_id: str,
window_title: str,
app_name: str,
screen_resolution: tuple,
detected_text: Optional[List[str]] = None,
context_tags: Optional[List[str]] = None,
workflow_candidate: Optional[str] = None,
save: bool = True
) -> EnrichedScreenState:
"""
Capture et enrichit un screenshot avec détection d'éléments.
Args:
screenshot: Screenshot numpy array
session_id: ID de session
window_title: Titre de la fenêtre
app_name: Nom de l'application
screen_resolution: Résolution d'écran
detected_text: Texte détecté (optionnel)
context_tags: Tags de contexte
workflow_candidate: Workflow candidat
save: Sauvegarder sur disque
Returns:
EnrichedScreenState créé
"""
# Créer les informations de fenêtre
window_info = WindowInfo(
app_name=app_name,
window_title=window_title,
screen_resolution=screen_resolution
)
# Détecter les éléments UI (si mode enrichi/complet)
ui_elements = []
if self.ui_detector:
try:
if self.logger:
self.logger.log_action({
"action": "ui_detection_started",
"app_name": app_name
})
ui_elements = self.ui_detector.detect_elements(
screenshot=screenshot,
window_info=window_info,
data_dir=self.data_dir
)
if self.logger:
self.logger.log_action({
"action": "ui_detection_completed",
"elements_count": len(ui_elements)
})
except Exception as e:
if self.logger:
self.logger.log_action({
"action": "ui_detection_error",
"error": str(e)
})
# Continuer sans éléments UI
ui_elements = []
# Créer l'EnrichedScreenState
# Pour l'instant, on utilise le ScreenStateManager pour créer la base
screen_state = self.screen_state_manager.create_screen_state(
session_id=session_id,
window_title=window_title,
app_name=app_name,
screenshot_path=f"{self.data_dir}/screens/temp_screenshot.png",
screen_resolution=screen_resolution,
detected_text=detected_text,
context_tags=context_tags,
workflow_candidate=workflow_candidate
)
# Ajouter les éléments UI détectés
screen_state.ui_elements = ui_elements
screen_state.mode = self.mode
# Mode complet: générer l'embedding multi-modal
if self.mode == "complete" and self.multimodal_manager:
try:
if self.logger:
self.logger.log_action({
"action": "multimodal_embedding_generation_started",
"screen_state_id": screen_state.screen_state_id
})
# Générer l'embedding multi-modal
multimodal_embedding = self.multimodal_manager.generate_multimodal_embedding(
screen_state=screen_state,
screenshot=screenshot,
save=save
)
# Remplacer l'embedding simple par l'embedding multi-modal
screen_state.state_embedding = multimodal_embedding
if self.logger:
self.logger.log_action({
"action": "multimodal_embedding_generated",
"screen_state_id": screen_state.screen_state_id,
"provider": multimodal_embedding.provider
})
except Exception as e:
if self.logger:
self.logger.log_action({
"action": "multimodal_embedding_error",
"screen_state_id": screen_state.screen_state_id,
"error": str(e)
})
# Continuer avec l'embedding simple
# Sauvegarder si demandé
if save:
# Sauvegarder le screenshot
import cv2
from pathlib import Path
screenshot_path = Path(self.data_dir) / "screens" / f"{screen_state.screen_state_id}.png"
screenshot_path.parent.mkdir(parents=True, exist_ok=True)
cv2.imwrite(str(screenshot_path), screenshot)
# Mettre à jour le chemin
screen_state.raw.screenshot_path = str(screenshot_path)
# Sauvegarder l'état
self.screen_state_manager.save_screen_state(screen_state)
return screen_state
def get_mode(self) -> str:
"""Retourne le mode actuel."""
return self.mode
def set_mode(self, mode: str):
"""
Change le mode de traitement.
Args:
mode: Nouveau mode ("light", "enriched", "complete")
"""
self.mode = mode
self.screen_state_manager.mode = mode
# Créer/détruire le UIElementDetector selon le mode
if mode in ["enriched", "complete"] and self.ui_detector is None:
self.ui_detector = UIElementDetector(
llm_manager=self.llm,
logger=self.logger,
config=self.config.get("ui_detector", {})
)
elif mode == "light":
self.ui_detector = None
# Créer/détruire le MultiModalEmbeddingManager selon le mode
if mode == "complete" and self.multimodal_manager is None:
self.multimodal_manager = MultiModalEmbeddingManager(
logger=self.logger,
data_dir=self.data_dir,
config=self.config.get("multimodal_embedding", {})
)
elif mode != "complete":
self.multimodal_manager = None
# Créer/détruire l'EnhancedWorkflowMatcher selon le mode
if mode == "complete" and self.multimodal_manager and self.enhanced_matcher is None:
self.enhanced_matcher = EnhancedWorkflowMatcher(
multimodal_manager=self.multimodal_manager,
logger=self.logger,
config=self.config.get("enhanced_matcher", {})
)
elif mode != "complete":
self.enhanced_matcher = None
if self.logger:
self.logger.log_action({
"action": "mode_changed",
"new_mode": mode,
"ui_detection_enabled": self.ui_detector is not None,
"multimodal_embedding_enabled": self.multimodal_manager is not None,
"enhanced_matching_enabled": self.enhanced_matcher is not None
})
def find_matching_workflows(
self,
screen_state: EnrichedScreenState,
screenshot: Optional[np.ndarray] = None,
workflows: Optional[List] = None,
top_k: int = 5
):
"""
Trouve les workflows qui matchent le mieux avec l'écran actuel.
Utilise l'EnhancedWorkflowMatcher en mode complet, sinon retourne None.
Args:
screen_state: État d'écran enrichi
screenshot: Screenshot numpy array (optionnel)
workflows: Liste de workflows à comparer (charge tous si None)
top_k: Nombre de meilleurs matches à retourner
Returns:
Liste des meilleurs WorkflowMatch ou None si pas en mode complet
"""
if self.mode == "complete" and self.enhanced_matcher:
return self.enhanced_matcher.find_matching_workflows(
screen_state=screen_state,
screenshot=screenshot,
workflows=workflows,
top_k=top_k
)
else:
if self.logger:
self.logger.log_action({
"action": "enhanced_matching_not_available",
"current_mode": self.mode,
"reason": "Enhanced matching requires 'complete' mode"
})
return None
if __name__ == "__main__":
# Tests basiques
from .logger import Logger
import shutil
from pathlib import Path
print("EnrichedScreenCapture - Tests basiques")
print("=" * 50)
# Créer un logger de test
logger = Logger(log_dir="test_logs")
# Test mode light
print("\n1. Test mode light:")
capture_light = EnrichedScreenCapture(
logger=logger,
data_dir="test_data",
mode="light"
)
print(f" Mode: {capture_light.get_mode()}")
print(f" UI Detector: {capture_light.ui_detector is not None}")
# Test mode enriched
print("\n2. Test mode enriched:")
capture_enriched = EnrichedScreenCapture(
logger=logger,
data_dir="test_data",
mode="enriched"
)
print(f" Mode: {capture_enriched.get_mode()}")
print(f" UI Detector: {capture_enriched.ui_detector is not None}")
# Test changement de mode
print("\n3. Test changement de mode:")
capture_enriched.set_mode("light")
print(f" Nouveau mode: {capture_enriched.get_mode()}")
print(f" UI Detector après changement: {capture_enriched.ui_detector is not None}")
print("\n✓ Tests basiques réussis!")
# Nettoyage
if Path("test_data").exists():
shutil.rmtree("test_data")
if Path("test_logs").exists():
shutil.rmtree("test_logs")