""" Modèles de données pour la détection d'éléments UI et l'état d'écran enrichi. Implémente les structures UIElement et EnrichedScreenState pour le système RPA Vision V2. Phase 1 - Mode Light: Structures de base avec compatibilité arrière complète. """ from dataclasses import dataclass, field, asdict from datetime import datetime from typing import List, Dict, Any, Tuple, Optional from enum import Enum import json import hashlib import numpy as np class UIElementType(Enum): """Types d'éléments UI supportés.""" BUTTON = "button" TEXT_INPUT = "text_input" DROPDOWN = "dropdown" TAB = "tab" CHECKBOX = "checkbox" RADIO_BUTTON = "radio_button" LINK = "link" GENERIC_INTERACTIVE = "generic_interactive" @dataclass class VisualData: """Données visuelles d'un élément UI.""" screenshot_path: str embedding_provider: str # ex: "openclip_ViT-B-32" embedding_vector_id: str # chemin vers le fichier .npy def to_dict(self) -> Dict[str, Any]: """Convertit en dictionnaire pour sérialisation JSON.""" return { "screenshot_path": self.screenshot_path, "embedding": { "provider": self.embedding_provider, "vector_id": self.embedding_vector_id } } @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'VisualData': """Crée une instance depuis un dictionnaire.""" if "embedding" in data: # Nouveau format return cls( screenshot_path=data["screenshot_path"], embedding_provider=data["embedding"]["provider"], embedding_vector_id=data["embedding"]["vector_id"] ) else: # Format legacy return cls( screenshot_path=data["screenshot_path"], embedding_provider=data.get("embedding_provider", ""), embedding_vector_id=data.get("embedding_vector_id", "") ) @dataclass class TextData: """Données textuelles d'un élément UI.""" raw: str normalized: str embedding_provider: str # ex: "clip_text" embedding_vector_id: str # chemin vers le fichier .npy def to_dict(self) -> Dict[str, Any]: """Convertit en dictionnaire pour sérialisation JSON.""" return { "raw": self.raw, "normalized": self.normalized, "embedding": { "provider": self.embedding_provider, "vector_id": self.embedding_vector_id } } @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'TextData': """Crée une instance depuis un dictionnaire.""" if "embedding" in data: # Nouveau format return cls( raw=data["raw"], normalized=data["normalized"], embedding_provider=data["embedding"]["provider"], embedding_vector_id=data["embedding"]["vector_id"] ) else: # Format legacy return cls( raw=data.get("raw", ""), normalized=data.get("normalized", ""), embedding_provider=data.get("embedding_provider", ""), embedding_vector_id=data.get("embedding_vector_id", "") ) @dataclass class ElementProperties: """Propriétés d'un élément UI.""" is_clickable: bool = False is_focusable: bool = False is_dangerous: bool = False def to_dict(self) -> Dict[str, Any]: """Convertit en dictionnaire pour sérialisation JSON.""" return { "is_clickable": self.is_clickable, "is_focusable": self.is_focusable, "is_dangerous": self.is_dangerous } @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'ElementProperties': """Crée une instance depuis un dictionnaire.""" return cls( is_clickable=data.get("is_clickable", False), is_focusable=data.get("is_focusable", False), is_dangerous=data.get("is_dangerous", False) ) @dataclass class ElementContext: """Contexte d'un élément UI.""" app_name: str window_title: str workflow_hint: Optional[str] = None def to_dict(self) -> Dict[str, Any]: """Convertit en dictionnaire pour sérialisation JSON.""" return { "app_name": self.app_name, "window_title": self.window_title, "workflow_hint": self.workflow_hint } @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'ElementContext': """Crée une instance depuis un dictionnaire.""" return cls( app_name=data["app_name"], window_title=data["window_title"], workflow_hint=data.get("workflow_hint") ) @dataclass class UIElement: """ Représente un élément d'interface utilisateur détecté. Attributes: element_id: Identifiant stable basé sur hash(app_name + center_bbox + label_normalized) type: Type d'élément (button, text_input, etc.) role: Rôle sémantique (validate_invoice, search_field, etc.) bbox: Bounding box (x1, y1, x2, y2) label: Texte visible de l'élément visual: Données visuelles (screenshot, embedding) text: Données textuelles (raw, normalized, embedding) properties: Propriétés (is_clickable, is_focusable, is_dangerous) context: Contexte (app_name, window_title, workflow_hint) tags: Tags additionnels confidence: Score de confiance de la détection (0.0-1.0) detection_method: Méthode de détection utilisée """ element_id: str type: UIElementType role: str bbox: Tuple[int, int, int, int] # (x1, y1, x2, y2) label: str visual: VisualData text: TextData properties: ElementProperties context: ElementContext tags: List[str] = field(default_factory=list) confidence: float = 1.0 detection_method: str = "unknown" @staticmethod def generate_element_id(app_name: str, bbox: Tuple[int, int, int, int], label: str) -> str: """ Génère un identifiant stable pour un élément UI. Args: app_name: Nom de l'application bbox: Bounding box (x1, y1, x2, y2) label: Label de l'élément Returns: Identifiant stable basé sur hash """ # Calculer le centre de la bbox center_x = (bbox[0] + bbox[2]) // 2 center_y = (bbox[1] + bbox[3]) // 2 # Normaliser le label (lowercase, strip whitespace) label_normalized = label.lower().strip() # Créer la chaîne à hasher hash_input = f"{app_name}_{center_x}_{center_y}_{label_normalized}" # Générer le hash hash_obj = hashlib.sha256(hash_input.encode('utf-8')) hash_hex = hash_obj.hexdigest()[:16] # Prendre les 16 premiers caractères return f"el_{hash_hex}" def to_dict(self) -> Dict[str, Any]: """Convertit en dictionnaire pour sérialisation JSON.""" return { "schema_version": "uielement_v1", "element_id": self.element_id, "type": self.type.value, "role": self.role, "bbox": list(self.bbox), "label": self.label, "confidence": float(self.confidence), "detection_method": self.detection_method, "visual": self.visual.to_dict(), "text": self.text.to_dict(), "properties": self.properties.to_dict(), "context": self.context.to_dict(), "tags": self.tags } @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'UIElement': """Crée une instance depuis un dictionnaire.""" # Gérer la compatibilité avec différentes versions de schéma schema_version = data.get("schema_version", "uielement_v1") # Parser le type element_type = UIElementType(data["type"]) # Reconstruire les sous-structures visual = VisualData.from_dict(data["visual"]) text = TextData.from_dict(data["text"]) properties = ElementProperties.from_dict(data["properties"]) context = ElementContext.from_dict(data["context"]) return cls( element_id=data["element_id"], type=element_type, role=data["role"], bbox=tuple(data["bbox"]), label=data["label"], visual=visual, text=text, properties=properties, context=context, tags=data.get("tags", []), confidence=data.get("confidence", 1.0), detection_method=data.get("detection_method", "unknown") ) def to_json(self) -> str: """Sérialise en JSON.""" return json.dumps(self.to_dict(), indent=2, ensure_ascii=False) @classmethod def from_json(cls, json_str: str) -> 'UIElement': """Désérialise depuis JSON.""" data = json.loads(json_str) return cls.from_dict(data) if __name__ == "__main__": # Tests basiques print("Test des modèles UIElement") print("=" * 50) # Test génération d'element_id print("\n1. Test génération d'element_id:") element_id = UIElement.generate_element_id( app_name="test_app", bbox=(100, 200, 300, 250), label="Valider" ) print(f" Element ID: {element_id}") # Test création d'un UIElement print("\n2. Test création UIElement:") element = UIElement( element_id=element_id, type=UIElementType.BUTTON, role="validate_action", bbox=(100, 200, 300, 250), label="Valider", visual=VisualData( screenshot_path="data/elements/el_001.png", embedding_provider="openclip_ViT-B-32", embedding_vector_id="data/embeddings/el_001.npy" ), text=TextData( raw="Valider", normalized="valider", embedding_provider="clip_text", embedding_vector_id="data/embeddings/el_001_text.npy" ), properties=ElementProperties( is_clickable=True, is_focusable=True, is_dangerous=False ), context=ElementContext( app_name="test_app", window_title="Test Window", workflow_hint="WF_test" ), tags=["primary_action"], confidence=0.95, detection_method="heuristic_rectangle" ) print(f" Element ID: {element.element_id}") print(f" Type: {element.type.value}") print(f" Role: {element.role}") print(f" Label: {element.label}") print(f" Confidence: {element.confidence}") # Test sérialisation print("\n3. Test sérialisation JSON:") json_str = element.to_json() print(f" JSON length: {len(json_str)} chars") print(f" Schema version: uielement_v1") # Test désérialisation print("\n4. Test désérialisation:") element_restored = UIElement.from_json(json_str) print(f" Restored element_id: {element_restored.element_id}") print(f" Restored type: {element_restored.type.value}") print(f" Restored label: {element_restored.label}") # Test stabilité de l'ID print("\n5. Test stabilité de l'element_id:") element_id_2 = UIElement.generate_element_id( app_name="test_app", bbox=(100, 200, 300, 250), label="Valider" ) print(f" ID 1: {element_id}") print(f" ID 2: {element_id_2}") print(f" IDs identiques: {element_id == element_id_2}") print("\n✓ Tous les tests basiques réussis!") # ============================================================================ # EnrichedScreenState and related structures # ============================================================================ @dataclass class WindowInfo: """Informations sur la fenêtre active.""" app_name: str window_title: str screen_resolution: Tuple[int, int] # (width, height) def to_dict(self) -> Dict[str, Any]: """Convertit en dictionnaire pour sérialisation JSON.""" return { "app_name": self.app_name, "window_title": self.window_title, "screen_resolution": list(self.screen_resolution) } @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'WindowInfo': """Crée une instance depuis un dictionnaire.""" return cls( app_name=data["app_name"], window_title=data["window_title"], screen_resolution=tuple(data["screen_resolution"]) ) @dataclass class RawData: """Données brutes de capture d'écran.""" screenshot_path: str def to_dict(self) -> Dict[str, Any]: """Convertit en dictionnaire pour sérialisation JSON.""" return { "screenshot_path": self.screenshot_path } @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'RawData': """Crée une instance depuis un dictionnaire.""" return cls(screenshot_path=data["screenshot_path"]) @dataclass class PerceptionData: """Données de perception (texte détecté, OCR, etc.).""" detected_text: List[str] = field(default_factory=list) ocr_results: Optional[Dict[str, Any]] = None def to_dict(self) -> Dict[str, Any]: """Convertit en dictionnaire pour sérialisation JSON.""" return { "detected_text": self.detected_text, "ocr_results": self.ocr_results } @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'PerceptionData': """Crée une instance depuis un dictionnaire.""" return cls( detected_text=data.get("detected_text", []), ocr_results=data.get("ocr_results") ) @dataclass class ComponentInfo: """Informations sur une composante d'embedding.""" provider: str vector_id: str def to_dict(self) -> Dict[str, Any]: """Convertit en dictionnaire pour sérialisation JSON.""" return { "provider": self.provider, "vector_id": self.vector_id } @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'ComponentInfo': """Crée une instance depuis un dictionnaire.""" return cls( provider=data["provider"], vector_id=data["vector_id"] ) @dataclass class EmbeddingComponents: """Composantes individuelles d'un state embedding multi-modal.""" image_embedding: Optional[ComponentInfo] = None text_embedding: Optional[ComponentInfo] = None title_embedding: Optional[ComponentInfo] = None ui_embedding: Optional[ComponentInfo] = None context_embedding: Optional[ComponentInfo] = None def to_dict(self) -> Dict[str, Any]: """Convertit en dictionnaire pour sérialisation JSON.""" result = {} if self.image_embedding: result["image_embedding"] = self.image_embedding.to_dict() if self.text_embedding: result["text_embedding"] = self.text_embedding.to_dict() if self.title_embedding: result["title_embedding"] = self.title_embedding.to_dict() if self.ui_embedding: result["ui_embedding"] = self.ui_embedding.to_dict() if self.context_embedding: result["context_embedding"] = self.context_embedding.to_dict() return result @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'EmbeddingComponents': """Crée une instance depuis un dictionnaire.""" return cls( image_embedding=ComponentInfo.from_dict(data["image_embedding"]) if "image_embedding" in data else None, text_embedding=ComponentInfo.from_dict(data["text_embedding"]) if "text_embedding" in data else None, title_embedding=ComponentInfo.from_dict(data["title_embedding"]) if "title_embedding" in data else None, ui_embedding=ComponentInfo.from_dict(data["ui_embedding"]) if "ui_embedding" in data else None, context_embedding=ComponentInfo.from_dict(data["context_embedding"]) if "context_embedding" in data else None ) @dataclass class StateEmbedding: """Embedding d'état unifié (multi-modal ou simple).""" provider: str vector_id: str components: Optional[EmbeddingComponents] = None def to_dict(self) -> Dict[str, Any]: """Convertit en dictionnaire pour sérialisation JSON.""" result = { "provider": self.provider, "vector_id": self.vector_id } if self.components: result["components"] = self.components.to_dict() return result @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'StateEmbedding': """Crée une instance depuis un dictionnaire.""" components = None if "components" in data and data["components"]: components = EmbeddingComponents.from_dict(data["components"]) return cls( provider=data["provider"], vector_id=data["vector_id"], components=components ) @dataclass class ContextData: """Données de contexte workflow.""" current_workflow_candidate: Optional[str] = None tags: List[str] = field(default_factory=list) metadata: Dict[str, Any] = field(default_factory=dict) def to_dict(self) -> Dict[str, Any]: """Convertit en dictionnaire pour sérialisation JSON.""" return { "current_workflow_candidate": self.current_workflow_candidate, "tags": self.tags, "metadata": self.metadata } @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'ContextData': """Crée une instance depuis un dictionnaire.""" return cls( current_workflow_candidate=data.get("current_workflow_candidate"), tags=data.get("tags", []), metadata=data.get("metadata", {}) ) @dataclass class EnrichedScreenState: """ ScreenState enrichi avec éléments d'UI et embedding multi-modal. Attributes: screen_state_id: Identifiant unique de l'état d'écran timestamp: Horodatage de la capture session_id: Identifiant de session window: Informations sur la fenêtre raw: Données brutes (screenshot_path) perception: Données de perception (texte détecté) ui_elements: Liste des éléments UI détectés state_embedding: Embedding d'état unifié context: Contexte workflow mode: Mode de traitement ("light", "enriched", "complete") processing_metadata: Métadonnées de traitement (optionnel) """ screen_state_id: str timestamp: datetime session_id: str window: WindowInfo raw: RawData perception: PerceptionData ui_elements: List[UIElement] state_embedding: StateEmbedding context: ContextData mode: str = "light" processing_metadata: Optional[Dict[str, Any]] = None def to_dict(self) -> Dict[str, Any]: """Convertit en dictionnaire pour sérialisation JSON.""" result = { "schema_version": "screenstate_v1", "mode": self.mode, "screen_state_id": self.screen_state_id, "timestamp": self.timestamp.isoformat(), "session_id": self.session_id, "window": self.window.to_dict(), "raw": self.raw.to_dict(), "perception": self.perception.to_dict(), "ui_elements": [elem.to_dict() for elem in self.ui_elements], "state_embedding": self.state_embedding.to_dict(), "context": self.context.to_dict() } if self.processing_metadata: result["processing_metadata"] = self.processing_metadata return result @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'EnrichedScreenState': """Crée une instance depuis un dictionnaire.""" # Gérer la compatibilité avec différentes versions de schéma schema_version = data.get("schema_version", "screenstate_v1") # Parser le timestamp timestamp = datetime.fromisoformat(data["timestamp"]) # Reconstruire les sous-structures window = WindowInfo.from_dict(data["window"]) raw = RawData.from_dict(data["raw"]) perception = PerceptionData.from_dict(data["perception"]) # Reconstruire les UI elements ui_elements = [UIElement.from_dict(elem_data) for elem_data in data.get("ui_elements", [])] # Reconstruire le state embedding state_embedding = StateEmbedding.from_dict(data["state_embedding"]) # Reconstruire le contexte context = ContextData.from_dict(data["context"]) return cls( screen_state_id=data["screen_state_id"], timestamp=timestamp, session_id=data["session_id"], window=window, raw=raw, perception=perception, ui_elements=ui_elements, state_embedding=state_embedding, context=context, mode=data.get("mode", "light"), processing_metadata=data.get("processing_metadata") ) def to_json(self) -> str: """Sérialise en JSON.""" return json.dumps(self.to_dict(), indent=2, ensure_ascii=False) @classmethod def from_json(cls, json_str: str) -> 'EnrichedScreenState': """Désérialise depuis JSON.""" data = json.loads(json_str) return cls.from_dict(data) @classmethod def create_light_mode( cls, screen_state_id: str, session_id: str, window: WindowInfo, screenshot_path: str, image_embedding_provider: str, image_embedding_vector_id: str ) -> 'EnrichedScreenState': """ Crée un EnrichedScreenState en mode light (compatibilité arrière). Args: screen_state_id: ID de l'état d'écran session_id: ID de session window: Informations sur la fenêtre screenshot_path: Chemin vers le screenshot image_embedding_provider: Provider de l'embedding image image_embedding_vector_id: ID du vecteur d'embedding image Returns: EnrichedScreenState en mode light """ return cls( screen_state_id=screen_state_id, timestamp=datetime.now(), session_id=session_id, window=window, raw=RawData(screenshot_path=screenshot_path), perception=PerceptionData(detected_text=[]), ui_elements=[], # Vide en mode light state_embedding=StateEmbedding( provider=image_embedding_provider, vector_id=image_embedding_vector_id, components=None # Pas de composantes en mode light ), context=ContextData(), mode="light" ) # Tests pour EnrichedScreenState def test_enriched_screen_state(): """Tests basiques pour EnrichedScreenState.""" print("\n" + "=" * 50) print("Test des modèles EnrichedScreenState") print("=" * 50) # Test mode light print("\n1. Test création en mode light:") window = WindowInfo( app_name="test_app", window_title="Test Window", screen_resolution=(1920, 1080) ) screen_state_light = EnrichedScreenState.create_light_mode( screen_state_id="screen_001", session_id="session_001", window=window, screenshot_path="data/screens/screen_001.png", image_embedding_provider="openclip_ViT-B-32", image_embedding_vector_id="data/embeddings/screen_001.npy" ) print(f" Screen State ID: {screen_state_light.screen_state_id}") print(f" Mode: {screen_state_light.mode}") print(f" UI Elements: {len(screen_state_light.ui_elements)}") print(f" State Embedding Provider: {screen_state_light.state_embedding.provider}") print(f" Has Components: {screen_state_light.state_embedding.components is not None}") # Test sérialisation mode light print("\n2. Test sérialisation JSON (mode light):") json_str = screen_state_light.to_json() print(f" JSON length: {len(json_str)} chars") # Test désérialisation mode light print("\n3. Test désérialisation (mode light):") screen_state_restored = EnrichedScreenState.from_json(json_str) print(f" Restored screen_state_id: {screen_state_restored.screen_state_id}") print(f" Restored mode: {screen_state_restored.mode}") print(f" Restored UI elements count: {len(screen_state_restored.ui_elements)}") # Test mode enriched avec éléments print("\n4. Test création en mode enriched:") element = UIElement( element_id="el_test_001", type=UIElementType.BUTTON, role="validate_action", bbox=(100, 200, 300, 250), label="Valider", visual=VisualData( screenshot_path="data/elements/el_001.png", embedding_provider="openclip_ViT-B-32", embedding_vector_id="data/embeddings/el_001.npy" ), text=TextData( raw="Valider", normalized="valider", embedding_provider="clip_text", embedding_vector_id="data/embeddings/el_001_text.npy" ), properties=ElementProperties(is_clickable=True), context=ElementContext( app_name="test_app", window_title="Test Window" ), tags=["primary_action"], confidence=0.95 ) screen_state_enriched = EnrichedScreenState( screen_state_id="screen_002", timestamp=datetime.now(), session_id="session_001", window=window, raw=RawData(screenshot_path="data/screens/screen_002.png"), perception=PerceptionData(detected_text=["Valider", "Annuler"]), ui_elements=[element], state_embedding=StateEmbedding( provider="openclip_ViT-B-32", vector_id="data/embeddings/screen_002.npy", components=None ), context=ContextData(tags=["test"]), mode="enriched" ) print(f" Screen State ID: {screen_state_enriched.screen_state_id}") print(f" Mode: {screen_state_enriched.mode}") print(f" UI Elements: {len(screen_state_enriched.ui_elements)}") print(f" Detected Text: {screen_state_enriched.perception.detected_text}") # Test mode complete avec composantes print("\n5. Test création en mode complete:") components = EmbeddingComponents( image_embedding=ComponentInfo( provider="openclip_ViT-B-32", vector_id="data/embeddings/screen_003_image.npy" ), text_embedding=ComponentInfo( provider="clip_text", vector_id="data/embeddings/screen_003_text.npy" ), title_embedding=ComponentInfo( provider="clip_text", vector_id="data/embeddings/screen_003_title.npy" ) ) screen_state_complete = EnrichedScreenState( screen_state_id="screen_003", timestamp=datetime.now(), session_id="session_001", window=window, raw=RawData(screenshot_path="data/screens/screen_003.png"), perception=PerceptionData(detected_text=["Valider", "Annuler"]), ui_elements=[element], state_embedding=StateEmbedding( provider="multimodal_fusion_v1", vector_id="data/embeddings/screen_003_fused.npy", components=components ), context=ContextData(tags=["test"]), mode="complete" ) print(f" Screen State ID: {screen_state_complete.screen_state_id}") print(f" Mode: {screen_state_complete.mode}") print(f" State Embedding Provider: {screen_state_complete.state_embedding.provider}") print(f" Has Components: {screen_state_complete.state_embedding.components is not None}") # Test sérialisation mode complete print("\n6. Test sérialisation JSON (mode complete):") json_str_complete = screen_state_complete.to_json() print(f" JSON length: {len(json_str_complete)} chars") # Test désérialisation mode complete print("\n7. Test désérialisation (mode complete):") screen_state_complete_restored = EnrichedScreenState.from_json(json_str_complete) print(f" Restored screen_state_id: {screen_state_complete_restored.screen_state_id}") print(f" Restored mode: {screen_state_complete_restored.mode}") print(f" Restored components: {screen_state_complete_restored.state_embedding.components is not None}") print("\n✓ Tous les tests EnrichedScreenState réussis!") if __name__ == "__main__": # Exécuter les tests test_enriched_screen_state()