828 lines
29 KiB
Python
828 lines
29 KiB
Python
"""
|
|
Modèles de données pour la détection d'éléments UI et l'état d'écran enrichi.
|
|
Implémente les structures UIElement et EnrichedScreenState pour le système RPA Vision V2.
|
|
|
|
Phase 1 - Mode Light: Structures de base avec compatibilité arrière complète.
|
|
"""
|
|
|
|
from dataclasses import dataclass, field, asdict
|
|
from datetime import datetime
|
|
from typing import List, Dict, Any, Tuple, Optional
|
|
from enum import Enum
|
|
import json
|
|
import hashlib
|
|
import numpy as np
|
|
|
|
|
|
class UIElementType(Enum):
|
|
"""Types d'éléments UI supportés."""
|
|
BUTTON = "button"
|
|
TEXT_INPUT = "text_input"
|
|
DROPDOWN = "dropdown"
|
|
TAB = "tab"
|
|
CHECKBOX = "checkbox"
|
|
RADIO_BUTTON = "radio_button"
|
|
LINK = "link"
|
|
GENERIC_INTERACTIVE = "generic_interactive"
|
|
|
|
|
|
@dataclass
|
|
class VisualData:
|
|
"""Données visuelles d'un élément UI."""
|
|
screenshot_path: str
|
|
embedding_provider: str # ex: "openclip_ViT-B-32"
|
|
embedding_vector_id: str # chemin vers le fichier .npy
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convertit en dictionnaire pour sérialisation JSON."""
|
|
return {
|
|
"screenshot_path": self.screenshot_path,
|
|
"embedding": {
|
|
"provider": self.embedding_provider,
|
|
"vector_id": self.embedding_vector_id
|
|
}
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'VisualData':
|
|
"""Crée une instance depuis un dictionnaire."""
|
|
if "embedding" in data:
|
|
# Nouveau format
|
|
return cls(
|
|
screenshot_path=data["screenshot_path"],
|
|
embedding_provider=data["embedding"]["provider"],
|
|
embedding_vector_id=data["embedding"]["vector_id"]
|
|
)
|
|
else:
|
|
# Format legacy
|
|
return cls(
|
|
screenshot_path=data["screenshot_path"],
|
|
embedding_provider=data.get("embedding_provider", ""),
|
|
embedding_vector_id=data.get("embedding_vector_id", "")
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class TextData:
|
|
"""Données textuelles d'un élément UI."""
|
|
raw: str
|
|
normalized: str
|
|
embedding_provider: str # ex: "clip_text"
|
|
embedding_vector_id: str # chemin vers le fichier .npy
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convertit en dictionnaire pour sérialisation JSON."""
|
|
return {
|
|
"raw": self.raw,
|
|
"normalized": self.normalized,
|
|
"embedding": {
|
|
"provider": self.embedding_provider,
|
|
"vector_id": self.embedding_vector_id
|
|
}
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'TextData':
|
|
"""Crée une instance depuis un dictionnaire."""
|
|
if "embedding" in data:
|
|
# Nouveau format
|
|
return cls(
|
|
raw=data["raw"],
|
|
normalized=data["normalized"],
|
|
embedding_provider=data["embedding"]["provider"],
|
|
embedding_vector_id=data["embedding"]["vector_id"]
|
|
)
|
|
else:
|
|
# Format legacy
|
|
return cls(
|
|
raw=data.get("raw", ""),
|
|
normalized=data.get("normalized", ""),
|
|
embedding_provider=data.get("embedding_provider", ""),
|
|
embedding_vector_id=data.get("embedding_vector_id", "")
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class ElementProperties:
|
|
"""Propriétés d'un élément UI."""
|
|
is_clickable: bool = False
|
|
is_focusable: bool = False
|
|
is_dangerous: bool = False
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convertit en dictionnaire pour sérialisation JSON."""
|
|
return {
|
|
"is_clickable": self.is_clickable,
|
|
"is_focusable": self.is_focusable,
|
|
"is_dangerous": self.is_dangerous
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'ElementProperties':
|
|
"""Crée une instance depuis un dictionnaire."""
|
|
return cls(
|
|
is_clickable=data.get("is_clickable", False),
|
|
is_focusable=data.get("is_focusable", False),
|
|
is_dangerous=data.get("is_dangerous", False)
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class ElementContext:
|
|
"""Contexte d'un élément UI."""
|
|
app_name: str
|
|
window_title: str
|
|
workflow_hint: Optional[str] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convertit en dictionnaire pour sérialisation JSON."""
|
|
return {
|
|
"app_name": self.app_name,
|
|
"window_title": self.window_title,
|
|
"workflow_hint": self.workflow_hint
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'ElementContext':
|
|
"""Crée une instance depuis un dictionnaire."""
|
|
return cls(
|
|
app_name=data["app_name"],
|
|
window_title=data["window_title"],
|
|
workflow_hint=data.get("workflow_hint")
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class UIElement:
|
|
"""
|
|
Représente un élément d'interface utilisateur détecté.
|
|
|
|
Attributes:
|
|
element_id: Identifiant stable basé sur hash(app_name + center_bbox + label_normalized)
|
|
type: Type d'élément (button, text_input, etc.)
|
|
role: Rôle sémantique (validate_invoice, search_field, etc.)
|
|
bbox: Bounding box (x1, y1, x2, y2)
|
|
label: Texte visible de l'élément
|
|
visual: Données visuelles (screenshot, embedding)
|
|
text: Données textuelles (raw, normalized, embedding)
|
|
properties: Propriétés (is_clickable, is_focusable, is_dangerous)
|
|
context: Contexte (app_name, window_title, workflow_hint)
|
|
tags: Tags additionnels
|
|
confidence: Score de confiance de la détection (0.0-1.0)
|
|
detection_method: Méthode de détection utilisée
|
|
"""
|
|
element_id: str
|
|
type: UIElementType
|
|
role: str
|
|
bbox: Tuple[int, int, int, int] # (x1, y1, x2, y2)
|
|
label: str
|
|
visual: VisualData
|
|
text: TextData
|
|
properties: ElementProperties
|
|
context: ElementContext
|
|
tags: List[str] = field(default_factory=list)
|
|
confidence: float = 1.0
|
|
detection_method: str = "unknown"
|
|
|
|
@staticmethod
|
|
def generate_element_id(app_name: str, bbox: Tuple[int, int, int, int], label: str) -> str:
|
|
"""
|
|
Génère un identifiant stable pour un élément UI.
|
|
|
|
Args:
|
|
app_name: Nom de l'application
|
|
bbox: Bounding box (x1, y1, x2, y2)
|
|
label: Label de l'élément
|
|
|
|
Returns:
|
|
Identifiant stable basé sur hash
|
|
"""
|
|
# Calculer le centre de la bbox
|
|
center_x = (bbox[0] + bbox[2]) // 2
|
|
center_y = (bbox[1] + bbox[3]) // 2
|
|
|
|
# Normaliser le label (lowercase, strip whitespace)
|
|
label_normalized = label.lower().strip()
|
|
|
|
# Créer la chaîne à hasher
|
|
hash_input = f"{app_name}_{center_x}_{center_y}_{label_normalized}"
|
|
|
|
# Générer le hash
|
|
hash_obj = hashlib.sha256(hash_input.encode('utf-8'))
|
|
hash_hex = hash_obj.hexdigest()[:16] # Prendre les 16 premiers caractères
|
|
|
|
return f"el_{hash_hex}"
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convertit en dictionnaire pour sérialisation JSON."""
|
|
return {
|
|
"schema_version": "uielement_v1",
|
|
"element_id": self.element_id,
|
|
"type": self.type.value,
|
|
"role": self.role,
|
|
"bbox": list(self.bbox),
|
|
"label": self.label,
|
|
"confidence": float(self.confidence),
|
|
"detection_method": self.detection_method,
|
|
"visual": self.visual.to_dict(),
|
|
"text": self.text.to_dict(),
|
|
"properties": self.properties.to_dict(),
|
|
"context": self.context.to_dict(),
|
|
"tags": self.tags
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'UIElement':
|
|
"""Crée une instance depuis un dictionnaire."""
|
|
# Gérer la compatibilité avec différentes versions de schéma
|
|
schema_version = data.get("schema_version", "uielement_v1")
|
|
|
|
# Parser le type
|
|
element_type = UIElementType(data["type"])
|
|
|
|
# Reconstruire les sous-structures
|
|
visual = VisualData.from_dict(data["visual"])
|
|
text = TextData.from_dict(data["text"])
|
|
properties = ElementProperties.from_dict(data["properties"])
|
|
context = ElementContext.from_dict(data["context"])
|
|
|
|
return cls(
|
|
element_id=data["element_id"],
|
|
type=element_type,
|
|
role=data["role"],
|
|
bbox=tuple(data["bbox"]),
|
|
label=data["label"],
|
|
visual=visual,
|
|
text=text,
|
|
properties=properties,
|
|
context=context,
|
|
tags=data.get("tags", []),
|
|
confidence=data.get("confidence", 1.0),
|
|
detection_method=data.get("detection_method", "unknown")
|
|
)
|
|
|
|
def to_json(self) -> str:
|
|
"""Sérialise en JSON."""
|
|
return json.dumps(self.to_dict(), indent=2, ensure_ascii=False)
|
|
|
|
@classmethod
|
|
def from_json(cls, json_str: str) -> 'UIElement':
|
|
"""Désérialise depuis JSON."""
|
|
data = json.loads(json_str)
|
|
return cls.from_dict(data)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Tests basiques
|
|
print("Test des modèles UIElement")
|
|
print("=" * 50)
|
|
|
|
# Test génération d'element_id
|
|
print("\n1. Test génération d'element_id:")
|
|
element_id = UIElement.generate_element_id(
|
|
app_name="test_app",
|
|
bbox=(100, 200, 300, 250),
|
|
label="Valider"
|
|
)
|
|
print(f" Element ID: {element_id}")
|
|
|
|
# Test création d'un UIElement
|
|
print("\n2. Test création UIElement:")
|
|
element = UIElement(
|
|
element_id=element_id,
|
|
type=UIElementType.BUTTON,
|
|
role="validate_action",
|
|
bbox=(100, 200, 300, 250),
|
|
label="Valider",
|
|
visual=VisualData(
|
|
screenshot_path="data/elements/el_001.png",
|
|
embedding_provider="openclip_ViT-B-32",
|
|
embedding_vector_id="data/embeddings/el_001.npy"
|
|
),
|
|
text=TextData(
|
|
raw="Valider",
|
|
normalized="valider",
|
|
embedding_provider="clip_text",
|
|
embedding_vector_id="data/embeddings/el_001_text.npy"
|
|
),
|
|
properties=ElementProperties(
|
|
is_clickable=True,
|
|
is_focusable=True,
|
|
is_dangerous=False
|
|
),
|
|
context=ElementContext(
|
|
app_name="test_app",
|
|
window_title="Test Window",
|
|
workflow_hint="WF_test"
|
|
),
|
|
tags=["primary_action"],
|
|
confidence=0.95,
|
|
detection_method="heuristic_rectangle"
|
|
)
|
|
|
|
print(f" Element ID: {element.element_id}")
|
|
print(f" Type: {element.type.value}")
|
|
print(f" Role: {element.role}")
|
|
print(f" Label: {element.label}")
|
|
print(f" Confidence: {element.confidence}")
|
|
|
|
# Test sérialisation
|
|
print("\n3. Test sérialisation JSON:")
|
|
json_str = element.to_json()
|
|
print(f" JSON length: {len(json_str)} chars")
|
|
print(f" Schema version: uielement_v1")
|
|
|
|
# Test désérialisation
|
|
print("\n4. Test désérialisation:")
|
|
element_restored = UIElement.from_json(json_str)
|
|
print(f" Restored element_id: {element_restored.element_id}")
|
|
print(f" Restored type: {element_restored.type.value}")
|
|
print(f" Restored label: {element_restored.label}")
|
|
|
|
# Test stabilité de l'ID
|
|
print("\n5. Test stabilité de l'element_id:")
|
|
element_id_2 = UIElement.generate_element_id(
|
|
app_name="test_app",
|
|
bbox=(100, 200, 300, 250),
|
|
label="Valider"
|
|
)
|
|
print(f" ID 1: {element_id}")
|
|
print(f" ID 2: {element_id_2}")
|
|
print(f" IDs identiques: {element_id == element_id_2}")
|
|
|
|
print("\n✓ Tous les tests basiques réussis!")
|
|
|
|
|
|
# ============================================================================
|
|
# EnrichedScreenState and related structures
|
|
# ============================================================================
|
|
|
|
|
|
@dataclass
|
|
class WindowInfo:
|
|
"""Informations sur la fenêtre active."""
|
|
app_name: str
|
|
window_title: str
|
|
screen_resolution: Tuple[int, int] # (width, height)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convertit en dictionnaire pour sérialisation JSON."""
|
|
return {
|
|
"app_name": self.app_name,
|
|
"window_title": self.window_title,
|
|
"screen_resolution": list(self.screen_resolution)
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'WindowInfo':
|
|
"""Crée une instance depuis un dictionnaire."""
|
|
return cls(
|
|
app_name=data["app_name"],
|
|
window_title=data["window_title"],
|
|
screen_resolution=tuple(data["screen_resolution"])
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class RawData:
|
|
"""Données brutes de capture d'écran."""
|
|
screenshot_path: str
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convertit en dictionnaire pour sérialisation JSON."""
|
|
return {
|
|
"screenshot_path": self.screenshot_path
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'RawData':
|
|
"""Crée une instance depuis un dictionnaire."""
|
|
return cls(screenshot_path=data["screenshot_path"])
|
|
|
|
|
|
@dataclass
|
|
class PerceptionData:
|
|
"""Données de perception (texte détecté, OCR, etc.)."""
|
|
detected_text: List[str] = field(default_factory=list)
|
|
ocr_results: Optional[Dict[str, Any]] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convertit en dictionnaire pour sérialisation JSON."""
|
|
return {
|
|
"detected_text": self.detected_text,
|
|
"ocr_results": self.ocr_results
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'PerceptionData':
|
|
"""Crée une instance depuis un dictionnaire."""
|
|
return cls(
|
|
detected_text=data.get("detected_text", []),
|
|
ocr_results=data.get("ocr_results")
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class ComponentInfo:
|
|
"""Informations sur une composante d'embedding."""
|
|
provider: str
|
|
vector_id: str
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convertit en dictionnaire pour sérialisation JSON."""
|
|
return {
|
|
"provider": self.provider,
|
|
"vector_id": self.vector_id
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'ComponentInfo':
|
|
"""Crée une instance depuis un dictionnaire."""
|
|
return cls(
|
|
provider=data["provider"],
|
|
vector_id=data["vector_id"]
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class EmbeddingComponents:
|
|
"""Composantes individuelles d'un state embedding multi-modal."""
|
|
image_embedding: Optional[ComponentInfo] = None
|
|
text_embedding: Optional[ComponentInfo] = None
|
|
title_embedding: Optional[ComponentInfo] = None
|
|
ui_embedding: Optional[ComponentInfo] = None
|
|
context_embedding: Optional[ComponentInfo] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convertit en dictionnaire pour sérialisation JSON."""
|
|
result = {}
|
|
if self.image_embedding:
|
|
result["image_embedding"] = self.image_embedding.to_dict()
|
|
if self.text_embedding:
|
|
result["text_embedding"] = self.text_embedding.to_dict()
|
|
if self.title_embedding:
|
|
result["title_embedding"] = self.title_embedding.to_dict()
|
|
if self.ui_embedding:
|
|
result["ui_embedding"] = self.ui_embedding.to_dict()
|
|
if self.context_embedding:
|
|
result["context_embedding"] = self.context_embedding.to_dict()
|
|
return result
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'EmbeddingComponents':
|
|
"""Crée une instance depuis un dictionnaire."""
|
|
return cls(
|
|
image_embedding=ComponentInfo.from_dict(data["image_embedding"]) if "image_embedding" in data else None,
|
|
text_embedding=ComponentInfo.from_dict(data["text_embedding"]) if "text_embedding" in data else None,
|
|
title_embedding=ComponentInfo.from_dict(data["title_embedding"]) if "title_embedding" in data else None,
|
|
ui_embedding=ComponentInfo.from_dict(data["ui_embedding"]) if "ui_embedding" in data else None,
|
|
context_embedding=ComponentInfo.from_dict(data["context_embedding"]) if "context_embedding" in data else None
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class StateEmbedding:
|
|
"""Embedding d'état unifié (multi-modal ou simple)."""
|
|
provider: str
|
|
vector_id: str
|
|
components: Optional[EmbeddingComponents] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convertit en dictionnaire pour sérialisation JSON."""
|
|
result = {
|
|
"provider": self.provider,
|
|
"vector_id": self.vector_id
|
|
}
|
|
if self.components:
|
|
result["components"] = self.components.to_dict()
|
|
return result
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'StateEmbedding':
|
|
"""Crée une instance depuis un dictionnaire."""
|
|
components = None
|
|
if "components" in data and data["components"]:
|
|
components = EmbeddingComponents.from_dict(data["components"])
|
|
|
|
return cls(
|
|
provider=data["provider"],
|
|
vector_id=data["vector_id"],
|
|
components=components
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class ContextData:
|
|
"""Données de contexte workflow."""
|
|
current_workflow_candidate: Optional[str] = None
|
|
tags: List[str] = field(default_factory=list)
|
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convertit en dictionnaire pour sérialisation JSON."""
|
|
return {
|
|
"current_workflow_candidate": self.current_workflow_candidate,
|
|
"tags": self.tags,
|
|
"metadata": self.metadata
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'ContextData':
|
|
"""Crée une instance depuis un dictionnaire."""
|
|
return cls(
|
|
current_workflow_candidate=data.get("current_workflow_candidate"),
|
|
tags=data.get("tags", []),
|
|
metadata=data.get("metadata", {})
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class EnrichedScreenState:
|
|
"""
|
|
ScreenState enrichi avec éléments d'UI et embedding multi-modal.
|
|
|
|
Attributes:
|
|
screen_state_id: Identifiant unique de l'état d'écran
|
|
timestamp: Horodatage de la capture
|
|
session_id: Identifiant de session
|
|
window: Informations sur la fenêtre
|
|
raw: Données brutes (screenshot_path)
|
|
perception: Données de perception (texte détecté)
|
|
ui_elements: Liste des éléments UI détectés
|
|
state_embedding: Embedding d'état unifié
|
|
context: Contexte workflow
|
|
mode: Mode de traitement ("light", "enriched", "complete")
|
|
processing_metadata: Métadonnées de traitement (optionnel)
|
|
"""
|
|
screen_state_id: str
|
|
timestamp: datetime
|
|
session_id: str
|
|
window: WindowInfo
|
|
raw: RawData
|
|
perception: PerceptionData
|
|
ui_elements: List[UIElement]
|
|
state_embedding: StateEmbedding
|
|
context: ContextData
|
|
mode: str = "light"
|
|
processing_metadata: Optional[Dict[str, Any]] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convertit en dictionnaire pour sérialisation JSON."""
|
|
result = {
|
|
"schema_version": "screenstate_v1",
|
|
"mode": self.mode,
|
|
"screen_state_id": self.screen_state_id,
|
|
"timestamp": self.timestamp.isoformat(),
|
|
"session_id": self.session_id,
|
|
"window": self.window.to_dict(),
|
|
"raw": self.raw.to_dict(),
|
|
"perception": self.perception.to_dict(),
|
|
"ui_elements": [elem.to_dict() for elem in self.ui_elements],
|
|
"state_embedding": self.state_embedding.to_dict(),
|
|
"context": self.context.to_dict()
|
|
}
|
|
|
|
if self.processing_metadata:
|
|
result["processing_metadata"] = self.processing_metadata
|
|
|
|
return result
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'EnrichedScreenState':
|
|
"""Crée une instance depuis un dictionnaire."""
|
|
# Gérer la compatibilité avec différentes versions de schéma
|
|
schema_version = data.get("schema_version", "screenstate_v1")
|
|
|
|
# Parser le timestamp
|
|
timestamp = datetime.fromisoformat(data["timestamp"])
|
|
|
|
# Reconstruire les sous-structures
|
|
window = WindowInfo.from_dict(data["window"])
|
|
raw = RawData.from_dict(data["raw"])
|
|
perception = PerceptionData.from_dict(data["perception"])
|
|
|
|
# Reconstruire les UI elements
|
|
ui_elements = [UIElement.from_dict(elem_data) for elem_data in data.get("ui_elements", [])]
|
|
|
|
# Reconstruire le state embedding
|
|
state_embedding = StateEmbedding.from_dict(data["state_embedding"])
|
|
|
|
# Reconstruire le contexte
|
|
context = ContextData.from_dict(data["context"])
|
|
|
|
return cls(
|
|
screen_state_id=data["screen_state_id"],
|
|
timestamp=timestamp,
|
|
session_id=data["session_id"],
|
|
window=window,
|
|
raw=raw,
|
|
perception=perception,
|
|
ui_elements=ui_elements,
|
|
state_embedding=state_embedding,
|
|
context=context,
|
|
mode=data.get("mode", "light"),
|
|
processing_metadata=data.get("processing_metadata")
|
|
)
|
|
|
|
def to_json(self) -> str:
|
|
"""Sérialise en JSON."""
|
|
return json.dumps(self.to_dict(), indent=2, ensure_ascii=False)
|
|
|
|
@classmethod
|
|
def from_json(cls, json_str: str) -> 'EnrichedScreenState':
|
|
"""Désérialise depuis JSON."""
|
|
data = json.loads(json_str)
|
|
return cls.from_dict(data)
|
|
|
|
@classmethod
|
|
def create_light_mode(
|
|
cls,
|
|
screen_state_id: str,
|
|
session_id: str,
|
|
window: WindowInfo,
|
|
screenshot_path: str,
|
|
image_embedding_provider: str,
|
|
image_embedding_vector_id: str
|
|
) -> 'EnrichedScreenState':
|
|
"""
|
|
Crée un EnrichedScreenState en mode light (compatibilité arrière).
|
|
|
|
Args:
|
|
screen_state_id: ID de l'état d'écran
|
|
session_id: ID de session
|
|
window: Informations sur la fenêtre
|
|
screenshot_path: Chemin vers le screenshot
|
|
image_embedding_provider: Provider de l'embedding image
|
|
image_embedding_vector_id: ID du vecteur d'embedding image
|
|
|
|
Returns:
|
|
EnrichedScreenState en mode light
|
|
"""
|
|
return cls(
|
|
screen_state_id=screen_state_id,
|
|
timestamp=datetime.now(),
|
|
session_id=session_id,
|
|
window=window,
|
|
raw=RawData(screenshot_path=screenshot_path),
|
|
perception=PerceptionData(detected_text=[]),
|
|
ui_elements=[], # Vide en mode light
|
|
state_embedding=StateEmbedding(
|
|
provider=image_embedding_provider,
|
|
vector_id=image_embedding_vector_id,
|
|
components=None # Pas de composantes en mode light
|
|
),
|
|
context=ContextData(),
|
|
mode="light"
|
|
)
|
|
|
|
|
|
# Tests pour EnrichedScreenState
|
|
def test_enriched_screen_state():
|
|
"""Tests basiques pour EnrichedScreenState."""
|
|
print("\n" + "=" * 50)
|
|
print("Test des modèles EnrichedScreenState")
|
|
print("=" * 50)
|
|
|
|
# Test mode light
|
|
print("\n1. Test création en mode light:")
|
|
window = WindowInfo(
|
|
app_name="test_app",
|
|
window_title="Test Window",
|
|
screen_resolution=(1920, 1080)
|
|
)
|
|
|
|
screen_state_light = EnrichedScreenState.create_light_mode(
|
|
screen_state_id="screen_001",
|
|
session_id="session_001",
|
|
window=window,
|
|
screenshot_path="data/screens/screen_001.png",
|
|
image_embedding_provider="openclip_ViT-B-32",
|
|
image_embedding_vector_id="data/embeddings/screen_001.npy"
|
|
)
|
|
|
|
print(f" Screen State ID: {screen_state_light.screen_state_id}")
|
|
print(f" Mode: {screen_state_light.mode}")
|
|
print(f" UI Elements: {len(screen_state_light.ui_elements)}")
|
|
print(f" State Embedding Provider: {screen_state_light.state_embedding.provider}")
|
|
print(f" Has Components: {screen_state_light.state_embedding.components is not None}")
|
|
|
|
# Test sérialisation mode light
|
|
print("\n2. Test sérialisation JSON (mode light):")
|
|
json_str = screen_state_light.to_json()
|
|
print(f" JSON length: {len(json_str)} chars")
|
|
|
|
# Test désérialisation mode light
|
|
print("\n3. Test désérialisation (mode light):")
|
|
screen_state_restored = EnrichedScreenState.from_json(json_str)
|
|
print(f" Restored screen_state_id: {screen_state_restored.screen_state_id}")
|
|
print(f" Restored mode: {screen_state_restored.mode}")
|
|
print(f" Restored UI elements count: {len(screen_state_restored.ui_elements)}")
|
|
|
|
# Test mode enriched avec éléments
|
|
print("\n4. Test création en mode enriched:")
|
|
element = UIElement(
|
|
element_id="el_test_001",
|
|
type=UIElementType.BUTTON,
|
|
role="validate_action",
|
|
bbox=(100, 200, 300, 250),
|
|
label="Valider",
|
|
visual=VisualData(
|
|
screenshot_path="data/elements/el_001.png",
|
|
embedding_provider="openclip_ViT-B-32",
|
|
embedding_vector_id="data/embeddings/el_001.npy"
|
|
),
|
|
text=TextData(
|
|
raw="Valider",
|
|
normalized="valider",
|
|
embedding_provider="clip_text",
|
|
embedding_vector_id="data/embeddings/el_001_text.npy"
|
|
),
|
|
properties=ElementProperties(is_clickable=True),
|
|
context=ElementContext(
|
|
app_name="test_app",
|
|
window_title="Test Window"
|
|
),
|
|
tags=["primary_action"],
|
|
confidence=0.95
|
|
)
|
|
|
|
screen_state_enriched = EnrichedScreenState(
|
|
screen_state_id="screen_002",
|
|
timestamp=datetime.now(),
|
|
session_id="session_001",
|
|
window=window,
|
|
raw=RawData(screenshot_path="data/screens/screen_002.png"),
|
|
perception=PerceptionData(detected_text=["Valider", "Annuler"]),
|
|
ui_elements=[element],
|
|
state_embedding=StateEmbedding(
|
|
provider="openclip_ViT-B-32",
|
|
vector_id="data/embeddings/screen_002.npy",
|
|
components=None
|
|
),
|
|
context=ContextData(tags=["test"]),
|
|
mode="enriched"
|
|
)
|
|
|
|
print(f" Screen State ID: {screen_state_enriched.screen_state_id}")
|
|
print(f" Mode: {screen_state_enriched.mode}")
|
|
print(f" UI Elements: {len(screen_state_enriched.ui_elements)}")
|
|
print(f" Detected Text: {screen_state_enriched.perception.detected_text}")
|
|
|
|
# Test mode complete avec composantes
|
|
print("\n5. Test création en mode complete:")
|
|
components = EmbeddingComponents(
|
|
image_embedding=ComponentInfo(
|
|
provider="openclip_ViT-B-32",
|
|
vector_id="data/embeddings/screen_003_image.npy"
|
|
),
|
|
text_embedding=ComponentInfo(
|
|
provider="clip_text",
|
|
vector_id="data/embeddings/screen_003_text.npy"
|
|
),
|
|
title_embedding=ComponentInfo(
|
|
provider="clip_text",
|
|
vector_id="data/embeddings/screen_003_title.npy"
|
|
)
|
|
)
|
|
|
|
screen_state_complete = EnrichedScreenState(
|
|
screen_state_id="screen_003",
|
|
timestamp=datetime.now(),
|
|
session_id="session_001",
|
|
window=window,
|
|
raw=RawData(screenshot_path="data/screens/screen_003.png"),
|
|
perception=PerceptionData(detected_text=["Valider", "Annuler"]),
|
|
ui_elements=[element],
|
|
state_embedding=StateEmbedding(
|
|
provider="multimodal_fusion_v1",
|
|
vector_id="data/embeddings/screen_003_fused.npy",
|
|
components=components
|
|
),
|
|
context=ContextData(tags=["test"]),
|
|
mode="complete"
|
|
)
|
|
|
|
print(f" Screen State ID: {screen_state_complete.screen_state_id}")
|
|
print(f" Mode: {screen_state_complete.mode}")
|
|
print(f" State Embedding Provider: {screen_state_complete.state_embedding.provider}")
|
|
print(f" Has Components: {screen_state_complete.state_embedding.components is not None}")
|
|
|
|
# Test sérialisation mode complete
|
|
print("\n6. Test sérialisation JSON (mode complete):")
|
|
json_str_complete = screen_state_complete.to_json()
|
|
print(f" JSON length: {len(json_str_complete)} chars")
|
|
|
|
# Test désérialisation mode complete
|
|
print("\n7. Test désérialisation (mode complete):")
|
|
screen_state_complete_restored = EnrichedScreenState.from_json(json_str_complete)
|
|
print(f" Restored screen_state_id: {screen_state_complete_restored.screen_state_id}")
|
|
print(f" Restored mode: {screen_state_complete_restored.mode}")
|
|
print(f" Restored components: {screen_state_complete_restored.state_embedding.components is not None}")
|
|
|
|
print("\n✓ Tous les tests EnrichedScreenState réussis!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Exécuter les tests
|
|
test_enriched_screen_state()
|