280 lines
9.4 KiB
Python
280 lines
9.4 KiB
Python
"""
|
|
Analyse visuelle des actions utilisateur.
|
|
Extrait et analyse la région autour d'une action pour créer une signature visuelle.
|
|
"""
|
|
|
|
import numpy as np
|
|
import cv2
|
|
from typing import Dict, Any, Optional, Tuple, Union
|
|
from pathlib import Path
|
|
from PIL import Image
|
|
|
|
from .embeddings_manager import EmbeddingsManager
|
|
from .embedders import EmbeddingManager as NewEmbeddingManager
|
|
from .utils.vision_utils import VisionUtils
|
|
from .llm_manager import LLMManager
|
|
from .logger import Logger
|
|
|
|
|
|
class VisionAnalysis:
|
|
"""
|
|
Analyse visuelle des actions pour créer des signatures réutilisables.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
embeddings_manager: Union[EmbeddingsManager, NewEmbeddingManager],
|
|
vision_utils: VisionUtils,
|
|
llm_manager: Optional[LLMManager] = None,
|
|
logger: Optional[Logger] = None
|
|
):
|
|
"""
|
|
Initialise l'analyseur visuel.
|
|
|
|
Args:
|
|
embeddings_manager: Pour créer les embeddings (old or new system)
|
|
vision_utils: Pour la détection d'éléments
|
|
llm_manager: Pour le contexte (optionnel)
|
|
logger: Pour la journalisation
|
|
"""
|
|
self.embeddings = embeddings_manager
|
|
self.vision = vision_utils
|
|
self.llm = llm_manager
|
|
self.logger = logger
|
|
|
|
# Detect if using new embedding system
|
|
self._use_new_system = isinstance(embeddings_manager, NewEmbeddingManager)
|
|
|
|
def analyze_action(
|
|
self,
|
|
screenshot: np.ndarray,
|
|
x: int,
|
|
y: int,
|
|
action_type: str,
|
|
window: str
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Analyse une action utilisateur et crée sa signature visuelle.
|
|
|
|
Args:
|
|
screenshot: Image complète de l'écran
|
|
x, y: Position de l'action
|
|
action_type: Type d'action (mouse_click, key_press, etc.)
|
|
window: Fenêtre active
|
|
|
|
Returns:
|
|
Signature visuelle de l'action (always returns a valid signature)
|
|
"""
|
|
# Initialize defaults
|
|
element_type = "unknown"
|
|
element_description = ""
|
|
region = None
|
|
region_coords = (0, 0, 0, 0)
|
|
embedding = None
|
|
|
|
try:
|
|
# 1. Extraire la région autour de l'action
|
|
region, region_coords = self._extract_region(screenshot, x, y, size=100)
|
|
|
|
# 2. Créer l'embedding de la région
|
|
if self._use_new_system:
|
|
# New system: convert numpy to PIL, then embed
|
|
region_rgb = cv2.cvtColor(region, cv2.COLOR_BGR2RGB)
|
|
pil_image = Image.fromarray(region_rgb.astype(np.uint8))
|
|
embedding = self.embeddings.embed(pil_image)
|
|
else:
|
|
# Old system: use encode_image directly
|
|
embedding = self.embeddings.encode_image(region)
|
|
|
|
except Exception as e:
|
|
if self.logger:
|
|
self.logger.log_action({
|
|
"action": "region_extraction_failed",
|
|
"error": str(e),
|
|
"position": (x, y)
|
|
})
|
|
# Continue with defaults
|
|
|
|
# 3. Détecter le type d'élément avec le LLM
|
|
try:
|
|
# Utiliser Qwen3-VL pour identifier l'élément
|
|
if self.llm and region is not None:
|
|
try:
|
|
prompt = f"""Analyse cette région d'interface utilisateur où l'utilisateur a cliqué.
|
|
|
|
Position du clic: ({x}, {y})
|
|
Type d'action: {action_type}
|
|
|
|
Identifie l'élément UI en une courte phrase (max 30 caractères).
|
|
Exemples: "Bouton Rafraîchir", "Icône Paramètres", "Champ de texte"
|
|
|
|
Réponds UNIQUEMENT avec l'identification, sans explication."""
|
|
|
|
response = self.llm.generate_with_vision(
|
|
prompt=prompt,
|
|
images=[region]
|
|
)
|
|
|
|
element_description = response.strip()[:50]
|
|
|
|
# Extraire le type (premier mot généralement)
|
|
words = element_description.lower().split()
|
|
if words:
|
|
if "bouton" in words or "button" in words:
|
|
element_type = "button"
|
|
elif "icône" in words or "icon" in words:
|
|
element_type = "icon"
|
|
elif "champ" in words or "field" in words or "input" in words:
|
|
element_type = "text_field"
|
|
elif "lien" in words or "link" in words:
|
|
element_type = "link"
|
|
else:
|
|
element_type = words[0]
|
|
|
|
except Exception as llm_error:
|
|
if self.logger:
|
|
self.logger.log_action({
|
|
"action": "llm_analysis_failed",
|
|
"error": str(llm_error),
|
|
"position": (x, y)
|
|
})
|
|
|
|
# Fallback: détection visuelle classique
|
|
if element_type == "unknown" and region is not None:
|
|
try:
|
|
all_detections = []
|
|
for elem_type in ["button", "icon", "text field"]:
|
|
detections = self.vision.detect(elem_type, screenshot)
|
|
all_detections.extend(detections)
|
|
|
|
if all_detections:
|
|
closest = self._find_closest_detection(all_detections, x, y)
|
|
if closest:
|
|
element_type = closest.label
|
|
|
|
except Exception as vision_error:
|
|
if self.logger:
|
|
self.logger.log_action({
|
|
"action": "vision_detection_failed",
|
|
"error": str(vision_error),
|
|
"position": (x, y)
|
|
})
|
|
|
|
except Exception as e:
|
|
if self.logger:
|
|
self.logger.log_action({
|
|
"action": "element_detection_failed",
|
|
"error": str(e),
|
|
"position": (x, y)
|
|
})
|
|
|
|
# 4. Créer la signature (always return a valid signature)
|
|
signature = {
|
|
"position": (x, y),
|
|
"region_coords": region_coords,
|
|
"region_image": region,
|
|
"embedding": embedding,
|
|
"element_type": element_type,
|
|
"element_description": element_description,
|
|
"action_type": action_type,
|
|
"window": window,
|
|
"screenshot_shape": screenshot.shape
|
|
}
|
|
|
|
return signature
|
|
|
|
def _extract_region(
|
|
self,
|
|
image: np.ndarray,
|
|
x: int,
|
|
y: int,
|
|
size: int = 100
|
|
) -> Tuple[np.ndarray, Tuple[int, int, int, int]]:
|
|
"""
|
|
Extrait une région carrée autour d'un point.
|
|
|
|
Returns:
|
|
(région extraite, coordonnées (x1, y1, x2, y2))
|
|
"""
|
|
h, w = image.shape[:2]
|
|
half_size = size // 2
|
|
|
|
# Calculer les coordonnées en restant dans l'image
|
|
x1 = max(0, x - half_size)
|
|
y1 = max(0, y - half_size)
|
|
x2 = min(w, x + half_size)
|
|
y2 = min(h, y + half_size)
|
|
|
|
region = image[y1:y2, x1:x2].copy()
|
|
|
|
# Redimensionner à size x size si nécessaire
|
|
if region.shape[0] != size or region.shape[1] != size:
|
|
region = cv2.resize(region, (size, size))
|
|
|
|
return region, (x1, y1, x2, y2)
|
|
|
|
def _find_closest_detection(
|
|
self,
|
|
detections: list,
|
|
x: int,
|
|
y: int,
|
|
max_distance: int = 50
|
|
):
|
|
"""
|
|
Trouve la détection la plus proche d'un point.
|
|
|
|
Returns:
|
|
Détection la plus proche ou None
|
|
"""
|
|
if not detections:
|
|
return None
|
|
|
|
closest = None
|
|
min_dist = float('inf')
|
|
|
|
for det in detections:
|
|
# Gérer à la fois les objets Detection et les dicts
|
|
if hasattr(det, 'bbox'):
|
|
bbox = det.bbox # Objet Detection
|
|
else:
|
|
bbox = det.get("bbox", []) # Dict
|
|
|
|
if bbox and len(bbox) >= 4:
|
|
# bbox format: (x, y, w, h)
|
|
cx = bbox[0] + bbox[2] / 2
|
|
cy = bbox[1] + bbox[3] / 2
|
|
|
|
# Distance au point
|
|
dist = np.sqrt((cx - x)**2 + (cy - y)**2)
|
|
|
|
if dist < min_dist and dist < max_distance:
|
|
min_dist = dist
|
|
closest = det
|
|
|
|
return closest
|
|
|
|
def compare_signatures(
|
|
self,
|
|
sig1: Dict[str, Any],
|
|
sig2: Dict[str, Any]
|
|
) -> float:
|
|
"""
|
|
Compare deux signatures visuelles.
|
|
|
|
Returns:
|
|
Score de similarité (0-1)
|
|
"""
|
|
# Comparer les embeddings
|
|
emb1 = sig1.get("embedding")
|
|
emb2 = sig2.get("embedding")
|
|
|
|
if emb1 is None or emb2 is None:
|
|
return 0.0
|
|
|
|
# Similarité cosinus
|
|
similarity = np.dot(emb1, emb2) / (
|
|
np.linalg.norm(emb1) * np.linalg.norm(emb2)
|
|
)
|
|
|
|
return float(similarity)
|