Initial commit
This commit is contained in:
279
geniusia2/core/vision_analysis.py
Normal file
279
geniusia2/core/vision_analysis.py
Normal file
@@ -0,0 +1,279 @@
|
||||
"""
|
||||
Analyse visuelle des actions utilisateur.
|
||||
Extrait et analyse la région autour d'une action pour créer une signature visuelle.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import cv2
|
||||
from typing import Dict, Any, Optional, Tuple, Union
|
||||
from pathlib import Path
|
||||
from PIL import Image
|
||||
|
||||
from .embeddings_manager import EmbeddingsManager
|
||||
from .embedders import EmbeddingManager as NewEmbeddingManager
|
||||
from .utils.vision_utils import VisionUtils
|
||||
from .llm_manager import LLMManager
|
||||
from .logger import Logger
|
||||
|
||||
|
||||
class VisionAnalysis:
|
||||
"""
|
||||
Analyse visuelle des actions pour créer des signatures réutilisables.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embeddings_manager: Union[EmbeddingsManager, NewEmbeddingManager],
|
||||
vision_utils: VisionUtils,
|
||||
llm_manager: Optional[LLMManager] = None,
|
||||
logger: Optional[Logger] = None
|
||||
):
|
||||
"""
|
||||
Initialise l'analyseur visuel.
|
||||
|
||||
Args:
|
||||
embeddings_manager: Pour créer les embeddings (old or new system)
|
||||
vision_utils: Pour la détection d'éléments
|
||||
llm_manager: Pour le contexte (optionnel)
|
||||
logger: Pour la journalisation
|
||||
"""
|
||||
self.embeddings = embeddings_manager
|
||||
self.vision = vision_utils
|
||||
self.llm = llm_manager
|
||||
self.logger = logger
|
||||
|
||||
# Detect if using new embedding system
|
||||
self._use_new_system = isinstance(embeddings_manager, NewEmbeddingManager)
|
||||
|
||||
def analyze_action(
|
||||
self,
|
||||
screenshot: np.ndarray,
|
||||
x: int,
|
||||
y: int,
|
||||
action_type: str,
|
||||
window: str
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyse une action utilisateur et crée sa signature visuelle.
|
||||
|
||||
Args:
|
||||
screenshot: Image complète de l'écran
|
||||
x, y: Position de l'action
|
||||
action_type: Type d'action (mouse_click, key_press, etc.)
|
||||
window: Fenêtre active
|
||||
|
||||
Returns:
|
||||
Signature visuelle de l'action (always returns a valid signature)
|
||||
"""
|
||||
# Initialize defaults
|
||||
element_type = "unknown"
|
||||
element_description = ""
|
||||
region = None
|
||||
region_coords = (0, 0, 0, 0)
|
||||
embedding = None
|
||||
|
||||
try:
|
||||
# 1. Extraire la région autour de l'action
|
||||
region, region_coords = self._extract_region(screenshot, x, y, size=100)
|
||||
|
||||
# 2. Créer l'embedding de la région
|
||||
if self._use_new_system:
|
||||
# New system: convert numpy to PIL, then embed
|
||||
region_rgb = cv2.cvtColor(region, cv2.COLOR_BGR2RGB)
|
||||
pil_image = Image.fromarray(region_rgb.astype(np.uint8))
|
||||
embedding = self.embeddings.embed(pil_image)
|
||||
else:
|
||||
# Old system: use encode_image directly
|
||||
embedding = self.embeddings.encode_image(region)
|
||||
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.log_action({
|
||||
"action": "region_extraction_failed",
|
||||
"error": str(e),
|
||||
"position": (x, y)
|
||||
})
|
||||
# Continue with defaults
|
||||
|
||||
# 3. Détecter le type d'élément avec le LLM
|
||||
try:
|
||||
# Utiliser Qwen3-VL pour identifier l'élément
|
||||
if self.llm and region is not None:
|
||||
try:
|
||||
prompt = f"""Analyse cette région d'interface utilisateur où l'utilisateur a cliqué.
|
||||
|
||||
Position du clic: ({x}, {y})
|
||||
Type d'action: {action_type}
|
||||
|
||||
Identifie l'élément UI en une courte phrase (max 30 caractères).
|
||||
Exemples: "Bouton Rafraîchir", "Icône Paramètres", "Champ de texte"
|
||||
|
||||
Réponds UNIQUEMENT avec l'identification, sans explication."""
|
||||
|
||||
response = self.llm.generate_with_vision(
|
||||
prompt=prompt,
|
||||
images=[region]
|
||||
)
|
||||
|
||||
element_description = response.strip()[:50]
|
||||
|
||||
# Extraire le type (premier mot généralement)
|
||||
words = element_description.lower().split()
|
||||
if words:
|
||||
if "bouton" in words or "button" in words:
|
||||
element_type = "button"
|
||||
elif "icône" in words or "icon" in words:
|
||||
element_type = "icon"
|
||||
elif "champ" in words or "field" in words or "input" in words:
|
||||
element_type = "text_field"
|
||||
elif "lien" in words or "link" in words:
|
||||
element_type = "link"
|
||||
else:
|
||||
element_type = words[0]
|
||||
|
||||
except Exception as llm_error:
|
||||
if self.logger:
|
||||
self.logger.log_action({
|
||||
"action": "llm_analysis_failed",
|
||||
"error": str(llm_error),
|
||||
"position": (x, y)
|
||||
})
|
||||
|
||||
# Fallback: détection visuelle classique
|
||||
if element_type == "unknown" and region is not None:
|
||||
try:
|
||||
all_detections = []
|
||||
for elem_type in ["button", "icon", "text field"]:
|
||||
detections = self.vision.detect(elem_type, screenshot)
|
||||
all_detections.extend(detections)
|
||||
|
||||
if all_detections:
|
||||
closest = self._find_closest_detection(all_detections, x, y)
|
||||
if closest:
|
||||
element_type = closest.label
|
||||
|
||||
except Exception as vision_error:
|
||||
if self.logger:
|
||||
self.logger.log_action({
|
||||
"action": "vision_detection_failed",
|
||||
"error": str(vision_error),
|
||||
"position": (x, y)
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.log_action({
|
||||
"action": "element_detection_failed",
|
||||
"error": str(e),
|
||||
"position": (x, y)
|
||||
})
|
||||
|
||||
# 4. Créer la signature (always return a valid signature)
|
||||
signature = {
|
||||
"position": (x, y),
|
||||
"region_coords": region_coords,
|
||||
"region_image": region,
|
||||
"embedding": embedding,
|
||||
"element_type": element_type,
|
||||
"element_description": element_description,
|
||||
"action_type": action_type,
|
||||
"window": window,
|
||||
"screenshot_shape": screenshot.shape
|
||||
}
|
||||
|
||||
return signature
|
||||
|
||||
def _extract_region(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
x: int,
|
||||
y: int,
|
||||
size: int = 100
|
||||
) -> Tuple[np.ndarray, Tuple[int, int, int, int]]:
|
||||
"""
|
||||
Extrait une région carrée autour d'un point.
|
||||
|
||||
Returns:
|
||||
(région extraite, coordonnées (x1, y1, x2, y2))
|
||||
"""
|
||||
h, w = image.shape[:2]
|
||||
half_size = size // 2
|
||||
|
||||
# Calculer les coordonnées en restant dans l'image
|
||||
x1 = max(0, x - half_size)
|
||||
y1 = max(0, y - half_size)
|
||||
x2 = min(w, x + half_size)
|
||||
y2 = min(h, y + half_size)
|
||||
|
||||
region = image[y1:y2, x1:x2].copy()
|
||||
|
||||
# Redimensionner à size x size si nécessaire
|
||||
if region.shape[0] != size or region.shape[1] != size:
|
||||
region = cv2.resize(region, (size, size))
|
||||
|
||||
return region, (x1, y1, x2, y2)
|
||||
|
||||
def _find_closest_detection(
|
||||
self,
|
||||
detections: list,
|
||||
x: int,
|
||||
y: int,
|
||||
max_distance: int = 50
|
||||
):
|
||||
"""
|
||||
Trouve la détection la plus proche d'un point.
|
||||
|
||||
Returns:
|
||||
Détection la plus proche ou None
|
||||
"""
|
||||
if not detections:
|
||||
return None
|
||||
|
||||
closest = None
|
||||
min_dist = float('inf')
|
||||
|
||||
for det in detections:
|
||||
# Gérer à la fois les objets Detection et les dicts
|
||||
if hasattr(det, 'bbox'):
|
||||
bbox = det.bbox # Objet Detection
|
||||
else:
|
||||
bbox = det.get("bbox", []) # Dict
|
||||
|
||||
if bbox and len(bbox) >= 4:
|
||||
# bbox format: (x, y, w, h)
|
||||
cx = bbox[0] + bbox[2] / 2
|
||||
cy = bbox[1] + bbox[3] / 2
|
||||
|
||||
# Distance au point
|
||||
dist = np.sqrt((cx - x)**2 + (cy - y)**2)
|
||||
|
||||
if dist < min_dist and dist < max_distance:
|
||||
min_dist = dist
|
||||
closest = det
|
||||
|
||||
return closest
|
||||
|
||||
def compare_signatures(
|
||||
self,
|
||||
sig1: Dict[str, Any],
|
||||
sig2: Dict[str, Any]
|
||||
) -> float:
|
||||
"""
|
||||
Compare deux signatures visuelles.
|
||||
|
||||
Returns:
|
||||
Score de similarité (0-1)
|
||||
"""
|
||||
# Comparer les embeddings
|
||||
emb1 = sig1.get("embedding")
|
||||
emb2 = sig2.get("embedding")
|
||||
|
||||
if emb1 is None or emb2 is None:
|
||||
return 0.0
|
||||
|
||||
# Similarité cosinus
|
||||
similarity = np.dot(emb1, emb2) / (
|
||||
np.linalg.norm(emb1) * np.linalg.norm(emb2)
|
||||
)
|
||||
|
||||
return float(similarity)
|
||||
Reference in New Issue
Block a user