Files
Geniusia_v2/geniusia2/core/vision_analysis.py
2026-03-05 00:20:25 +01:00

280 lines
9.4 KiB
Python

"""
Analyse visuelle des actions utilisateur.
Extrait et analyse la région autour d'une action pour créer une signature visuelle.
"""
import numpy as np
import cv2
from typing import Dict, Any, Optional, Tuple, Union
from pathlib import Path
from PIL import Image
from .embeddings_manager import EmbeddingsManager
from .embedders import EmbeddingManager as NewEmbeddingManager
from .utils.vision_utils import VisionUtils
from .llm_manager import LLMManager
from .logger import Logger
class VisionAnalysis:
"""
Analyse visuelle des actions pour créer des signatures réutilisables.
"""
def __init__(
self,
embeddings_manager: Union[EmbeddingsManager, NewEmbeddingManager],
vision_utils: VisionUtils,
llm_manager: Optional[LLMManager] = None,
logger: Optional[Logger] = None
):
"""
Initialise l'analyseur visuel.
Args:
embeddings_manager: Pour créer les embeddings (old or new system)
vision_utils: Pour la détection d'éléments
llm_manager: Pour le contexte (optionnel)
logger: Pour la journalisation
"""
self.embeddings = embeddings_manager
self.vision = vision_utils
self.llm = llm_manager
self.logger = logger
# Detect if using new embedding system
self._use_new_system = isinstance(embeddings_manager, NewEmbeddingManager)
def analyze_action(
self,
screenshot: np.ndarray,
x: int,
y: int,
action_type: str,
window: str
) -> Dict[str, Any]:
"""
Analyse une action utilisateur et crée sa signature visuelle.
Args:
screenshot: Image complète de l'écran
x, y: Position de l'action
action_type: Type d'action (mouse_click, key_press, etc.)
window: Fenêtre active
Returns:
Signature visuelle de l'action (always returns a valid signature)
"""
# Initialize defaults
element_type = "unknown"
element_description = ""
region = None
region_coords = (0, 0, 0, 0)
embedding = None
try:
# 1. Extraire la région autour de l'action
region, region_coords = self._extract_region(screenshot, x, y, size=100)
# 2. Créer l'embedding de la région
if self._use_new_system:
# New system: convert numpy to PIL, then embed
region_rgb = cv2.cvtColor(region, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(region_rgb.astype(np.uint8))
embedding = self.embeddings.embed(pil_image)
else:
# Old system: use encode_image directly
embedding = self.embeddings.encode_image(region)
except Exception as e:
if self.logger:
self.logger.log_action({
"action": "region_extraction_failed",
"error": str(e),
"position": (x, y)
})
# Continue with defaults
# 3. Détecter le type d'élément avec le LLM
try:
# Utiliser Qwen3-VL pour identifier l'élément
if self.llm and region is not None:
try:
prompt = f"""Analyse cette région d'interface utilisateur où l'utilisateur a cliqué.
Position du clic: ({x}, {y})
Type d'action: {action_type}
Identifie l'élément UI en une courte phrase (max 30 caractères).
Exemples: "Bouton Rafraîchir", "Icône Paramètres", "Champ de texte"
Réponds UNIQUEMENT avec l'identification, sans explication."""
response = self.llm.generate_with_vision(
prompt=prompt,
images=[region]
)
element_description = response.strip()[:50]
# Extraire le type (premier mot généralement)
words = element_description.lower().split()
if words:
if "bouton" in words or "button" in words:
element_type = "button"
elif "icône" in words or "icon" in words:
element_type = "icon"
elif "champ" in words or "field" in words or "input" in words:
element_type = "text_field"
elif "lien" in words or "link" in words:
element_type = "link"
else:
element_type = words[0]
except Exception as llm_error:
if self.logger:
self.logger.log_action({
"action": "llm_analysis_failed",
"error": str(llm_error),
"position": (x, y)
})
# Fallback: détection visuelle classique
if element_type == "unknown" and region is not None:
try:
all_detections = []
for elem_type in ["button", "icon", "text field"]:
detections = self.vision.detect(elem_type, screenshot)
all_detections.extend(detections)
if all_detections:
closest = self._find_closest_detection(all_detections, x, y)
if closest:
element_type = closest.label
except Exception as vision_error:
if self.logger:
self.logger.log_action({
"action": "vision_detection_failed",
"error": str(vision_error),
"position": (x, y)
})
except Exception as e:
if self.logger:
self.logger.log_action({
"action": "element_detection_failed",
"error": str(e),
"position": (x, y)
})
# 4. Créer la signature (always return a valid signature)
signature = {
"position": (x, y),
"region_coords": region_coords,
"region_image": region,
"embedding": embedding,
"element_type": element_type,
"element_description": element_description,
"action_type": action_type,
"window": window,
"screenshot_shape": screenshot.shape
}
return signature
def _extract_region(
self,
image: np.ndarray,
x: int,
y: int,
size: int = 100
) -> Tuple[np.ndarray, Tuple[int, int, int, int]]:
"""
Extrait une région carrée autour d'un point.
Returns:
(région extraite, coordonnées (x1, y1, x2, y2))
"""
h, w = image.shape[:2]
half_size = size // 2
# Calculer les coordonnées en restant dans l'image
x1 = max(0, x - half_size)
y1 = max(0, y - half_size)
x2 = min(w, x + half_size)
y2 = min(h, y + half_size)
region = image[y1:y2, x1:x2].copy()
# Redimensionner à size x size si nécessaire
if region.shape[0] != size or region.shape[1] != size:
region = cv2.resize(region, (size, size))
return region, (x1, y1, x2, y2)
def _find_closest_detection(
self,
detections: list,
x: int,
y: int,
max_distance: int = 50
):
"""
Trouve la détection la plus proche d'un point.
Returns:
Détection la plus proche ou None
"""
if not detections:
return None
closest = None
min_dist = float('inf')
for det in detections:
# Gérer à la fois les objets Detection et les dicts
if hasattr(det, 'bbox'):
bbox = det.bbox # Objet Detection
else:
bbox = det.get("bbox", []) # Dict
if bbox and len(bbox) >= 4:
# bbox format: (x, y, w, h)
cx = bbox[0] + bbox[2] / 2
cy = bbox[1] + bbox[3] / 2
# Distance au point
dist = np.sqrt((cx - x)**2 + (cy - y)**2)
if dist < min_dist and dist < max_distance:
min_dist = dist
closest = det
return closest
def compare_signatures(
self,
sig1: Dict[str, Any],
sig2: Dict[str, Any]
) -> float:
"""
Compare deux signatures visuelles.
Returns:
Score de similarité (0-1)
"""
# Comparer les embeddings
emb1 = sig1.get("embedding")
emb2 = sig2.get("embedding")
if emb1 is None or emb2 is None:
return 0.0
# Similarité cosinus
similarity = np.dot(emb1, emb2) / (
np.linalg.norm(emb1) * np.linalg.norm(emb2)
)
return float(similarity)