Files
Geniusia_v2/geniusia2/core/task_replay.py
2026-03-05 00:20:25 +01:00

499 lines
16 KiB
Python

"""
Système de rejeu intelligent de tâches apprises avec reconnaissance visuelle.
Permet de rejouer des tâches en s'adaptant aux variations d'interface.
"""
import asyncio
import time
from typing import Dict, List, Optional, Any, Tuple
from pathlib import Path
import numpy as np
from PIL import Image
from .models import TaskProfile, Action
from .learning_manager import LearningManager
from .embeddings_manager import EmbeddingsManager
from .utils.vision_utils import VisionUtils
from .utils.input_utils import InputUtils
from .logger import Logger
class TaskReplayEngine:
"""
Moteur de rejeu intelligent qui utilise la vision pour localiser
les éléments et rejouer les tâches apprises.
"""
def __init__(
self,
learning_manager: LearningManager,
embeddings_manager: EmbeddingsManager,
vision_utils: VisionUtils,
input_utils: InputUtils,
logger: Logger,
config: Dict[str, Any]
):
"""
Initialise le moteur de rejeu.
Args:
learning_manager: Gestionnaire d'apprentissage
embeddings_manager: Gestionnaire d'embeddings
vision_utils: Utilitaires de vision
input_utils: Utilitaires d'entrée
logger: Logger
config: Configuration
"""
self.learning_manager = learning_manager
self.embeddings_manager = embeddings_manager
self.vision_utils = vision_utils
self.input_utils = input_utils
self.logger = logger
self.config = config
# Configuration
self.similarity_threshold = config.get("replay", {}).get(
"similarity_threshold", 0.75
)
self.max_search_attempts = config.get("replay", {}).get(
"max_search_attempts", 3
)
self.delay_between_actions = config.get("replay", {}).get(
"delay_between_actions", 0.5
)
self.logger.log_action({
"action": "task_replay_engine_initialized",
"similarity_threshold": self.similarity_threshold
})
async def replay_task(
self,
task_id: str,
interactive: bool = False
) -> Dict[str, Any]:
"""
Rejoue une tâche apprise.
Args:
task_id: ID de la tâche à rejouer
interactive: Si True, demande confirmation avant chaque action
Returns:
Résultats du rejeu
"""
# Charger la tâche
task = self.learning_manager.load_task(task_id)
if not task:
return {
"success": False,
"error": "task_not_found",
"task_id": task_id
}
self.logger.log_action({
"action": "task_replay_started",
"task_id": task_id,
"interactive": interactive
})
# Récupérer les signatures
signatures = task.metadata.get("signatures", [])
if not signatures:
return {
"success": False,
"error": "no_signatures",
"task_id": task_id
}
results = {
"task_id": task_id,
"total_actions": len(signatures),
"executed_actions": 0,
"failed_actions": 0,
"actions": []
}
# Rejouer chaque action
for i, signature in enumerate(signatures):
self.logger.log_action({
"action": "replaying_step",
"step": i + 1,
"total": len(signatures)
})
# Localiser l'élément visuellement
location = await self._find_element_visually(signature)
if not location:
self.logger.log_action({
"action": "element_not_found",
"step": i + 1,
"signature": signature.get("description", "Unknown")
})
results["failed_actions"] += 1
results["actions"].append({
"step": i + 1,
"success": False,
"error": "element_not_found"
})
continue
# Exécuter l'action
success = await self._execute_action_at_location(
signature,
location,
interactive
)
results["actions"].append({
"step": i + 1,
"success": success,
"location": location,
"action_type": signature.get("action_type")
})
if success:
results["executed_actions"] += 1
else:
results["failed_actions"] += 1
# Attendre entre les actions
if i < len(signatures) - 1:
await asyncio.sleep(self.delay_between_actions)
results["success"] = results["failed_actions"] == 0
self.logger.log_action({
"action": "task_replay_completed",
"task_id": task_id,
"success": results["success"],
"executed": results["executed_actions"],
"failed": results["failed_actions"]
})
return results
async def _find_element_visually(
self,
signature: Dict[str, Any]
) -> Optional[Dict[str, Any]]:
"""
Localise un élément visuellement en utilisant son embedding.
Args:
signature: Signature visuelle de l'élément
Returns:
Localisation (x, y, confidence) ou None
"""
embedding = signature.get("embedding")
if embedding is None:
return None
# Capturer l'écran actuel
import pyautogui
screenshot = pyautogui.screenshot()
screenshot_np = np.array(screenshot)
# Rechercher l'élément avec vision
for attempt in range(self.max_search_attempts):
# Générer l'embedding de l'écran actuel
current_embedding = self.vision_utils.generate_clip_embedding(
screenshot_np
)
# Rechercher les zones similaires
similar_regions = await self._search_similar_regions(
screenshot_np,
embedding
)
if similar_regions:
best_match = similar_regions[0]
if best_match["similarity"] >= self.similarity_threshold:
self.logger.log_action({
"action": "element_found",
"similarity": best_match["similarity"],
"attempt": attempt + 1
})
return {
"x": best_match["x"],
"y": best_match["y"],
"confidence": best_match["similarity"],
"bbox": best_match.get("bbox")
}
# Attendre un peu avant de réessayer
if attempt < self.max_search_attempts - 1:
await asyncio.sleep(0.5)
return None
async def _search_similar_regions(
self,
screenshot: np.ndarray,
target_embedding: np.ndarray,
grid_size: int = 4
) -> List[Dict[str, Any]]:
"""
Recherche les régions similaires dans une capture d'écran.
Args:
screenshot: Capture d'écran
target_embedding: Embedding cible
grid_size: Taille de la grille de recherche
Returns:
Liste de régions similaires triées par similarité
"""
height, width = screenshot.shape[:2]
cell_height = height // grid_size
cell_width = width // grid_size
regions = []
# Parcourir la grille
for row in range(grid_size):
for col in range(grid_size):
y1 = row * cell_height
x1 = col * cell_width
y2 = min((row + 1) * cell_height, height)
x2 = min((col + 1) * cell_width, width)
# Extraire la région
region = screenshot[y1:y2, x1:x2]
# Générer l'embedding
region_embedding = self.vision_utils.generate_clip_embedding(region)
# Calculer la similarité
similarity = self._cosine_similarity(
target_embedding,
region_embedding
)
# Centre de la région
center_x = (x1 + x2) // 2
center_y = (y1 + y2) // 2
regions.append({
"x": center_x,
"y": center_y,
"bbox": (x1, y1, x2, y2),
"similarity": similarity
})
# Trier par similarité décroissante
regions.sort(key=lambda r: r["similarity"], reverse=True)
return regions
def _cosine_similarity(
self,
emb1: np.ndarray,
emb2: np.ndarray
) -> float:
"""Calcule la similarité cosinus entre deux embeddings."""
dot_product = np.dot(emb1, emb2)
norm1 = np.linalg.norm(emb1)
norm2 = np.linalg.norm(emb2)
if norm1 == 0 or norm2 == 0:
return 0.0
return float(dot_product / (norm1 * norm2))
async def _execute_action_at_location(
self,
signature: Dict[str, Any],
location: Dict[str, Any],
interactive: bool
) -> bool:
"""
Exécute une action à une localisation donnée.
Args:
signature: Signature de l'action
location: Localisation de l'élément
interactive: Si True, demande confirmation
Returns:
True si succès
"""
action_type = signature.get("action_type", "click")
x = location["x"]
y = location["y"]
# Demander confirmation si mode interactif
if interactive:
confirmed = await self._ask_confirmation(signature, location)
if not confirmed:
return False
try:
if action_type == "click":
self.input_utils.click(x, y)
return True
elif action_type == "type":
text = signature.get("text", "")
self.input_utils.type_text(text)
return True
elif action_type == "scroll":
direction = signature.get("direction", "down")
amount = signature.get("amount", 3)
self.input_utils.scroll(direction, amount, x, y)
return True
elif action_type == "drag":
# Pour le drag, on a besoin de la destination
end_x = signature.get("end_x", x + 100)
end_y = signature.get("end_y", y)
self.input_utils.drag(x, y, end_x, end_y)
return True
else:
self.logger.log_action({
"action": "unknown_action_type",
"type": action_type
})
return False
except Exception as e:
self.logger.log_action({
"action": "execute_action_error",
"error": str(e),
"action_type": action_type
})
return False
async def _ask_confirmation(
self,
signature: Dict[str, Any],
location: Dict[str, Any]
) -> bool:
"""
Demande confirmation à l'utilisateur (mode interactif).
Args:
signature: Signature de l'action
location: Localisation
Returns:
True si confirmé
"""
# TODO: Implémenter une vraie interface de confirmation
# Pour l'instant, on accepte automatiquement
return True
def list_available_tasks(self) -> List[Dict[str, Any]]:
"""
Liste toutes les tâches disponibles pour le rejeu.
Returns:
Liste des tâches avec leurs métadonnées
"""
tasks = []
profiles_path = Path(self.learning_manager.profiles_path)
for task_dir in profiles_path.iterdir():
if not task_dir.is_dir():
continue
metadata_file = task_dir / "metadata.json"
if not metadata_file.exists():
continue
try:
import json
with open(metadata_file, "r") as f:
metadata = json.load(f)
tasks.append({
"task_id": metadata.get("task_id"),
"task_name": metadata.get("task_name", metadata.get("description")),
"observation_count": metadata.get("observation_count", metadata.get("observations")),
"confidence": metadata.get("confidence_score", 0.0)
})
except Exception as e:
self.logger.log_action({
"action": "task_list_error",
"task_dir": str(task_dir),
"error": str(e)
})
return tasks
async def replay_task_with_monitoring(
self,
task_id: str,
on_step_completed: Optional[callable] = None
) -> Dict[str, Any]:
"""
Rejoue une tâche avec monitoring en temps réel.
Args:
task_id: ID de la tâche
on_step_completed: Callback appelé après chaque étape
Returns:
Résultats du rejeu
"""
task = self.learning_manager.load_task(task_id)
if not task:
return {
"success": False,
"error": "task_not_found"
}
signatures = task.metadata.get("signatures", [])
results = {
"task_id": task_id,
"steps": []
}
for i, signature in enumerate(signatures):
step_result = {
"step": i + 1,
"description": signature.get("description", "Unknown"),
"status": "pending"
}
# Localiser et exécuter
location = await self._find_element_visually(signature)
if location:
success = await self._execute_action_at_location(
signature,
location,
False
)
step_result["status"] = "success" if success else "failed"
step_result["location"] = location
else:
step_result["status"] = "not_found"
results["steps"].append(step_result)
# Callback
if on_step_completed:
on_step_completed(step_result)
await asyncio.sleep(self.delay_between_actions)
results["success"] = all(
s["status"] == "success" for s in results["steps"]
)
return results