diff --git a/agent_rust/src/executor.rs b/agent_rust/src/executor.rs index 011b82a9d..52fef6881 100644 --- a/agent_rust/src/executor.rs +++ b/agent_rust/src/executor.rs @@ -4,7 +4,9 @@ //! Utilise enigo pour la simulation, compatible Windows et Linux. //! Reproduit le comportement de agent_v1/core/executor.py. +use crate::config::Config; use crate::network::{Action, ActionResult}; +use crate::visual; use enigo::{ Coordinate, Direction, Enigo, Key, Keyboard, Mouse, Settings, }; @@ -16,14 +18,16 @@ use std::time::Duration; /// Dispatche vers le bon handler selon le type d'action. /// Les coordonnées x_pct/y_pct (0.0-1.0) sont converties en pixels /// à partir des dimensions de l'écran. +/// Si visual_mode est activé, résout d'abord la cible via le serveur. pub fn execute_action( action: &Action, screen_width: u32, screen_height: u32, + config: &Config, ) -> ActionResult { match action.action_type.as_str() { - "click" => execute_click(action, screen_width, screen_height), - "type" => execute_type(action, screen_width, screen_height), + "click" => execute_click(action, screen_width, screen_height, config), + "type" => execute_type(action, screen_width, screen_height, config), "key_combo" => execute_key_combo(action), "scroll" => execute_scroll(action, screen_width, screen_height), "wait" => execute_wait(action), @@ -34,14 +38,59 @@ pub fn execute_action( } } +/// Résout les coordonnées visuellement si visual_mode est activé. +/// +/// Si la résolution échoue, retourne les coordonnées de fallback (blind). +/// Si visual_mode est désactivé ou target_spec absent, retourne les coordonnées originales. +fn resolve_coordinates( + action: &Action, + screen_width: u32, + screen_height: u32, + config: &Config, +) -> (f64, f64) { + let mut x_pct = action.x_pct; + let mut y_pct = action.y_pct; + + if action.visual_mode && !action.target_spec.is_null() { + println!( + " [VISUAL] Mode visuel active — resolution de la cible..." + ); + match visual::resolve_target_visual( + config, + &action.target_spec, + x_pct, + y_pct, + screen_width, + screen_height, + ) { + Some((rx, ry)) => { + println!(" [VISUAL] Resolu : ({:.4}, {:.4})", rx, ry); + x_pct = rx; + y_pct = ry; + } + None => { + println!( + " [VISUAL] Echec — fallback coordonnees aveugles ({:.4}, {:.4})", + x_pct, y_pct + ); + } + } + } + + (x_pct, y_pct) +} + /// Exécute un clic souris aux coordonnées normalisées. -fn execute_click(action: &Action, screen_width: u32, screen_height: u32) -> ActionResult { - let real_x = (action.x_pct * screen_width as f64) as i32; - let real_y = (action.y_pct * screen_height as f64) as i32; +/// Résout visuellement la cible si visual_mode est activé. +fn execute_click(action: &Action, screen_width: u32, screen_height: u32, config: &Config) -> ActionResult { + let (x_pct, y_pct) = resolve_coordinates(action, screen_width, screen_height, config); + let real_x = (x_pct * screen_width as f64) as i32; + let real_y = (y_pct * screen_height as f64) as i32; println!( - " [CLICK] ({:.3}, {:.3}) -> ({}, {}) sur ({}x{}), bouton={}", - action.x_pct, action.y_pct, real_x, real_y, screen_width, screen_height, action.button + " [CLICK] ({:.4}, {:.4}) -> ({}, {}) sur ({}x{}), bouton={}{}", + x_pct, y_pct, real_x, real_y, screen_width, screen_height, action.button, + if action.visual_mode { " [VISUAL]" } else { "" } ); let mut enigo = match Enigo::new(&Settings::default()) { @@ -93,7 +142,7 @@ fn execute_click(action: &Action, screen_width: u32, screen_height: u32) -> Acti /// /// Si des coordonnées sont fournies (x_pct > 0), clique d'abord /// sur le champ avant de taper (comme en Python). -fn execute_type(action: &Action, screen_width: u32, screen_height: u32) -> ActionResult { +fn execute_type(action: &Action, screen_width: u32, screen_height: u32, config: &Config) -> ActionResult { let text = &action.text; println!( " [TYPE] Texte: '{}' ({} chars)", @@ -101,6 +150,9 @@ fn execute_type(action: &Action, screen_width: u32, screen_height: u32) -> Actio text.len() ); + // Résoudre visuellement les coordonnées si visual_mode est activé + let (x_pct, y_pct) = resolve_coordinates(action, screen_width, screen_height, config); + let mut enigo = match Enigo::new(&Settings::default()) { Ok(e) => e, Err(e) => { @@ -112,10 +164,11 @@ fn execute_type(action: &Action, screen_width: u32, screen_height: u32) -> Actio }; // Clic préalable sur le champ si coordonnées disponibles - if action.x_pct > 0.0 && action.y_pct > 0.0 { - let real_x = (action.x_pct * screen_width as f64) as i32; - let real_y = (action.y_pct * screen_height as f64) as i32; - println!(" [TYPE] Clic prealable sur ({}, {})", real_x, real_y); + if x_pct > 0.0 && y_pct > 0.0 { + let real_x = (x_pct * screen_width as f64) as i32; + let real_y = (y_pct * screen_height as f64) as i32; + println!(" [TYPE] Clic prealable sur ({}, {}){}", real_x, real_y, + if action.visual_mode { " [VISUAL]" } else { "" }); if let Err(e) = enigo.move_mouse(real_x, real_y, Coordinate::Abs) { eprintln!(" [TYPE] Erreur deplacement souris : {}", e); diff --git a/agent_rust/src/main.rs b/agent_rust/src/main.rs index d852a43fd..9ec449b77 100644 --- a/agent_rust/src/main.rs +++ b/agent_rust/src/main.rs @@ -14,6 +14,7 @@ mod executor; mod network; mod replay; mod server; +mod visual; use config::Config; use reqwest::blocking::Client; diff --git a/agent_rust/src/replay.rs b/agent_rust/src/replay.rs index c5ed4c676..fe0d1e8ac 100644 --- a/agent_rust/src/replay.rs +++ b/agent_rust/src/replay.rs @@ -70,9 +70,9 @@ pub fn replay_poll_loop(config: &Config) { // Obtenir les dimensions de l'écran let (sw, sh) = capture::screen_dimensions().unwrap_or((1920, 1080)); - // Exécuter l'action + // Exécuter l'action (avec config pour la résolution visuelle) println!(">>> Execution de l'action {}...", action_type); - let mut result = executor::execute_action(&action, sw, sh); + let mut result = executor::execute_action(&action, sw, sh, config); println!( ">>> Resultat execution : success={}, error={:?}", result.success, result.error diff --git a/agent_rust/src/visual.rs b/agent_rust/src/visual.rs new file mode 100644 index 000000000..5bf75c195 --- /dev/null +++ b/agent_rust/src/visual.rs @@ -0,0 +1,110 @@ +//! Résolution visuelle des cibles via le serveur. +//! +//! Envoie un screenshot + target_spec au serveur qui effectue le template +//! matching OpenCV et retourne les coordonnées résolues (x_pct, y_pct). +//! Approche server-side : pas de dépendance OpenCV dans le binaire Rust. + +use crate::capture; +use crate::config::Config; +use reqwest::blocking::Client; + +/// Résout visuellement une cible en envoyant le screenshot courant au serveur. +/// +/// Capture l'écran, l'encode en JPEG base64, envoie au endpoint +/// `/traces/stream/replay/resolve_target` qui fait le template matching. +/// +/// Retourne Some((x_pct, y_pct)) si la cible est trouvée, None sinon. +pub fn resolve_target_visual( + config: &Config, + target_spec: &serde_json::Value, + fallback_x: f64, + fallback_y: f64, + screen_width: u32, + screen_height: u32, +) -> Option<(f64, f64)> { + // 1. Capturer le screenshot actuel + let screenshot = match capture::capture_screenshot() { + Some(img) => img, + None => { + eprintln!(" [VISUAL] Echec capture screenshot pour résolution visuelle"); + return None; + } + }; + + // Encoder en JPEG base64 (qualité 75 — bon compromis taille/précision) + let screenshot_b64 = capture::screenshot_to_jpeg_base64(&screenshot, 75); + if screenshot_b64.is_empty() { + eprintln!(" [VISUAL] Echec encodage JPEG"); + return None; + } + + println!( + " [VISUAL] Screenshot capture ({}x{}), envoi au serveur...", + screen_width, screen_height + ); + + // 2. Envoyer au serveur /replay/resolve_target + let client = Client::new(); + let payload = serde_json::json!({ + "session_id": config.agent_session_id(), + "screenshot_b64": screenshot_b64, + "target_spec": target_spec, + "fallback_x_pct": fallback_x, + "fallback_y_pct": fallback_y, + "screen_width": screen_width, + "screen_height": screen_height, + }); + + let url = format!("{}/traces/stream/replay/resolve_target", config.server_url); + + let resp = match client + .post(&url) + .json(&payload) + .timeout(std::time::Duration::from_secs(30)) + .send() + { + Ok(r) => r, + Err(e) => { + eprintln!(" [VISUAL] Erreur reseau vers {} : {}", url, e); + return None; + } + }; + + if !resp.status().is_success() { + eprintln!( + " [VISUAL] Serveur a repondu HTTP {}", + resp.status() + ); + return None; + } + + // 3. Parser la réponse + let data: serde_json::Value = match resp.json() { + Ok(d) => d, + Err(e) => { + eprintln!(" [VISUAL] Erreur parsing reponse JSON : {}", e); + return None; + } + }; + + let resolved = data["resolved"].as_bool().unwrap_or(false); + if resolved { + let x = data["x_pct"].as_f64()?; + let y = data["y_pct"].as_f64()?; + let method = data["method"].as_str().unwrap_or("?"); + let score = data["score"].as_f64().unwrap_or(0.0); + println!( + " [VISUAL] Resolu par {} (score={:.3}) : ({:.4}, {:.4})", + method, score, x, y + ); + Some((x, y)) + } else { + let reason = data["reason"].as_str().unwrap_or("inconnu"); + let method = data["method"].as_str().unwrap_or("?"); + println!( + " [VISUAL] Non resolu (methode={}, raison={})", + method, reason + ); + None + } +}