feat: agent Rust Phase 2 — visual mode (template matching serveur)
- visual.rs : resolve via POST /replay/resolve_target - executor.rs : resolve avant chaque clic si visual_mode=true - Fallback blind si matching échoue - Binaire toujours 1.8 MB (pas de nouvelle dépendance) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -4,7 +4,9 @@
|
|||||||
//! Utilise enigo pour la simulation, compatible Windows et Linux.
|
//! Utilise enigo pour la simulation, compatible Windows et Linux.
|
||||||
//! Reproduit le comportement de agent_v1/core/executor.py.
|
//! Reproduit le comportement de agent_v1/core/executor.py.
|
||||||
|
|
||||||
|
use crate::config::Config;
|
||||||
use crate::network::{Action, ActionResult};
|
use crate::network::{Action, ActionResult};
|
||||||
|
use crate::visual;
|
||||||
use enigo::{
|
use enigo::{
|
||||||
Coordinate, Direction, Enigo, Key, Keyboard, Mouse, Settings,
|
Coordinate, Direction, Enigo, Key, Keyboard, Mouse, Settings,
|
||||||
};
|
};
|
||||||
@@ -16,14 +18,16 @@ use std::time::Duration;
|
|||||||
/// Dispatche vers le bon handler selon le type d'action.
|
/// Dispatche vers le bon handler selon le type d'action.
|
||||||
/// Les coordonnées x_pct/y_pct (0.0-1.0) sont converties en pixels
|
/// Les coordonnées x_pct/y_pct (0.0-1.0) sont converties en pixels
|
||||||
/// à partir des dimensions de l'écran.
|
/// à partir des dimensions de l'écran.
|
||||||
|
/// Si visual_mode est activé, résout d'abord la cible via le serveur.
|
||||||
pub fn execute_action(
|
pub fn execute_action(
|
||||||
action: &Action,
|
action: &Action,
|
||||||
screen_width: u32,
|
screen_width: u32,
|
||||||
screen_height: u32,
|
screen_height: u32,
|
||||||
|
config: &Config,
|
||||||
) -> ActionResult {
|
) -> ActionResult {
|
||||||
match action.action_type.as_str() {
|
match action.action_type.as_str() {
|
||||||
"click" => execute_click(action, screen_width, screen_height),
|
"click" => execute_click(action, screen_width, screen_height, config),
|
||||||
"type" => execute_type(action, screen_width, screen_height),
|
"type" => execute_type(action, screen_width, screen_height, config),
|
||||||
"key_combo" => execute_key_combo(action),
|
"key_combo" => execute_key_combo(action),
|
||||||
"scroll" => execute_scroll(action, screen_width, screen_height),
|
"scroll" => execute_scroll(action, screen_width, screen_height),
|
||||||
"wait" => execute_wait(action),
|
"wait" => execute_wait(action),
|
||||||
@@ -34,14 +38,59 @@ pub fn execute_action(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Résout les coordonnées visuellement si visual_mode est activé.
|
||||||
|
///
|
||||||
|
/// Si la résolution échoue, retourne les coordonnées de fallback (blind).
|
||||||
|
/// Si visual_mode est désactivé ou target_spec absent, retourne les coordonnées originales.
|
||||||
|
fn resolve_coordinates(
|
||||||
|
action: &Action,
|
||||||
|
screen_width: u32,
|
||||||
|
screen_height: u32,
|
||||||
|
config: &Config,
|
||||||
|
) -> (f64, f64) {
|
||||||
|
let mut x_pct = action.x_pct;
|
||||||
|
let mut y_pct = action.y_pct;
|
||||||
|
|
||||||
|
if action.visual_mode && !action.target_spec.is_null() {
|
||||||
|
println!(
|
||||||
|
" [VISUAL] Mode visuel active — resolution de la cible..."
|
||||||
|
);
|
||||||
|
match visual::resolve_target_visual(
|
||||||
|
config,
|
||||||
|
&action.target_spec,
|
||||||
|
x_pct,
|
||||||
|
y_pct,
|
||||||
|
screen_width,
|
||||||
|
screen_height,
|
||||||
|
) {
|
||||||
|
Some((rx, ry)) => {
|
||||||
|
println!(" [VISUAL] Resolu : ({:.4}, {:.4})", rx, ry);
|
||||||
|
x_pct = rx;
|
||||||
|
y_pct = ry;
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
println!(
|
||||||
|
" [VISUAL] Echec — fallback coordonnees aveugles ({:.4}, {:.4})",
|
||||||
|
x_pct, y_pct
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
(x_pct, y_pct)
|
||||||
|
}
|
||||||
|
|
||||||
/// Exécute un clic souris aux coordonnées normalisées.
|
/// Exécute un clic souris aux coordonnées normalisées.
|
||||||
fn execute_click(action: &Action, screen_width: u32, screen_height: u32) -> ActionResult {
|
/// Résout visuellement la cible si visual_mode est activé.
|
||||||
let real_x = (action.x_pct * screen_width as f64) as i32;
|
fn execute_click(action: &Action, screen_width: u32, screen_height: u32, config: &Config) -> ActionResult {
|
||||||
let real_y = (action.y_pct * screen_height as f64) as i32;
|
let (x_pct, y_pct) = resolve_coordinates(action, screen_width, screen_height, config);
|
||||||
|
let real_x = (x_pct * screen_width as f64) as i32;
|
||||||
|
let real_y = (y_pct * screen_height as f64) as i32;
|
||||||
|
|
||||||
println!(
|
println!(
|
||||||
" [CLICK] ({:.3}, {:.3}) -> ({}, {}) sur ({}x{}), bouton={}",
|
" [CLICK] ({:.4}, {:.4}) -> ({}, {}) sur ({}x{}), bouton={}{}",
|
||||||
action.x_pct, action.y_pct, real_x, real_y, screen_width, screen_height, action.button
|
x_pct, y_pct, real_x, real_y, screen_width, screen_height, action.button,
|
||||||
|
if action.visual_mode { " [VISUAL]" } else { "" }
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut enigo = match Enigo::new(&Settings::default()) {
|
let mut enigo = match Enigo::new(&Settings::default()) {
|
||||||
@@ -93,7 +142,7 @@ fn execute_click(action: &Action, screen_width: u32, screen_height: u32) -> Acti
|
|||||||
///
|
///
|
||||||
/// Si des coordonnées sont fournies (x_pct > 0), clique d'abord
|
/// Si des coordonnées sont fournies (x_pct > 0), clique d'abord
|
||||||
/// sur le champ avant de taper (comme en Python).
|
/// sur le champ avant de taper (comme en Python).
|
||||||
fn execute_type(action: &Action, screen_width: u32, screen_height: u32) -> ActionResult {
|
fn execute_type(action: &Action, screen_width: u32, screen_height: u32, config: &Config) -> ActionResult {
|
||||||
let text = &action.text;
|
let text = &action.text;
|
||||||
println!(
|
println!(
|
||||||
" [TYPE] Texte: '{}' ({} chars)",
|
" [TYPE] Texte: '{}' ({} chars)",
|
||||||
@@ -101,6 +150,9 @@ fn execute_type(action: &Action, screen_width: u32, screen_height: u32) -> Actio
|
|||||||
text.len()
|
text.len()
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Résoudre visuellement les coordonnées si visual_mode est activé
|
||||||
|
let (x_pct, y_pct) = resolve_coordinates(action, screen_width, screen_height, config);
|
||||||
|
|
||||||
let mut enigo = match Enigo::new(&Settings::default()) {
|
let mut enigo = match Enigo::new(&Settings::default()) {
|
||||||
Ok(e) => e,
|
Ok(e) => e,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
@@ -112,10 +164,11 @@ fn execute_type(action: &Action, screen_width: u32, screen_height: u32) -> Actio
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Clic préalable sur le champ si coordonnées disponibles
|
// Clic préalable sur le champ si coordonnées disponibles
|
||||||
if action.x_pct > 0.0 && action.y_pct > 0.0 {
|
if x_pct > 0.0 && y_pct > 0.0 {
|
||||||
let real_x = (action.x_pct * screen_width as f64) as i32;
|
let real_x = (x_pct * screen_width as f64) as i32;
|
||||||
let real_y = (action.y_pct * screen_height as f64) as i32;
|
let real_y = (y_pct * screen_height as f64) as i32;
|
||||||
println!(" [TYPE] Clic prealable sur ({}, {})", real_x, real_y);
|
println!(" [TYPE] Clic prealable sur ({}, {}){}", real_x, real_y,
|
||||||
|
if action.visual_mode { " [VISUAL]" } else { "" });
|
||||||
|
|
||||||
if let Err(e) = enigo.move_mouse(real_x, real_y, Coordinate::Abs) {
|
if let Err(e) = enigo.move_mouse(real_x, real_y, Coordinate::Abs) {
|
||||||
eprintln!(" [TYPE] Erreur deplacement souris : {}", e);
|
eprintln!(" [TYPE] Erreur deplacement souris : {}", e);
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ mod executor;
|
|||||||
mod network;
|
mod network;
|
||||||
mod replay;
|
mod replay;
|
||||||
mod server;
|
mod server;
|
||||||
|
mod visual;
|
||||||
|
|
||||||
use config::Config;
|
use config::Config;
|
||||||
use reqwest::blocking::Client;
|
use reqwest::blocking::Client;
|
||||||
|
|||||||
@@ -70,9 +70,9 @@ pub fn replay_poll_loop(config: &Config) {
|
|||||||
// Obtenir les dimensions de l'écran
|
// Obtenir les dimensions de l'écran
|
||||||
let (sw, sh) = capture::screen_dimensions().unwrap_or((1920, 1080));
|
let (sw, sh) = capture::screen_dimensions().unwrap_or((1920, 1080));
|
||||||
|
|
||||||
// Exécuter l'action
|
// Exécuter l'action (avec config pour la résolution visuelle)
|
||||||
println!(">>> Execution de l'action {}...", action_type);
|
println!(">>> Execution de l'action {}...", action_type);
|
||||||
let mut result = executor::execute_action(&action, sw, sh);
|
let mut result = executor::execute_action(&action, sw, sh, config);
|
||||||
println!(
|
println!(
|
||||||
">>> Resultat execution : success={}, error={:?}",
|
">>> Resultat execution : success={}, error={:?}",
|
||||||
result.success, result.error
|
result.success, result.error
|
||||||
|
|||||||
110
agent_rust/src/visual.rs
Normal file
110
agent_rust/src/visual.rs
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
//! Résolution visuelle des cibles via le serveur.
|
||||||
|
//!
|
||||||
|
//! Envoie un screenshot + target_spec au serveur qui effectue le template
|
||||||
|
//! matching OpenCV et retourne les coordonnées résolues (x_pct, y_pct).
|
||||||
|
//! Approche server-side : pas de dépendance OpenCV dans le binaire Rust.
|
||||||
|
|
||||||
|
use crate::capture;
|
||||||
|
use crate::config::Config;
|
||||||
|
use reqwest::blocking::Client;
|
||||||
|
|
||||||
|
/// Résout visuellement une cible en envoyant le screenshot courant au serveur.
|
||||||
|
///
|
||||||
|
/// Capture l'écran, l'encode en JPEG base64, envoie au endpoint
|
||||||
|
/// `/traces/stream/replay/resolve_target` qui fait le template matching.
|
||||||
|
///
|
||||||
|
/// Retourne Some((x_pct, y_pct)) si la cible est trouvée, None sinon.
|
||||||
|
pub fn resolve_target_visual(
|
||||||
|
config: &Config,
|
||||||
|
target_spec: &serde_json::Value,
|
||||||
|
fallback_x: f64,
|
||||||
|
fallback_y: f64,
|
||||||
|
screen_width: u32,
|
||||||
|
screen_height: u32,
|
||||||
|
) -> Option<(f64, f64)> {
|
||||||
|
// 1. Capturer le screenshot actuel
|
||||||
|
let screenshot = match capture::capture_screenshot() {
|
||||||
|
Some(img) => img,
|
||||||
|
None => {
|
||||||
|
eprintln!(" [VISUAL] Echec capture screenshot pour résolution visuelle");
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Encoder en JPEG base64 (qualité 75 — bon compromis taille/précision)
|
||||||
|
let screenshot_b64 = capture::screenshot_to_jpeg_base64(&screenshot, 75);
|
||||||
|
if screenshot_b64.is_empty() {
|
||||||
|
eprintln!(" [VISUAL] Echec encodage JPEG");
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
println!(
|
||||||
|
" [VISUAL] Screenshot capture ({}x{}), envoi au serveur...",
|
||||||
|
screen_width, screen_height
|
||||||
|
);
|
||||||
|
|
||||||
|
// 2. Envoyer au serveur /replay/resolve_target
|
||||||
|
let client = Client::new();
|
||||||
|
let payload = serde_json::json!({
|
||||||
|
"session_id": config.agent_session_id(),
|
||||||
|
"screenshot_b64": screenshot_b64,
|
||||||
|
"target_spec": target_spec,
|
||||||
|
"fallback_x_pct": fallback_x,
|
||||||
|
"fallback_y_pct": fallback_y,
|
||||||
|
"screen_width": screen_width,
|
||||||
|
"screen_height": screen_height,
|
||||||
|
});
|
||||||
|
|
||||||
|
let url = format!("{}/traces/stream/replay/resolve_target", config.server_url);
|
||||||
|
|
||||||
|
let resp = match client
|
||||||
|
.post(&url)
|
||||||
|
.json(&payload)
|
||||||
|
.timeout(std::time::Duration::from_secs(30))
|
||||||
|
.send()
|
||||||
|
{
|
||||||
|
Ok(r) => r,
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!(" [VISUAL] Erreur reseau vers {} : {}", url, e);
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if !resp.status().is_success() {
|
||||||
|
eprintln!(
|
||||||
|
" [VISUAL] Serveur a repondu HTTP {}",
|
||||||
|
resp.status()
|
||||||
|
);
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Parser la réponse
|
||||||
|
let data: serde_json::Value = match resp.json() {
|
||||||
|
Ok(d) => d,
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!(" [VISUAL] Erreur parsing reponse JSON : {}", e);
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let resolved = data["resolved"].as_bool().unwrap_or(false);
|
||||||
|
if resolved {
|
||||||
|
let x = data["x_pct"].as_f64()?;
|
||||||
|
let y = data["y_pct"].as_f64()?;
|
||||||
|
let method = data["method"].as_str().unwrap_or("?");
|
||||||
|
let score = data["score"].as_f64().unwrap_or(0.0);
|
||||||
|
println!(
|
||||||
|
" [VISUAL] Resolu par {} (score={:.3}) : ({:.4}, {:.4})",
|
||||||
|
method, score, x, y
|
||||||
|
);
|
||||||
|
Some((x, y))
|
||||||
|
} else {
|
||||||
|
let reason = data["reason"].as_str().unwrap_or("inconnu");
|
||||||
|
let method = data["method"].as_str().unwrap_or("?");
|
||||||
|
println!(
|
||||||
|
" [VISUAL] Non resolu (methode={}, raison={})",
|
||||||
|
method, reason
|
||||||
|
);
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user