feat: monitor module sysinfo + evaluation seuils
This commit is contained in:
451
src/monitor.rs
451
src/monitor.rs
@@ -1 +1,450 @@
|
||||
// Monitor module — Task 4
|
||||
use crate::alerter::Alerter;
|
||||
use crate::config::{Alert, ConfigManager, ProcessConfig};
|
||||
use chrono::{Duration, Local};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex, RwLock};
|
||||
use std::time::Duration as StdDuration;
|
||||
use sysinfo::{Disks, System};
|
||||
use tokio::sync::Mutex as AsyncMutex;
|
||||
|
||||
pub fn eval_status(value: f64, threshold: f64) -> &'static str {
|
||||
if threshold <= 0.0 {
|
||||
return "ok";
|
||||
}
|
||||
let ratio = value / threshold;
|
||||
if ratio >= 1.0 {
|
||||
"critical"
|
||||
} else if ratio >= 0.80 {
|
||||
"warning"
|
||||
} else {
|
||||
"ok"
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CpuMetrics {
|
||||
pub percent: f64,
|
||||
pub cores: usize,
|
||||
pub threshold: f64,
|
||||
pub status: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct RamMetrics {
|
||||
pub percent: f64,
|
||||
pub total_gb: f64,
|
||||
pub used_gb: f64,
|
||||
pub available_gb: f64,
|
||||
pub threshold: f64,
|
||||
pub status: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DiskMetrics {
|
||||
pub drive: String,
|
||||
pub mountpoint: String,
|
||||
pub percent: f64,
|
||||
pub total_gb: f64,
|
||||
pub used_gb: f64,
|
||||
pub free_gb: f64,
|
||||
pub threshold: f64,
|
||||
pub status: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ProcessMetrics {
|
||||
pub name: String,
|
||||
pub pattern: String,
|
||||
pub running: bool,
|
||||
pub enabled: bool,
|
||||
pub alert_on_down: bool,
|
||||
pub instance_count: usize,
|
||||
pub total_memory_mb: f64,
|
||||
pub total_cpu_percent: f64,
|
||||
pub memory_threshold_mb: f64,
|
||||
pub memory_status: String,
|
||||
pub pids: Vec<u32>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Metrics {
|
||||
pub timestamp: String,
|
||||
pub hostname: String,
|
||||
pub os: String,
|
||||
pub cpu: CpuMetrics,
|
||||
pub ram: RamMetrics,
|
||||
pub disks: Vec<DiskMetrics>,
|
||||
pub processes: Vec<ProcessMetrics>,
|
||||
pub uptime: String,
|
||||
pub boot_time: String,
|
||||
pub monitoring_active: bool,
|
||||
pub last_check: String,
|
||||
pub next_check: String,
|
||||
}
|
||||
|
||||
pub struct SystemMonitor {
|
||||
config_manager: Arc<AsyncMutex<ConfigManager>>,
|
||||
alerter: Arc<Alerter>,
|
||||
pub metrics: Arc<RwLock<Option<Metrics>>>,
|
||||
pub running: Arc<std::sync::atomic::AtomicBool>,
|
||||
last_alerts: Arc<Mutex<HashMap<String, chrono::DateTime<Local>>>>,
|
||||
}
|
||||
|
||||
impl SystemMonitor {
|
||||
pub fn new(
|
||||
config_manager: Arc<AsyncMutex<ConfigManager>>,
|
||||
alerter: Arc<Alerter>,
|
||||
) -> Self {
|
||||
SystemMonitor {
|
||||
config_manager,
|
||||
alerter,
|
||||
metrics: Arc::new(RwLock::new(None)),
|
||||
running: Arc::new(std::sync::atomic::AtomicBool::new(false)),
|
||||
last_alerts: Arc::new(Mutex::new(HashMap::new())),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn collect(&self) -> Metrics {
|
||||
let config = {
|
||||
let cm = self.config_manager.lock().await;
|
||||
cm.config.clone()
|
||||
};
|
||||
|
||||
let mut sys = System::new_all();
|
||||
// Deux mesures pour CPU précis
|
||||
std::thread::sleep(StdDuration::from_millis(500));
|
||||
sys.refresh_all();
|
||||
|
||||
let cpu_percent = sys.global_cpu_usage() as f64;
|
||||
let cpu_status = eval_status(cpu_percent, config.thresholds.cpu_percent).to_string();
|
||||
|
||||
let ram_total = sys.total_memory() as f64;
|
||||
let ram_used = sys.used_memory() as f64;
|
||||
let ram_available = sys.available_memory() as f64;
|
||||
let ram_percent = if ram_total > 0.0 {
|
||||
ram_used / ram_total * 100.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
let ram_status = eval_status(ram_percent, config.thresholds.ram_percent).to_string();
|
||||
|
||||
let mut disks = Vec::new();
|
||||
let disk_list = Disks::new_with_refreshed_list();
|
||||
let ignored_fs = ["squashfs", "tmpfs", "devtmpfs", "overlay", "iso9660"];
|
||||
for disk in &disk_list {
|
||||
let fs = disk.file_system().to_string_lossy().to_lowercase();
|
||||
if ignored_fs.iter().any(|&f| fs.contains(f)) {
|
||||
continue;
|
||||
}
|
||||
let total = disk.total_space() as f64;
|
||||
if total < 1_073_741_824.0 {
|
||||
continue; // < 1 GB
|
||||
}
|
||||
let available = disk.available_space() as f64;
|
||||
let used = total - available;
|
||||
let percent = (used / total * 1000.0).round() / 10.0;
|
||||
let status = eval_status(percent, config.thresholds.disk_percent).to_string();
|
||||
disks.push(DiskMetrics {
|
||||
drive: disk
|
||||
.name()
|
||||
.to_string_lossy()
|
||||
.trim_end_matches('\\')
|
||||
.to_string(),
|
||||
mountpoint: disk.mount_point().to_string_lossy().to_string(),
|
||||
percent,
|
||||
total_gb: (total / 1_073_741_824.0 * 10.0).round() / 10.0,
|
||||
used_gb: (used / 1_073_741_824.0 * 10.0).round() / 10.0,
|
||||
free_gb: (available / 1_073_741_824.0 * 10.0).round() / 10.0,
|
||||
threshold: config.thresholds.disk_percent,
|
||||
status,
|
||||
});
|
||||
}
|
||||
|
||||
let processes = self.check_processes(&sys, &config.processes);
|
||||
|
||||
let boot_time_unix = System::boot_time();
|
||||
let now_unix = Local::now().timestamp() as u64;
|
||||
let uptime_secs = now_unix.saturating_sub(boot_time_unix);
|
||||
let uptime = format!(
|
||||
"{}:{:02}:{:02}",
|
||||
uptime_secs / 3600,
|
||||
(uptime_secs % 3600) / 60,
|
||||
uptime_secs % 60
|
||||
);
|
||||
|
||||
let now = Local::now();
|
||||
let interval = config.check_interval_minutes;
|
||||
|
||||
Metrics {
|
||||
timestamp: now.to_rfc3339(),
|
||||
hostname: System::host_name().unwrap_or_else(|| "inconnu".into()),
|
||||
os: format!(
|
||||
"{} {}",
|
||||
System::name().unwrap_or_default(),
|
||||
System::os_version().unwrap_or_default()
|
||||
),
|
||||
cpu: CpuMetrics {
|
||||
percent: (cpu_percent * 10.0).round() / 10.0,
|
||||
cores: sys.cpus().len(),
|
||||
threshold: config.thresholds.cpu_percent,
|
||||
status: cpu_status,
|
||||
},
|
||||
ram: RamMetrics {
|
||||
percent: (ram_percent * 10.0).round() / 10.0,
|
||||
total_gb: (ram_total / 1_073_741_824.0 * 10.0).round() / 10.0,
|
||||
used_gb: (ram_used / 1_073_741_824.0 * 10.0).round() / 10.0,
|
||||
available_gb: (ram_available / 1_073_741_824.0 * 10.0).round() / 10.0,
|
||||
threshold: config.thresholds.ram_percent,
|
||||
status: ram_status,
|
||||
},
|
||||
disks,
|
||||
processes,
|
||||
uptime,
|
||||
boot_time: chrono::DateTime::from_timestamp(boot_time_unix as i64, 0)
|
||||
.map(|dt: chrono::DateTime<chrono::Utc>| dt.to_rfc3339())
|
||||
.unwrap_or_default(),
|
||||
monitoring_active: self
|
||||
.running
|
||||
.load(std::sync::atomic::Ordering::Relaxed),
|
||||
last_check: now.to_rfc3339(),
|
||||
next_check: (now + Duration::minutes(interval as i64)).to_rfc3339(),
|
||||
}
|
||||
}
|
||||
|
||||
fn check_processes(
|
||||
&self,
|
||||
sys: &System,
|
||||
process_configs: &[ProcessConfig],
|
||||
) -> Vec<ProcessMetrics> {
|
||||
let mut results = Vec::new();
|
||||
for pc in process_configs {
|
||||
let pattern = pc.pattern.to_lowercase();
|
||||
let mut found_pids: Vec<u32> = Vec::new();
|
||||
let mut total_mem: f64 = 0.0;
|
||||
let mut total_cpu: f64 = 0.0;
|
||||
|
||||
if pc.enabled {
|
||||
for (pid, proc) in sys.processes() {
|
||||
let name = proc.name().to_string_lossy().to_lowercase();
|
||||
let cmd = proc
|
||||
.cmd()
|
||||
.iter()
|
||||
.map(|s| s.to_string_lossy().to_lowercase())
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ");
|
||||
if name.contains(&pattern) || cmd.contains(&pattern) {
|
||||
found_pids.push(pid.as_u32());
|
||||
total_mem += proc.memory() as f64 / 1_048_576.0;
|
||||
total_cpu += proc.cpu_usage() as f64;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mem_status = if pc.memory_threshold_mb > 0.0 && total_mem > 0.0 {
|
||||
eval_status(total_mem, pc.memory_threshold_mb).to_string()
|
||||
} else {
|
||||
"ok".to_string()
|
||||
};
|
||||
|
||||
results.push(ProcessMetrics {
|
||||
name: pc.name.clone(),
|
||||
pattern: pc.pattern.clone(),
|
||||
running: !found_pids.is_empty(),
|
||||
enabled: pc.enabled,
|
||||
alert_on_down: pc.alert_on_down,
|
||||
instance_count: found_pids.len(),
|
||||
total_memory_mb: (total_mem * 10.0).round() / 10.0,
|
||||
total_cpu_percent: (total_cpu * 10.0).round() / 10.0,
|
||||
memory_threshold_mb: pc.memory_threshold_mb,
|
||||
memory_status: mem_status,
|
||||
pids: found_pids,
|
||||
});
|
||||
}
|
||||
results
|
||||
}
|
||||
|
||||
pub async fn check_and_alert(&self, metrics: &Metrics) {
|
||||
let cooldown = {
|
||||
let cm = self.config_manager.lock().await;
|
||||
cm.config.alert_cooldown_minutes
|
||||
};
|
||||
let hostname = metrics.hostname.clone();
|
||||
|
||||
let mut to_alert: Vec<(String, String, f64, f64, String)> = Vec::new();
|
||||
|
||||
{
|
||||
let mut last = self.last_alerts.lock().unwrap();
|
||||
let now = Local::now();
|
||||
|
||||
let mut maybe_alert =
|
||||
|key: String, msg: String, val: f64, thr: f64, typ: String| {
|
||||
let should = match last.get(&key) {
|
||||
Some(t) => (now - *t) >= Duration::minutes(cooldown as i64),
|
||||
None => true,
|
||||
};
|
||||
if should {
|
||||
last.insert(key.clone(), now);
|
||||
to_alert.push((key, msg, val, thr, typ));
|
||||
}
|
||||
};
|
||||
|
||||
if metrics.cpu.status == "critical" {
|
||||
maybe_alert(
|
||||
"cpu".into(),
|
||||
format!(
|
||||
"CPU a {}% (seuil: {}%)",
|
||||
metrics.cpu.percent, metrics.cpu.threshold
|
||||
),
|
||||
metrics.cpu.percent,
|
||||
metrics.cpu.threshold,
|
||||
"threshold".into(),
|
||||
);
|
||||
}
|
||||
if metrics.ram.status == "critical" {
|
||||
maybe_alert(
|
||||
"ram".into(),
|
||||
format!(
|
||||
"RAM a {}% (seuil: {}%)",
|
||||
metrics.ram.percent, metrics.ram.threshold
|
||||
),
|
||||
metrics.ram.percent,
|
||||
metrics.ram.threshold,
|
||||
"threshold".into(),
|
||||
);
|
||||
}
|
||||
for disk in &metrics.disks {
|
||||
if disk.status == "critical" {
|
||||
maybe_alert(
|
||||
format!("disk_{}", disk.drive),
|
||||
format!(
|
||||
"Disque {} a {}% (seuil: {}%)",
|
||||
disk.drive, disk.percent, disk.threshold
|
||||
),
|
||||
disk.percent,
|
||||
disk.threshold,
|
||||
"threshold".into(),
|
||||
);
|
||||
}
|
||||
}
|
||||
for proc in &metrics.processes {
|
||||
if !proc.enabled {
|
||||
continue;
|
||||
}
|
||||
if proc.alert_on_down && !proc.running {
|
||||
maybe_alert(
|
||||
format!("process_down_{}", proc.name),
|
||||
format!(
|
||||
"Processus '{}' non detecte (pattern: {})",
|
||||
proc.name, proc.pattern
|
||||
),
|
||||
0.0,
|
||||
0.0,
|
||||
"process_down".into(),
|
||||
);
|
||||
}
|
||||
if proc.memory_threshold_mb > 0.0 && proc.memory_status == "critical" {
|
||||
maybe_alert(
|
||||
format!("process_mem_{}", proc.name),
|
||||
format!(
|
||||
"Processus '{}' utilise {} Mo (seuil: {} Mo)",
|
||||
proc.name, proc.total_memory_mb, proc.memory_threshold_mb
|
||||
),
|
||||
proc.total_memory_mb,
|
||||
proc.memory_threshold_mb,
|
||||
"threshold".into(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (key, message, value, threshold, alert_type) in to_alert {
|
||||
let alert = Alert {
|
||||
timestamp: Local::now().to_rfc3339(),
|
||||
alert_type: alert_type.clone(),
|
||||
key,
|
||||
message: message.clone(),
|
||||
value,
|
||||
threshold,
|
||||
hostname: hostname.clone(),
|
||||
};
|
||||
{
|
||||
let cm = self.config_manager.lock().await;
|
||||
cm.save_alert(alert);
|
||||
let subject = format!("[ALERTE] {} - {}", hostname, message);
|
||||
let body = format!(
|
||||
"Alerte de supervision\n{sep}\n\nServeur : {host}\nDate : {date}\nType : {typ}\n\nMessage : {msg}\n\n{sep}\nSupervision - Monitoring automatique",
|
||||
sep = "=".repeat(40),
|
||||
host = hostname,
|
||||
date = Local::now().to_rfc3339(),
|
||||
typ = alert_type,
|
||||
msg = message
|
||||
);
|
||||
self.alerter.send(&cm.config.smtp, &subject, &body).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn start(self: Arc<Self>) {
|
||||
self.running
|
||||
.store(true, std::sync::atomic::Ordering::Relaxed);
|
||||
let monitor = self.clone();
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
if !monitor
|
||||
.running
|
||||
.load(std::sync::atomic::Ordering::Relaxed)
|
||||
{
|
||||
break;
|
||||
}
|
||||
let metrics = monitor.collect().await;
|
||||
{
|
||||
let mut m = monitor.metrics.write().unwrap();
|
||||
*m = Some(metrics.clone());
|
||||
}
|
||||
monitor.check_and_alert(&metrics).await;
|
||||
|
||||
let interval = {
|
||||
let cm = monitor.config_manager.lock().await;
|
||||
cm.config.check_interval_minutes
|
||||
};
|
||||
tokio::time::sleep(StdDuration::from_secs(interval * 60)).await;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
self.running
|
||||
.store(false, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn eval_status_ok_below_80_percent() {
|
||||
assert_eq!(eval_status(70.0, 90.0), "ok");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn eval_status_warning_at_80_percent_of_threshold() {
|
||||
assert_eq!(eval_status(72.0, 90.0), "warning"); // 72/90 = 0.8
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn eval_status_critical_at_threshold() {
|
||||
assert_eq!(eval_status(90.0, 90.0), "critical");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn eval_status_critical_above_threshold() {
|
||||
assert_eq!(eval_status(95.0, 90.0), "critical");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn eval_status_ok_with_zero_threshold() {
|
||||
assert_eq!(eval_status(50.0, 0.0), "ok");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user