Interface web Flask securisee pour surveiller CPU, RAM, disques et processus (JVM, Nginx, Amadea Web 8 x64). Alertes email SMTP configurables, seuils reglables, compilation PyInstaller en .exe, installation service Windows via NSSM. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
292 lines
11 KiB
Python
292 lines
11 KiB
Python
"""Collecte des metriques systeme et surveillance des seuils."""
|
|
|
|
import platform
|
|
import threading
|
|
import time
|
|
from datetime import datetime, timedelta
|
|
|
|
import psutil
|
|
|
|
|
|
class SystemMonitor:
|
|
def __init__(self, config_manager, alerter):
|
|
self.config = config_manager
|
|
self.alerter = alerter
|
|
self._metrics = {}
|
|
self._lock = threading.Lock()
|
|
self._running = False
|
|
self._thread = None
|
|
self._last_alerts = {} # cle -> datetime derniere alerte
|
|
|
|
@property
|
|
def metrics(self):
|
|
with self._lock:
|
|
return dict(self._metrics)
|
|
|
|
def collect_metrics(self):
|
|
"""Collecte toutes les metriques systeme."""
|
|
cfg = self.config.config
|
|
thresholds = cfg["thresholds"]
|
|
|
|
# CPU
|
|
cpu_percent = psutil.cpu_percent(interval=1)
|
|
cpu_status = self._eval_status(cpu_percent, thresholds["cpu_percent"])
|
|
|
|
# RAM
|
|
ram = psutil.virtual_memory()
|
|
ram_status = self._eval_status(ram.percent, thresholds["ram_percent"])
|
|
|
|
# Disques
|
|
disks = []
|
|
IGNORED_FS = {"squashfs", "tmpfs", "devtmpfs", "overlay", "iso9660"}
|
|
for part in psutil.disk_partitions():
|
|
# Ignorer les pseudo-filesystems (loop, snap, tmpfs, etc.)
|
|
if part.fstype in IGNORED_FS:
|
|
continue
|
|
if part.device.startswith("/dev/loop"):
|
|
continue
|
|
try:
|
|
usage = psutil.disk_usage(part.mountpoint)
|
|
except (PermissionError, OSError):
|
|
continue
|
|
# Ignorer les partitions minuscules (< 1 Go)
|
|
if usage.total < 1024 ** 3:
|
|
continue
|
|
disk_status = self._eval_status(usage.percent, thresholds["disk_percent"])
|
|
disks.append({
|
|
"drive": part.device.rstrip("\\"),
|
|
"mountpoint": part.mountpoint,
|
|
"percent": round(usage.percent, 1),
|
|
"total_gb": round(usage.total / (1024 ** 3), 1),
|
|
"used_gb": round(usage.used / (1024 ** 3), 1),
|
|
"free_gb": round(usage.free / (1024 ** 3), 1),
|
|
"threshold": thresholds["disk_percent"],
|
|
"status": disk_status,
|
|
})
|
|
|
|
# Processus surveilles
|
|
processes = self._check_processes(cfg.get("processes", []))
|
|
|
|
# Infos systeme
|
|
boot_time = datetime.fromtimestamp(psutil.boot_time())
|
|
uptime = datetime.now() - boot_time
|
|
|
|
now = datetime.now()
|
|
interval = cfg.get("check_interval_minutes", 1)
|
|
|
|
metrics = {
|
|
"timestamp": now.isoformat(),
|
|
"hostname": platform.node(),
|
|
"os": f"{platform.system()} {platform.release()}",
|
|
"cpu": {
|
|
"percent": cpu_percent,
|
|
"cores": psutil.cpu_count(),
|
|
"threshold": thresholds["cpu_percent"],
|
|
"status": cpu_status,
|
|
},
|
|
"ram": {
|
|
"percent": round(ram.percent, 1),
|
|
"total_gb": round(ram.total / (1024 ** 3), 1),
|
|
"used_gb": round(ram.used / (1024 ** 3), 1),
|
|
"available_gb": round(ram.available / (1024 ** 3), 1),
|
|
"threshold": thresholds["ram_percent"],
|
|
"status": ram_status,
|
|
},
|
|
"disks": disks,
|
|
"processes": processes,
|
|
"uptime": str(uptime).split(".")[0],
|
|
"boot_time": boot_time.isoformat(),
|
|
"monitoring_active": self._running,
|
|
"last_check": now.isoformat(),
|
|
"next_check": (now + timedelta(minutes=interval)).isoformat(),
|
|
}
|
|
|
|
with self._lock:
|
|
self._metrics = metrics
|
|
|
|
return metrics
|
|
|
|
def _check_processes(self, process_configs):
|
|
"""Verifie l'etat des processus surveilles."""
|
|
results = []
|
|
for proc_cfg in process_configs:
|
|
pattern = proc_cfg["pattern"].lower()
|
|
name = proc_cfg["name"]
|
|
enabled = proc_cfg.get("enabled", True)
|
|
mem_threshold = proc_cfg.get("memory_threshold_mb", 0)
|
|
|
|
found = []
|
|
if enabled:
|
|
for proc in psutil.process_iter(["pid", "name", "cmdline", "memory_info", "cpu_percent"]):
|
|
try:
|
|
pname = (proc.info["name"] or "").lower()
|
|
cmdline = " ".join(proc.info["cmdline"] or []).lower()
|
|
if pattern in pname or pattern in cmdline:
|
|
mem_mb = round(proc.info["memory_info"].rss / (1024 ** 2), 1) if proc.info["memory_info"] else 0
|
|
found.append({
|
|
"pid": proc.info["pid"],
|
|
"memory_mb": mem_mb,
|
|
"cpu_percent": proc.info["cpu_percent"] or 0,
|
|
})
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
|
continue
|
|
|
|
total_memory = sum(p["memory_mb"] for p in found)
|
|
total_cpu = sum(p["cpu_percent"] for p in found)
|
|
running = len(found) > 0
|
|
|
|
# Statut memoire
|
|
mem_status = "ok"
|
|
if mem_threshold > 0 and total_memory > 0:
|
|
mem_status = self._eval_status(total_memory, mem_threshold, is_mb=True)
|
|
|
|
results.append({
|
|
"name": name,
|
|
"pattern": proc_cfg["pattern"],
|
|
"running": running,
|
|
"enabled": enabled,
|
|
"alert_on_down": proc_cfg.get("alert_on_down", True),
|
|
"instance_count": len(found),
|
|
"total_memory_mb": round(total_memory, 1),
|
|
"total_cpu_percent": round(total_cpu, 1),
|
|
"memory_threshold_mb": mem_threshold,
|
|
"memory_status": mem_status,
|
|
"pids": [p["pid"] for p in found],
|
|
})
|
|
|
|
return results
|
|
|
|
def _eval_status(self, value, threshold, is_mb=False):
|
|
if is_mb:
|
|
ratio = value / threshold if threshold > 0 else 0
|
|
else:
|
|
ratio = value / threshold if threshold > 0 else 0
|
|
if ratio >= 1.0:
|
|
return "critical"
|
|
elif ratio >= 0.80:
|
|
return "warning"
|
|
return "ok"
|
|
|
|
def check_and_alert(self, metrics):
|
|
"""Verifie les seuils et envoie des alertes si necessaire."""
|
|
cfg = self.config.config
|
|
cooldown = cfg.get("alert_cooldown_minutes", 30)
|
|
alerts_sent = []
|
|
|
|
# CPU
|
|
if metrics["cpu"]["status"] == "critical":
|
|
msg = f"CPU a {metrics['cpu']['percent']}% (seuil: {metrics['cpu']['threshold']}%)"
|
|
if self._should_alert("cpu", cooldown):
|
|
self._send_and_log("cpu", msg, metrics["cpu"]["percent"], metrics["cpu"]["threshold"])
|
|
alerts_sent.append(msg)
|
|
|
|
# RAM
|
|
if metrics["ram"]["status"] == "critical":
|
|
msg = f"RAM a {metrics['ram']['percent']}% (seuil: {metrics['ram']['threshold']}%)"
|
|
if self._should_alert("ram", cooldown):
|
|
self._send_and_log("ram", msg, metrics["ram"]["percent"], metrics["ram"]["threshold"])
|
|
alerts_sent.append(msg)
|
|
|
|
# Disques
|
|
for disk in metrics["disks"]:
|
|
key = f"disk_{disk['drive']}"
|
|
if disk["status"] == "critical":
|
|
msg = f"Disque {disk['drive']} a {disk['percent']}% (seuil: {disk['threshold']}%)"
|
|
if self._should_alert(key, cooldown):
|
|
self._send_and_log(key, msg, disk["percent"], disk["threshold"])
|
|
alerts_sent.append(msg)
|
|
|
|
# Processus
|
|
for proc in metrics["processes"]:
|
|
if not proc["enabled"]:
|
|
continue
|
|
|
|
# Alerte processus arrete
|
|
if proc["alert_on_down"] and not proc["running"]:
|
|
key = f"process_down_{proc['name']}"
|
|
msg = f"Processus '{proc['name']}' non detecte (pattern: {proc['pattern']})"
|
|
if self._should_alert(key, cooldown):
|
|
self._send_and_log(key, msg, 0, 0, alert_type="process_down")
|
|
alerts_sent.append(msg)
|
|
|
|
# Alerte memoire processus
|
|
if proc["memory_threshold_mb"] > 0 and proc["memory_status"] == "critical":
|
|
key = f"process_mem_{proc['name']}"
|
|
msg = (
|
|
f"Processus '{proc['name']}' utilise {proc['total_memory_mb']} Mo "
|
|
f"(seuil: {proc['memory_threshold_mb']} Mo)"
|
|
)
|
|
if self._should_alert(key, cooldown):
|
|
self._send_and_log(key, msg, proc["total_memory_mb"], proc["memory_threshold_mb"])
|
|
alerts_sent.append(msg)
|
|
|
|
return alerts_sent
|
|
|
|
def _should_alert(self, key, cooldown_minutes):
|
|
now = datetime.now()
|
|
last = self._last_alerts.get(key)
|
|
if last and (now - last) < timedelta(minutes=cooldown_minutes):
|
|
return False
|
|
return True
|
|
|
|
def _send_and_log(self, key, message, value, threshold, alert_type="threshold"):
|
|
now = datetime.now()
|
|
hostname = self._metrics.get("hostname", platform.node())
|
|
|
|
# Enregistrer l'alerte
|
|
alert = {
|
|
"timestamp": now.isoformat(),
|
|
"type": alert_type,
|
|
"key": key,
|
|
"message": message,
|
|
"value": value,
|
|
"threshold": threshold,
|
|
"hostname": hostname,
|
|
}
|
|
self.config.save_alert(alert)
|
|
|
|
# Envoyer l'email
|
|
subject = f"[ALERTE] {hostname} - {message}"
|
|
self.alerter.send_alert(subject, self._format_alert_body(alert))
|
|
|
|
# Mettre a jour le cooldown
|
|
self._last_alerts[key] = now
|
|
|
|
def _format_alert_body(self, alert):
|
|
return (
|
|
f"Alerte de supervision\n"
|
|
f"{'=' * 40}\n\n"
|
|
f"Serveur : {alert['hostname']}\n"
|
|
f"Date : {alert['timestamp']}\n"
|
|
f"Type : {alert['type']}\n\n"
|
|
f"Message : {alert['message']}\n\n"
|
|
f"{'=' * 40}\n"
|
|
f"Supervision - Monitoring automatique"
|
|
)
|
|
|
|
# --- Thread de monitoring ---
|
|
|
|
def start(self):
|
|
if self._running:
|
|
return
|
|
self._running = True
|
|
self._thread = threading.Thread(target=self._monitoring_loop, daemon=True)
|
|
self._thread.start()
|
|
|
|
def stop(self):
|
|
self._running = False
|
|
|
|
def _monitoring_loop(self):
|
|
last_check = 0
|
|
while self._running:
|
|
interval = self.config.get("check_interval_minutes", 1) * 60
|
|
elapsed = time.time() - last_check
|
|
if elapsed >= interval:
|
|
try:
|
|
metrics = self.collect_metrics()
|
|
self.check_and_alert(metrics)
|
|
except Exception as e:
|
|
print(f"[Monitoring] Erreur: {e}")
|
|
last_check = time.time()
|
|
time.sleep(5) # Verifie toutes les 5s si c'est le moment
|