feat: init projet supervision — monitoring systeme Windows
Interface web Flask securisee pour surveiller CPU, RAM, disques et processus (JVM, Nginx, Amadea Web 8 x64). Alertes email SMTP configurables, seuils reglables, compilation PyInstaller en .exe, installation service Windows via NSSM. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
291
monitor.py
Normal file
291
monitor.py
Normal file
@@ -0,0 +1,291 @@
|
||||
"""Collecte des metriques systeme et surveillance des seuils."""
|
||||
|
||||
import platform
|
||||
import threading
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import psutil
|
||||
|
||||
|
||||
class SystemMonitor:
|
||||
def __init__(self, config_manager, alerter):
|
||||
self.config = config_manager
|
||||
self.alerter = alerter
|
||||
self._metrics = {}
|
||||
self._lock = threading.Lock()
|
||||
self._running = False
|
||||
self._thread = None
|
||||
self._last_alerts = {} # cle -> datetime derniere alerte
|
||||
|
||||
@property
|
||||
def metrics(self):
|
||||
with self._lock:
|
||||
return dict(self._metrics)
|
||||
|
||||
def collect_metrics(self):
|
||||
"""Collecte toutes les metriques systeme."""
|
||||
cfg = self.config.config
|
||||
thresholds = cfg["thresholds"]
|
||||
|
||||
# CPU
|
||||
cpu_percent = psutil.cpu_percent(interval=1)
|
||||
cpu_status = self._eval_status(cpu_percent, thresholds["cpu_percent"])
|
||||
|
||||
# RAM
|
||||
ram = psutil.virtual_memory()
|
||||
ram_status = self._eval_status(ram.percent, thresholds["ram_percent"])
|
||||
|
||||
# Disques
|
||||
disks = []
|
||||
IGNORED_FS = {"squashfs", "tmpfs", "devtmpfs", "overlay", "iso9660"}
|
||||
for part in psutil.disk_partitions():
|
||||
# Ignorer les pseudo-filesystems (loop, snap, tmpfs, etc.)
|
||||
if part.fstype in IGNORED_FS:
|
||||
continue
|
||||
if part.device.startswith("/dev/loop"):
|
||||
continue
|
||||
try:
|
||||
usage = psutil.disk_usage(part.mountpoint)
|
||||
except (PermissionError, OSError):
|
||||
continue
|
||||
# Ignorer les partitions minuscules (< 1 Go)
|
||||
if usage.total < 1024 ** 3:
|
||||
continue
|
||||
disk_status = self._eval_status(usage.percent, thresholds["disk_percent"])
|
||||
disks.append({
|
||||
"drive": part.device.rstrip("\\"),
|
||||
"mountpoint": part.mountpoint,
|
||||
"percent": round(usage.percent, 1),
|
||||
"total_gb": round(usage.total / (1024 ** 3), 1),
|
||||
"used_gb": round(usage.used / (1024 ** 3), 1),
|
||||
"free_gb": round(usage.free / (1024 ** 3), 1),
|
||||
"threshold": thresholds["disk_percent"],
|
||||
"status": disk_status,
|
||||
})
|
||||
|
||||
# Processus surveilles
|
||||
processes = self._check_processes(cfg.get("processes", []))
|
||||
|
||||
# Infos systeme
|
||||
boot_time = datetime.fromtimestamp(psutil.boot_time())
|
||||
uptime = datetime.now() - boot_time
|
||||
|
||||
now = datetime.now()
|
||||
interval = cfg.get("check_interval_minutes", 1)
|
||||
|
||||
metrics = {
|
||||
"timestamp": now.isoformat(),
|
||||
"hostname": platform.node(),
|
||||
"os": f"{platform.system()} {platform.release()}",
|
||||
"cpu": {
|
||||
"percent": cpu_percent,
|
||||
"cores": psutil.cpu_count(),
|
||||
"threshold": thresholds["cpu_percent"],
|
||||
"status": cpu_status,
|
||||
},
|
||||
"ram": {
|
||||
"percent": round(ram.percent, 1),
|
||||
"total_gb": round(ram.total / (1024 ** 3), 1),
|
||||
"used_gb": round(ram.used / (1024 ** 3), 1),
|
||||
"available_gb": round(ram.available / (1024 ** 3), 1),
|
||||
"threshold": thresholds["ram_percent"],
|
||||
"status": ram_status,
|
||||
},
|
||||
"disks": disks,
|
||||
"processes": processes,
|
||||
"uptime": str(uptime).split(".")[0],
|
||||
"boot_time": boot_time.isoformat(),
|
||||
"monitoring_active": self._running,
|
||||
"last_check": now.isoformat(),
|
||||
"next_check": (now + timedelta(minutes=interval)).isoformat(),
|
||||
}
|
||||
|
||||
with self._lock:
|
||||
self._metrics = metrics
|
||||
|
||||
return metrics
|
||||
|
||||
def _check_processes(self, process_configs):
|
||||
"""Verifie l'etat des processus surveilles."""
|
||||
results = []
|
||||
for proc_cfg in process_configs:
|
||||
pattern = proc_cfg["pattern"].lower()
|
||||
name = proc_cfg["name"]
|
||||
enabled = proc_cfg.get("enabled", True)
|
||||
mem_threshold = proc_cfg.get("memory_threshold_mb", 0)
|
||||
|
||||
found = []
|
||||
if enabled:
|
||||
for proc in psutil.process_iter(["pid", "name", "cmdline", "memory_info", "cpu_percent"]):
|
||||
try:
|
||||
pname = (proc.info["name"] or "").lower()
|
||||
cmdline = " ".join(proc.info["cmdline"] or []).lower()
|
||||
if pattern in pname or pattern in cmdline:
|
||||
mem_mb = round(proc.info["memory_info"].rss / (1024 ** 2), 1) if proc.info["memory_info"] else 0
|
||||
found.append({
|
||||
"pid": proc.info["pid"],
|
||||
"memory_mb": mem_mb,
|
||||
"cpu_percent": proc.info["cpu_percent"] or 0,
|
||||
})
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
||||
continue
|
||||
|
||||
total_memory = sum(p["memory_mb"] for p in found)
|
||||
total_cpu = sum(p["cpu_percent"] for p in found)
|
||||
running = len(found) > 0
|
||||
|
||||
# Statut memoire
|
||||
mem_status = "ok"
|
||||
if mem_threshold > 0 and total_memory > 0:
|
||||
mem_status = self._eval_status(total_memory, mem_threshold, is_mb=True)
|
||||
|
||||
results.append({
|
||||
"name": name,
|
||||
"pattern": proc_cfg["pattern"],
|
||||
"running": running,
|
||||
"enabled": enabled,
|
||||
"alert_on_down": proc_cfg.get("alert_on_down", True),
|
||||
"instance_count": len(found),
|
||||
"total_memory_mb": round(total_memory, 1),
|
||||
"total_cpu_percent": round(total_cpu, 1),
|
||||
"memory_threshold_mb": mem_threshold,
|
||||
"memory_status": mem_status,
|
||||
"pids": [p["pid"] for p in found],
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
def _eval_status(self, value, threshold, is_mb=False):
|
||||
if is_mb:
|
||||
ratio = value / threshold if threshold > 0 else 0
|
||||
else:
|
||||
ratio = value / threshold if threshold > 0 else 0
|
||||
if ratio >= 1.0:
|
||||
return "critical"
|
||||
elif ratio >= 0.80:
|
||||
return "warning"
|
||||
return "ok"
|
||||
|
||||
def check_and_alert(self, metrics):
|
||||
"""Verifie les seuils et envoie des alertes si necessaire."""
|
||||
cfg = self.config.config
|
||||
cooldown = cfg.get("alert_cooldown_minutes", 30)
|
||||
alerts_sent = []
|
||||
|
||||
# CPU
|
||||
if metrics["cpu"]["status"] == "critical":
|
||||
msg = f"CPU a {metrics['cpu']['percent']}% (seuil: {metrics['cpu']['threshold']}%)"
|
||||
if self._should_alert("cpu", cooldown):
|
||||
self._send_and_log("cpu", msg, metrics["cpu"]["percent"], metrics["cpu"]["threshold"])
|
||||
alerts_sent.append(msg)
|
||||
|
||||
# RAM
|
||||
if metrics["ram"]["status"] == "critical":
|
||||
msg = f"RAM a {metrics['ram']['percent']}% (seuil: {metrics['ram']['threshold']}%)"
|
||||
if self._should_alert("ram", cooldown):
|
||||
self._send_and_log("ram", msg, metrics["ram"]["percent"], metrics["ram"]["threshold"])
|
||||
alerts_sent.append(msg)
|
||||
|
||||
# Disques
|
||||
for disk in metrics["disks"]:
|
||||
key = f"disk_{disk['drive']}"
|
||||
if disk["status"] == "critical":
|
||||
msg = f"Disque {disk['drive']} a {disk['percent']}% (seuil: {disk['threshold']}%)"
|
||||
if self._should_alert(key, cooldown):
|
||||
self._send_and_log(key, msg, disk["percent"], disk["threshold"])
|
||||
alerts_sent.append(msg)
|
||||
|
||||
# Processus
|
||||
for proc in metrics["processes"]:
|
||||
if not proc["enabled"]:
|
||||
continue
|
||||
|
||||
# Alerte processus arrete
|
||||
if proc["alert_on_down"] and not proc["running"]:
|
||||
key = f"process_down_{proc['name']}"
|
||||
msg = f"Processus '{proc['name']}' non detecte (pattern: {proc['pattern']})"
|
||||
if self._should_alert(key, cooldown):
|
||||
self._send_and_log(key, msg, 0, 0, alert_type="process_down")
|
||||
alerts_sent.append(msg)
|
||||
|
||||
# Alerte memoire processus
|
||||
if proc["memory_threshold_mb"] > 0 and proc["memory_status"] == "critical":
|
||||
key = f"process_mem_{proc['name']}"
|
||||
msg = (
|
||||
f"Processus '{proc['name']}' utilise {proc['total_memory_mb']} Mo "
|
||||
f"(seuil: {proc['memory_threshold_mb']} Mo)"
|
||||
)
|
||||
if self._should_alert(key, cooldown):
|
||||
self._send_and_log(key, msg, proc["total_memory_mb"], proc["memory_threshold_mb"])
|
||||
alerts_sent.append(msg)
|
||||
|
||||
return alerts_sent
|
||||
|
||||
def _should_alert(self, key, cooldown_minutes):
|
||||
now = datetime.now()
|
||||
last = self._last_alerts.get(key)
|
||||
if last and (now - last) < timedelta(minutes=cooldown_minutes):
|
||||
return False
|
||||
return True
|
||||
|
||||
def _send_and_log(self, key, message, value, threshold, alert_type="threshold"):
|
||||
now = datetime.now()
|
||||
hostname = self._metrics.get("hostname", platform.node())
|
||||
|
||||
# Enregistrer l'alerte
|
||||
alert = {
|
||||
"timestamp": now.isoformat(),
|
||||
"type": alert_type,
|
||||
"key": key,
|
||||
"message": message,
|
||||
"value": value,
|
||||
"threshold": threshold,
|
||||
"hostname": hostname,
|
||||
}
|
||||
self.config.save_alert(alert)
|
||||
|
||||
# Envoyer l'email
|
||||
subject = f"[ALERTE] {hostname} - {message}"
|
||||
self.alerter.send_alert(subject, self._format_alert_body(alert))
|
||||
|
||||
# Mettre a jour le cooldown
|
||||
self._last_alerts[key] = now
|
||||
|
||||
def _format_alert_body(self, alert):
|
||||
return (
|
||||
f"Alerte de supervision\n"
|
||||
f"{'=' * 40}\n\n"
|
||||
f"Serveur : {alert['hostname']}\n"
|
||||
f"Date : {alert['timestamp']}\n"
|
||||
f"Type : {alert['type']}\n\n"
|
||||
f"Message : {alert['message']}\n\n"
|
||||
f"{'=' * 40}\n"
|
||||
f"Supervision - Monitoring automatique"
|
||||
)
|
||||
|
||||
# --- Thread de monitoring ---
|
||||
|
||||
def start(self):
|
||||
if self._running:
|
||||
return
|
||||
self._running = True
|
||||
self._thread = threading.Thread(target=self._monitoring_loop, daemon=True)
|
||||
self._thread.start()
|
||||
|
||||
def stop(self):
|
||||
self._running = False
|
||||
|
||||
def _monitoring_loop(self):
|
||||
last_check = 0
|
||||
while self._running:
|
||||
interval = self.config.get("check_interval_minutes", 1) * 60
|
||||
elapsed = time.time() - last_check
|
||||
if elapsed >= interval:
|
||||
try:
|
||||
metrics = self.collect_metrics()
|
||||
self.check_and_alert(metrics)
|
||||
except Exception as e:
|
||||
print(f"[Monitoring] Erreur: {e}")
|
||||
last_check = time.time()
|
||||
time.sleep(5) # Verifie toutes les 5s si c'est le moment
|
||||
Reference in New Issue
Block a user