feat: init projet supervision — monitoring systeme Windows

Interface web Flask securisee pour surveiller CPU, RAM, disques
et processus (JVM, Nginx, Amadea Web 8 x64).
Alertes email SMTP configurables, seuils reglables, compilation
PyInstaller en .exe, installation service Windows via NSSM.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-03-26 09:48:35 +01:00
commit 61d17968a0
17 changed files with 1958 additions and 0 deletions

291
monitor.py Normal file
View File

@@ -0,0 +1,291 @@
"""Collecte des metriques systeme et surveillance des seuils."""
import platform
import threading
import time
from datetime import datetime, timedelta
import psutil
class SystemMonitor:
def __init__(self, config_manager, alerter):
self.config = config_manager
self.alerter = alerter
self._metrics = {}
self._lock = threading.Lock()
self._running = False
self._thread = None
self._last_alerts = {} # cle -> datetime derniere alerte
@property
def metrics(self):
with self._lock:
return dict(self._metrics)
def collect_metrics(self):
"""Collecte toutes les metriques systeme."""
cfg = self.config.config
thresholds = cfg["thresholds"]
# CPU
cpu_percent = psutil.cpu_percent(interval=1)
cpu_status = self._eval_status(cpu_percent, thresholds["cpu_percent"])
# RAM
ram = psutil.virtual_memory()
ram_status = self._eval_status(ram.percent, thresholds["ram_percent"])
# Disques
disks = []
IGNORED_FS = {"squashfs", "tmpfs", "devtmpfs", "overlay", "iso9660"}
for part in psutil.disk_partitions():
# Ignorer les pseudo-filesystems (loop, snap, tmpfs, etc.)
if part.fstype in IGNORED_FS:
continue
if part.device.startswith("/dev/loop"):
continue
try:
usage = psutil.disk_usage(part.mountpoint)
except (PermissionError, OSError):
continue
# Ignorer les partitions minuscules (< 1 Go)
if usage.total < 1024 ** 3:
continue
disk_status = self._eval_status(usage.percent, thresholds["disk_percent"])
disks.append({
"drive": part.device.rstrip("\\"),
"mountpoint": part.mountpoint,
"percent": round(usage.percent, 1),
"total_gb": round(usage.total / (1024 ** 3), 1),
"used_gb": round(usage.used / (1024 ** 3), 1),
"free_gb": round(usage.free / (1024 ** 3), 1),
"threshold": thresholds["disk_percent"],
"status": disk_status,
})
# Processus surveilles
processes = self._check_processes(cfg.get("processes", []))
# Infos systeme
boot_time = datetime.fromtimestamp(psutil.boot_time())
uptime = datetime.now() - boot_time
now = datetime.now()
interval = cfg.get("check_interval_minutes", 1)
metrics = {
"timestamp": now.isoformat(),
"hostname": platform.node(),
"os": f"{platform.system()} {platform.release()}",
"cpu": {
"percent": cpu_percent,
"cores": psutil.cpu_count(),
"threshold": thresholds["cpu_percent"],
"status": cpu_status,
},
"ram": {
"percent": round(ram.percent, 1),
"total_gb": round(ram.total / (1024 ** 3), 1),
"used_gb": round(ram.used / (1024 ** 3), 1),
"available_gb": round(ram.available / (1024 ** 3), 1),
"threshold": thresholds["ram_percent"],
"status": ram_status,
},
"disks": disks,
"processes": processes,
"uptime": str(uptime).split(".")[0],
"boot_time": boot_time.isoformat(),
"monitoring_active": self._running,
"last_check": now.isoformat(),
"next_check": (now + timedelta(minutes=interval)).isoformat(),
}
with self._lock:
self._metrics = metrics
return metrics
def _check_processes(self, process_configs):
"""Verifie l'etat des processus surveilles."""
results = []
for proc_cfg in process_configs:
pattern = proc_cfg["pattern"].lower()
name = proc_cfg["name"]
enabled = proc_cfg.get("enabled", True)
mem_threshold = proc_cfg.get("memory_threshold_mb", 0)
found = []
if enabled:
for proc in psutil.process_iter(["pid", "name", "cmdline", "memory_info", "cpu_percent"]):
try:
pname = (proc.info["name"] or "").lower()
cmdline = " ".join(proc.info["cmdline"] or []).lower()
if pattern in pname or pattern in cmdline:
mem_mb = round(proc.info["memory_info"].rss / (1024 ** 2), 1) if proc.info["memory_info"] else 0
found.append({
"pid": proc.info["pid"],
"memory_mb": mem_mb,
"cpu_percent": proc.info["cpu_percent"] or 0,
})
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
continue
total_memory = sum(p["memory_mb"] for p in found)
total_cpu = sum(p["cpu_percent"] for p in found)
running = len(found) > 0
# Statut memoire
mem_status = "ok"
if mem_threshold > 0 and total_memory > 0:
mem_status = self._eval_status(total_memory, mem_threshold, is_mb=True)
results.append({
"name": name,
"pattern": proc_cfg["pattern"],
"running": running,
"enabled": enabled,
"alert_on_down": proc_cfg.get("alert_on_down", True),
"instance_count": len(found),
"total_memory_mb": round(total_memory, 1),
"total_cpu_percent": round(total_cpu, 1),
"memory_threshold_mb": mem_threshold,
"memory_status": mem_status,
"pids": [p["pid"] for p in found],
})
return results
def _eval_status(self, value, threshold, is_mb=False):
if is_mb:
ratio = value / threshold if threshold > 0 else 0
else:
ratio = value / threshold if threshold > 0 else 0
if ratio >= 1.0:
return "critical"
elif ratio >= 0.80:
return "warning"
return "ok"
def check_and_alert(self, metrics):
"""Verifie les seuils et envoie des alertes si necessaire."""
cfg = self.config.config
cooldown = cfg.get("alert_cooldown_minutes", 30)
alerts_sent = []
# CPU
if metrics["cpu"]["status"] == "critical":
msg = f"CPU a {metrics['cpu']['percent']}% (seuil: {metrics['cpu']['threshold']}%)"
if self._should_alert("cpu", cooldown):
self._send_and_log("cpu", msg, metrics["cpu"]["percent"], metrics["cpu"]["threshold"])
alerts_sent.append(msg)
# RAM
if metrics["ram"]["status"] == "critical":
msg = f"RAM a {metrics['ram']['percent']}% (seuil: {metrics['ram']['threshold']}%)"
if self._should_alert("ram", cooldown):
self._send_and_log("ram", msg, metrics["ram"]["percent"], metrics["ram"]["threshold"])
alerts_sent.append(msg)
# Disques
for disk in metrics["disks"]:
key = f"disk_{disk['drive']}"
if disk["status"] == "critical":
msg = f"Disque {disk['drive']} a {disk['percent']}% (seuil: {disk['threshold']}%)"
if self._should_alert(key, cooldown):
self._send_and_log(key, msg, disk["percent"], disk["threshold"])
alerts_sent.append(msg)
# Processus
for proc in metrics["processes"]:
if not proc["enabled"]:
continue
# Alerte processus arrete
if proc["alert_on_down"] and not proc["running"]:
key = f"process_down_{proc['name']}"
msg = f"Processus '{proc['name']}' non detecte (pattern: {proc['pattern']})"
if self._should_alert(key, cooldown):
self._send_and_log(key, msg, 0, 0, alert_type="process_down")
alerts_sent.append(msg)
# Alerte memoire processus
if proc["memory_threshold_mb"] > 0 and proc["memory_status"] == "critical":
key = f"process_mem_{proc['name']}"
msg = (
f"Processus '{proc['name']}' utilise {proc['total_memory_mb']} Mo "
f"(seuil: {proc['memory_threshold_mb']} Mo)"
)
if self._should_alert(key, cooldown):
self._send_and_log(key, msg, proc["total_memory_mb"], proc["memory_threshold_mb"])
alerts_sent.append(msg)
return alerts_sent
def _should_alert(self, key, cooldown_minutes):
now = datetime.now()
last = self._last_alerts.get(key)
if last and (now - last) < timedelta(minutes=cooldown_minutes):
return False
return True
def _send_and_log(self, key, message, value, threshold, alert_type="threshold"):
now = datetime.now()
hostname = self._metrics.get("hostname", platform.node())
# Enregistrer l'alerte
alert = {
"timestamp": now.isoformat(),
"type": alert_type,
"key": key,
"message": message,
"value": value,
"threshold": threshold,
"hostname": hostname,
}
self.config.save_alert(alert)
# Envoyer l'email
subject = f"[ALERTE] {hostname} - {message}"
self.alerter.send_alert(subject, self._format_alert_body(alert))
# Mettre a jour le cooldown
self._last_alerts[key] = now
def _format_alert_body(self, alert):
return (
f"Alerte de supervision\n"
f"{'=' * 40}\n\n"
f"Serveur : {alert['hostname']}\n"
f"Date : {alert['timestamp']}\n"
f"Type : {alert['type']}\n\n"
f"Message : {alert['message']}\n\n"
f"{'=' * 40}\n"
f"Supervision - Monitoring automatique"
)
# --- Thread de monitoring ---
def start(self):
if self._running:
return
self._running = True
self._thread = threading.Thread(target=self._monitoring_loop, daemon=True)
self._thread.start()
def stop(self):
self._running = False
def _monitoring_loop(self):
last_check = 0
while self._running:
interval = self.config.get("check_interval_minutes", 1) * 60
elapsed = time.time() - last_check
if elapsed >= interval:
try:
metrics = self.collect_metrics()
self.check_and_alert(metrics)
except Exception as e:
print(f"[Monitoring] Erreur: {e}")
last_check = time.time()
time.sleep(5) # Verifie toutes les 5s si c'est le moment