feat: init projet supervision — monitoring systeme Windows

Interface web Flask securisee pour surveiller CPU, RAM, disques et processus (JVM, Nginx, Amadea Web 8 x64). Alertes email SMTP configurables, seuils reglables, compilation PyInstaller en .exe, installation service Windows via NSSM. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 09:48:35 +01:00
commit 61d17968a0
17 changed files with 1958 additions and 0 deletions
--- a/monitor.py
+++ b/monitor.py
@@ -0,0 +1,291 @@
+"""Collecte des metriques systeme et surveillance des seuils."""
+
+import platform
+import threading
+import time
+from datetime import datetime, timedelta
+
+import psutil
+
+
+class SystemMonitor:
+    def __init__(self, config_manager, alerter):
+        self.config = config_manager
+        self.alerter = alerter
+        self._metrics = {}
+        self._lock = threading.Lock()
+        self._running = False
+        self._thread = None
+        self._last_alerts = {}  # cle -> datetime derniere alerte
+
+    @property
+    def metrics(self):
+        with self._lock:
+            return dict(self._metrics)
+
+    def collect_metrics(self):
+        """Collecte toutes les metriques systeme."""
+        cfg = self.config.config
+        thresholds = cfg["thresholds"]
+
+        # CPU
+        cpu_percent = psutil.cpu_percent(interval=1)
+        cpu_status = self._eval_status(cpu_percent, thresholds["cpu_percent"])
+
+        # RAM
+        ram = psutil.virtual_memory()
+        ram_status = self._eval_status(ram.percent, thresholds["ram_percent"])
+
+        # Disques
+        disks = []
+        IGNORED_FS = {"squashfs", "tmpfs", "devtmpfs", "overlay", "iso9660"}
+        for part in psutil.disk_partitions():
+            # Ignorer les pseudo-filesystems (loop, snap, tmpfs, etc.)
+            if part.fstype in IGNORED_FS:
+                continue
+            if part.device.startswith("/dev/loop"):
+                continue
+            try:
+                usage = psutil.disk_usage(part.mountpoint)
+            except (PermissionError, OSError):
+                continue
+            # Ignorer les partitions minuscules (< 1 Go)
+            if usage.total < 1024 ** 3:
+                continue
+            disk_status = self._eval_status(usage.percent, thresholds["disk_percent"])
+            disks.append({
+                "drive": part.device.rstrip("\\"),
+                "mountpoint": part.mountpoint,
+                "percent": round(usage.percent, 1),
+                "total_gb": round(usage.total / (1024 ** 3), 1),
+                "used_gb": round(usage.used / (1024 ** 3), 1),
+                "free_gb": round(usage.free / (1024 ** 3), 1),
+                "threshold": thresholds["disk_percent"],
+                "status": disk_status,
+            })
+
+        # Processus surveilles
+        processes = self._check_processes(cfg.get("processes", []))
+
+        # Infos systeme
+        boot_time = datetime.fromtimestamp(psutil.boot_time())
+        uptime = datetime.now() - boot_time
+
+        now = datetime.now()
+        interval = cfg.get("check_interval_minutes", 1)
+
+        metrics = {
+            "timestamp": now.isoformat(),
+            "hostname": platform.node(),
+            "os": f"{platform.system()} {platform.release()}",
+            "cpu": {
+                "percent": cpu_percent,
+                "cores": psutil.cpu_count(),
+                "threshold": thresholds["cpu_percent"],
+                "status": cpu_status,
+            },
+            "ram": {
+                "percent": round(ram.percent, 1),
+                "total_gb": round(ram.total / (1024 ** 3), 1),
+                "used_gb": round(ram.used / (1024 ** 3), 1),
+                "available_gb": round(ram.available / (1024 ** 3), 1),
+                "threshold": thresholds["ram_percent"],
+                "status": ram_status,
+            },
+            "disks": disks,
+            "processes": processes,
+            "uptime": str(uptime).split(".")[0],
+            "boot_time": boot_time.isoformat(),
+            "monitoring_active": self._running,
+            "last_check": now.isoformat(),
+            "next_check": (now + timedelta(minutes=interval)).isoformat(),
+        }
+
+        with self._lock:
+            self._metrics = metrics
+
+        return metrics
+
+    def _check_processes(self, process_configs):
+        """Verifie l'etat des processus surveilles."""
+        results = []
+        for proc_cfg in process_configs:
+            pattern = proc_cfg["pattern"].lower()
+            name = proc_cfg["name"]
+            enabled = proc_cfg.get("enabled", True)
+            mem_threshold = proc_cfg.get("memory_threshold_mb", 0)
+
+            found = []
+            if enabled:
+                for proc in psutil.process_iter(["pid", "name", "cmdline", "memory_info", "cpu_percent"]):
+                    try:
+                        pname = (proc.info["name"] or "").lower()
+                        cmdline = " ".join(proc.info["cmdline"] or []).lower()
+                        if pattern in pname or pattern in cmdline:
+                            mem_mb = round(proc.info["memory_info"].rss / (1024 ** 2), 1) if proc.info["memory_info"] else 0
+                            found.append({
+                                "pid": proc.info["pid"],
+                                "memory_mb": mem_mb,
+                                "cpu_percent": proc.info["cpu_percent"] or 0,
+                            })
+                    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
+                        continue
+
+            total_memory = sum(p["memory_mb"] for p in found)
+            total_cpu = sum(p["cpu_percent"] for p in found)
+            running = len(found) > 0
+
+            # Statut memoire
+            mem_status = "ok"
+            if mem_threshold > 0 and total_memory > 0:
+                mem_status = self._eval_status(total_memory, mem_threshold, is_mb=True)
+
+            results.append({
+                "name": name,
+                "pattern": proc_cfg["pattern"],
+                "running": running,
+                "enabled": enabled,
+                "alert_on_down": proc_cfg.get("alert_on_down", True),
+                "instance_count": len(found),
+                "total_memory_mb": round(total_memory, 1),
+                "total_cpu_percent": round(total_cpu, 1),
+                "memory_threshold_mb": mem_threshold,
+                "memory_status": mem_status,
+                "pids": [p["pid"] for p in found],
+            })
+
+        return results
+
+    def _eval_status(self, value, threshold, is_mb=False):
+        if is_mb:
+            ratio = value / threshold if threshold > 0 else 0
+        else:
+            ratio = value / threshold if threshold > 0 else 0
+        if ratio >= 1.0:
+            return "critical"
+        elif ratio >= 0.80:
+            return "warning"
+        return "ok"
+
+    def check_and_alert(self, metrics):
+        """Verifie les seuils et envoie des alertes si necessaire."""
+        cfg = self.config.config
+        cooldown = cfg.get("alert_cooldown_minutes", 30)
+        alerts_sent = []
+
+        # CPU
+        if metrics["cpu"]["status"] == "critical":
+            msg = f"CPU a {metrics['cpu']['percent']}% (seuil: {metrics['cpu']['threshold']}%)"
+            if self._should_alert("cpu", cooldown):
+                self._send_and_log("cpu", msg, metrics["cpu"]["percent"], metrics["cpu"]["threshold"])
+                alerts_sent.append(msg)
+
+        # RAM
+        if metrics["ram"]["status"] == "critical":
+            msg = f"RAM a {metrics['ram']['percent']}% (seuil: {metrics['ram']['threshold']}%)"
+            if self._should_alert("ram", cooldown):
+                self._send_and_log("ram", msg, metrics["ram"]["percent"], metrics["ram"]["threshold"])
+                alerts_sent.append(msg)
+
+        # Disques
+        for disk in metrics["disks"]:
+            key = f"disk_{disk['drive']}"
+            if disk["status"] == "critical":
+                msg = f"Disque {disk['drive']} a {disk['percent']}% (seuil: {disk['threshold']}%)"
+                if self._should_alert(key, cooldown):
+                    self._send_and_log(key, msg, disk["percent"], disk["threshold"])
+                    alerts_sent.append(msg)
+
+        # Processus
+        for proc in metrics["processes"]:
+            if not proc["enabled"]:
+                continue
+
+            # Alerte processus arrete
+            if proc["alert_on_down"] and not proc["running"]:
+                key = f"process_down_{proc['name']}"
+                msg = f"Processus '{proc['name']}' non detecte (pattern: {proc['pattern']})"
+                if self._should_alert(key, cooldown):
+                    self._send_and_log(key, msg, 0, 0, alert_type="process_down")
+                    alerts_sent.append(msg)
+
+            # Alerte memoire processus
+            if proc["memory_threshold_mb"] > 0 and proc["memory_status"] == "critical":
+                key = f"process_mem_{proc['name']}"
+                msg = (
+                    f"Processus '{proc['name']}' utilise {proc['total_memory_mb']} Mo "
+                    f"(seuil: {proc['memory_threshold_mb']} Mo)"
+                )
+                if self._should_alert(key, cooldown):
+                    self._send_and_log(key, msg, proc["total_memory_mb"], proc["memory_threshold_mb"])
+                    alerts_sent.append(msg)
+
+        return alerts_sent
+
+    def _should_alert(self, key, cooldown_minutes):
+        now = datetime.now()
+        last = self._last_alerts.get(key)
+        if last and (now - last) < timedelta(minutes=cooldown_minutes):
+            return False
+        return True
+
+    def _send_and_log(self, key, message, value, threshold, alert_type="threshold"):
+        now = datetime.now()
+        hostname = self._metrics.get("hostname", platform.node())
+
+        # Enregistrer l'alerte
+        alert = {
+            "timestamp": now.isoformat(),
+            "type": alert_type,
+            "key": key,
+            "message": message,
+            "value": value,
+            "threshold": threshold,
+            "hostname": hostname,
+        }
+        self.config.save_alert(alert)
+
+        # Envoyer l'email
+        subject = f"[ALERTE] {hostname} - {message}"
+        self.alerter.send_alert(subject, self._format_alert_body(alert))
+
+        # Mettre a jour le cooldown
+        self._last_alerts[key] = now
+
+    def _format_alert_body(self, alert):
+        return (
+            f"Alerte de supervision\n"
+            f"{'=' * 40}\n\n"
+            f"Serveur : {alert['hostname']}\n"
+            f"Date    : {alert['timestamp']}\n"
+            f"Type    : {alert['type']}\n\n"
+            f"Message : {alert['message']}\n\n"
+            f"{'=' * 40}\n"
+            f"Supervision - Monitoring automatique"
+        )
+
+    # --- Thread de monitoring ---
+
+    def start(self):
+        if self._running:
+            return
+        self._running = True
+        self._thread = threading.Thread(target=self._monitoring_loop, daemon=True)
+        self._thread.start()
+
+    def stop(self):
+        self._running = False
+
+    def _monitoring_loop(self):
+        last_check = 0
+        while self._running:
+            interval = self.config.get("check_interval_minutes", 1) * 60
+            elapsed = time.time() - last_check
+            if elapsed >= interval:
+                try:
+                    metrics = self.collect_metrics()
+                    self.check_and_alert(metrics)
+                except Exception as e:
+                    print(f"[Monitoring] Erreur: {e}")
+                last_check = time.time()
+            time.sleep(5)  # Verifie toutes les 5s si c'est le moment