supervision/monitor.py

"""Collecte des metriques systeme et surveillance des seuils."""

import platform
import threading
import time
from datetime import datetime, timedelta

import psutil


class SystemMonitor:
    def __init__(self, config_manager, alerter):
        self.config = config_manager
        self.alerter = alerter
        self._metrics = {}
        self._lock = threading.Lock()
        self._running = False
        self._thread = None
        self._last_alerts = {}  # cle -> datetime derniere alerte

    @property
    def metrics(self):
        with self._lock:
            return dict(self._metrics)

    def collect_metrics(self):
        """Collecte toutes les metriques systeme."""
        cfg = self.config.config
        thresholds = cfg["thresholds"]

        # CPU
        cpu_percent = psutil.cpu_percent(interval=1)
        cpu_status = self._eval_status(cpu_percent, thresholds["cpu_percent"])

        # RAM
        ram = psutil.virtual_memory()
        ram_status = self._eval_status(ram.percent, thresholds["ram_percent"])

        # Disques
        disks = []
        IGNORED_FS = {"squashfs", "tmpfs", "devtmpfs", "overlay", "iso9660"}
        for part in psutil.disk_partitions():
            # Ignorer les pseudo-filesystems (loop, snap, tmpfs, etc.)
            if part.fstype in IGNORED_FS:
                continue
            if part.device.startswith("/dev/loop"):
                continue
            try:
                usage = psutil.disk_usage(part.mountpoint)
            except (PermissionError, OSError):
                continue
            # Ignorer les partitions minuscules (< 1 Go)
            if usage.total < 1024 ** 3:
                continue
            disk_status = self._eval_status(usage.percent, thresholds["disk_percent"])
            disks.append({
                "drive": part.device.rstrip("\\"),
                "mountpoint": part.mountpoint,
                "percent": round(usage.percent, 1),
                "total_gb": round(usage.total / (1024 ** 3), 1),
                "used_gb": round(usage.used / (1024 ** 3), 1),
                "free_gb": round(usage.free / (1024 ** 3), 1),
                "threshold": thresholds["disk_percent"],
                "status": disk_status,
            })

        # Processus surveilles
        processes = self._check_processes(cfg.get("processes", []))

        # Infos systeme
        boot_time = datetime.fromtimestamp(psutil.boot_time())
        uptime = datetime.now() - boot_time

        now = datetime.now()
        interval = cfg.get("check_interval_minutes", 1)

        metrics = {
            "timestamp": now.isoformat(),
            "hostname": platform.node(),
            "os": f"{platform.system()} {platform.release()}",
            "cpu": {
                "percent": cpu_percent,
                "cores": psutil.cpu_count(),
                "threshold": thresholds["cpu_percent"],
                "status": cpu_status,
            },
            "ram": {
                "percent": round(ram.percent, 1),
                "total_gb": round(ram.total / (1024 ** 3), 1),
                "used_gb": round(ram.used / (1024 ** 3), 1),
                "available_gb": round(ram.available / (1024 ** 3), 1),
                "threshold": thresholds["ram_percent"],
                "status": ram_status,
            },
            "disks": disks,
            "processes": processes,
            "uptime": str(uptime).split(".")[0],
            "boot_time": boot_time.isoformat(),
            "monitoring_active": self._running,
            "last_check": now.isoformat(),
            "next_check": (now + timedelta(minutes=interval)).isoformat(),
        }

        with self._lock:
            self._metrics = metrics

        return metrics

    def _check_processes(self, process_configs):
        """Verifie l'etat des processus surveilles."""
        results = []
        for proc_cfg in process_configs:
            pattern = proc_cfg["pattern"].lower()
            name = proc_cfg["name"]
            enabled = proc_cfg.get("enabled", True)
            mem_threshold = proc_cfg.get("memory_threshold_mb", 0)

            found = []
            if enabled:
                for proc in psutil.process_iter(["pid", "name", "cmdline", "memory_info", "cpu_percent"]):
                    try:
                        pname = (proc.info["name"] or "").lower()
                        cmdline = " ".join(proc.info["cmdline"] or []).lower()
                        if pattern in pname or pattern in cmdline:
                            mem_mb = round(proc.info["memory_info"].rss / (1024 ** 2), 1) if proc.info["memory_info"] else 0
                            found.append({
                                "pid": proc.info["pid"],
                                "memory_mb": mem_mb,
                                "cpu_percent": proc.info["cpu_percent"] or 0,
                            })
                    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
                        continue

            total_memory = sum(p["memory_mb"] for p in found)
            total_cpu = sum(p["cpu_percent"] for p in found)
            running = len(found) > 0

            # Statut memoire
            mem_status = "ok"
            if mem_threshold > 0 and total_memory > 0:
                mem_status = self._eval_status(total_memory, mem_threshold, is_mb=True)

            results.append({
                "name": name,
                "pattern": proc_cfg["pattern"],
                "running": running,
                "enabled": enabled,
                "alert_on_down": proc_cfg.get("alert_on_down", True),
                "instance_count": len(found),
                "total_memory_mb": round(total_memory, 1),
                "total_cpu_percent": round(total_cpu, 1),
                "memory_threshold_mb": mem_threshold,
                "memory_status": mem_status,
                "pids": [p["pid"] for p in found],
            })

        return results

    def _eval_status(self, value, threshold, is_mb=False):
        if is_mb:
            ratio = value / threshold if threshold > 0 else 0
        else:
            ratio = value / threshold if threshold > 0 else 0
        if ratio >= 1.0:
            return "critical"
        elif ratio >= 0.80:
            return "warning"
        return "ok"

    def check_and_alert(self, metrics):
        """Verifie les seuils et envoie des alertes si necessaire."""
        cfg = self.config.config
        cooldown = cfg.get("alert_cooldown_minutes", 30)
        alerts_sent = []

        # CPU
        if metrics["cpu"]["status"] == "critical":
            msg = f"CPU a {metrics['cpu']['percent']}% (seuil: {metrics['cpu']['threshold']}%)"
            if self._should_alert("cpu", cooldown):
                self._send_and_log("cpu", msg, metrics["cpu"]["percent"], metrics["cpu"]["threshold"])
                alerts_sent.append(msg)

        # RAM
        if metrics["ram"]["status"] == "critical":
            msg = f"RAM a {metrics['ram']['percent']}% (seuil: {metrics['ram']['threshold']}%)"
            if self._should_alert("ram", cooldown):
                self._send_and_log("ram", msg, metrics["ram"]["percent"], metrics["ram"]["threshold"])
                alerts_sent.append(msg)

        # Disques
        for disk in metrics["disks"]:
            key = f"disk_{disk['drive']}"
            if disk["status"] == "critical":
                msg = f"Disque {disk['drive']} a {disk['percent']}% (seuil: {disk['threshold']}%)"
                if self._should_alert(key, cooldown):
                    self._send_and_log(key, msg, disk["percent"], disk["threshold"])
                    alerts_sent.append(msg)

        # Processus
        for proc in metrics["processes"]:
            if not proc["enabled"]:
                continue

            # Alerte processus arrete
            if proc["alert_on_down"] and not proc["running"]:
                key = f"process_down_{proc['name']}"
                msg = f"Processus '{proc['name']}' non detecte (pattern: {proc['pattern']})"
                if self._should_alert(key, cooldown):
                    self._send_and_log(key, msg, 0, 0, alert_type="process_down")
                    alerts_sent.append(msg)

            # Alerte memoire processus
            if proc["memory_threshold_mb"] > 0 and proc["memory_status"] == "critical":
                key = f"process_mem_{proc['name']}"
                msg = (
                    f"Processus '{proc['name']}' utilise {proc['total_memory_mb']} Mo "
                    f"(seuil: {proc['memory_threshold_mb']} Mo)"
                )
                if self._should_alert(key, cooldown):
                    self._send_and_log(key, msg, proc["total_memory_mb"], proc["memory_threshold_mb"])
                    alerts_sent.append(msg)

        return alerts_sent

    def _should_alert(self, key, cooldown_minutes):
        now = datetime.now()
        last = self._last_alerts.get(key)
        if last and (now - last) < timedelta(minutes=cooldown_minutes):
            return False
        return True

    def _send_and_log(self, key, message, value, threshold, alert_type="threshold"):
        now = datetime.now()
        hostname = self._metrics.get("hostname", platform.node())

        # Enregistrer l'alerte
        alert = {
            "timestamp": now.isoformat(),
            "type": alert_type,
            "key": key,
            "message": message,
            "value": value,
            "threshold": threshold,
            "hostname": hostname,
        }
        self.config.save_alert(alert)

        # Envoyer l'email
        subject = f"[ALERTE] {hostname} - {message}"
        self.alerter.send_alert(subject, self._format_alert_body(alert))

        # Mettre a jour le cooldown
        self._last_alerts[key] = now

    def _format_alert_body(self, alert):
        return (
            f"Alerte de supervision\n"
            f"{'=' * 40}\n\n"
            f"Serveur : {alert['hostname']}\n"
            f"Date    : {alert['timestamp']}\n"
            f"Type    : {alert['type']}\n\n"
            f"Message : {alert['message']}\n\n"
            f"{'=' * 40}\n"
            f"Supervision - Monitoring automatique"
        )

    # --- Thread de monitoring ---

    def start(self):
        if self._running:
            return
        self._running = True
        self._thread = threading.Thread(target=self._monitoring_loop, daemon=True)
        self._thread.start()

    def stop(self):
        self._running = False

    def _monitoring_loop(self):
        last_check = 0
        while self._running:
            interval = self.config.get("check_interval_minutes", 1) * 60
            elapsed = time.time() - last_check
            if elapsed >= interval:
                try:
                    metrics = self.collect_metrics()
                    self.check_and_alert(metrics)
                except Exception as e:
                    print(f"[Monitoring] Erreur: {e}")
                last_check = time.time()
            time.sleep(5)  # Verifie toutes les 5s si c'est le moment