"""Collecte des metriques systeme et surveillance des seuils.""" import platform import threading import time from datetime import datetime, timedelta import psutil class SystemMonitor: def __init__(self, config_manager, alerter): self.config = config_manager self.alerter = alerter self._metrics = {} self._lock = threading.Lock() self._running = False self._thread = None self._last_alerts = {} # cle -> datetime derniere alerte @property def metrics(self): with self._lock: return dict(self._metrics) def collect_metrics(self): """Collecte toutes les metriques systeme.""" cfg = self.config.config thresholds = cfg["thresholds"] # CPU cpu_percent = psutil.cpu_percent(interval=1) cpu_status = self._eval_status(cpu_percent, thresholds["cpu_percent"]) # RAM ram = psutil.virtual_memory() ram_status = self._eval_status(ram.percent, thresholds["ram_percent"]) # Disques disks = [] IGNORED_FS = {"squashfs", "tmpfs", "devtmpfs", "overlay", "iso9660"} for part in psutil.disk_partitions(): # Ignorer les pseudo-filesystems (loop, snap, tmpfs, etc.) if part.fstype in IGNORED_FS: continue if part.device.startswith("/dev/loop"): continue try: usage = psutil.disk_usage(part.mountpoint) except (PermissionError, OSError): continue # Ignorer les partitions minuscules (< 1 Go) if usage.total < 1024 ** 3: continue disk_status = self._eval_status(usage.percent, thresholds["disk_percent"]) disks.append({ "drive": part.device.rstrip("\\"), "mountpoint": part.mountpoint, "percent": round(usage.percent, 1), "total_gb": round(usage.total / (1024 ** 3), 1), "used_gb": round(usage.used / (1024 ** 3), 1), "free_gb": round(usage.free / (1024 ** 3), 1), "threshold": thresholds["disk_percent"], "status": disk_status, }) # Processus surveilles processes = self._check_processes(cfg.get("processes", [])) # Infos systeme boot_time = datetime.fromtimestamp(psutil.boot_time()) uptime = datetime.now() - boot_time now = datetime.now() interval = cfg.get("check_interval_minutes", 1) metrics = { "timestamp": now.isoformat(), "hostname": platform.node(), "os": f"{platform.system()} {platform.release()}", "cpu": { "percent": cpu_percent, "cores": psutil.cpu_count(), "threshold": thresholds["cpu_percent"], "status": cpu_status, }, "ram": { "percent": round(ram.percent, 1), "total_gb": round(ram.total / (1024 ** 3), 1), "used_gb": round(ram.used / (1024 ** 3), 1), "available_gb": round(ram.available / (1024 ** 3), 1), "threshold": thresholds["ram_percent"], "status": ram_status, }, "disks": disks, "processes": processes, "uptime": str(uptime).split(".")[0], "boot_time": boot_time.isoformat(), "monitoring_active": self._running, "last_check": now.isoformat(), "next_check": (now + timedelta(minutes=interval)).isoformat(), } with self._lock: self._metrics = metrics return metrics def _check_processes(self, process_configs): """Verifie l'etat des processus surveilles.""" results = [] for proc_cfg in process_configs: pattern = proc_cfg["pattern"].lower() name = proc_cfg["name"] enabled = proc_cfg.get("enabled", True) mem_threshold = proc_cfg.get("memory_threshold_mb", 0) found = [] if enabled: for proc in psutil.process_iter(["pid", "name", "cmdline", "memory_info", "cpu_percent"]): try: pname = (proc.info["name"] or "").lower() cmdline = " ".join(proc.info["cmdline"] or []).lower() if pattern in pname or pattern in cmdline: mem_mb = round(proc.info["memory_info"].rss / (1024 ** 2), 1) if proc.info["memory_info"] else 0 found.append({ "pid": proc.info["pid"], "memory_mb": mem_mb, "cpu_percent": proc.info["cpu_percent"] or 0, }) except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): continue total_memory = sum(p["memory_mb"] for p in found) total_cpu = sum(p["cpu_percent"] for p in found) running = len(found) > 0 # Statut memoire mem_status = "ok" if mem_threshold > 0 and total_memory > 0: mem_status = self._eval_status(total_memory, mem_threshold, is_mb=True) results.append({ "name": name, "pattern": proc_cfg["pattern"], "running": running, "enabled": enabled, "alert_on_down": proc_cfg.get("alert_on_down", True), "instance_count": len(found), "total_memory_mb": round(total_memory, 1), "total_cpu_percent": round(total_cpu, 1), "memory_threshold_mb": mem_threshold, "memory_status": mem_status, "pids": [p["pid"] for p in found], }) return results def _eval_status(self, value, threshold, is_mb=False): if is_mb: ratio = value / threshold if threshold > 0 else 0 else: ratio = value / threshold if threshold > 0 else 0 if ratio >= 1.0: return "critical" elif ratio >= 0.80: return "warning" return "ok" def check_and_alert(self, metrics): """Verifie les seuils et envoie des alertes si necessaire.""" cfg = self.config.config cooldown = cfg.get("alert_cooldown_minutes", 30) alerts_sent = [] # CPU if metrics["cpu"]["status"] == "critical": msg = f"CPU a {metrics['cpu']['percent']}% (seuil: {metrics['cpu']['threshold']}%)" if self._should_alert("cpu", cooldown): self._send_and_log("cpu", msg, metrics["cpu"]["percent"], metrics["cpu"]["threshold"]) alerts_sent.append(msg) # RAM if metrics["ram"]["status"] == "critical": msg = f"RAM a {metrics['ram']['percent']}% (seuil: {metrics['ram']['threshold']}%)" if self._should_alert("ram", cooldown): self._send_and_log("ram", msg, metrics["ram"]["percent"], metrics["ram"]["threshold"]) alerts_sent.append(msg) # Disques for disk in metrics["disks"]: key = f"disk_{disk['drive']}" if disk["status"] == "critical": msg = f"Disque {disk['drive']} a {disk['percent']}% (seuil: {disk['threshold']}%)" if self._should_alert(key, cooldown): self._send_and_log(key, msg, disk["percent"], disk["threshold"]) alerts_sent.append(msg) # Processus for proc in metrics["processes"]: if not proc["enabled"]: continue # Alerte processus arrete if proc["alert_on_down"] and not proc["running"]: key = f"process_down_{proc['name']}" msg = f"Processus '{proc['name']}' non detecte (pattern: {proc['pattern']})" if self._should_alert(key, cooldown): self._send_and_log(key, msg, 0, 0, alert_type="process_down") alerts_sent.append(msg) # Alerte memoire processus if proc["memory_threshold_mb"] > 0 and proc["memory_status"] == "critical": key = f"process_mem_{proc['name']}" msg = ( f"Processus '{proc['name']}' utilise {proc['total_memory_mb']} Mo " f"(seuil: {proc['memory_threshold_mb']} Mo)" ) if self._should_alert(key, cooldown): self._send_and_log(key, msg, proc["total_memory_mb"], proc["memory_threshold_mb"]) alerts_sent.append(msg) return alerts_sent def _should_alert(self, key, cooldown_minutes): now = datetime.now() last = self._last_alerts.get(key) if last and (now - last) < timedelta(minutes=cooldown_minutes): return False return True def _send_and_log(self, key, message, value, threshold, alert_type="threshold"): now = datetime.now() hostname = self._metrics.get("hostname", platform.node()) # Enregistrer l'alerte alert = { "timestamp": now.isoformat(), "type": alert_type, "key": key, "message": message, "value": value, "threshold": threshold, "hostname": hostname, } self.config.save_alert(alert) # Envoyer l'email subject = f"[ALERTE] {hostname} - {message}" self.alerter.send_alert(subject, self._format_alert_body(alert)) # Mettre a jour le cooldown self._last_alerts[key] = now def _format_alert_body(self, alert): return ( f"Alerte de supervision\n" f"{'=' * 40}\n\n" f"Serveur : {alert['hostname']}\n" f"Date : {alert['timestamp']}\n" f"Type : {alert['type']}\n\n" f"Message : {alert['message']}\n\n" f"{'=' * 40}\n" f"Supervision - Monitoring automatique" ) # --- Thread de monitoring --- def start(self): if self._running: return self._running = True self._thread = threading.Thread(target=self._monitoring_loop, daemon=True) self._thread.start() def stop(self): self._running = False def _monitoring_loop(self): last_check = 0 while self._running: interval = self.config.get("check_interval_minutes", 1) * 60 elapsed = time.time() - last_check if elapsed >= interval: try: metrics = self.collect_metrics() self.check_and_alert(metrics) except Exception as e: print(f"[Monitoring] Erreur: {e}") last_check = time.time() time.sleep(5) # Verifie toutes les 5s si c'est le moment