feat(vwb): add dashboard competence testing and health tools

This commit is contained in:
Dom
2026-06-02 16:27:19 +02:00
parent d38f0b0f2f
commit 18ed6cb751
23 changed files with 2769 additions and 27 deletions

View File

@@ -0,0 +1,101 @@
import json
import subprocess
import os
import hashlib
def get_hash(file_path):
if not os.path.exists(file_path):
return "MISSING"
sha256_hash = hashlib.sha256()
with open(file_path,"rb") as f:
for byte_block in iter(lambda: f.read(4096),b""):
sha256_hash.update(byte_block)
return sha256_hash.hexdigest()
def main():
# 1. Get ollama list
try:
raw_list = subprocess.check_output(["ollama", "list"]).decode("utf-8")
except:
raw_list = ""
lines = raw_list.strip().split("\n")[1:]
inventory = []
for line in lines:
parts = line.split()
if len(parts) < 3: continue
tag = parts[0]
tag_id = parts[1]
size = parts[2]
# 2. Find manifest path
# Pattern: /var/lib/ollama/.ollama/models/manifests/registry.ollama.ai/library/NAME/TAG
# Or: /var/lib/ollama/.ollama/models/manifests/registry.ollama.ai/USER/NAME/TAG
manifest_root = "/var/lib/ollama/.ollama/models/manifests/registry.ollama.ai/"
tag_parts = tag.split("/")
if len(tag_parts) == 1:
# library
name_tag = tag_parts[0].split(":")
name = name_tag[0]
version = name_tag[1] if len(name_tag) > 1 else "latest"
manifest_path = os.path.join(manifest_root, "library", name, version)
else:
user = tag_parts[0]
name_tag = tag_parts[1].split(":")
name = name_tag[0]
version = name_tag[1] if len(name_tag) > 1 else "latest"
manifest_path = os.path.join(manifest_root, user, name, version)
manifest_hash = get_hash(manifest_path)
# 3. Read manifest content
layers = []
config_digest = ""
if os.path.exists(manifest_path):
try:
with open(manifest_path, "r") as f:
data = json.load(f)
config_digest = data.get("config", {}).get("digest", "")
for layer in data.get("layers", []):
layers.append({
"mediaType": layer.get("mediaType"),
"digest": layer.get("digest"),
"size": layer.get("size"),
"from": layer.get("from")
})
except:
pass
# 4. Get Modelfile for critical params
try:
modelfile = subprocess.check_output(["ollama", "show", tag, "--modelfile"]).decode("utf-8")
from_line = [l for l in modelfile.split("\n") if l.startswith("FROM ")][0]
except:
modelfile = ""
from_line = ""
entry = {
"tag": tag,
"ollama_list_id": tag_id,
"ollama_list_size": size,
"manifest_path": manifest_path,
"manifest_hash": manifest_hash,
"config_digest": config_digest,
"blob_digest_from": from_line.replace("FROM ", "").strip(),
"layers": layers,
"reconstructible": "YES" if from_line and manifest_hash != "MISSING" else "UNKNOWN"
}
inventory.append(entry)
output = {
"inventory_date": "2026-05-25T13:35:00",
"total_tags": len(inventory),
"models": inventory
}
print(json.dumps(output, indent=2))
if __name__ == "__main__":
main()

401
tools/lea_healthcheck.py Normal file
View File

@@ -0,0 +1,401 @@
#!/usr/bin/env python3
"""Read-only healthcheck for the Lea demo stack.
This script does not start, stop, restart, delete, or restore anything.
It is intended as a daily proof artifact before the 2026-06-01 demo.
"""
from __future__ import annotations
import argparse
import base64
import json
import os
import shlex
import socket
import subprocess
import sys
import time
import urllib.error
import urllib.request
from pathlib import Path
from typing import Any
REPO_ROOT = Path(__file__).resolve().parents[1]
OLLAMA_MODELS_DIR = Path("/var/lib/ollama/.ollama/models")
CRITICAL_MODELS = (
"qwen2.5vl:7b-rpa",
"t2a-gemma3-27b:latest",
"t2a-gemma3-27b-q4:latest",
"thiagomoraes/medgemma-27b-it:Q4_K_S",
)
CRITICAL_BLOBS = {
"t2a-gemma3-27b:latest": "sha256-2f2509e30b0d07db517b82e62404194ef355846f08ac287775ff363693086818",
"t2a-gemma3-27b-q4:latest": "sha256-0139f42273d53348fa0d24daae016b7231e1310258bbbaa7e38a1af703217c1a",
"thiagomoraes/medgemma-27b-it:Q4_K_S": "sha256-7cb6ff10942c8ccf370e274daafaf56da3fff318f40a355df331d8783c6c11f3",
}
def run_command(args: list[str], timeout: float = 5.0) -> tuple[int, str, str]:
try:
proc = subprocess.run(
args,
cwd=REPO_ROOT,
text=True,
encoding="utf-8",
errors="replace",
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
timeout=timeout,
check=False,
)
return proc.returncode, proc.stdout.strip(), proc.stderr.strip()
except FileNotFoundError as exc:
return 127, "", str(exc)
except subprocess.TimeoutExpired as exc:
stdout = (exc.stdout or "").strip() if isinstance(exc.stdout, str) else ""
stderr = (exc.stderr or "").strip() if isinstance(exc.stderr, str) else ""
return 124, stdout, stderr or f"timeout after {timeout}s"
def http_json(url: str, timeout: float = 2.0) -> tuple[bool, Any, str]:
try:
req = urllib.request.Request(url, headers={"User-Agent": "lea-healthcheck/1.0"})
with urllib.request.urlopen(req, timeout=timeout) as response:
body = response.read().decode("utf-8", errors="replace")
if response.status >= 400:
return False, None, f"http {response.status}: {body[:300]}"
return True, json.loads(body), ""
except json.JSONDecodeError as exc:
return False, None, f"invalid json: {exc}"
except (urllib.error.URLError, TimeoutError, OSError) as exc:
return False, None, str(exc)
def tcp_open(host: str, port: int, timeout: float = 1.0) -> tuple[bool, str]:
try:
with socket.create_connection((host, port), timeout=timeout):
return True, ""
except OSError as exc:
return False, str(exc)
def add_check(
checks: list[dict[str, Any]],
name: str,
status: str,
summary: str,
details: Any | None = None,
) -> None:
checks.append(
{
"name": name,
"status": status,
"summary": summary,
"details": details,
}
)
def check_systemd(checks: list[dict[str, Any]]) -> None:
for unit, required in (
("rpa-streaming.service", True),
("rpa-agent-chat.service", False),
):
code, stdout, stderr = run_command(["systemctl", "--user", "is-active", unit])
if code == 0 and stdout == "active":
add_check(checks, f"systemd:{unit}", "ok", "active")
elif required:
add_check(
checks,
f"systemd:{unit}",
"fail",
stdout or stderr or f"is-active returned {code}",
)
else:
add_check(
checks,
f"systemd:{unit}",
"warn",
stdout or stderr or "inactive optional service",
{"note": "5004 narration bus is optional, but should be fixed before demo if enabled on Windows."},
)
def check_ports(checks: list[dict[str, Any]], host: str) -> None:
for port, name, required in (
(5005, "streaming-http", True),
(11434, "ollama-api", True),
(5004, "feedbackbus-socketio", False),
):
ok, error = tcp_open(host, port)
if ok:
add_check(checks, f"tcp:{name}:{port}", "ok", f"{host}:{port} accepts TCP")
elif required:
add_check(checks, f"tcp:{name}:{port}", "fail", f"{host}:{port} closed: {error}")
else:
add_check(checks, f"tcp:{name}:{port}", "warn", f"{host}:{port} closed: {error}")
def check_http_services(checks: list[dict[str, Any]]) -> None:
ok, data, error = http_json("http://127.0.0.1:5005/health")
if ok and isinstance(data, dict) and data.get("status") == "healthy":
add_check(checks, "http:rpa-streaming:/health", "ok", "healthy", data)
else:
add_check(checks, "http:rpa-streaming:/health", "fail", error or "unexpected health response", data)
ok, data, error = http_json("http://127.0.0.1:5004/api/status")
if ok and isinstance(data, dict) and data.get("status") == "online":
add_check(checks, "http:feedbackbus:/api/status", "ok", "online", data)
elif ok:
add_check(checks, "http:feedbackbus:/api/status", "warn", "unexpected status response", data)
else:
add_check(checks, "http:feedbackbus:/api/status", "warn", error or "not responding")
def check_ollama(checks: list[dict[str, Any]]) -> None:
ok, tags_data, error = http_json("http://127.0.0.1:11434/api/tags", timeout=4.0)
if not ok or not isinstance(tags_data, dict):
add_check(checks, "ollama:tags", "fail", error or "cannot read /api/tags")
return
models = tags_data.get("models") or []
names = {entry.get("name") or entry.get("model") for entry in models if isinstance(entry, dict)}
missing = [name for name in CRITICAL_MODELS if name not in names]
if missing:
add_check(checks, "ollama:critical-tags", "fail", f"missing critical tags: {', '.join(missing)}")
else:
add_check(checks, "ollama:critical-tags", "ok", f"{len(CRITICAL_MODELS)} critical tags present")
ok, ps_data, error = http_json("http://127.0.0.1:11434/api/ps", timeout=4.0)
if ok and isinstance(ps_data, dict):
loaded = ps_data.get("models") or []
vlm = next(
(entry for entry in loaded if isinstance(entry, dict) and entry.get("name") == "qwen2.5vl:7b-rpa"),
None,
)
if vlm:
add_check(
checks,
"ollama:resident-vlm",
"ok",
"qwen2.5vl:7b-rpa resident",
{
"context_length": vlm.get("context_length"),
"size": vlm.get("size"),
"size_vram": vlm.get("size_vram"),
"expires_at": vlm.get("expires_at"),
},
)
else:
add_check(checks, "ollama:resident-vlm", "warn", "qwen2.5vl:7b-rpa is not currently resident", loaded)
else:
add_check(checks, "ollama:ps", "warn", error or "cannot read /api/ps")
def check_model_store(checks: list[dict[str, Any]]) -> None:
manifests_dir = OLLAMA_MODELS_DIR / "manifests"
blobs_dir = OLLAMA_MODELS_DIR / "blobs"
if not OLLAMA_MODELS_DIR.exists():
add_check(checks, "ollama:store", "fail", f"missing {OLLAMA_MODELS_DIR}")
return
manifest_count = sum(1 for path in manifests_dir.rglob("*") if path.is_file()) if manifests_dir.exists() else 0
blob_count = sum(1 for path in blobs_dir.iterdir() if path.is_file()) if blobs_dir.exists() else 0
add_check(
checks,
"ollama:store-counts",
"ok" if manifest_count >= 38 and blob_count >= 100 else "warn",
f"{manifest_count} manifests, {blob_count} blobs",
{"path": str(OLLAMA_MODELS_DIR)},
)
missing_blobs = []
for model, blob_name in CRITICAL_BLOBS.items():
blob_path = blobs_dir / blob_name
if not blob_path.exists():
missing_blobs.append({"model": model, "blob": blob_name})
if missing_blobs:
add_check(checks, "ollama:critical-blobs", "fail", "missing critical blobs", missing_blobs)
else:
add_check(checks, "ollama:critical-blobs", "ok", f"{len(CRITICAL_BLOBS)} critical blobs present")
def check_windows(checks: list[dict[str, Any]], host: str, user: str, ssh_command: str) -> None:
powershell = r"""
$ErrorActionPreference = "SilentlyContinue"
$task = schtasks /query /tn LeaInteractive /fo LIST /v | Out-String
$taskState = $null
try {
$taskState = (Get-ScheduledTask -TaskName 'LeaInteractive').State.ToString()
} catch {
$taskState = $null
}
$procs = Get-CimInstance Win32_Process -Filter "name = 'pythonw.exe' or name = 'python.exe'" |
Where-Object { $_.CommandLine -like '*run_agent_v1.py*' } |
Select-Object ProcessId,ParentProcessId,CommandLine
$lock = $null
if (Test-Path 'C:\rpa_vision\lea_agent.lock') {
$lock = (Get-Content 'C:\rpa_vision\lea_agent.lock' -Raw).Trim()
}
[pscustomobject]@{
lea_feedback_bus_user = [Environment]::GetEnvironmentVariable('LEA_FEEDBACK_BUS', 'User')
lea_feedback_bus_machine = [Environment]::GetEnvironmentVariable('LEA_FEEDBACK_BUS', 'Machine')
rpa_capture_bind_user = [Environment]::GetEnvironmentVariable('RPA_CAPTURE_BIND', 'User')
task_state = $taskState
task_running = ($taskState -eq 'Running')
task_raw = $task
agent_processes = @($procs)
lock_pid = $lock
} | ConvertTo-Json -Compress -Depth 5
""".strip()
command_parts = shlex.split(ssh_command)
if not command_parts:
add_check(checks, "windows:ssh", "skip", "empty ssh command")
return
target = f"{user}@{host}"
args = command_parts + ["-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=5"]
if "sshpass" not in command_parts[0]:
args += ["-o", "BatchMode=yes"]
encoded = base64.b64encode(powershell.encode("utf-16le")).decode("ascii")
args += [target, "powershell", "-NoProfile", "-EncodedCommand", encoded]
code, stdout, stderr = run_command(args, timeout=12.0)
if code != 0:
add_check(checks, "windows:ssh", "warn", stderr or stdout or f"ssh returned {code}")
return
try:
data = json.loads(stdout)
except json.JSONDecodeError as exc:
add_check(checks, "windows:ssh", "warn", f"invalid powershell json: {exc}", stdout[:1000])
return
processes = data.get("agent_processes") or []
if isinstance(processes, dict):
processes = [processes]
process_ids = {
int(proc.get("ProcessId"))
for proc in processes
if proc.get("ProcessId") is not None
}
process_roots = [
proc for proc in processes
if proc.get("ParentProcessId") is None
or int(proc.get("ParentProcessId")) not in process_ids
]
if processes and not process_roots:
process_roots = processes
instance_count = len(process_roots)
task_running = bool(data.get("task_running"))
task_state = data.get("task_state")
if task_running:
task_status = "ok"
task_summary = f"task state={task_state!r}"
elif instance_count:
task_status = "warn"
task_summary = f"task state={task_state!r}, but {instance_count} agent instance tree(s) are alive"
else:
task_status = "fail"
task_summary = f"task state={task_state!r}"
add_check(checks, "windows:ssh", "ok", f"reachable as {target}")
add_check(checks, "windows:LeaInteractive", task_status, task_summary)
add_check(
checks,
"windows:agent-process",
"ok" if instance_count == 1 else ("warn" if instance_count > 1 else "fail"),
f"{instance_count} Lea instance tree(s), {len(processes)} run_agent_v1.py process(es)",
{
"roots": process_roots,
"processes": processes,
"note": "pythonw.exe from a venv can spawn a child pythonw.exe; count root process trees, not raw processes.",
},
)
feedback_bus = data.get("lea_feedback_bus_user") or data.get("lea_feedback_bus_machine")
add_check(
checks,
"windows:LEA_FEEDBACK_BUS",
"ok",
f"LEA_FEEDBACK_BUS={feedback_bus!r}",
{
"note": "If set to '1', Windows will try port 5004; local TCP/HTTP checks report whether that service is available.",
"rpa_capture_bind_user": data.get("rpa_capture_bind_user"),
"lock_pid": data.get("lock_pid"),
},
)
def summarize(checks: list[dict[str, Any]]) -> str:
if any(check["status"] == "fail" for check in checks):
return "fail"
if any(check["status"] == "warn" for check in checks):
return "warn"
return "ok"
def print_text(report: dict[str, Any]) -> None:
print(f"Lea healthcheck: {report['overall'].upper()}")
print(f"Timestamp: {report['timestamp']}")
print()
for check in report["checks"]:
print(f"[{check['status'].upper():4}] {check['name']} - {check['summary']}")
def parse_args(argv: list[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--json", action="store_true", help="print machine-readable JSON")
parser.add_argument("--strict", action="store_true", help="exit non-zero on warnings")
parser.add_argument("--host", default="127.0.0.1", help="local service host for TCP checks")
parser.add_argument("--windows-host", default=os.environ.get("LEA_WINDOWS_HOST", ""))
parser.add_argument("--windows-user", default=os.environ.get("LEA_WINDOWS_USER", "dom"))
parser.add_argument(
"--ssh-command",
default=os.environ.get("LEA_SSH_COMMAND", "ssh"),
help="ssh command prefix; for password auth use LEA_SSH_COMMAND='sshpass -e ssh' and SSHPASS externally",
)
return parser.parse_args(argv)
def main(argv: list[str]) -> int:
args = parse_args(argv)
checks: list[dict[str, Any]] = []
check_systemd(checks)
check_ports(checks, args.host)
check_http_services(checks)
check_ollama(checks)
check_model_store(checks)
if args.windows_host:
check_windows(checks, args.windows_host, args.windows_user, args.ssh_command)
else:
add_check(checks, "windows", "skip", "not requested; pass --windows-host or LEA_WINDOWS_HOST")
report = {
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S%z"),
"overall": summarize(checks),
"repo": str(REPO_ROOT),
"checks": checks,
}
if args.json:
print(json.dumps(report, indent=2, sort_keys=True))
else:
print_text(report)
if report["overall"] == "fail":
return 2
if args.strict and report["overall"] == "warn":
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))

View File

@@ -0,0 +1,409 @@
#!/usr/bin/env python3
"""Read-only preflight for Lea micro-learning prerequisites.
The script performs fast checks only. It does not warm up models, pull models,
start services, stop replays, restart processes, or modify files.
"""
from __future__ import annotations
import argparse
import csv
import json
import os
import re
import subprocess
import sys
import time
import urllib.error
import urllib.request
from pathlib import Path
from typing import Any
try: # Script execution from tools/
from lea_healthcheck import REPO_ROOT, add_check, http_json, run_command, summarize
except ImportError: # Test/import execution from repository root
try:
from tools.lea_healthcheck import REPO_ROOT, add_check, http_json, run_command, summarize
except ImportError:
REPO_ROOT = Path(__file__).resolve().parents[1]
def run_command(args: list[str], timeout: float = 5.0) -> tuple[int, str, str]:
try:
proc = subprocess.run(
args,
cwd=REPO_ROOT,
text=True,
encoding="utf-8",
errors="replace",
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
timeout=timeout,
check=False,
)
return proc.returncode, proc.stdout.strip(), proc.stderr.strip()
except FileNotFoundError as exc:
return 127, "", str(exc)
except subprocess.TimeoutExpired as exc:
stdout = (exc.stdout or "").strip() if isinstance(exc.stdout, str) else ""
stderr = (exc.stderr or "").strip() if isinstance(exc.stderr, str) else ""
return 124, stdout, stderr or f"timeout after {timeout}s"
def http_json(url: str, timeout: float = 2.0) -> tuple[bool, Any, str]:
try:
req = urllib.request.Request(url, headers={"User-Agent": "lea-micro-preflight/1.0"})
with urllib.request.urlopen(req, timeout=timeout) as response:
body = response.read().decode("utf-8", errors="replace")
if response.status >= 400:
return False, None, f"http {response.status}: {body[:300]}"
return True, json.loads(body), ""
except json.JSONDecodeError as exc:
return False, None, f"invalid json: {exc}"
except (urllib.error.URLError, TimeoutError, OSError) as exc:
return False, None, str(exc)
def add_check(
checks: list[dict[str, Any]],
name: str,
status: str,
summary: str,
details: Any | None = None,
) -> None:
checks.append({"name": name, "status": status, "summary": summary, "details": details})
def summarize(checks: list[dict[str, Any]]) -> str:
if any(check["status"] == "fail" for check in checks):
return "fail"
if any(check["status"] == "warn" for check in checks):
return "warn"
return "ok"
DEFAULT_MIN_VRAM_FREE_MIB = 4000
DEFAULT_MIN_RAM_AVAILABLE_MIB = 8192
DEFAULT_MAX_SWAP_USED_MIB = 4096
DEFAULT_MAX_SWAP_USED_PCT = 70.0
REQUIRED_MODELS = ("qwen2.5vl:7b-rpa", "qwen2.5:7b")
DEFAULT_RESIDENT_WARN_MODEL = "qwen2.5vl:7b-rpa"
def _parse_mib_int(value: str) -> int:
match = re.search(r"-?\d+", value.replace("\u00a0", " "))
if not match:
raise ValueError(f"cannot parse integer from {value!r}")
return int(match.group(0))
def parse_free_m(output: str) -> dict[str, dict[str, int]]:
"""Parse `free -m` output into mem/swap dictionaries.
Handles localized column names by normalizing to English keys.
"""
header: list[str] = []
parsed: dict[str, dict[str, int]] = {}
fallback_columns = {
"mem": ["total", "used", "free", "shared", "buff/cache", "available"],
"swap": ["total", "used", "free"],
}
# Localization map: known foreign column names → English
_LOCALIZATION_MAP = {
"disponible": "available",
"utilisé": "used",
"libre": "free",
"partagé": "shared",
"tamp/cache": "buff/cache", # French truncation of "tampon/cache"
}
def _normalize(name: str) -> str:
return _LOCALIZATION_MAP.get(name.lower(), name.lower())
for raw_line in output.splitlines():
parts = raw_line.strip().split()
if not parts:
continue
if parts[0].lower() == "total":
header = [_normalize(p) for p in parts]
continue
label = parts[0].rstrip(":").lower()
# Handle localized row labels: "échange" = "swap" (French)
if label == "échange":
label = "swap"
if label not in ("mem", "swap"):
continue
values = parts[1:]
if label == "mem" and header:
columns = header[: len(values)]
else:
columns = fallback_columns[label][: len(values)]
parsed[label] = {key: int(value) for key, value in zip(columns, values)}
if "mem" not in parsed:
raise ValueError("missing Mem line in free output")
if "available" not in parsed["mem"] and "free" in parsed["mem"]:
parsed["mem"]["available"] = parsed["mem"]["free"]
if "available" not in parsed["mem"]:
raise ValueError("missing available memory in free output")
if "swap" not in parsed:
raise ValueError("missing Swap line in free output")
return parsed
def parse_nvidia_smi_memory(output: str) -> list[dict[str, int]]:
"""Parse `nvidia-smi --query-gpu=memory.free,memory.total` CSV output."""
gpus: list[dict[str, int]] = []
for row in csv.reader(output.splitlines()):
cells = [cell.strip() for cell in row if cell.strip()]
if not cells:
continue
if len(cells) < 2:
raise ValueError(f"expected two CSV columns, got {cells!r}")
gpus.append(
{
"free_mib": _parse_mib_int(cells[0]),
"total_mib": _parse_mib_int(cells[1]),
}
)
if not gpus:
raise ValueError("nvidia-smi returned no GPU memory rows")
return gpus
def extract_ollama_tags(data: Any) -> set[str]:
"""Extract model tags from Ollama `/api/tags` or `/api/ps` style JSON."""
if isinstance(data, dict):
models = data.get("models") or []
elif isinstance(data, list):
models = data
else:
return set()
tags: set[str] = set()
for entry in models:
if not isinstance(entry, dict):
continue
for key in ("name", "model"):
value = entry.get(key)
if isinstance(value, str) and value:
tags.add(value)
return tags
def check_gpu(checks: list[dict[str, Any]], min_vram_free_mib: int, timeout: float) -> None:
code, stdout, stderr = run_command(
[
"nvidia-smi",
"--query-gpu=memory.free,memory.total",
"--format=csv,noheader,nounits",
],
timeout=timeout,
)
if code != 0:
summary = "nvidia-smi not available" if code == 127 else stderr or stdout or f"exit {code}"
add_check(checks, "gpu:nvidia-smi", "fail", summary)
return
try:
gpus = parse_nvidia_smi_memory(stdout)
except ValueError as exc:
add_check(checks, "gpu:nvidia-smi", "fail", f"cannot parse nvidia-smi output: {exc}", stdout)
return
add_check(checks, "gpu:nvidia-smi", "ok", f"query ok, {len(gpus)} GPU(s)", {"gpus": gpus})
best_free = max(gpu["free_mib"] for gpu in gpus)
status = "ok" if best_free >= min_vram_free_mib else "fail"
add_check(
checks,
"gpu:vram-free",
status,
f"best free VRAM {best_free} MiB / required {min_vram_free_mib} MiB",
{"threshold_mib": min_vram_free_mib, "gpus": gpus},
)
def check_memory(
checks: list[dict[str, Any]],
min_ram_available_mib: int,
max_swap_used_mib: int,
max_swap_used_pct: float,
timeout: float,
) -> None:
code, stdout, stderr = run_command(["free", "-m"], timeout=timeout)
if code != 0:
add_check(checks, "memory:free", "fail", stderr or stdout or f"exit {code}")
return
try:
memory = parse_free_m(stdout)
except ValueError as exc:
add_check(checks, "memory:free", "fail", f"cannot parse free -m output: {exc}", stdout)
return
mem = memory["mem"]
available = mem["available"]
add_check(
checks,
"ram:available",
"ok" if available >= min_ram_available_mib else "fail",
f"available RAM {available} MiB / required {min_ram_available_mib} MiB",
{"threshold_mib": min_ram_available_mib, "mem": mem},
)
swap = memory["swap"]
swap_total = swap.get("total", 0)
swap_used = swap.get("used", 0)
if swap_total <= 0:
add_check(checks, "swap:usage", "fail", "swap total is 0 MiB", {"swap": swap})
return
swap_used_pct = (swap_used / swap_total) * 100.0
swap_ok = swap_used <= max_swap_used_mib and swap_used_pct <= max_swap_used_pct
add_check(
checks,
"swap:usage",
"ok" if swap_ok else "fail",
f"swap used {swap_used} MiB ({swap_used_pct:.1f}%) / limits {max_swap_used_mib} MiB and {max_swap_used_pct:.1f}%",
{
"max_used_mib": max_swap_used_mib,
"max_used_pct": max_swap_used_pct,
"used_pct": round(swap_used_pct, 2),
"swap": swap,
},
)
def check_ollama(
checks: list[dict[str, Any]],
base_url: str,
required_models: tuple[str, ...],
resident_warn_model: str,
timeout: float,
) -> None:
base = base_url.rstrip("/")
ok, tags_data, error = http_json(f"{base}/api/tags", timeout=timeout)
if not ok or not isinstance(tags_data, dict):
add_check(checks, "ollama:tags", "fail", error or "cannot read /api/tags")
return
tags = extract_ollama_tags(tags_data)
add_check(checks, "ollama:tags", "ok", f"/api/tags readable, {len(tags)} tag(s)")
missing = [model for model in required_models if model not in tags]
if missing:
add_check(
checks,
"ollama:required-models",
"fail",
f"missing required model(s): {', '.join(missing)}",
{"required": list(required_models), "present": sorted(tags)},
)
else:
add_check(
checks,
"ollama:required-models",
"ok",
f"{len(required_models)} required model(s) present",
{"required": list(required_models)},
)
ok, ps_data, error = http_json(f"{base}/api/ps", timeout=timeout)
if not ok or not isinstance(ps_data, dict):
add_check(checks, "ollama:ps", "fail", error or "cannot read /api/ps")
return
resident = extract_ollama_tags(ps_data)
add_check(checks, "ollama:ps", "ok", f"/api/ps readable, {len(resident)} resident model(s)")
if resident_warn_model in resident:
add_check(checks, "ollama:resident-vlm", "ok", f"{resident_warn_model} resident")
else:
add_check(
checks,
"ollama:resident-vlm",
"warn",
f"{resident_warn_model} is not resident; no warmup was attempted",
{"resident": sorted(resident)},
)
def print_text(report: dict[str, Any]) -> None:
print(f"Lea micro preflight: {report['overall'].upper()}")
print(f"Timestamp: {report['timestamp']}")
print("Warmup: disabled")
print()
for check in report["checks"]:
print(f"[{check['status'].upper():4}] {check['name']} - {check['summary']}")
def parse_args(argv: list[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--json", action="store_true", help="print machine-readable JSON")
parser.add_argument("--strict", action="store_true", help="exit 1 when only warnings are present")
parser.add_argument("--ollama-url", default=os.environ.get("OLLAMA_BASE_URL", "http://127.0.0.1:11434"))
parser.add_argument("--timeout", type=float, default=4.0, help="per-check timeout in seconds")
parser.add_argument("--min-vram-free-mib", type=int, default=DEFAULT_MIN_VRAM_FREE_MIB)
parser.add_argument("--min-ram-available-mib", type=int, default=DEFAULT_MIN_RAM_AVAILABLE_MIB)
parser.add_argument("--max-swap-used-mib", type=int, default=DEFAULT_MAX_SWAP_USED_MIB)
parser.add_argument("--max-swap-used-pct", type=float, default=DEFAULT_MAX_SWAP_USED_PCT)
parser.add_argument(
"--required-model",
action="append",
dest="required_models",
help="required Ollama model tag; may be repeated",
)
parser.add_argument("--resident-warn-model", default=DEFAULT_RESIDENT_WARN_MODEL)
return parser.parse_args(argv)
def build_report(args: argparse.Namespace) -> dict[str, Any]:
required_models = tuple(args.required_models or REQUIRED_MODELS)
checks: list[dict[str, Any]] = []
check_gpu(checks, args.min_vram_free_mib, args.timeout)
check_memory(
checks,
args.min_ram_available_mib,
args.max_swap_used_mib,
args.max_swap_used_pct,
args.timeout,
)
check_ollama(checks, args.ollama_url, required_models, args.resident_warn_model, args.timeout)
return {
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S%z"),
"overall": summarize(checks),
"repo": str(REPO_ROOT),
"warmup": "disabled",
"thresholds": {
"min_vram_free_mib": args.min_vram_free_mib,
"min_ram_available_mib": args.min_ram_available_mib,
"max_swap_used_mib": args.max_swap_used_mib,
"max_swap_used_pct": args.max_swap_used_pct,
},
"ollama_url": args.ollama_url,
"required_models": list(required_models),
"checks": checks,
}
def main(argv: list[str]) -> int:
args = parse_args(argv)
report = build_report(args)
if args.json:
print(json.dumps(report, indent=2, sort_keys=True))
else:
print_text(report)
if report["overall"] == "fail":
return 2
if args.strict and report["overall"] == "warn":
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))

View File

@@ -882,8 +882,8 @@ _SESSION_TEMPLATE = """<!DOCTYPE html>
<td class="mono">{{ a.position }}</td>
<td>{{ a.window_title|truncate(40) }}</td>
<td class="mono">
{% if a.text %}{{ a.text|truncate(60) }}{% endif %}
{% if a.keys %}{{ a.keys }}{% endif %}
{% if a["text"] %}{{ a["text"]|truncate(60) }}{% endif %}
{% if a["keys"] %}{{ a["keys"] }}{% endif %}
</td>
<td>
{% if a.shot_file %}