Path: blob/master/contrib/vm/compute_node/root/healthcheck.py
1075 views
#!/usr/bin/env python312from datetime import datetime3import os4import pwd5import random6import re7import shlex8import shutil9import subprocess10import sys11import time1213import requests141516RETRIES = 317PROBE_TIMEOUT_SECONDS = 1518RETRY_SLEEP_SECONDS = 0.519OOM_LOOKBACK_MINUTES = 520CONTEXT_KEEP_LINES = 72021CONTEXT_LINES_ON_FAILURE = 32223SERVICE_NAME = "sagecell"24CONTEXT_LOG = "/root/healthcheck-context.log"25EVENT_LOG = "/root/healthcheck.log"26WORKER_USER = "{worker}"2728OOM_PATTERNS = (29"Out of memory",30"Killed process",31"oom-kill",32"OOM killer",33"Memory cgroup out of memory",34"oom_reaper",35)36OOM_GREP = "|".join(re.escape(pattern) for pattern in OOM_PATTERNS)37KERNELS_LOOKBACK_MINUTES = 3383940def timestamp():41return datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")424344def log_message(message):45with open(EVENT_LOG, "a", encoding="utf-8") as f:46f.write(f"{timestamp()} {message}\n")474849def log_status_line(line):50with open(EVENT_LOG, "a", encoding="utf-8") as f:51f.write(line + "\n")525354def update_context_log(line):55lines = []56if os.path.exists(CONTEXT_LOG):57with open(CONTEXT_LOG, encoding="utf-8") as f:58lines = f.read().splitlines()59previous = lines[-CONTEXT_LINES_ON_FAILURE:]60lines = (lines + [line])[-CONTEXT_KEEP_LINES:]61with open(CONTEXT_LOG, "w", encoding="utf-8") as f:62f.write("\n".join(lines) + "\n")63return previous646566def try_output(command):67"""Return command output; treat errors as empty output."""68try:69return subprocess.check_output(70shlex.split(command),71text=True,72stderr=subprocess.DEVNULL,73)74except (FileNotFoundError, subprocess.CalledProcessError):75return ""767778def human_memory_from_kib(value_kib):79value = float(value_kib) * 102480for unit in ("B", "K", "M", "G", "T", "P"):81if value < 1024 or unit == "P":82if unit in ("B", "K"):83return f"{int(round(value)):4d}{unit}"84return f"{value:4.1f}{unit}"85value /= 102486return f"{value:4.1f}P"878889def memory_topic():90with open("/proc/meminfo", encoding="utf-8") as f:91meminfo = {}92for line in f:93key, value = line.split(":", 1)94meminfo[key] = int(value.strip().split()[0])95used_kib = meminfo["MemTotal"] - meminfo["MemAvailable"]96return f"M:{human_memory_from_kib(used_kib)}"979899def oom_topic():100try:101worker_uid = str(pwd.getpwnam(WORKER_USER).pw_uid)102except KeyError:103return "OOM:?/?"104worker_patterns = (105WORKER_USER,106f" uid {worker_uid}",107f" uid={worker_uid}",108f" UID: {worker_uid}",109f" user {worker_uid}",110)111output = try_output(112f"journalctl -k --since '{OOM_LOOKBACK_MINUTES} minutes ago' --no-pager -o cat --grep '{OOM_GREP}'"113)114total = 0115worker_events = 0116for line in output.splitlines():117if not any(pattern in line for pattern in OOM_PATTERNS):118continue119total += 1120if any(pattern in line for pattern in worker_patterns):121worker_events += 1122return f"OOM:{total}/{worker_events}"123124125def format_float(value, width, decimals):126if value is None:127return "?" * width128return f"{value:{width}.{decimals}f}"129130131def format_integer(value, width):132if value is None:133return "?" * width134return f"{int(round(value)):>{width}d}"135136137def kernels_topic():138output = try_output(139f"journalctl --since '{KERNELS_LOOKBACK_MINUTES} minutes ago' --no-pager -o cat --grep 'tracking [0-9]+ kernels' -n 1"140)141match = re.search(r"tracking ([0-9]+) kernels", output)142tracking = format_integer(int(match.group(1)), 2) if match else "??"143144output = try_output(145f"journalctl --since '{KERNELS_LOOKBACK_MINUTES} minutes ago' --no-pager -o cat --grep '[0-9]+ preforked kernels left' -n 1"146)147match = re.search(r"([0-9]+) preforked kernels left", output)148preforked = format_integer(int(match.group(1)), 2) if match else "??"149150return f"K:{tracking}/{preforked}"151152153def disk_topics():154if shutil.which("iostat") is None:155return ["iostat not installed"]156output = try_output("iostat -dxy 1 1")157lines = output.splitlines()158header = None159devices = []160for raw_line in lines:161line = raw_line.strip()162if not line:163continue164columns = line.split()165if columns[0] == "Device":166header = columns167continue168if header is None:169continue170if columns[0].startswith(("loop", "ram", "sr")):171continue172if len(columns) != len(header):173continue174row = dict(zip(header, columns))175devices.append(row)176if not devices:177return ["error calling iostat"]178179def parse_metric(row, name):180try:181return float(row[name])182except (KeyError, ValueError):183return None184185topics = []186for row in sorted(devices, key=lambda device: device["Device"]):187util = parse_metric(row, "%util")188aqu = parse_metric(row, "aqu-sz")189r_await = parse_metric(row, "r_await")190w_await = parse_metric(row, "w_await")191topics.append(192"{}:{}% {}q {}r {}w".format(193row["Device"],194format_integer(util, 3),195format_float(aqu, 3, 1),196format_integer(r_await, 2),197format_integer(w_await, 2),198)199)200return topics201202203def status_line():204"""Return a compact one-line health snapshot.205206Example:2072026-04-01 03:39:25.925167 K: 2/ 9 L:1.06 0.74 0.84 M: 1.4G OOM:0/0 sda: 1%% 0.1q 6r 13w sdb: 3%% 0.3q 2r 2w208209Topics:210timestamp current local time211K tracked kernels / preforked kernels left212L 1/5/15 minute load averages213M used RAM214OOM all recent OOM events / recent worker-related OOM events215sda, sdb per-disk util%%, queue size, read-await ms, write-await ms216"""217load = os.getloadavg()218topics = [219timestamp(),220kernels_topic(),221"L:{} {} {}".format(222format_float(load[0], 4, 2),223format_float(load[1], 4, 2),224format_float(load[2], 4, 2),225),226memory_topic(),227oom_topic(),228]229topics.extend(disk_topics())230return " ".join(topics)231232233def run_probe(base_url):234a = random.randint(-2**31, 2**31)235b = random.randint(-2**31, 2**31)236# The handling of temporary files in Sage 9.7 does not allow SageMathCell to237# function properly if there are no regular requests producing temporary238# files. To fight it, we'll generate one during health checks. See239# https://groups.google.com/g/sage-devel/c/jpwUb8OCVVc/m/R4r5bnOkBQAJ240code = "show(plot(sin)); print({} + {})".format(a, b)241try:242response = requests.post(243base_url + "/service",244data={"code": code, "accepted_tos": "true"},245timeout=PROBE_TIMEOUT_SECONDS,246)247response.raise_for_status()248reply = response.json()249# Every few hours we have a request that comes back as executed, but the250# stdout is not in the dictionary. It seems that the compute message251# never actually gets sent to the kernel and it appears the problem is252# in the zmq connection between the webserver and the kernel.253#254# Also sometimes reply is unsuccessful, yet the server keeps running255# and other requests are serviced. Since a restart breaks all active256# interacts, better not to restart the server that "mostly works" and257# instead we'll just accumulate statistics on these random errors to258# help resolve them.259if (260reply["success"]261and "stdout" in reply262and int(reply["stdout"].strip()) == a + b263):264return True, None265return False, str(reply)266except Exception as exc: # pylint: disable=broad-except267return False, str(exc)268269270def main():271if len(sys.argv) != 2:272print("usage: healthcheck.py <base_url>")273return 2274275previous_context = update_context_log(status_line())276277if subprocess.call(278["systemctl", "--quiet", "is-active", SERVICE_NAME],279stdout=subprocess.DEVNULL,280stderr=subprocess.DEVNULL,281) != 0:282print(f"{timestamp()} Service is not active, skipping check")283return 0284285remaining_attempts = RETRIES286first_failure = True287while remaining_attempts:288remaining_attempts -= 1289ok, reason = run_probe(sys.argv[1])290if ok:291return 0292293current_line = status_line()294if first_failure:295for context_line in previous_context:296log_status_line(context_line)297first_failure = False298log_status_line(current_line)299log_message(f"healthcheck failed, {remaining_attempts} attempts left: {reason}")300301if remaining_attempts:302time.sleep(RETRY_SLEEP_SECONDS)303304log_message(f"restarting {SERVICE_NAME}")305try:306subprocess.check_call(["systemctl", "restart", SERVICE_NAME])307log_message("restart succeeded")308except (FileNotFoundError, subprocess.CalledProcessError):309log_status_line(status_line())310log_message("restart failed")311return 1312313314if __name__ == "__main__":315sys.exit(main())316317318