Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Path: blob/master/src/packages/hub/health-checks.ts
Views: 687
/*1* This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.2* License: MS-RSL – see LICENSE.md for details3*/45// endpoints for various health checks67import getLogger from "@cocalc/backend/logger";8const { new_counter } = require("@cocalc/hub/metrics-recorder");9import { howLongDisconnectedMins } from "@cocalc/database/postgres/record-connect-error";10import type { PostgreSQL } from "@cocalc/database/postgres/types";11import { seconds2hms } from "@cocalc/util/misc";12import express, { Response } from "express";13import { createServer, Server } from "net";14import { isFloat } from "validator";15import { database_is_working } from "./hub_register";16const logger = getLogger("hub:healthcheck");17const { debug: L } = logger;1819const HEALTHCHECKS = new_counter(20"healthchecks_total",21"test healthcheck counter",22["status"]23);2425interface HealthcheckData {26code: 200 | 404;27txt: string;28}2930// self termination is only activated, if there is a COCALC_HUB_SELF_TERMINATE environment variable31// it's value is an interval in hours, minimum and maximum, for how long it should be alive32// and a drain period in minutes at the end.33// e.g. "24,48,15" for an uptime between 1 and 2 days and 15 minutes of draining34function init_self_terminate(): {35startup: number;36shutdown?: number; // when to shutdown (causes a failed health check)37drain?: number; // when to start draining, causes a proxy server to no longer send traffic38} {39const D = logger.extend("init_self_terminate").debug;40const startup = Date.now();41const conf = process.env.COCALC_HUB_SELF_TERMINATE;42if (conf == null) {43D("COCALC_HUB_SELF_TERMINATE env var not set, hence no self-termination");44return { startup };45}46const [from_str, to_str, drain_str] = conf.trim().split(",");47if (!isFloat(from_str, { gt: 0 }))48throw new Error("COCALC_HUB_SELF_TERMINATE/from not a positive float");49if (!isFloat(to_str, { gt: 0 }))50throw new Error("COCALC_HUB_SELF_TERMINATE/to not a positive float");51if (!isFloat(drain_str, { gt: 0 }))52throw new Error("COCALC_HUB_SELF_TERMINATE/drain not a positive float");53const from = parseFloat(from_str);54const to = parseFloat(to_str);55const drain_h = parseFloat(drain_str) / 60; // minutes to hours56D("parsed data:", { from, to, drain_h });57if (from > to)58throw Error(59"COCALC_HUB_SELF_TERMINATE 'from' must be smaller than 'to', e.g. '24,48,15'"60);61const uptime = Math.random() * (to - from); // hours62const hours2ms = 1000 * 60 * 60;63const shutdown = startup + (from + uptime) * hours2ms;64const drain = shutdown - drain_h * hours2ms;65if (startup > drain) {66throw new Error(67`COCALC_HUB_SELF_TERMINATE: startup must be smaller than drain – ${startup}>${drain}`68);69}70D({71startup: new Date(startup).toISOString(),72drain: new Date(drain).toISOString(),73shutdown: new Date(shutdown).toISOString(),74uptime: seconds2hms((hours2ms * uptime) / 1000),75draintime: seconds2hms((drain_h * hours2ms) / 1000),76});77return { startup, shutdown, drain };78}7980const { startup, shutdown, drain } = init_self_terminate();8182let agent_port = 0;83let agent_host = "0.0.0.0";84export function set_agent_endpoint(port: number, host: string) {85L(`set_agent_endpoint ${agent_host}:${agent_port}`);86agent_port = port;87agent_host = host;88}8990let agent_check_server: Server | undefined;9192// HAProxy agent-check TCP endpoint93// https://cbonte.github.io/haproxy-dconv/2.0/configuration.html#5.2-agent-check94// for development, set the env var in your startup script or terminal init file95// export COCALC_HUB_SELF_TERMINATE=.1,.2,196// and then query it like that97// $ telnet 0.0.0.0 $(cat $COCALC_ROOT/dev/project/ports/agent-port)98function setup_agent_check() {99if (agent_port == 0 || drain == null) {100L("setup_agent_check: agent_port not set, no agent checks");101return;102}103104// TODO this could also return a "weight" for this server, based on load values105// there is also "drain", but we set it to "10%" to avoid a nasty situation, when all endpoints are draining.106// ATTN: weight must be set as well, which is poorly documented here:107// https://cbonte.github.io/haproxy-dconv/2.0/configuration.html#5.2-weight108agent_check_server = createServer((c) => {109let msg = Date.now() < drain ? "ready up 100%" : "10%";110c.write(msg + "\r\n");111c.destroy();112});113114agent_check_server.listen(agent_port, agent_host);115L(`setup_agent_check: listening on ${agent_host}:${agent_port}`);116}117118export interface Check {119status: string;120abort?: boolean;121}122123interface Opts {124db: PostgreSQL;125router: express.Router;126extra?: (() => Promise<Check>)[]; // additional health checks127}128129// this could be directly in setup_health_checks, but we also need it in proxy.coffee130// proxy.coffee must be rewritten and restructured first – just wrapping it with a router131// didn't work at all for me132export function process_alive(): HealthcheckData {133let txt = "alive: YES";134let is_dead = true;135if (!database_is_working()) {136// this will stop haproxy from routing traffic to us137// until db connection starts working again.138txt = "alive: NO – database not working";139} else if (shutdown != null && Date.now() > shutdown) {140txt = "alive: NO – shutdown initiated";141} else {142is_dead = false;143}144const code = is_dead ? 404 : 200;145return { txt, code };146}147148function checkConcurrent(db: PostgreSQL): Check {149const c = db.concurrent();150if (c >= db._concurrent_warn) {151return {152status: `hub not healthy, since concurrent ${c} >= ${db._concurrent_warn}`,153abort: true,154};155} else {156return { status: `concurrent ${c} < ${db._concurrent_warn}` };157}158}159160function checkUptime(): Check {161const now = Date.now();162const uptime = seconds2hms((now - startup) / 1000);163if (shutdown != null && drain != null) {164if (now >= shutdown) {165const msg = `uptime ${uptime} – expired, terminating now`;166L(msg);167return { status: msg, abort: true };168} else {169const until = seconds2hms((shutdown - now) / 1000);170const drain_str =171drain > now172? `draining in ${seconds2hms((drain - now) / 1000)}`173: "draining now";174const msg = `uptime ${uptime} – ${drain_str} – terminating in ${until}`;175L(msg);176return { status: msg };177}178} else {179const msg = `uptime ${uptime} – no self-termination`;180L(msg);181return { status: msg };182}183}184185// if there are is no connection to the database for that many minutes,186// declare the hub unhealthy187const DB_ERRORS_THRESHOLD_MIN = parseInt(188process.env.COCALC_DB_ERRORS_THRESHOLD_MIN ?? "5"189);190191function checkDBConnectivity(): Check {192if (DB_ERRORS_THRESHOLD_MIN <= 0) {193return { status: "db connectivity check disabled" };194}195const num = howLongDisconnectedMins();196if (num == null) {197return { status: "no DB connection problems", abort: false };198}199// round num to 2 decimal places200const numStr = num.toFixed(2);201const above = num >= DB_ERRORS_THRESHOLD_MIN;202const status = above203? `DB problems for ${numStr} >= ${DB_ERRORS_THRESHOLD_MIN} mins`204: `DB problems for ${numStr} < ${DB_ERRORS_THRESHOLD_MIN} mins`;205return { status, abort: above };206}207208// same note as above for process_alive()209async function process_health_check(210db: PostgreSQL,211extra: (() => Promise<Check>)[] = []212): Promise<HealthcheckData> {213let any_abort = false;214let txt = "healthchecks:\n";215for (const test of [216() => checkConcurrent(db),217checkUptime,218checkDBConnectivity,219...extra,220]) {221try {222const { status, abort = false } = await test();223const statusTxt = abort ? "FAIL" : "OK";224txt += `${status} – ${statusTxt}\n`;225any_abort = any_abort || abort;226L(`process_health_check: ${status} – ${statusTxt}`);227} catch (err) {228L(`process_health_check ERRROR: ${err}`);229HEALTHCHECKS.labels("ERROR").inc();230}231}232const code = any_abort ? 404 : 200;233HEALTHCHECKS.labels(any_abort ? "FAIL" : "OK").inc();234return { code, txt };235}236237export async function setup_health_checks(opts: Opts): Promise<void> {238const { db, extra, router } = opts;239setup_agent_check();240241// used by HAPROXY for testing that this hub is OK to receive traffic242router.get("/alive", (_, res: Response) => {243const { code, txt } = process_alive();244res.type("txt");245res.status(code);246res.send(txt);247});248249// this is a more general check than concurrent-warn250// additionally to checking the database condition, it also self-terminates251// this hub if it is running for quite some time. beyond that, in the future252// there could be even more checks on top of that.253router.get("/healthcheck", async (_, res: Response) => {254const { txt, code } = await process_health_check(db, extra);255res.status(code);256res.type("txt");257res.send(txt);258});259260// /concurrent-warn -- could be used by kubernetes to decide whether or not to kill the container; if261// below the warn thresh, returns number of concurrent connection; if hits warn, then262// returns 404 error, meaning hub may be unhealthy. Kubernetes will try a few times before263// killing the container. Will also return 404 if there is no working database connection.264router.get("/concurrent-warn", (_, res) => {265res.type("txt");266if (!database_is_working()) {267L("/concurrent-warn: not healthy, since database connection not working");268res.status(404).end();269return;270}271272const c = db.concurrent();273if (c >= db._concurrent_warn) {274L(275`/concurrent-warn: not healthy, since concurrent ${c} >= ${db._concurrent_warn}`276);277res.status(404).end();278return;279}280res.send(`${c}`);281});282283// Return number of concurrent connections (could be useful)284router.get("/concurrent", (_, res) => {285res.type("txt");286res.send(`${db.concurrent()}`);287});288}289290291