Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Path: blob/master/src/packages/project/kucalc.ts
Views: 687
/*1* This file is part of CoCalc: Copyright © 2023 Sagemath, Inc.2* License: MS-RSL – see LICENSE.md for details3*/45/*6Some code specific to running a project in the KuCalc environment.7*/89import { readFile as readFileAsync } from "node:fs/promises";1011// Prometheus client setup -- https://github.com/siimon/prom-client12import prom_client from "prom-client";1314import { execute_code } from "@cocalc/backend/misc_node";15import { callback2 as cb2 } from "@cocalc/util/async-utils";16import { startswith } from "@cocalc/util/misc";17import get_bugs_total from "./bug-counter";18import { session_id, start_ts } from "./consts";19import { getLogger } from "./logger";2021const L = getLogger("kucalc");2223interface Status {24time: number;25memory: { limit?: number; rss?: number };26cpu: { usage?: number };27disk_MB: number;28start_ts: number;29session_id: string;30processes: { [key: string]: { cpu: number; memory: number } };31oom_kills: number;32}3334// additionally, record GC statistics35// https://www.npmjs.com/package/prometheus-gc-stats36//# I'm commenting this out because the package prometheus-gc-stats37//# on npm very explicitly says it does not support prom-client38//# version 13, which is what we have installed everywhere. That39//# version is a significant breaking change from version 12, so40//# I'm also not comfortable reverting back. Harald I think force41//# upgraded prom-client to version 13 in this commit: b31e087ea2c640f494db15b652d9d0f86e7bd8a542// require('prometheus-gc-stats')()()4344// collect some recommended default metrics45prom_client.collectDefaultMetrics();4647// --- end prometheus setup4849// This gets **changed** to true, if a certain50// command line flag is passed in.51export let IN_KUCALC = false;5253export function setInKucalc(val: boolean): void {54IN_KUCALC = val;55}5657// status information58let current_status: Partial<Status> = {};5960export function init(client) {61// update project status every 30s62// TODO: could switch to faster when it's changing and slower when it isn't.63const f = () => update_project_status(client);64f();65return setInterval(f, 30000);66}6768async function update_project_status(client) {69const dbg = client.dbg("update_status");70dbg();7172try {73const status = await compute_status();74current_status = status;75await cb2(client.query, {76query: {77projects: { project_id: client.client_id(), status },78},79});80} catch (err) {81dbg(`ERROR: ${err}`);82}83}8485export async function test_compute_status() {86return await compute_status();87}8889async function compute_status(): Promise<Status> {90const status: Status = {91time: Date.now(),92memory: { rss: 0 },93disk_MB: 0,94cpu: {},95start_ts,96session_id,97processes: {},98oom_kills: 0,99};100await Promise.all([101compute_status_disk(status),102cgroup_stats(status),103processes_info(status),104compute_status_tmp(status),105]);106return status;107}108109async function compute_status_disk(status) {110const x: number = await disk_usage("$HOME");111status.disk_MB = x;112}113114async function processes_info(status): Promise<void> {115const cols = ["pid", "lstart", "time", "rss", "args"];116117return new Promise((resolve, _reject) => {118execute_code({119command: "ps",120args: ["--no-header", "-o", cols.join(","), "-u", "user"], // TODO user should be data.username ?121bash: false,122cb(err, out) {123if (err || out?.exit_code !== 0) {124L.warn(`ps failed: ${err} ${out?.stderr}`);125} else {126let cnt = -1; // no need to account for the ps process itself!127// TODO parsing anything out of ps is really hard :-(128// but we want to know how many sage, jupyter, console, etc. instances are running.129for (let line of out.stdout.split("\n")) {130if (line.length > 0) {131cnt += 1;132}133}134status.processes.count = cnt;135}136resolve();137},138});139});140}141142// NOTE: we use tmpfs for /tmp, so RAM usage is the **sum** of /tmp and what143// processes use.144async function compute_status_tmp(status) {145const x: number = await disk_usage("/tmp");146status.memory.rss += 1000 * x;147}148149// this grabs the memory stats directly from the sysfs cgroup files150// the actual usage is the sum of the rss values plus cache, but we leave cache aside151async function cgroup_stats(status) {152async function getMemory() {153const data = await readFileAsync(154"/sys/fs/cgroup/memory/memory.stat",155"utf8",156);157158const stats: {159total_rss?: number;160total_cache?: number;161hierarchical_memory_limit?: number;162} = {};163164for (let line of data.split("\n")) {165const [key, value] = line.split(" ");166try {167stats[key] = parseInt(value);168} catch (_err) {}169}170return stats;171}172173async function getCPU() {174const data = await readFileAsync(175"/sys/fs/cgroup/cpu,cpuacct/cpuacct.usage",176"utf8",177);178179try {180return parseFloat(data) / Math.pow(10, 9);181} catch (_err) {182return 0.0;183}184}185186async function getOOM() {187const data = await readFileAsync(188"/sys/fs/cgroup/memory/memory.oom_control",189"utf8",190);191192try {193for (let line of data.split("\n")) {194// search string includes a trailing space, otherwise it matches 'oom_kill_disable'!195if (startswith(line, "oom_kill ")) {196return parseInt(line.split(" ")[1]);197}198}199} catch (_err) {}200return 0;201}202203try {204const [memory, cpu, oom]: [{ [key: string]: number }, number, number] =205await Promise.all([getMemory(), getCPU(), getOOM()]);206207const kib = 1024; // convert to kibibyte208// total_rss includes total_rss_huge209// Ref: https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt210status.memory.rss += (memory.total_rss ?? 0) / kib;211status.memory.cache = (memory.total_cache ?? 0) / kib;212status.memory.limit = (memory.hierarchical_memory_limit ?? 0) / kib;213status.cpu.usage = cpu;214status.oom_kills = oom;215} catch (err) {216L.warn(`cgroup_stats error: ${err}`);217}218}219220async function disk_usage(path): Promise<number> {221return new Promise((resolve, reject) => {222execute_code({223command: `df -BM ${path} | tail -1 | awk '{gsub(\"M\",\"\");print $3}'`,224bash: true,225cb(err, out) {226if (err) {227return reject(err);228} else {229return resolve(parseInt(out?.stdout ?? "0"));230}231},232});233});234}235236export function prometheus_metrics(project_id): string {237const P = "cocalc_project";238const cs = current_status;239const labels = `project_id=\"${project_id}\",session_id=\"${session_id}\"`;240return (241[242`# HELP ${P}_bugs_total The total number of caught bugs.`,243`# TYPE ${P}_bugs_total counter`,244`${P}_bugs_total{${labels}} ${get_bugs_total()}`,245`# HELP ${P}_start_time when the project/session started`,246`# TYPE ${P}_start_time counter`,247`${P}_start_time{${labels}} ${start_ts}`,248`# HELP ${P}_cpu_usage_seconds`,249`# TYPE ${P}_cpu_usage_seconds counter`,250`${P}_cpu_usage_seconds{${labels}} ${cs.cpu?.usage ?? 0.0}`,251`# HELP ${P}_disk_usage_mb`,252`# TYPE ${P}_disk_usage_mb gauge`,253`${P}_disk_usage_mb{${labels}} ${cs.disk_MB ?? 0.0}`,254`# HELP ${P}_memory_usage_ki`,255`# TYPE ${P}_memory_usage_ki gauge`,256`${P}_memory_usage_ki{${labels}} ${cs.memory?.rss ?? 0.0}`,257`# HELP ${P}_memory_limit_ki`,258`# TYPE ${P}_memory_limit_ki gauge`,259`${P}_memory_limit_ki{${labels}} ${cs.memory?.limit ?? 0.0}`,260`# HELP ${P}_running_processes_total`,261`# TYPE ${P}_running_processes_total gauge`,262`${P}_running_processes_total{${labels}} ${cs.processes?.count ?? 0}`,263`# HELP ${P}_oom_kills_total`,264`# TYPE ${P}_oom_kills_total counter `,265`${P}_oom_kills_total{${labels}} ${cs.oom_kills ?? 0}`,266].join("\n") + "\n" // makes sure the response ends with a newline!267);268}269270// called inside raw_server271export function init_health_metrics(raw_server, project_id): void {272if (!IN_KUCALC) {273return;274}275// Setup health and metrics (no url base prefix needed)276raw_server.use("/health", function (_req, res): void {277res.setHeader("Content-Type", "text/plain");278res.setHeader("Cache-Control", "no-cache, no-store");279res.send("OK");280});281282// prometheus text format -- https://prometheus.io/docs/instrumenting/exposition_formats/#text-format-details283raw_server.use("/metrics", async function (_req, res): Promise<void> {284res.setHeader("Content-Type", "text/plain; version=0.0.4");285res.header("Cache-Control", "no-cache, no-store");286const part1 = prometheus_metrics(project_id);287res.send(part1 + "\n" + (await prom_client.register.metrics()) + "\n");288});289}290291292