CoCalc -- kucalc.ts

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.

GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/packages/project/kucalc.ts
Views: ⁶⁸⁷
1
/*
2
 *  This file is part of CoCalc: Copyright © 2023 Sagemath, Inc.
3
 *  License: MS-RSL – see LICENSE.md for details
4
 */
5

6
/*
7
Some code specific to running a project in the KuCalc environment.
8
*/
9

10
import { readFile as readFileAsync } from "node:fs/promises";
11

12
// Prometheus client setup -- https://github.com/siimon/prom-client
13
import prom_client from "prom-client";
14

15
import { execute_code } from "@cocalc/backend/misc_node";
16
import { callback2 as cb2 } from "@cocalc/util/async-utils";
17
import { startswith } from "@cocalc/util/misc";
18
import get_bugs_total from "./bug-counter";
19
import { session_id, start_ts } from "./consts";
20
import { getLogger } from "./logger";
21

22
const L = getLogger("kucalc");
23

24
interface Status {
25
  time: number;
26
  memory: { limit?: number; rss?: number };
27
  cpu: { usage?: number };
28
  disk_MB: number;
29
  start_ts: number;
30
  session_id: string;
31
  processes: { [key: string]: { cpu: number; memory: number } };
32
  oom_kills: number;
33
}
34

35
// additionally, record GC statistics
36
// https://www.npmjs.com/package/prometheus-gc-stats
37
//# I'm commenting this out because the package prometheus-gc-stats
38
//# on npm very explicitly says it does not support prom-client
39
//# version 13, which is what we have installed everywhere.  That
40
//# version is a significant breaking change from version 12, so
41
//# I'm also not comfortable reverting back.  Harald I think force
42
//# upgraded prom-client to version 13 in this commit: b31e087ea2c640f494db15b652d9d0f86e7bd8a5
43
// require('prometheus-gc-stats')()()
44

45
// collect some recommended default metrics
46
prom_client.collectDefaultMetrics();
47

48
// --- end prometheus setup
49

50
// This gets **changed** to true, if a certain
51
// command line flag is passed in.
52
export let IN_KUCALC = false;
53

54
export function setInKucalc(val: boolean): void {
55
  IN_KUCALC = val;
56
}
57

58
// status information
59
let current_status: Partial<Status> = {};
60

61
export function init(client) {
62
  // update project status every 30s
63
  // TODO: could switch to faster when it's changing and slower when it isn't.
64
  const f = () => update_project_status(client);
65
  f();
66
  return setInterval(f, 30000);
67
}
68

69
async function update_project_status(client) {
70
  const dbg = client.dbg("update_status");
71
  dbg();
72

73
  try {
74
    const status = await compute_status();
75
    current_status = status;
76
    await cb2(client.query, {
77
      query: {
78
        projects: { project_id: client.client_id(), status },
79
      },
80
    });
81
  } catch (err) {
82
    dbg(`ERROR: ${err}`);
83
  }
84
}
85

86
export async function test_compute_status() {
87
  return await compute_status();
88
}
89

90
async function compute_status(): Promise<Status> {
91
  const status: Status = {
92
    time: Date.now(),
93
    memory: { rss: 0 },
94
    disk_MB: 0,
95
    cpu: {},
96
    start_ts,
97
    session_id,
98
    processes: {},
99
    oom_kills: 0,
100
  };
101
  await Promise.all([
102
    compute_status_disk(status),
103
    cgroup_stats(status),
104
    processes_info(status),
105
    compute_status_tmp(status),
106
  ]);
107
  return status;
108
}
109

110
async function compute_status_disk(status) {
111
  const x: number = await disk_usage("$HOME");
112
  status.disk_MB = x;
113
}
114

115
async function processes_info(status): Promise<void> {
116
  const cols = ["pid", "lstart", "time", "rss", "args"];
117

118
  return new Promise((resolve, _reject) => {
119
    execute_code({
120
      command: "ps",
121
      args: ["--no-header", "-o", cols.join(","), "-u", "user"], // TODO user should be data.username ?
122
      bash: false,
123
      cb(err, out) {
124
        if (err || out?.exit_code !== 0) {
125
          L.warn(`ps failed: ${err} ${out?.stderr}`);
126
        } else {
127
          let cnt = -1; // no need to account for the ps process itself!
128
          // TODO parsing anything out of ps is really hard :-(
129
          // but we want to know how many sage, jupyter, console, etc. instances are running.
130
          for (let line of out.stdout.split("\n")) {
131
            if (line.length > 0) {
132
              cnt += 1;
133
            }
134
          }
135
          status.processes.count = cnt;
136
        }
137
        resolve();
138
      },
139
    });
140
  });
141
}
142

143
// NOTE: we use tmpfs for /tmp, so RAM usage is the **sum** of /tmp and what
144
// processes use.
145
async function compute_status_tmp(status) {
146
  const x: number = await disk_usage("/tmp");
147
  status.memory.rss += 1000 * x;
148
}
149

150
// this grabs the memory stats directly from the sysfs cgroup files
151
// the actual usage is the sum of the rss values plus cache, but we leave cache aside
152
async function cgroup_stats(status) {
153
  async function getMemory() {
154
    const data = await readFileAsync(
155
      "/sys/fs/cgroup/memory/memory.stat",
156
      "utf8",
157
    );
158

159
    const stats: {
160
      total_rss?: number;
161
      total_cache?: number;
162
      hierarchical_memory_limit?: number;
163
    } = {};
164

165
    for (let line of data.split("\n")) {
166
      const [key, value] = line.split(" ");
167
      try {
168
        stats[key] = parseInt(value);
169
      } catch (_err) {}
170
    }
171
    return stats;
172
  }
173

174
  async function getCPU() {
175
    const data = await readFileAsync(
176
      "/sys/fs/cgroup/cpu,cpuacct/cpuacct.usage",
177
      "utf8",
178
    );
179

180
    try {
181
      return parseFloat(data) / Math.pow(10, 9);
182
    } catch (_err) {
183
      return 0.0;
184
    }
185
  }
186

187
  async function getOOM() {
188
    const data = await readFileAsync(
189
      "/sys/fs/cgroup/memory/memory.oom_control",
190
      "utf8",
191
    );
192

193
    try {
194
      for (let line of data.split("\n")) {
195
        // search string includes a trailing space, otherwise it matches 'oom_kill_disable'!
196
        if (startswith(line, "oom_kill ")) {
197
          return parseInt(line.split(" ")[1]);
198
        }
199
      }
200
    } catch (_err) {}
201
    return 0;
202
  }
203

204
  try {
205
    const [memory, cpu, oom]: [{ [key: string]: number }, number, number] =
206
      await Promise.all([getMemory(), getCPU(), getOOM()]);
207

208
    const kib = 1024; // convert to kibibyte
209
    // total_rss includes total_rss_huge
210
    // Ref: https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt
211
    status.memory.rss += (memory.total_rss ?? 0) / kib;
212
    status.memory.cache = (memory.total_cache ?? 0) / kib;
213
    status.memory.limit = (memory.hierarchical_memory_limit ?? 0) / kib;
214
    status.cpu.usage = cpu;
215
    status.oom_kills = oom;
216
  } catch (err) {
217
    L.warn(`cgroup_stats error: ${err}`);
218
  }
219
}
220

221
async function disk_usage(path): Promise<number> {
222
  return new Promise((resolve, reject) => {
223
    execute_code({
224
      command: `df -BM ${path} | tail -1 | awk '{gsub(\"M\",\"\");print $3}'`,
225
      bash: true,
226
      cb(err, out) {
227
        if (err) {
228
          return reject(err);
229
        } else {
230
          return resolve(parseInt(out?.stdout ?? "0"));
231
        }
232
      },
233
    });
234
  });
235
}
236

237
export function prometheus_metrics(project_id): string {
238
  const P = "cocalc_project";
239
  const cs = current_status;
240
  const labels = `project_id=\"${project_id}\",session_id=\"${session_id}\"`;
241
  return (
242
    [
243
      `# HELP ${P}_bugs_total The total number of caught bugs.`,
244
      `# TYPE ${P}_bugs_total counter`,
245
      `${P}_bugs_total{${labels}} ${get_bugs_total()}`,
246
      `# HELP ${P}_start_time when the project/session started`,
247
      `# TYPE ${P}_start_time counter`,
248
      `${P}_start_time{${labels}} ${start_ts}`,
249
      `# HELP ${P}_cpu_usage_seconds`,
250
      `# TYPE ${P}_cpu_usage_seconds counter`,
251
      `${P}_cpu_usage_seconds{${labels}} ${cs.cpu?.usage ?? 0.0}`,
252
      `# HELP ${P}_disk_usage_mb`,
253
      `# TYPE ${P}_disk_usage_mb gauge`,
254
      `${P}_disk_usage_mb{${labels}} ${cs.disk_MB ?? 0.0}`,
255
      `# HELP ${P}_memory_usage_ki`,
256
      `# TYPE ${P}_memory_usage_ki gauge`,
257
      `${P}_memory_usage_ki{${labels}} ${cs.memory?.rss ?? 0.0}`,
258
      `# HELP ${P}_memory_limit_ki`,
259
      `# TYPE ${P}_memory_limit_ki gauge`,
260
      `${P}_memory_limit_ki{${labels}} ${cs.memory?.limit ?? 0.0}`,
261
      `# HELP ${P}_running_processes_total`,
262
      `# TYPE ${P}_running_processes_total gauge`,
263
      `${P}_running_processes_total{${labels}} ${cs.processes?.count ?? 0}`,
264
      `# HELP ${P}_oom_kills_total`,
265
      `# TYPE ${P}_oom_kills_total counter `,
266
      `${P}_oom_kills_total{${labels}} ${cs.oom_kills ?? 0}`,
267
    ].join("\n") + "\n" // makes sure the response ends with a newline!
268
  );
269
}
270

271
// called inside raw_server
272
export function init_health_metrics(raw_server, project_id): void {
273
  if (!IN_KUCALC) {
274
    return;
275
  }
276
  // Setup health and metrics (no url base prefix needed)
277
  raw_server.use("/health", function (_req, res): void {
278
    res.setHeader("Content-Type", "text/plain");
279
    res.setHeader("Cache-Control", "no-cache, no-store");
280
    res.send("OK");
281
  });
282

283
  // prometheus text format -- https://prometheus.io/docs/instrumenting/exposition_formats/#text-format-details
284
  raw_server.use("/metrics", async function (_req, res): Promise<void> {
285
    res.setHeader("Content-Type", "text/plain; version=0.0.4");
286
    res.header("Cache-Control", "no-cache, no-store");
287
    const part1 = prometheus_metrics(project_id);
288
    res.send(part1 + "\n" + (await prom_client.register.metrics()) + "\n");
289
  });
290
}
291

292
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.

Product

Resources

Company

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more, all in one place.

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.