CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
sagemathinc

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.

GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/packages/project/kucalc.ts
Views: 687
1
/*
2
* This file is part of CoCalc: Copyright © 2023 Sagemath, Inc.
3
* License: MS-RSL – see LICENSE.md for details
4
*/
5
6
/*
7
Some code specific to running a project in the KuCalc environment.
8
*/
9
10
import { readFile as readFileAsync } from "node:fs/promises";
11
12
// Prometheus client setup -- https://github.com/siimon/prom-client
13
import prom_client from "prom-client";
14
15
import { execute_code } from "@cocalc/backend/misc_node";
16
import { callback2 as cb2 } from "@cocalc/util/async-utils";
17
import { startswith } from "@cocalc/util/misc";
18
import get_bugs_total from "./bug-counter";
19
import { session_id, start_ts } from "./consts";
20
import { getLogger } from "./logger";
21
22
const L = getLogger("kucalc");
23
24
interface Status {
25
time: number;
26
memory: { limit?: number; rss?: number };
27
cpu: { usage?: number };
28
disk_MB: number;
29
start_ts: number;
30
session_id: string;
31
processes: { [key: string]: { cpu: number; memory: number } };
32
oom_kills: number;
33
}
34
35
// additionally, record GC statistics
36
// https://www.npmjs.com/package/prometheus-gc-stats
37
//# I'm commenting this out because the package prometheus-gc-stats
38
//# on npm very explicitly says it does not support prom-client
39
//# version 13, which is what we have installed everywhere. That
40
//# version is a significant breaking change from version 12, so
41
//# I'm also not comfortable reverting back. Harald I think force
42
//# upgraded prom-client to version 13 in this commit: b31e087ea2c640f494db15b652d9d0f86e7bd8a5
43
// require('prometheus-gc-stats')()()
44
45
// collect some recommended default metrics
46
prom_client.collectDefaultMetrics();
47
48
// --- end prometheus setup
49
50
// This gets **changed** to true, if a certain
51
// command line flag is passed in.
52
export let IN_KUCALC = false;
53
54
export function setInKucalc(val: boolean): void {
55
IN_KUCALC = val;
56
}
57
58
// status information
59
let current_status: Partial<Status> = {};
60
61
export function init(client) {
62
// update project status every 30s
63
// TODO: could switch to faster when it's changing and slower when it isn't.
64
const f = () => update_project_status(client);
65
f();
66
return setInterval(f, 30000);
67
}
68
69
async function update_project_status(client) {
70
const dbg = client.dbg("update_status");
71
dbg();
72
73
try {
74
const status = await compute_status();
75
current_status = status;
76
await cb2(client.query, {
77
query: {
78
projects: { project_id: client.client_id(), status },
79
},
80
});
81
} catch (err) {
82
dbg(`ERROR: ${err}`);
83
}
84
}
85
86
export async function test_compute_status() {
87
return await compute_status();
88
}
89
90
async function compute_status(): Promise<Status> {
91
const status: Status = {
92
time: Date.now(),
93
memory: { rss: 0 },
94
disk_MB: 0,
95
cpu: {},
96
start_ts,
97
session_id,
98
processes: {},
99
oom_kills: 0,
100
};
101
await Promise.all([
102
compute_status_disk(status),
103
cgroup_stats(status),
104
processes_info(status),
105
compute_status_tmp(status),
106
]);
107
return status;
108
}
109
110
async function compute_status_disk(status) {
111
const x: number = await disk_usage("$HOME");
112
status.disk_MB = x;
113
}
114
115
async function processes_info(status): Promise<void> {
116
const cols = ["pid", "lstart", "time", "rss", "args"];
117
118
return new Promise((resolve, _reject) => {
119
execute_code({
120
command: "ps",
121
args: ["--no-header", "-o", cols.join(","), "-u", "user"], // TODO user should be data.username ?
122
bash: false,
123
cb(err, out) {
124
if (err || out?.exit_code !== 0) {
125
L.warn(`ps failed: ${err} ${out?.stderr}`);
126
} else {
127
let cnt = -1; // no need to account for the ps process itself!
128
// TODO parsing anything out of ps is really hard :-(
129
// but we want to know how many sage, jupyter, console, etc. instances are running.
130
for (let line of out.stdout.split("\n")) {
131
if (line.length > 0) {
132
cnt += 1;
133
}
134
}
135
status.processes.count = cnt;
136
}
137
resolve();
138
},
139
});
140
});
141
}
142
143
// NOTE: we use tmpfs for /tmp, so RAM usage is the **sum** of /tmp and what
144
// processes use.
145
async function compute_status_tmp(status) {
146
const x: number = await disk_usage("/tmp");
147
status.memory.rss += 1000 * x;
148
}
149
150
// this grabs the memory stats directly from the sysfs cgroup files
151
// the actual usage is the sum of the rss values plus cache, but we leave cache aside
152
async function cgroup_stats(status) {
153
async function getMemory() {
154
const data = await readFileAsync(
155
"/sys/fs/cgroup/memory/memory.stat",
156
"utf8",
157
);
158
159
const stats: {
160
total_rss?: number;
161
total_cache?: number;
162
hierarchical_memory_limit?: number;
163
} = {};
164
165
for (let line of data.split("\n")) {
166
const [key, value] = line.split(" ");
167
try {
168
stats[key] = parseInt(value);
169
} catch (_err) {}
170
}
171
return stats;
172
}
173
174
async function getCPU() {
175
const data = await readFileAsync(
176
"/sys/fs/cgroup/cpu,cpuacct/cpuacct.usage",
177
"utf8",
178
);
179
180
try {
181
return parseFloat(data) / Math.pow(10, 9);
182
} catch (_err) {
183
return 0.0;
184
}
185
}
186
187
async function getOOM() {
188
const data = await readFileAsync(
189
"/sys/fs/cgroup/memory/memory.oom_control",
190
"utf8",
191
);
192
193
try {
194
for (let line of data.split("\n")) {
195
// search string includes a trailing space, otherwise it matches 'oom_kill_disable'!
196
if (startswith(line, "oom_kill ")) {
197
return parseInt(line.split(" ")[1]);
198
}
199
}
200
} catch (_err) {}
201
return 0;
202
}
203
204
try {
205
const [memory, cpu, oom]: [{ [key: string]: number }, number, number] =
206
await Promise.all([getMemory(), getCPU(), getOOM()]);
207
208
const kib = 1024; // convert to kibibyte
209
// total_rss includes total_rss_huge
210
// Ref: https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt
211
status.memory.rss += (memory.total_rss ?? 0) / kib;
212
status.memory.cache = (memory.total_cache ?? 0) / kib;
213
status.memory.limit = (memory.hierarchical_memory_limit ?? 0) / kib;
214
status.cpu.usage = cpu;
215
status.oom_kills = oom;
216
} catch (err) {
217
L.warn(`cgroup_stats error: ${err}`);
218
}
219
}
220
221
async function disk_usage(path): Promise<number> {
222
return new Promise((resolve, reject) => {
223
execute_code({
224
command: `df -BM ${path} | tail -1 | awk '{gsub(\"M\",\"\");print $3}'`,
225
bash: true,
226
cb(err, out) {
227
if (err) {
228
return reject(err);
229
} else {
230
return resolve(parseInt(out?.stdout ?? "0"));
231
}
232
},
233
});
234
});
235
}
236
237
export function prometheus_metrics(project_id): string {
238
const P = "cocalc_project";
239
const cs = current_status;
240
const labels = `project_id=\"${project_id}\",session_id=\"${session_id}\"`;
241
return (
242
[
243
`# HELP ${P}_bugs_total The total number of caught bugs.`,
244
`# TYPE ${P}_bugs_total counter`,
245
`${P}_bugs_total{${labels}} ${get_bugs_total()}`,
246
`# HELP ${P}_start_time when the project/session started`,
247
`# TYPE ${P}_start_time counter`,
248
`${P}_start_time{${labels}} ${start_ts}`,
249
`# HELP ${P}_cpu_usage_seconds`,
250
`# TYPE ${P}_cpu_usage_seconds counter`,
251
`${P}_cpu_usage_seconds{${labels}} ${cs.cpu?.usage ?? 0.0}`,
252
`# HELP ${P}_disk_usage_mb`,
253
`# TYPE ${P}_disk_usage_mb gauge`,
254
`${P}_disk_usage_mb{${labels}} ${cs.disk_MB ?? 0.0}`,
255
`# HELP ${P}_memory_usage_ki`,
256
`# TYPE ${P}_memory_usage_ki gauge`,
257
`${P}_memory_usage_ki{${labels}} ${cs.memory?.rss ?? 0.0}`,
258
`# HELP ${P}_memory_limit_ki`,
259
`# TYPE ${P}_memory_limit_ki gauge`,
260
`${P}_memory_limit_ki{${labels}} ${cs.memory?.limit ?? 0.0}`,
261
`# HELP ${P}_running_processes_total`,
262
`# TYPE ${P}_running_processes_total gauge`,
263
`${P}_running_processes_total{${labels}} ${cs.processes?.count ?? 0}`,
264
`# HELP ${P}_oom_kills_total`,
265
`# TYPE ${P}_oom_kills_total counter `,
266
`${P}_oom_kills_total{${labels}} ${cs.oom_kills ?? 0}`,
267
].join("\n") + "\n" // makes sure the response ends with a newline!
268
);
269
}
270
271
// called inside raw_server
272
export function init_health_metrics(raw_server, project_id): void {
273
if (!IN_KUCALC) {
274
return;
275
}
276
// Setup health and metrics (no url base prefix needed)
277
raw_server.use("/health", function (_req, res): void {
278
res.setHeader("Content-Type", "text/plain");
279
res.setHeader("Cache-Control", "no-cache, no-store");
280
res.send("OK");
281
});
282
283
// prometheus text format -- https://prometheus.io/docs/instrumenting/exposition_formats/#text-format-details
284
raw_server.use("/metrics", async function (_req, res): Promise<void> {
285
res.setHeader("Content-Type", "text/plain; version=0.0.4");
286
res.header("Cache-Control", "no-cache, no-store");
287
const part1 = prometheus_metrics(project_id);
288
res.send(part1 + "\n" + (await prom_client.register.metrics()) + "\n");
289
});
290
}
291
292