Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sagemathinc
GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/packages/project/project-info/server.ts
5536 views
1
/*
2
* This file is part of CoCalc: Copyright © 2020–2026 Sagemath, Inc.
3
* License: MS-RSL – see LICENSE.md for details
4
*/
5
6
/*
7
Project information server, doing the heavy lifting of telling the client
8
about what's going on in a project.
9
10
This is an event emitter that emits a ProjectInfo object periodically when running.
11
12
One important aspect is that this avoids spawning subprocesses, which could be problematic
13
if there is a limit on the number of processes that can be spawned, or memory pressure, etc.
14
*/
15
16
import { delay } from "awaiting";
17
import type { DiskUsage as DF_DiskUsage } from "diskusage";
18
import { check as df } from "diskusage";
19
import { EventEmitter } from "node:events";
20
import { access, readFile } from "node:fs/promises";
21
22
import { ProcessStats } from "@cocalc/backend/process-stats";
23
import { pidToPath as terminalPidToPath } from "@cocalc/project/conat/terminal/manager";
24
import { getLogger } from "@cocalc/project/logger";
25
import { get_path_for_pid as x11_pid2path } from "@cocalc/project/x11/server";
26
import type {
27
CGroup,
28
CoCalcInfo,
29
DiskUsage,
30
Process,
31
Processes,
32
ProjectInfo,
33
} from "@cocalc/util/types/project-info/types";
34
35
const L = getLogger("project-info:server").debug;
36
37
const bytes2MiB = (bytes) => bytes / (1024 * 1024);
38
39
/**
40
* Detect if /tmp is mounted as tmpfs (memory-based filesystem) by reading /proc/mounts.
41
* Returns true if /tmp is tmpfs, false otherwise.
42
*/
43
async function isTmpMemoryBased(): Promise<boolean> {
44
try {
45
const mounts = await readFile("/proc/mounts", "utf8");
46
// Look for lines like: "tmpfs /tmp tmpfs rw,nosuid,nodev,noexec,relatime,size=1024000k 0 0"
47
const tmpfsPattern = /^\S+\s+\/tmp\s+tmpfs\s/m;
48
return tmpfsPattern.test(mounts);
49
} catch (error) {
50
L("Failed to read /proc/mounts, assuming /tmp is disk-based:", error);
51
return false; // Default to safer assumption for development environments
52
}
53
}
54
55
/**
56
* Safely read a file, returning null if the file doesn't exist.
57
* Throws for other errors.
58
*/
59
async function safeReadFile(path: string): Promise<string | null> {
60
try {
61
return await readFile(path, "utf8");
62
} catch (error: any) {
63
if (error.code === "ENOENT") {
64
console.warn(`safeReadFile: ${path} not found, skipping`);
65
return null;
66
}
67
throw error;
68
}
69
}
70
71
export class ProjectInfoServer extends EventEmitter {
72
private last?: ProjectInfo = undefined;
73
private readonly dbg: Function;
74
private running = false;
75
private readonly testing: boolean;
76
private delay_s: number;
77
private tmpIsMemoryBased?: boolean;
78
private cgroupFilesAreMissing: boolean = false;
79
private processStats: ProcessStats;
80
private cgroupVersion: "v1" | "v2" | "unknown" | null;
81
82
constructor(testing = false) {
83
super();
84
this.delay_s = 2;
85
this.testing = testing;
86
this.dbg = L;
87
// cgroup version will be detected lazily
88
this.cgroupVersion = null;
89
}
90
91
private async processes(timestamp: number) {
92
return await this.processStats.processes(timestamp, "project-info");
93
}
94
95
// delta-time for this and the previous process information
96
private dt(timestamp) {
97
return (timestamp - (this.last?.timestamp ?? 0)) / 1000;
98
}
99
100
public latest(): ProjectInfo | undefined {
101
return this.last;
102
}
103
104
// for a process we know (pid, etc.) we try to map to cocalc specific information
105
private async cocalc({
106
pid,
107
cmdline,
108
}: Pick<Process, "pid" | "cmdline">): Promise<CoCalcInfo | undefined> {
109
//this.dbg("classify", { pid, exe, cmdline });
110
if (pid === process.pid) {
111
return { type: "project" };
112
}
113
// SPEED: importing @cocalc/jupyter/kernel is slow, so it MUST NOT BE DONE
114
// on the top level, especially not in any code that is loaded during
115
// project startup
116
const { get_kernel_by_pid } = await import("@cocalc/jupyter/kernel");
117
const jupyter_kernel = get_kernel_by_pid(pid);
118
if (jupyter_kernel != null) {
119
return { type: "jupyter", path: jupyter_kernel.get_path() };
120
}
121
const termpath = terminalPidToPath(pid);
122
if (termpath != null) {
123
return { type: "terminal", path: termpath };
124
}
125
const x11_path = x11_pid2path(pid);
126
if (x11_path != null) {
127
return { type: "x11", path: x11_path };
128
}
129
// SSHD: strangely, just one long string in cmdline[0]
130
if (
131
cmdline.length === 1 &&
132
cmdline[0].startsWith("sshd:") &&
133
cmdline[0].indexOf("-p 2222") != -1
134
) {
135
return { type: "sshd" };
136
}
137
}
138
139
private async lookupCoCalcInfo(processes: Processes) {
140
// iterate over all processes keys (pid) and call this.cocalc({pid, cmdline})
141
// to update the processes coclc field
142
for (const pid in processes) {
143
processes[pid].cocalc = await this.cocalc({
144
pid: parseInt(pid),
145
cmdline: processes[pid].cmdline,
146
});
147
}
148
}
149
150
/**
151
* Detect cgroup version lazily.
152
* Fine to run once, since the cgroup version won't change during the process lifetime.
153
*/
154
private async detectCGroupVersion(): Promise<"v1" | "v2" | "unknown" | null> {
155
if (this.cgroupVersion !== null) {
156
return this.cgroupVersion;
157
}
158
159
try {
160
// Check for v2-specific file
161
await access("/sys/fs/cgroup/cgroup.controllers");
162
this.cgroupVersion = "v2";
163
} catch (error: any) {
164
if (error.code === "ENOENT") {
165
// File doesn't exist, so likely v1
166
this.cgroupVersion = "v1";
167
} else {
168
// Other errors (e.g., permissions): treat as unknown
169
console.error("Error detecting cgroup version:", error);
170
this.cgroupVersion = "unknown";
171
}
172
}
173
174
L(`detected cgroup version: ${this.cgroupVersion}`);
175
return this.cgroupVersion;
176
}
177
178
/**
179
* Collect cgroup resource usage information.
180
* This is specific to running a project in a CGroup container.
181
* Harald: however, even without a container this shouldn't fail … just tells
182
* you what the whole system is doing, all your processes.
183
* William: it's constantly failing in cocalc-docker every second, so to avoid
184
* clogging logs and wasting CPU, if the files are missing once, it stops updating.
185
*/
186
private async cgroup({ timestamp }): Promise<CGroup | undefined> {
187
const version = await this.detectCGroupVersion();
188
switch (version) {
189
case "v1":
190
return this.cgroupV1({ timestamp });
191
case "v2":
192
return this.cgroupV2({ timestamp });
193
default:
194
this.dbg("cgroup: unknown version, skipping");
195
return undefined;
196
}
197
}
198
199
/**
200
* Collect cgroup v1 resource usage information.
201
*
202
* cgroup v1 uses separate hierarchies for different resource controllers:
203
* - /sys/fs/cgroup/memory/memory.stat - memory statistics
204
* - /sys/fs/cgroup/cpu,cpuacct/cpuacct.usage - CPU usage in nanoseconds
205
* - /sys/fs/cgroup/memory/memory.oom_control - OOM kill information
206
* - /sys/fs/cgroup/cpu,cpuacct/cpu.cfs_quota_us - CPU quota
207
* - /sys/fs/cgroup/cpu,cpuacct/cpu.cfs_period_us - CPU period
208
*/
209
private async cgroupV1({ timestamp }): Promise<CGroup | undefined> {
210
if (this.cgroupFilesAreMissing) {
211
return;
212
}
213
try {
214
const [mem_stat_raw, cpu_raw, oom_raw, cfs_quota_raw, cfs_period_raw] =
215
await Promise.all([
216
readFile("/sys/fs/cgroup/memory/memory.stat", "utf8"),
217
readFile("/sys/fs/cgroup/cpu,cpuacct/cpuacct.usage", "utf8"),
218
readFile("/sys/fs/cgroup/memory/memory.oom_control", "utf8"),
219
readFile("/sys/fs/cgroup/cpu,cpuacct/cpu.cfs_quota_us", "utf8"),
220
readFile("/sys/fs/cgroup/cpu,cpuacct/cpu.cfs_period_us", "utf8"),
221
]);
222
const mem_stat_keys = [
223
"total_rss",
224
"total_cache",
225
"hierarchical_memory_limit",
226
];
227
const cpu_usage = parseFloat(cpu_raw) / Math.pow(10, 9);
228
const dt = this.dt(timestamp);
229
const cpu_usage_rate =
230
this.last?.cgroup != null
231
? (cpu_usage - this.last.cgroup.cpu_usage) / dt
232
: 0;
233
const [cfs_quota, cfs_period] = [
234
parseInt(cfs_quota_raw),
235
parseInt(cfs_period_raw),
236
];
237
const mem_stat = mem_stat_raw
238
.split("\n")
239
.map((line) => line.split(" "))
240
.filter(([k, _]) => mem_stat_keys.includes(k))
241
.reduce((stat, [key, val]) => {
242
stat[key] = bytes2MiB(parseInt(val));
243
return stat;
244
}, {});
245
const oom_kills = oom_raw
246
.split("\n")
247
.filter((val) => val.startsWith("oom_kill "))
248
.map((val) => parseInt(val.slice("oom_kill ".length)))[0];
249
250
// Handle unlimited CPU quota (-1) correctly
251
const cpu_cores_limit = cfs_quota === -1 ? -1 : cfs_quota / cfs_period;
252
253
return {
254
mem_stat,
255
cpu_usage,
256
cpu_usage_rate,
257
cpu_cores_limit,
258
oom_kills,
259
};
260
} catch (err) {
261
this.dbg("cgroup v1: error", err);
262
if (err.code == "ENOENT") {
263
// TODO: instead of shutting this down, we could maybe do a better job
264
// figuring out what the correct cgroups files are on a given system.
265
// E.g., in my cocalc-docker, I do NOT have /sys/fs/cgroup/memory/memory.stat
266
// but I do have /sys/fs/cgroup/memory.stat
267
this.cgroupFilesAreMissing = true;
268
this.dbg(
269
"cgroup v1: files are missing so cgroups info will no longer be updated",
270
);
271
}
272
return undefined;
273
}
274
}
275
276
/**
277
* Get the current process's cgroup path for v2.
278
*/
279
private async getCgroupV2Path(): Promise<string> {
280
try {
281
const cgroupData = await readFile("/proc/self/cgroup", "utf8");
282
// v2 format: "0::/path/to/cgroup"
283
const match = cgroupData.match(/^0::(.+)$/m);
284
if (match) {
285
return `/sys/fs/cgroup${match[1]}`;
286
}
287
} catch (error) {
288
console.warn("Failed to read /proc/self/cgroup, using root cgroup");
289
}
290
return "/sys/fs/cgroup";
291
}
292
293
/**
294
* Get system total memory from /proc/meminfo as fallback.
295
*/
296
private async getSystemTotalMemory(): Promise<number> {
297
try {
298
const meminfo = await safeReadFile("/proc/meminfo");
299
if (meminfo) {
300
const match = meminfo.match(/^MemTotal:\s+(\d+)\s+kB$/m);
301
if (match) {
302
return parseInt(match[1]) / 1024; // Convert kB to MiB
303
}
304
}
305
} catch (error) {
306
console.warn("Failed to read system memory info:", error);
307
}
308
return -1; // Fallback to unlimited if can't read
309
}
310
311
/**
312
* Get system CPU core count from /proc/cpuinfo as fallback.
313
*/
314
private async getSystemCpuCores(): Promise<number> {
315
try {
316
const cpuinfo = await safeReadFile("/proc/cpuinfo");
317
if (cpuinfo) {
318
const processors = cpuinfo.match(/^processor\s*:/gm);
319
return processors ? processors.length : -1;
320
}
321
} catch (error) {
322
console.warn("Failed to read system CPU info:", error);
323
}
324
return -1; // Fallback to unlimited if can't read
325
}
326
327
/**
328
* Collect cgroup v2 resource usage information.
329
*
330
* cgroup v2 uses a unified hierarchy with process-specific paths:
331
* - {cgroup_path}/memory.stat - comprehensive memory statistics
332
* - {cgroup_path}/cpu.stat - CPU usage statistics in microseconds
333
* - {cgroup_path}/memory.events - memory events including OOM kills
334
* - {cgroup_path}/cpu.max - CPU limits in "quota period" format
335
* - {cgroup_path}/memory.max - memory limit in bytes or "max"
336
*
337
* Memory stat mapping from v2 to v1 equivalent:
338
* - anon: Anonymous memory (private memory, roughly equivalent to v1 total_rss)
339
* - file: Page cache memory (file-backed memory)
340
* - kernel: Kernel memory usage
341
* - slab: Kernel slab memory (reclaimable + unreclaimable)
342
* - total_cache equivalent: file + slab (approximates v1 cached memory)
343
*
344
* ## Testing different cgroup environments
345
*
346
* ### Container with limits (CoCalc production scenario):
347
* ```bash
348
* # Test memory and CPU limits
349
* docker run --rm --memory=512m --cpus=0.5 ubuntu:24.04 sh -c "
350
* cat /proc/self/cgroup # Shows: 0::/
351
* cat /sys/fs/cgroup/memory.max # Shows: 536870912 (512MB in bytes)
352
* cat /sys/fs/cgroup/cpu.max # Shows: 50000 100000 (0.5 cores)
353
* cat /sys/fs/cgroup/memory.events # Shows: low 0, high 0, max 0, oom 0, oom_kill 0, oom_group_kill 0
354
* "
355
* ```
356
*
357
* ### Container without limits:
358
* ```bash
359
* docker run --rm ubuntu:24.04 sh -c "
360
* cat /proc/self/cgroup # Shows: 0::/
361
* cat /sys/fs/cgroup/memory.max # Shows: max
362
* cat /sys/fs/cgroup/cpu.max # Shows: max 100000
363
* "
364
* ```
365
*
366
* ### Host system (development environment):
367
* ```bash
368
* cat /proc/self/cgroup # Shows: 0::/user.slice/user-1000.slice/...
369
* # Files exist in /sys/fs/cgroup/user.slice/... but typically show unlimited values
370
* # System fallback examples:
371
* cat /proc/meminfo | head -1 # MemTotal: 32585044 kB
372
* grep -c "^processor" /proc/cpuinfo # 8 (CPU cores)
373
* ```
374
*
375
* Expected file formats:
376
* - memory.max: "536870912" (bytes) or "max" (unlimited)
377
* - cpu.max: "50000 100000" (quota period) or "max 100000" (unlimited)
378
* - memory.events: "low 0\nhigh 0\nmax 0\noom 0\noom_kill 0\noom_group_kill 0"
379
* - cpu.stat: "usage_usec 1234567\n..." (usage in microseconds)
380
* - memory.stat: "anon 12345\nfile 67890\nkernel 111\nslab 222\n..." (values in bytes)
381
*/
382
private async cgroupV2({ timestamp }): Promise<CGroup | undefined> {
383
if (this.cgroupFilesAreMissing) {
384
return;
385
}
386
try {
387
const cgroupPath = await this.getCgroupV2Path();
388
389
const [
390
mem_stat_raw,
391
cpu_stat_raw,
392
mem_events_raw,
393
cpu_max_raw,
394
mem_max_raw,
395
] = await Promise.all([
396
safeReadFile(`${cgroupPath}/memory.stat`),
397
safeReadFile(`${cgroupPath}/cpu.stat`),
398
safeReadFile(`${cgroupPath}/memory.events`),
399
safeReadFile(`${cgroupPath}/cpu.max`),
400
safeReadFile(`${cgroupPath}/memory.max`),
401
]);
402
403
// Parse memory.stat - extract key memory statistics
404
// These keys provide the most relevant memory usage information
405
const mem_stat_keys = ["anon", "file", "kernel", "slab"];
406
const mem_stat = mem_stat_raw
407
? mem_stat_raw
408
.split("\n")
409
.map((line) => line.split(" "))
410
.filter(([k, _]) => mem_stat_keys.includes(k))
411
.reduce((stat, [key, val]) => {
412
stat[key] = bytes2MiB(parseInt(val));
413
return stat;
414
}, {})
415
: {};
416
417
// For compatibility with v1 interface, map v2 stats to v1 equivalents:
418
// - total_rss: Anonymous memory (private/process memory)
419
mem_stat["total_rss"] = mem_stat["anon"] || 0;
420
// - total_cache: File cache + kernel slab memory (shared/cached memory)
421
mem_stat["total_cache"] =
422
(mem_stat["file"] || 0) + (mem_stat["slab"] || 0);
423
424
// - hierarchical_memory_limit: Memory limit from memory.max, with system fallback
425
const mem_max_value = mem_max_raw?.trim();
426
if (mem_max_value === "max" || !mem_max_value) {
427
// Use system total memory as fallback when cgroup limit is unlimited
428
mem_stat["hierarchical_memory_limit"] =
429
await this.getSystemTotalMemory();
430
} else {
431
mem_stat["hierarchical_memory_limit"] = bytes2MiB(
432
parseInt(mem_max_value),
433
);
434
}
435
436
// Parse cpu.stat - extract CPU usage in microseconds, convert to seconds
437
// v2 provides usage_usec (microseconds) vs v1 which provides nanoseconds
438
const cpu_usage_match = cpu_stat_raw?.match(/usage_usec (\d+)/);
439
const cpu_usage = cpu_usage_match
440
? parseFloat(cpu_usage_match[1]) / 1000000
441
: 0;
442
443
// Calculate CPU usage rate
444
const dt = this.dt(timestamp);
445
const cpu_usage_rate =
446
this.last?.cgroup != null
447
? (cpu_usage - this.last.cgroup.cpu_usage) / dt
448
: 0;
449
450
// Parse memory.events for OOM kills
451
const oom_kill_match = mem_events_raw?.match(/oom_kill (\d+)/);
452
const oom_kills = oom_kill_match ? parseInt(oom_kill_match[1]) : 0;
453
454
// Parse cpu.max for CPU limit, with system fallback
455
// v2 format: "quota period" (e.g., "50000 100000" = 0.5 cores) or "max" for unlimited
456
// v1 uses separate files: cpu.cfs_quota_us and cpu.cfs_period_us
457
const cpu_max_parts = cpu_max_raw?.trim().split(" ");
458
let cpu_cores_limit = -1; // -1 indicates unlimited
459
if (
460
cpu_max_parts &&
461
cpu_max_parts[0] !== "max" &&
462
cpu_max_parts.length >= 2
463
) {
464
const quota = parseInt(cpu_max_parts[0]);
465
const period = parseInt(cpu_max_parts[1]);
466
cpu_cores_limit = quota / period;
467
} else {
468
// Use system CPU core count as fallback when cgroup limit is unlimited
469
cpu_cores_limit = await this.getSystemCpuCores();
470
}
471
472
return {
473
mem_stat,
474
cpu_usage,
475
cpu_usage_rate,
476
cpu_cores_limit,
477
oom_kills,
478
};
479
} catch (err) {
480
this.dbg("cgroupV2: error", err);
481
if (err.code == "ENOENT") {
482
// Mark files as missing to avoid repeated failed attempts
483
this.cgroupFilesAreMissing = true;
484
this.dbg(
485
"cgroupV2: files are missing so cgroups info will no longer be updated",
486
);
487
}
488
return undefined;
489
}
490
}
491
492
// for cocalc/kucalc we want to know the disk usage + limits of the
493
// users home dir and /tmp. /tmp is a ram disk, which will count against
494
// the overall memory limit!
495
private async disk_usage(): Promise<DiskUsage> {
496
const convert = function (val: DF_DiskUsage) {
497
return {
498
total: bytes2MiB(val.total),
499
free: bytes2MiB(val.free),
500
available: bytes2MiB(val.available),
501
usage: bytes2MiB(val.total - val.free),
502
};
503
};
504
const [tmp, project] = await Promise.all([
505
df("/tmp"),
506
df(process.env.HOME ?? "/home/user"),
507
]);
508
509
const tmpData = convert(tmp);
510
511
// If /tmp is not tmpfs (memory-based), don't count its disk usage toward memory
512
// since cgroup_stats adds disk_usage.tmp.usage to memory calculations
513
if (this.tmpIsMemoryBased === false) {
514
tmpData.usage = 0;
515
}
516
517
return { tmp: tmpData, project: convert(project) };
518
}
519
520
// orchestrating where all the information is bundled up for an update
521
private async get_info(): Promise<ProjectInfo | undefined> {
522
try {
523
const timestamp = Date.now();
524
const [processes, cgroup, disk_usage] = await Promise.all([
525
this.processes(timestamp),
526
this.cgroup({ timestamp }),
527
this.disk_usage(),
528
]);
529
const { procs, boottime, uptime } = processes;
530
await this.lookupCoCalcInfo(procs);
531
const info: ProjectInfo = {
532
timestamp,
533
processes: procs,
534
uptime,
535
boottime,
536
cgroup,
537
disk_usage,
538
};
539
return info;
540
} catch (err) {
541
this.dbg("get_info: error", err);
542
}
543
}
544
545
public stop() {
546
this.running = false;
547
}
548
549
close = () => {
550
this.stop();
551
};
552
553
public async start(): Promise<void> {
554
if (this.running) {
555
this.dbg("project-info/server: already running, cannot be started twice");
556
} else {
557
await this._start();
558
}
559
}
560
561
private async _start(): Promise<void> {
562
this.dbg("start");
563
if (this.running) {
564
throw Error("Cannot start ProjectInfoServer twice");
565
}
566
567
// Initialize tmpfs detection once at startup
568
this.tmpIsMemoryBased = await isTmpMemoryBased();
569
this.running = true;
570
this.processStats = ProcessStats.getInstance();
571
if (this.testing) {
572
this.processStats.setTesting(true);
573
}
574
await this.processStats.init();
575
while (true) {
576
//this.dbg(`listeners on 'info': ${this.listenerCount("info")}`);
577
const info = await this.get_info();
578
if (info != null) this.last = info;
579
this.emit("info", info ?? this.last);
580
if (this.running) {
581
await delay(1000 * this.delay_s);
582
} else {
583
this.dbg("start: no longer running → stopping loop");
584
this.last = undefined;
585
return;
586
}
587
// in test mode just one more, that's enough
588
if (this.last != null && this.testing) {
589
const info = await this.get_info();
590
this.dbg(JSON.stringify(info, null, 2));
591
return;
592
}
593
}
594
}
595
}
596
597
// testing: $ ts-node server.ts
598
if (require.main === module) {
599
const pis = new ProjectInfoServer(true);
600
pis.start().then(() => process.exit());
601
}
602
603