Path: blob/master/src/packages/project/project-info/server.ts
5536 views
/*1* This file is part of CoCalc: Copyright © 2020–2026 Sagemath, Inc.2* License: MS-RSL – see LICENSE.md for details3*/45/*6Project information server, doing the heavy lifting of telling the client7about what's going on in a project.89This is an event emitter that emits a ProjectInfo object periodically when running.1011One important aspect is that this avoids spawning subprocesses, which could be problematic12if there is a limit on the number of processes that can be spawned, or memory pressure, etc.13*/1415import { delay } from "awaiting";16import type { DiskUsage as DF_DiskUsage } from "diskusage";17import { check as df } from "diskusage";18import { EventEmitter } from "node:events";19import { access, readFile } from "node:fs/promises";2021import { ProcessStats } from "@cocalc/backend/process-stats";22import { pidToPath as terminalPidToPath } from "@cocalc/project/conat/terminal/manager";23import { getLogger } from "@cocalc/project/logger";24import { get_path_for_pid as x11_pid2path } from "@cocalc/project/x11/server";25import type {26CGroup,27CoCalcInfo,28DiskUsage,29Process,30Processes,31ProjectInfo,32} from "@cocalc/util/types/project-info/types";3334const L = getLogger("project-info:server").debug;3536const bytes2MiB = (bytes) => bytes / (1024 * 1024);3738/**39* Detect if /tmp is mounted as tmpfs (memory-based filesystem) by reading /proc/mounts.40* Returns true if /tmp is tmpfs, false otherwise.41*/42async function isTmpMemoryBased(): Promise<boolean> {43try {44const mounts = await readFile("/proc/mounts", "utf8");45// Look for lines like: "tmpfs /tmp tmpfs rw,nosuid,nodev,noexec,relatime,size=1024000k 0 0"46const tmpfsPattern = /^\S+\s+\/tmp\s+tmpfs\s/m;47return tmpfsPattern.test(mounts);48} catch (error) {49L("Failed to read /proc/mounts, assuming /tmp is disk-based:", error);50return false; // Default to safer assumption for development environments51}52}5354/**55* Safely read a file, returning null if the file doesn't exist.56* Throws for other errors.57*/58async function safeReadFile(path: string): Promise<string | null> {59try {60return await readFile(path, "utf8");61} catch (error: any) {62if (error.code === "ENOENT") {63console.warn(`safeReadFile: ${path} not found, skipping`);64return null;65}66throw error;67}68}6970export class ProjectInfoServer extends EventEmitter {71private last?: ProjectInfo = undefined;72private readonly dbg: Function;73private running = false;74private readonly testing: boolean;75private delay_s: number;76private tmpIsMemoryBased?: boolean;77private cgroupFilesAreMissing: boolean = false;78private processStats: ProcessStats;79private cgroupVersion: "v1" | "v2" | "unknown" | null;8081constructor(testing = false) {82super();83this.delay_s = 2;84this.testing = testing;85this.dbg = L;86// cgroup version will be detected lazily87this.cgroupVersion = null;88}8990private async processes(timestamp: number) {91return await this.processStats.processes(timestamp, "project-info");92}9394// delta-time for this and the previous process information95private dt(timestamp) {96return (timestamp - (this.last?.timestamp ?? 0)) / 1000;97}9899public latest(): ProjectInfo | undefined {100return this.last;101}102103// for a process we know (pid, etc.) we try to map to cocalc specific information104private async cocalc({105pid,106cmdline,107}: Pick<Process, "pid" | "cmdline">): Promise<CoCalcInfo | undefined> {108//this.dbg("classify", { pid, exe, cmdline });109if (pid === process.pid) {110return { type: "project" };111}112// SPEED: importing @cocalc/jupyter/kernel is slow, so it MUST NOT BE DONE113// on the top level, especially not in any code that is loaded during114// project startup115const { get_kernel_by_pid } = await import("@cocalc/jupyter/kernel");116const jupyter_kernel = get_kernel_by_pid(pid);117if (jupyter_kernel != null) {118return { type: "jupyter", path: jupyter_kernel.get_path() };119}120const termpath = terminalPidToPath(pid);121if (termpath != null) {122return { type: "terminal", path: termpath };123}124const x11_path = x11_pid2path(pid);125if (x11_path != null) {126return { type: "x11", path: x11_path };127}128// SSHD: strangely, just one long string in cmdline[0]129if (130cmdline.length === 1 &&131cmdline[0].startsWith("sshd:") &&132cmdline[0].indexOf("-p 2222") != -1133) {134return { type: "sshd" };135}136}137138private async lookupCoCalcInfo(processes: Processes) {139// iterate over all processes keys (pid) and call this.cocalc({pid, cmdline})140// to update the processes coclc field141for (const pid in processes) {142processes[pid].cocalc = await this.cocalc({143pid: parseInt(pid),144cmdline: processes[pid].cmdline,145});146}147}148149/**150* Detect cgroup version lazily.151* Fine to run once, since the cgroup version won't change during the process lifetime.152*/153private async detectCGroupVersion(): Promise<"v1" | "v2" | "unknown" | null> {154if (this.cgroupVersion !== null) {155return this.cgroupVersion;156}157158try {159// Check for v2-specific file160await access("/sys/fs/cgroup/cgroup.controllers");161this.cgroupVersion = "v2";162} catch (error: any) {163if (error.code === "ENOENT") {164// File doesn't exist, so likely v1165this.cgroupVersion = "v1";166} else {167// Other errors (e.g., permissions): treat as unknown168console.error("Error detecting cgroup version:", error);169this.cgroupVersion = "unknown";170}171}172173L(`detected cgroup version: ${this.cgroupVersion}`);174return this.cgroupVersion;175}176177/**178* Collect cgroup resource usage information.179* This is specific to running a project in a CGroup container.180* Harald: however, even without a container this shouldn't fail … just tells181* you what the whole system is doing, all your processes.182* William: it's constantly failing in cocalc-docker every second, so to avoid183* clogging logs and wasting CPU, if the files are missing once, it stops updating.184*/185private async cgroup({ timestamp }): Promise<CGroup | undefined> {186const version = await this.detectCGroupVersion();187switch (version) {188case "v1":189return this.cgroupV1({ timestamp });190case "v2":191return this.cgroupV2({ timestamp });192default:193this.dbg("cgroup: unknown version, skipping");194return undefined;195}196}197198/**199* Collect cgroup v1 resource usage information.200*201* cgroup v1 uses separate hierarchies for different resource controllers:202* - /sys/fs/cgroup/memory/memory.stat - memory statistics203* - /sys/fs/cgroup/cpu,cpuacct/cpuacct.usage - CPU usage in nanoseconds204* - /sys/fs/cgroup/memory/memory.oom_control - OOM kill information205* - /sys/fs/cgroup/cpu,cpuacct/cpu.cfs_quota_us - CPU quota206* - /sys/fs/cgroup/cpu,cpuacct/cpu.cfs_period_us - CPU period207*/208private async cgroupV1({ timestamp }): Promise<CGroup | undefined> {209if (this.cgroupFilesAreMissing) {210return;211}212try {213const [mem_stat_raw, cpu_raw, oom_raw, cfs_quota_raw, cfs_period_raw] =214await Promise.all([215readFile("/sys/fs/cgroup/memory/memory.stat", "utf8"),216readFile("/sys/fs/cgroup/cpu,cpuacct/cpuacct.usage", "utf8"),217readFile("/sys/fs/cgroup/memory/memory.oom_control", "utf8"),218readFile("/sys/fs/cgroup/cpu,cpuacct/cpu.cfs_quota_us", "utf8"),219readFile("/sys/fs/cgroup/cpu,cpuacct/cpu.cfs_period_us", "utf8"),220]);221const mem_stat_keys = [222"total_rss",223"total_cache",224"hierarchical_memory_limit",225];226const cpu_usage = parseFloat(cpu_raw) / Math.pow(10, 9);227const dt = this.dt(timestamp);228const cpu_usage_rate =229this.last?.cgroup != null230? (cpu_usage - this.last.cgroup.cpu_usage) / dt231: 0;232const [cfs_quota, cfs_period] = [233parseInt(cfs_quota_raw),234parseInt(cfs_period_raw),235];236const mem_stat = mem_stat_raw237.split("\n")238.map((line) => line.split(" "))239.filter(([k, _]) => mem_stat_keys.includes(k))240.reduce((stat, [key, val]) => {241stat[key] = bytes2MiB(parseInt(val));242return stat;243}, {});244const oom_kills = oom_raw245.split("\n")246.filter((val) => val.startsWith("oom_kill "))247.map((val) => parseInt(val.slice("oom_kill ".length)))[0];248249// Handle unlimited CPU quota (-1) correctly250const cpu_cores_limit = cfs_quota === -1 ? -1 : cfs_quota / cfs_period;251252return {253mem_stat,254cpu_usage,255cpu_usage_rate,256cpu_cores_limit,257oom_kills,258};259} catch (err) {260this.dbg("cgroup v1: error", err);261if (err.code == "ENOENT") {262// TODO: instead of shutting this down, we could maybe do a better job263// figuring out what the correct cgroups files are on a given system.264// E.g., in my cocalc-docker, I do NOT have /sys/fs/cgroup/memory/memory.stat265// but I do have /sys/fs/cgroup/memory.stat266this.cgroupFilesAreMissing = true;267this.dbg(268"cgroup v1: files are missing so cgroups info will no longer be updated",269);270}271return undefined;272}273}274275/**276* Get the current process's cgroup path for v2.277*/278private async getCgroupV2Path(): Promise<string> {279try {280const cgroupData = await readFile("/proc/self/cgroup", "utf8");281// v2 format: "0::/path/to/cgroup"282const match = cgroupData.match(/^0::(.+)$/m);283if (match) {284return `/sys/fs/cgroup${match[1]}`;285}286} catch (error) {287console.warn("Failed to read /proc/self/cgroup, using root cgroup");288}289return "/sys/fs/cgroup";290}291292/**293* Get system total memory from /proc/meminfo as fallback.294*/295private async getSystemTotalMemory(): Promise<number> {296try {297const meminfo = await safeReadFile("/proc/meminfo");298if (meminfo) {299const match = meminfo.match(/^MemTotal:\s+(\d+)\s+kB$/m);300if (match) {301return parseInt(match[1]) / 1024; // Convert kB to MiB302}303}304} catch (error) {305console.warn("Failed to read system memory info:", error);306}307return -1; // Fallback to unlimited if can't read308}309310/**311* Get system CPU core count from /proc/cpuinfo as fallback.312*/313private async getSystemCpuCores(): Promise<number> {314try {315const cpuinfo = await safeReadFile("/proc/cpuinfo");316if (cpuinfo) {317const processors = cpuinfo.match(/^processor\s*:/gm);318return processors ? processors.length : -1;319}320} catch (error) {321console.warn("Failed to read system CPU info:", error);322}323return -1; // Fallback to unlimited if can't read324}325326/**327* Collect cgroup v2 resource usage information.328*329* cgroup v2 uses a unified hierarchy with process-specific paths:330* - {cgroup_path}/memory.stat - comprehensive memory statistics331* - {cgroup_path}/cpu.stat - CPU usage statistics in microseconds332* - {cgroup_path}/memory.events - memory events including OOM kills333* - {cgroup_path}/cpu.max - CPU limits in "quota period" format334* - {cgroup_path}/memory.max - memory limit in bytes or "max"335*336* Memory stat mapping from v2 to v1 equivalent:337* - anon: Anonymous memory (private memory, roughly equivalent to v1 total_rss)338* - file: Page cache memory (file-backed memory)339* - kernel: Kernel memory usage340* - slab: Kernel slab memory (reclaimable + unreclaimable)341* - total_cache equivalent: file + slab (approximates v1 cached memory)342*343* ## Testing different cgroup environments344*345* ### Container with limits (CoCalc production scenario):346* ```bash347* # Test memory and CPU limits348* docker run --rm --memory=512m --cpus=0.5 ubuntu:24.04 sh -c "349* cat /proc/self/cgroup # Shows: 0::/350* cat /sys/fs/cgroup/memory.max # Shows: 536870912 (512MB in bytes)351* cat /sys/fs/cgroup/cpu.max # Shows: 50000 100000 (0.5 cores)352* cat /sys/fs/cgroup/memory.events # Shows: low 0, high 0, max 0, oom 0, oom_kill 0, oom_group_kill 0353* "354* ```355*356* ### Container without limits:357* ```bash358* docker run --rm ubuntu:24.04 sh -c "359* cat /proc/self/cgroup # Shows: 0::/360* cat /sys/fs/cgroup/memory.max # Shows: max361* cat /sys/fs/cgroup/cpu.max # Shows: max 100000362* "363* ```364*365* ### Host system (development environment):366* ```bash367* cat /proc/self/cgroup # Shows: 0::/user.slice/user-1000.slice/...368* # Files exist in /sys/fs/cgroup/user.slice/... but typically show unlimited values369* # System fallback examples:370* cat /proc/meminfo | head -1 # MemTotal: 32585044 kB371* grep -c "^processor" /proc/cpuinfo # 8 (CPU cores)372* ```373*374* Expected file formats:375* - memory.max: "536870912" (bytes) or "max" (unlimited)376* - cpu.max: "50000 100000" (quota period) or "max 100000" (unlimited)377* - memory.events: "low 0\nhigh 0\nmax 0\noom 0\noom_kill 0\noom_group_kill 0"378* - cpu.stat: "usage_usec 1234567\n..." (usage in microseconds)379* - memory.stat: "anon 12345\nfile 67890\nkernel 111\nslab 222\n..." (values in bytes)380*/381private async cgroupV2({ timestamp }): Promise<CGroup | undefined> {382if (this.cgroupFilesAreMissing) {383return;384}385try {386const cgroupPath = await this.getCgroupV2Path();387388const [389mem_stat_raw,390cpu_stat_raw,391mem_events_raw,392cpu_max_raw,393mem_max_raw,394] = await Promise.all([395safeReadFile(`${cgroupPath}/memory.stat`),396safeReadFile(`${cgroupPath}/cpu.stat`),397safeReadFile(`${cgroupPath}/memory.events`),398safeReadFile(`${cgroupPath}/cpu.max`),399safeReadFile(`${cgroupPath}/memory.max`),400]);401402// Parse memory.stat - extract key memory statistics403// These keys provide the most relevant memory usage information404const mem_stat_keys = ["anon", "file", "kernel", "slab"];405const mem_stat = mem_stat_raw406? mem_stat_raw407.split("\n")408.map((line) => line.split(" "))409.filter(([k, _]) => mem_stat_keys.includes(k))410.reduce((stat, [key, val]) => {411stat[key] = bytes2MiB(parseInt(val));412return stat;413}, {})414: {};415416// For compatibility with v1 interface, map v2 stats to v1 equivalents:417// - total_rss: Anonymous memory (private/process memory)418mem_stat["total_rss"] = mem_stat["anon"] || 0;419// - total_cache: File cache + kernel slab memory (shared/cached memory)420mem_stat["total_cache"] =421(mem_stat["file"] || 0) + (mem_stat["slab"] || 0);422423// - hierarchical_memory_limit: Memory limit from memory.max, with system fallback424const mem_max_value = mem_max_raw?.trim();425if (mem_max_value === "max" || !mem_max_value) {426// Use system total memory as fallback when cgroup limit is unlimited427mem_stat["hierarchical_memory_limit"] =428await this.getSystemTotalMemory();429} else {430mem_stat["hierarchical_memory_limit"] = bytes2MiB(431parseInt(mem_max_value),432);433}434435// Parse cpu.stat - extract CPU usage in microseconds, convert to seconds436// v2 provides usage_usec (microseconds) vs v1 which provides nanoseconds437const cpu_usage_match = cpu_stat_raw?.match(/usage_usec (\d+)/);438const cpu_usage = cpu_usage_match439? parseFloat(cpu_usage_match[1]) / 1000000440: 0;441442// Calculate CPU usage rate443const dt = this.dt(timestamp);444const cpu_usage_rate =445this.last?.cgroup != null446? (cpu_usage - this.last.cgroup.cpu_usage) / dt447: 0;448449// Parse memory.events for OOM kills450const oom_kill_match = mem_events_raw?.match(/oom_kill (\d+)/);451const oom_kills = oom_kill_match ? parseInt(oom_kill_match[1]) : 0;452453// Parse cpu.max for CPU limit, with system fallback454// v2 format: "quota period" (e.g., "50000 100000" = 0.5 cores) or "max" for unlimited455// v1 uses separate files: cpu.cfs_quota_us and cpu.cfs_period_us456const cpu_max_parts = cpu_max_raw?.trim().split(" ");457let cpu_cores_limit = -1; // -1 indicates unlimited458if (459cpu_max_parts &&460cpu_max_parts[0] !== "max" &&461cpu_max_parts.length >= 2462) {463const quota = parseInt(cpu_max_parts[0]);464const period = parseInt(cpu_max_parts[1]);465cpu_cores_limit = quota / period;466} else {467// Use system CPU core count as fallback when cgroup limit is unlimited468cpu_cores_limit = await this.getSystemCpuCores();469}470471return {472mem_stat,473cpu_usage,474cpu_usage_rate,475cpu_cores_limit,476oom_kills,477};478} catch (err) {479this.dbg("cgroupV2: error", err);480if (err.code == "ENOENT") {481// Mark files as missing to avoid repeated failed attempts482this.cgroupFilesAreMissing = true;483this.dbg(484"cgroupV2: files are missing so cgroups info will no longer be updated",485);486}487return undefined;488}489}490491// for cocalc/kucalc we want to know the disk usage + limits of the492// users home dir and /tmp. /tmp is a ram disk, which will count against493// the overall memory limit!494private async disk_usage(): Promise<DiskUsage> {495const convert = function (val: DF_DiskUsage) {496return {497total: bytes2MiB(val.total),498free: bytes2MiB(val.free),499available: bytes2MiB(val.available),500usage: bytes2MiB(val.total - val.free),501};502};503const [tmp, project] = await Promise.all([504df("/tmp"),505df(process.env.HOME ?? "/home/user"),506]);507508const tmpData = convert(tmp);509510// If /tmp is not tmpfs (memory-based), don't count its disk usage toward memory511// since cgroup_stats adds disk_usage.tmp.usage to memory calculations512if (this.tmpIsMemoryBased === false) {513tmpData.usage = 0;514}515516return { tmp: tmpData, project: convert(project) };517}518519// orchestrating where all the information is bundled up for an update520private async get_info(): Promise<ProjectInfo | undefined> {521try {522const timestamp = Date.now();523const [processes, cgroup, disk_usage] = await Promise.all([524this.processes(timestamp),525this.cgroup({ timestamp }),526this.disk_usage(),527]);528const { procs, boottime, uptime } = processes;529await this.lookupCoCalcInfo(procs);530const info: ProjectInfo = {531timestamp,532processes: procs,533uptime,534boottime,535cgroup,536disk_usage,537};538return info;539} catch (err) {540this.dbg("get_info: error", err);541}542}543544public stop() {545this.running = false;546}547548close = () => {549this.stop();550};551552public async start(): Promise<void> {553if (this.running) {554this.dbg("project-info/server: already running, cannot be started twice");555} else {556await this._start();557}558}559560private async _start(): Promise<void> {561this.dbg("start");562if (this.running) {563throw Error("Cannot start ProjectInfoServer twice");564}565566// Initialize tmpfs detection once at startup567this.tmpIsMemoryBased = await isTmpMemoryBased();568this.running = true;569this.processStats = ProcessStats.getInstance();570if (this.testing) {571this.processStats.setTesting(true);572}573await this.processStats.init();574while (true) {575//this.dbg(`listeners on 'info': ${this.listenerCount("info")}`);576const info = await this.get_info();577if (info != null) this.last = info;578this.emit("info", info ?? this.last);579if (this.running) {580await delay(1000 * this.delay_s);581} else {582this.dbg("start: no longer running → stopping loop");583this.last = undefined;584return;585}586// in test mode just one more, that's enough587if (this.last != null && this.testing) {588const info = await this.get_info();589this.dbg(JSON.stringify(info, null, 2));590return;591}592}593}594}595596// testing: $ ts-node server.ts597if (require.main === module) {598const pis = new ProjectInfoServer(true);599pis.start().then(() => process.exit());600}601602603