Path: blob/master/src/packages/backend/process-stats.ts
5767 views
/*1* This file is part of CoCalc: Copyright © 2020–2026 Sagemath, Inc.2* License: MS-RSL – see LICENSE.md for details3*/45import { exec as cp_exec } from "node:child_process";6import { readFile, readdir, readlink } from "node:fs/promises";7import { join } from "node:path";8import { promisify } from "node:util";910import { mapParallelLimit } from "@cocalc/util/async-utils";11import { reuseInFlight } from "@cocalc/util/reuse-in-flight";12import {13Cpu,14Process,15Processes,16Stat,17State,18} from "@cocalc/util/types/project-info/types";19import { getLogger } from "./logger";20import { envToInt } from "./misc/env-to-number";2122const dbg = getLogger("process-stats").debug;2324const exec = promisify(cp_exec);2526/**27* Return information about all processes (up to a limit or filter) in the environment, where this node.js process runs.28* This has been refactored out of project/project-info/server.ts.29* It is also used by the backend itself in "execute-code.ts" – to gather info about a spawned async process.30*/3132// this is a hard limit on the number of processes we gather, just to33// be on the safe side to avoid processing too much data.34const LIMIT = envToInt("COCALC_PROJECT_INFO_PROC_LIMIT", 1024);3536export class ProcessStats {37private static instance: ProcessStats;3839private readonly procLimit: number;4041private testing: boolean;42private ticks: number;43private pagesize: number;44private lastByKey = new Map<45string,46{ timestamp: number; processes: Processes }47>();4849private constructor() {50this.procLimit = LIMIT;51this.init();52}5354public static getInstance(): ProcessStats {55if (!ProcessStats.instance) {56ProcessStats.instance = new ProcessStats();57}58return ProcessStats.instance;59}6061public setTesting(testing: boolean): void {62this.testing = testing;63}6465// this grabs some kernel configuration values we need. they won't change66public init = reuseInFlight(async () => {67if (this.ticks == null) {68const [p_ticks, p_pagesize] = await Promise.all([69exec("getconf CLK_TCK"),70exec("getconf PAGESIZE"),71]);72// should be 100, usually73this.ticks = parseInt(p_ticks.stdout.trim());74// 4096?75this.pagesize = parseInt(p_pagesize.stdout.trim());76}77});7879// the "stat" file contains all the information80// this page explains what is what81// https://man7.org/linux/man-pages/man5/proc.5.html82private async stat(path: string): Promise<Stat> {83// all time-values are in seconds84const raw = await readFile(path, "utf8");85// the "comm" field could contain additional spaces or parents86const [i, j] = [raw.indexOf("("), raw.lastIndexOf(")")];87const start = raw.slice(0, i - 1).trim();88const end = raw.slice(j + 1).trim();89const data = `${start} comm ${end}`.split(" ");90const get = (idx) => parseInt(data[idx]);91// "comm" is now a placeholder to keep indices as they are.92// don't forget to account for 0 vs. 1 based indexing.93const ret = {94ppid: get(3),95state: data[2] as State,96utime: get(13) / this.ticks, // CPU time spent in user code, measured in clock ticks (#14)97stime: get(14) / this.ticks, // CPU time spent in kernel code, measured in clock ticks (#15)98cutime: get(15) / this.ticks, // Waited-for children's CPU time spent in user code (in clock ticks) (#16)99cstime: get(16) / this.ticks, // Waited-for children's CPU time spent in kernel code (in clock ticks) (#17)100starttime: get(21) / this.ticks, // Time when the process started, measured in clock ticks (#22)101nice: get(18),102num_threads: get(19),103mem: { rss: (get(23) * this.pagesize) / (1024 * 1024) }, // MiB104};105return ret;106}107108// delta-time for this and the previous process information109private dt(timestamp: number, lastTimestamp?: number) {110return (timestamp - (lastTimestamp ?? 0)) / 1000;111}112113// calculate cpu times114private cpu({115pid,116stat,117timestamp,118lastProcesses,119lastTimestamp,120}: {121pid: number;122stat: Stat;123timestamp: number;124lastProcesses?: Processes;125lastTimestamp?: number;126}): Cpu {127// we are interested in that processes total usage: user + system128const total_cpu = stat.utime + stat.stime;129// the fallback is chosen in such a way, that it says 0% if we do not have historic data130const prev_cpu = lastProcesses?.[pid]?.cpu.secs ?? total_cpu;131const dt = this.dt(timestamp, lastTimestamp);132// how much cpu time was used since last time we checked this process…133const pct = dt > 0 ? 100 * ((total_cpu - prev_cpu) / dt) : 0;134return { pct: pct, secs: total_cpu };135}136137private async cmdline(path: string): Promise<string[]> {138// we split at the null-delimiter and filter all empty elements139return (await readFile(path, "utf8"))140.split("\0")141.filter((c) => c.length > 0);142}143144// this gathers all the information for a specific process with the given pid145private async process({146pid: pid_str,147uptime,148timestamp,149lastProcesses,150lastTimestamp,151}: {152pid: string;153uptime: number;154timestamp: number;155lastProcesses?: Processes;156lastTimestamp?: number;157}): Promise<Process> {158const base = join("/proc", pid_str);159const pid = parseInt(pid_str);160const fn = (name) => join(base, name);161const [cmdline, exe, stat] = await Promise.all([162this.cmdline(fn("cmdline")),163readlink(fn("exe")),164this.stat(fn("stat")),165]);166return {167pid,168ppid: stat.ppid,169cmdline,170exe,171stat,172cpu: this.cpu({ pid, timestamp, stat, lastProcesses, lastTimestamp }),173uptime: uptime - stat.starttime,174};175}176177// this is how long the underlying machine is running178// we need this information, because the processes' start time is179// measured in "ticks" since the machine started180private async uptime(): Promise<[number, Date]> {181// return uptime in secs182const out = await readFile("/proc/uptime", "utf8");183const uptime = parseFloat(out.split(" ")[0]);184const boottime = new Date(new Date().getTime() - 1000 * uptime);185return [uptime, boottime];186}187188// this is where we gather information about all running processes189public async processes(190timestamp?: number,191sampleKey = "default",192): Promise<{ procs: Processes; uptime: number; boottime: Date }> {193timestamp ??= new Date().getTime();194const [uptime, boottime] = await this.uptime();195const last = this.lastByKey.get(sampleKey);196197const procs: Processes = {};198let pids = (await readdir("/proc")).filter((pid) => pid.match(/^[0-9]+$/));199200if (pids.length > this.procLimit) {201dbg(`too many processes – limit of ${this.procLimit} reached!`);202// we avoid processing and sending too much data203pids = pids.slice(0, this.procLimit);204}205206await mapParallelLimit(207pids,208async (pid) => {209try {210const proc = await this.process({211pid,212uptime,213timestamp,214lastProcesses: last?.processes,215lastTimestamp: last?.timestamp,216});217procs[proc.pid] = proc;218} catch (err) {219if (this.testing)220dbg(`process ${pid} likely vanished – could happen – ${err}`);221}222},22320,224);225226this.lastByKey.set(sampleKey, { timestamp, processes: procs });227return { procs, uptime, boottime };228}229}230231export interface ProcessTreeStats {232rss: number;233cpu_secs: number;234cpu_pct: number;235}236237/**238* Recursively sum process statistics for a process and all its children.239* This function aggregates CPU time, memory usage, and CPU percentage240* for a process tree starting from the given PID.241*/242export function sumChildren(243procs: Processes,244children: { [pid: number]: number[] },245pid: number,246): ProcessTreeStats | null {247const proc = procs[`${pid}`];248if (proc == null) {249return null;250}251252let rss = proc.stat.mem.rss;253let cpu_secs = proc.cpu.secs;254let cpu_pct = proc.cpu.pct;255256for (const ch of children[pid] ?? []) {257const sc = sumChildren(procs, children, ch);258if (sc == null) return null;259rss += sc.rss;260cpu_secs += sc.cpu_secs;261cpu_pct += sc.cpu_pct;262}263264return { rss, cpu_secs, cpu_pct };265}266267268