Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Path: blob/master/src/packages/hub/metrics-recorder.coffee
Views: 687
#########################################################################1# This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.2# License: MS-RSL – see LICENSE.md for details3#########################################################################45# This is a small helper class to record real-time metrics about the hub.6# It is designed for the hub, such that a local process can easily check its health.7# After an initial version, this has been repurposed to use prometheus.8# It wraps its client elements and adds some instrumentation to some hub components.910fs = require('fs')11path = require('path')12underscore = require('underscore')13{execSync} = require('child_process')14{defaults} = misc = require('@cocalc/util/misc')1516# Prometheus client setup -- https://github.com/siimon/prom-client17prom_client = require('prom-client')1819# some constants20FREQ_s = 5 # update stats every FREQ seconds21DELAY_s = 10 # with an initial delay of DELAY seconds2223# collect some recommended default metrics24prom_client.collectDefaultMetrics(timeout: FREQ_s * 1000)2526# CLK_TCK (usually 100, but maybe not ...)27try28CLK_TCK = parseInt(execSync('getconf CLK_TCK', {encoding: 'utf8'}))29catch err30CLK_TCK = null3132###33# there is more than just continuous values34# cont: continuous (like number of changefeeds), will be smoothed35# disc: discrete, like blocked, will be recorded with timestamp36# in a queue of length DISC_LEN37exports.TYPE = TYPE =38COUNT: 'counter' # strictly non-decrasing integer39GAUGE: 'gauge' # only the most recent value is recorded40LAST : 'latest' # only the most recent value is recorded41DISC : 'discrete' # timeseries of length DISC_LEN42CONT : 'continuous' # continuous with exponential decay43MAX : 'contmax' # like CONT, reduces buffer to max value44SUM : 'contsum' # like CONT, reduces buffer to sum of values divided by FREQ_s45###4647PREFIX = 'cocalc_hub_'4849exports.new_counter = new_counter = (name, help, labels) ->50# a prometheus counter -- https://github.com/siimon/prom-client#counter51# use it like counter.labels(labelA, labelB).inc([positive number or default is 1])52if not name.endsWith('_total')53throw "Counter metric names have to end in [_unit]_total but I got '#{name}' -- https://prometheus.io/docs/practices/naming/"54return new prom_client.Counter(name: PREFIX + name, help: help, labelNames: labels ? [])5556exports.new_gauge = new_gauge = (name, help, labels) ->57# a prometheus gauge -- https://github.com/siimon/prom-client#gauge58# basically, use it like gauge.labels(labelA, labelB).set(value)59return new prom_client.Gauge(name: PREFIX + name, help: help, labelNames: labels ? [])6061exports.new_quantile = new_quantile = (name, help, config={}) ->62# invoked as quantile.observe(value)63config = defaults config,64# a few more than the default, in particular including the actual min and max65percentiles: [0.0, 0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 1.0]66labels : []67return new prom_client.Summary(name: PREFIX + name, help: help, labelNames:config.labels, percentiles: config.percentiles)6869exports.new_histogram = new_histogram = (name, help, config={}) ->70# invoked as histogram.observe(value)71config = defaults config,72buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10]73labels: []74return new prom_client.Histogram(name: PREFIX + name, help: help, labelNames: config.labels, buckets:config.buckets)757677# This is modified by the Client class (in client.coffee) when metrics78# get pushed from browsers. It's a map from client_id to79# an array of metrics objects, which are already labeled with extra80# information about the client_id and account_id.81exports.client_metrics = {}8283class MetricsRecorder84constructor: (@dbg, cb) ->85###86* @dbg: reporting via winston, instance with configuration passed in from hub.coffee87###88# stores the current state of the statistics89@_stats = {}90@_types = {} # key → TYPE.T mapping9192# the full statistic93@_data = {}94@_collectors = []9596# initialization finished97@setup_monitoring()98cb?(undefined, @)99100client_metrics: =>101###102exports.client_metrics is a mapping of client id to the json exported metric.103The AggregatorRegistry is supposed to work with a list of metrics, and by default,104it sums them up. `aggregate` is a static method and hence it should be ok to use it directly.105###106metrics = (m for _, m of exports.client_metrics)107108registry = prom_client.AggregatorRegistry.aggregate(metrics)109return await registry.metrics()110111metrics: =>112###113get a serialized representation of the metrics status114(was a dict that should be JSON, now it is for prometheus)115it's only called by the HTTP stuff in servers for the /metrics endpoint116###117hub = await prom_client.register.metrics()118clients = await @client_metrics()119return hub + clients120121register_collector: (collector) =>122# The added collector functions will be evaluated periodically to gather metrics123@_collectors.push(collector)124125setup_monitoring: =>126# setup monitoring of some components127# called by the hub *after* setting up the DB, etc.128num_clients_gauge = new_gauge('clients_count', 'Number of connected clients')129{number_of_clients} = require('./hub_register')130@register_collector ->131try132num_clients_gauge.set(number_of_clients())133catch134num_clients_gauge.set(0)135136137# our own CPU metrics monitor, separating user and sys!138# it's actually a counter, since it is non-decreasing, but we'll use .set(...)139@_cpu_seconds_total = new_gauge('process_cpu_categorized_seconds_total', 'Total number of CPU seconds used', ['type'])140141@_collect_duration = new_histogram('metrics_collect_duration_s', 'How long it took to gather the metrics', buckets:[0.0001, 0.001, 0.01, 1])142@_collect_duration_last = new_gauge('metrics_collect_duration_s_last', 'How long it took the last time to gather the metrics')143144# init periodically calling @_collect145setTimeout((=> setInterval(@_collect, FREQ_s * 1000)), DELAY_s * 1000)146147_collect: =>148endG = @_collect_duration_last.startTimer()149endH = @_collect_duration.startTimer()150151# called by @_update to evaluate the collector functions152#@dbg('_collect called')153for c in @_collectors154c()155# linux specific: collecting this process and all its children sys+user times156# http://man7.org/linux/man-pages/man5/proc.5.html157fs.readFile path.join('/proc', ''+process.pid, 'stat'), 'utf8', (err, infos) =>158if err or not CLK_TCK?159@dbg("_collect err: #{err}")160return161# there might be spaces in the process name, hence split after the closing bracket!162infos = infos[infos.lastIndexOf(')') + 2...].split(' ')163@_cpu_seconds_total.labels('user') .set(parseFloat(infos[11]) / CLK_TCK)164@_cpu_seconds_total.labels('system') .set(parseFloat(infos[12]) / CLK_TCK)165# time spent waiting on child processes166@_cpu_seconds_total.labels('chld_user') .set(parseFloat(infos[13]) / CLK_TCK)167@_cpu_seconds_total.labels('chld_system').set(parseFloat(infos[14]) / CLK_TCK)168169# END: the timings for this run.170endG()171endH()172173metricsRecorder = null174exports.init = (winston, cb) ->175dbg = (msg) ->176winston.info("MetricsRecorder: #{msg}")177metricsRecorder = new MetricsRecorder(dbg, cb)178179exports.get = ->180return metricsRecorder181182183