CoCalc -- metrics-recorder.coffee

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.

GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/packages/hub/metrics-recorder.coffee
Views: ⁶⁸⁷
1
#########################################################################
2
# This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.
3
# License: MS-RSL – see LICENSE.md for details
4
#########################################################################
5

6
# This is a small helper class to record real-time metrics about the hub.
7
# It is designed for the hub, such that a local process can easily check its health.
8
# After an initial version, this has been repurposed to use prometheus.
9
# It wraps its client elements and adds some instrumentation to some hub components.
10

11
fs         = require('fs')
12
path       = require('path')
13
underscore = require('underscore')
14
{execSync} = require('child_process')
15
{defaults} = misc = require('@cocalc/util/misc')
16

17
# Prometheus client setup -- https://github.com/siimon/prom-client
18
prom_client = require('prom-client')
19

20
# some constants
21
FREQ_s     = 5   # update stats every FREQ seconds
22
DELAY_s    = 10    # with an initial delay of DELAY seconds
23

24
# collect some recommended default metrics
25
prom_client.collectDefaultMetrics(timeout: FREQ_s * 1000)
26

27
# CLK_TCK (usually 100, but maybe not ...)
28
try
29
    CLK_TCK = parseInt(execSync('getconf CLK_TCK', {encoding: 'utf8'}))
30
catch err
31
    CLK_TCK = null
32

33
###
34
# there is more than just continuous values
35
# cont: continuous (like number of changefeeds), will be smoothed
36
#       disc: discrete, like blocked, will be recorded with timestamp
37
#             in a queue of length DISC_LEN
38
exports.TYPE = TYPE =
39
    COUNT: 'counter'    # strictly non-decrasing integer
40
    GAUGE: 'gauge'      # only the most recent value is recorded
41
    LAST : 'latest'     # only the most recent value is recorded
42
    DISC : 'discrete'   # timeseries of length DISC_LEN
43
    CONT : 'continuous' # continuous with exponential decay
44
    MAX  : 'contmax'    # like CONT, reduces buffer to max value
45
    SUM  : 'contsum'    # like CONT, reduces buffer to sum of values divided by FREQ_s
46
###
47

48
PREFIX = 'cocalc_hub_'
49

50
exports.new_counter = new_counter = (name, help, labels) ->
51
    # a prometheus counter -- https://github.com/siimon/prom-client#counter
52
    # use it like counter.labels(labelA, labelB).inc([positive number or default is 1])
53
    if not name.endsWith('_total')
54
        throw "Counter metric names have to end in [_unit]_total but I got '#{name}' -- https://prometheus.io/docs/practices/naming/"
55
    return new prom_client.Counter(name: PREFIX + name, help: help, labelNames: labels ? [])
56

57
exports.new_gauge = new_gauge = (name, help, labels) ->
58
    # a prometheus gauge -- https://github.com/siimon/prom-client#gauge
59
    # basically, use it like gauge.labels(labelA, labelB).set(value)
60
    return new prom_client.Gauge(name: PREFIX + name, help: help, labelNames: labels ? [])
61

62
exports.new_quantile = new_quantile = (name, help, config={}) ->
63
    # invoked as quantile.observe(value)
64
    config = defaults config,
65
        # a few more than the default, in particular including the actual min and max
66
        percentiles: [0.0, 0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 1.0]
67
        labels : []
68
    return new prom_client.Summary(name: PREFIX + name, help: help, labelNames:config.labels, percentiles: config.percentiles)
69

70
exports.new_histogram = new_histogram = (name, help, config={}) ->
71
    # invoked as histogram.observe(value)
72
    config = defaults config,
73
        buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10]
74
        labels: []
75
    return new prom_client.Histogram(name: PREFIX + name, help: help, labelNames: config.labels, buckets:config.buckets)
76

77

78
# This is modified by the Client class (in client.coffee) when metrics
79
# get pushed from browsers.  It's a map from client_id to
80
# an array of metrics objects, which are already labeled with extra
81
# information about the client_id and account_id.
82
exports.client_metrics = {}
83

84
class MetricsRecorder
85
    constructor: (@dbg, cb) ->
86
        ###
87
        * @dbg: reporting via winston, instance with configuration passed in from hub.coffee
88
        ###
89
        # stores the current state of the statistics
90
        @_stats = {}
91
        @_types = {} # key → TYPE.T mapping
92

93
        # the full statistic
94
        @_data  = {}
95
        @_collectors = []
96

97
        # initialization finished
98
        @setup_monitoring()
99
        cb?(undefined, @)
100

101
    client_metrics: =>
102
        ###
103
        exports.client_metrics is a mapping of client id to the json exported metric.
104
        The AggregatorRegistry is supposed to work with a list of metrics, and by default,
105
        it sums them up. `aggregate` is a static method and hence it should be ok to use it directly.
106
        ###
107
        metrics = (m for _, m of exports.client_metrics)
108

109
        registry = prom_client.AggregatorRegistry.aggregate(metrics)
110
        return await registry.metrics()
111

112
    metrics: =>
113
        ###
114
        get a serialized representation of the metrics status
115
        (was a dict that should be JSON, now it is for prometheus)
116
        it's only called by the HTTP stuff in servers for the /metrics endpoint
117
        ###
118
        hub     = await prom_client.register.metrics()
119
        clients = await @client_metrics()
120
        return hub + clients
121

122
    register_collector: (collector) =>
123
        # The added collector functions will be evaluated periodically to gather metrics
124
        @_collectors.push(collector)
125

126
    setup_monitoring: =>
127
        # setup monitoring of some components
128
        # called by the hub *after* setting up the DB, etc.
129
        num_clients_gauge = new_gauge('clients_count', 'Number of connected clients')
130
        {number_of_clients} = require('./hub_register')
131
        @register_collector ->
132
            try
133
                num_clients_gauge.set(number_of_clients())
134
            catch
135
                num_clients_gauge.set(0)
136

137

138
        # our own CPU metrics monitor, separating user and sys!
139
        # it's actually a counter, since it is non-decreasing, but we'll use .set(...)
140
        @_cpu_seconds_total = new_gauge('process_cpu_categorized_seconds_total', 'Total number of CPU seconds used', ['type'])
141

142
        @_collect_duration = new_histogram('metrics_collect_duration_s', 'How long it took to gather the metrics', buckets:[0.0001, 0.001, 0.01, 1])
143
        @_collect_duration_last = new_gauge('metrics_collect_duration_s_last', 'How long it took the last time to gather the metrics')
144

145
        # init periodically calling @_collect
146
        setTimeout((=> setInterval(@_collect, FREQ_s * 1000)), DELAY_s * 1000)
147

148
    _collect: =>
149
        endG = @_collect_duration_last.startTimer()
150
        endH = @_collect_duration.startTimer()
151

152
        # called by @_update to evaluate the collector functions
153
        #@dbg('_collect called')
154
        for c in @_collectors
155
            c()
156
        # linux specific: collecting this process and all its children sys+user times
157
        # http://man7.org/linux/man-pages/man5/proc.5.html
158
        fs.readFile path.join('/proc', ''+process.pid, 'stat'), 'utf8', (err, infos) =>
159
            if err or not CLK_TCK?
160
                @dbg("_collect err: #{err}")
161
                return
162
            # there might be spaces in the process name, hence split after the closing bracket!
163
            infos = infos[infos.lastIndexOf(')') + 2...].split(' ')
164
            @_cpu_seconds_total.labels('user')       .set(parseFloat(infos[11]) / CLK_TCK)
165
            @_cpu_seconds_total.labels('system')     .set(parseFloat(infos[12]) / CLK_TCK)
166
            # time spent waiting on child processes
167
            @_cpu_seconds_total.labels('chld_user')  .set(parseFloat(infos[13]) / CLK_TCK)
168
            @_cpu_seconds_total.labels('chld_system').set(parseFloat(infos[14]) / CLK_TCK)
169

170
            # END: the timings for this run.
171
            endG()
172
            endH()
173

174
metricsRecorder = null
175
exports.init = (winston, cb) ->
176
    dbg = (msg) ->
177
        winston.info("MetricsRecorder: #{msg}")
178
    metricsRecorder = new MetricsRecorder(dbg, cb)
179

180
exports.get = ->
181
    return metricsRecorder
182

183
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.

Product

Resources

Company

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more, all in one place.

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.