Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sagemathinc
GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/packages/hub/analytics.ts
5707 views
1
/*
2
* This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.
3
* License: MS-RSL – see LICENSE.md for details
4
*/
5
6
import cors from "cors"; // express-js cors plugin
7
import { json, Router } from "express";
8
import * as fs from "fs";
9
import { isEqual } from "lodash";
10
import ms from "ms";
11
import {
12
fromUrl,
13
parseDomain,
14
ParseResult,
15
ParseResultType,
16
} from "parse-domain";
17
import { join } from "path";
18
const UglifyJS = require("uglify-js");
19
20
import { is_valid_uuid_string, uuid } from "@cocalc/util/misc";
21
22
import { pii_retention_to_future } from "@cocalc/database/postgres/pii";
23
import { get_server_settings } from "@cocalc/database/postgres/server-settings";
24
import type { PostgreSQL } from "@cocalc/database/postgres/types";
25
import { ANALYTICS_COOKIE_NAME } from "@cocalc/util/consts";
26
27
import { getLogger } from "./logger";
28
29
// Rate limiting for analytics data - 10 entries per second
30
const RATE_LIMIT_ENTRIES_PER_SECOND = 10;
31
const RATE_LIMIT_WINDOW_MS = 1000;
32
let rateLimitCounter = 0;
33
let rateLimitWindowStart = Date.now();
34
35
// Minifying analytics-script.js. Note
36
// that this file analytics.ts gets compiled to
37
// dist/analytics.js and also analytics-script.ts
38
// gets compiled to dist/analytics-script.js.
39
const result = UglifyJS.minify(
40
fs.readFileSync(join(__dirname, "analytics-script.js")).toString(),
41
);
42
if (result.error) {
43
throw Error(`Error minifying analytics-script.js -- ${result.error}`);
44
}
45
export const analytics_js =
46
"if (window.exports === undefined) { var exports={}; } \n" + result.code;
47
48
function create_log(name) {
49
return getLogger(`analytics.${name}`).debug;
50
}
51
52
// Rate limiting check - returns true if request should be allowed
53
function checkRateLimit(): boolean {
54
const now = Date.now();
55
56
// Reset counter if window has passed
57
if (now - rateLimitWindowStart >= RATE_LIMIT_WINDOW_MS) {
58
rateLimitCounter = 0;
59
rateLimitWindowStart = now;
60
}
61
62
// Check if we're under the limit
63
if (rateLimitCounter < RATE_LIMIT_ENTRIES_PER_SECOND) {
64
rateLimitCounter++;
65
return true;
66
}
67
68
return false;
69
}
70
71
/*
72
// base64 encoded PNG (white), 1x1 pixels
73
const _PNG_DATA =
74
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+ip1sAAAAASUVORK5CYII=";
75
const PNG_1x1 = Buffer.from(_PNG_DATA, "base64");
76
*/
77
78
function sanitize(obj: object, recursive = 0): any {
79
if (recursive >= 2) return { error: "recursion limit" };
80
const ret: any = {};
81
let cnt = 0;
82
for (const key of Object.keys(obj)) {
83
cnt += 1;
84
if (cnt > 20) break;
85
const key_san = key.slice(0, 50);
86
let val_san = obj[key];
87
if (val_san == null) continue;
88
if (typeof val_san === "object") {
89
val_san = sanitize(val_san, recursive + 1);
90
} else if (typeof val_san === "string") {
91
val_san = val_san.slice(0, 2000);
92
} else {
93
// do nothing
94
}
95
ret[key_san] = val_san;
96
}
97
return ret;
98
}
99
100
// record analytics data
101
// case 1: store "token" with associated "data", referrer, utm, etc.
102
// case 2: update entry with a known "token" with the account_id + 2nd timestamp
103
// case 3: cookieless tracking - store data without user association
104
function recordAnalyticsData(
105
db: any,
106
token: string | null,
107
payload: object | undefined,
108
pii_retention: number | false,
109
): void {
110
if (payload == null) return;
111
112
// Rate limiting check - applies to all analytics data recording
113
if (!checkRateLimit()) {
114
const dbg = create_log("record");
115
dbg("Rate limit exceeded, dropping analytics data");
116
return;
117
}
118
119
const dbg = create_log("record");
120
dbg({ token, payload });
121
122
// sanitize data (limits size and number of characters)
123
const rec_data = sanitize(payload);
124
dbg("sanitized data", rec_data);
125
const expire = pii_retention_to_future(pii_retention);
126
127
// Cookie-based tracking (with user association)
128
if (token != null && is_valid_uuid_string(token)) {
129
if (rec_data.account_id != null) {
130
// dbg("update analytics", rec_data.account_id);
131
// only update if account id isn't already set!
132
db._query({
133
query: "UPDATE analytics",
134
where: [{ "token = $::UUID": token }, "account_id IS NULL"],
135
set: {
136
"account_id :: UUID": rec_data.account_id,
137
"account_id_time :: TIMESTAMP": new Date(),
138
"expire :: TIMESTAMP": expire,
139
},
140
});
141
} else {
142
db._query({
143
query: "INSERT INTO analytics",
144
values: {
145
"token :: UUID": token,
146
"data :: JSONB": rec_data,
147
"data_time :: TIMESTAMP": new Date(),
148
"expire :: TIMESTAMP": expire,
149
},
150
conflict: "token",
151
});
152
}
153
} else {
154
// Cookieless tracking (no user association, privacy-focused)
155
// Generate a random token for this single entry
156
const anonymousToken = uuid();
157
db._query({
158
query: "INSERT INTO analytics",
159
values: {
160
"token :: UUID": anonymousToken,
161
"data :: JSONB": { ...rec_data, cookieless: true },
162
"data_time :: TIMESTAMP": new Date(),
163
"expire :: TIMESTAMP": expire,
164
},
165
conflict: "token",
166
});
167
}
168
}
169
170
// could throw an error
171
function checkCORS(
172
origin: string | undefined,
173
dns_parsed: ParseResult,
174
dbg: Function,
175
): boolean {
176
// no origin, e.g. when loaded as usual in a script tag
177
if (origin == null) return true;
178
179
// origin could be https://...
180
const origin_parsed = parseDomain(fromUrl(origin));
181
if (origin_parsed.type === ParseResultType.Reserved) {
182
// This happens, e.g., when origin is https://localhost, which happens with cocalc-docker.
183
return true;
184
}
185
// the configured DNS name is not ok
186
if (dns_parsed.type !== ParseResultType.Listed) {
187
dbg(`parsed DNS domain invalid: ${JSON.stringify(dns_parsed)}`);
188
return false;
189
}
190
// now, we want dns_parsed and origin_parsed to be valid and listed
191
if (origin_parsed.type === ParseResultType.Listed) {
192
// most likely case: same domain as settings.DNS
193
if (
194
isEqual(origin_parsed.topLevelDomains, dns_parsed.topLevelDomains) &&
195
origin_parsed.domain === dns_parsed.domain
196
) {
197
return true;
198
}
199
// we also allow cocalc.com and sagemath.com
200
if (isEqual(origin_parsed.topLevelDomains, ["com"])) {
201
if (
202
origin_parsed.domain === "cocalc" ||
203
origin_parsed.domain === "sagemath"
204
) {
205
return true;
206
}
207
}
208
// … as well as sagemath.org
209
if (
210
isEqual(origin_parsed.topLevelDomains, ["org"]) &&
211
origin_parsed.domain === "sagemath"
212
) {
213
return true;
214
}
215
}
216
return false;
217
}
218
219
/*
220
cocalc analytics setup -- this is used in http_hub_server to setup the /analytics.js endpoint
221
222
this extracts tracking information about landing pages, measure campaign performance, etc.
223
224
1. it sends a static js file (which is included in a script tag) to a page
225
2. a unique ID is generated and stored in a cookie
226
3. the script (should) send back a POST request, telling us about
227
the UTM params, referral, landing page, etc.
228
229
The query param "fqd" (fully qualified domain) can be set to true or false (default true)
230
It controls if the bounce back URL mentions the domain.
231
*/
232
233
import base_path from "@cocalc/backend/base-path";
234
235
export async function initAnalytics(
236
router: Router,
237
database: PostgreSQL,
238
): Promise<void> {
239
const dbg = create_log("analytics_js/cors");
240
241
// we only get the DNS once at startup – i.e. hub restart required upon changing DNS!
242
const settings = await get_server_settings();
243
const DNS = settings.dns;
244
const dns_parsed = parseDomain(DNS);
245
const pii_retention = settings.pii_retention;
246
const analytics_enabled = settings.analytics_cookie;
247
248
if (
249
dns_parsed.type !== ParseResultType.Listed &&
250
dns_parsed.type !== ParseResultType.Reserved
251
) {
252
dbg(
253
`WARNING: the configured domain name ${DNS} cannot be parsed properly. ` +
254
`Please fix it in Admin → Site Settings!\n` +
255
`dns_parsed="${JSON.stringify(dns_parsed)}}"`,
256
);
257
}
258
259
// CORS-setup: allow access from other trusted (!) domains
260
const analytics_cors = {
261
credentials: true,
262
methods: ["GET", "POST"],
263
allowedHeaders: ["Content-Type", "*"],
264
origin: function (origin, cb) {
265
dbg(`check origin='${origin}'`);
266
try {
267
if (checkCORS(origin, dns_parsed, dbg)) {
268
cb(null, true);
269
} else {
270
cb(`origin="${origin}" is not allowed`, false);
271
}
272
} catch (e) {
273
cb(e);
274
return;
275
}
276
},
277
};
278
279
// process POST body data
280
// https://expressjs.com/en/api.html#express.json
281
router.use("/analytics.js", json());
282
283
router.get("/analytics.js", cors(analytics_cors), function (req, res) {
284
res.header("Content-Type", "text/javascript");
285
286
// in case user was already here, do not send it again.
287
// only the first hit is interesting.
288
dbg(
289
`/analytics.js GET analytics_cookie='${req.cookies[ANALYTICS_COOKIE_NAME]}'`,
290
);
291
292
if (!req.cookies[ANALYTICS_COOKIE_NAME] && analytics_enabled) {
293
// No analytics cookie is set and cookies are enabled, so we set one.
294
// When analytics_enabled is false, we skip setting cookies to enable
295
// cookieless tracking for better privacy.
296
setAnalyticsCookie(res /* DNS */);
297
}
298
299
// Return NOOP if DNS is invalid, or if cookies are enabled and already exist
300
if (
301
dns_parsed.type !== ParseResultType.Listed ||
302
(analytics_enabled && req.cookies[ANALYTICS_COOKIE_NAME])
303
) {
304
// cache for 6 hours -- max-age has unit seconds
305
res.header(
306
"Cache-Control",
307
`private, max-age=${6 * 60 * 60}, must-revalidate`,
308
);
309
res.write("// NOOP");
310
res.end();
311
return;
312
}
313
314
// write response script
315
// this only runs once, hence no caching
316
res.header("Cache-Control", "no-cache, no-store");
317
318
const DOMAIN = `${dns_parsed.domain}.${dns_parsed.topLevelDomains.join(
319
".",
320
)}`;
321
res.write(`var NAME = '${ANALYTICS_COOKIE_NAME}';\n`);
322
res.write(`var ID = '${uuid()}';\n`);
323
res.write(`var DOMAIN = '${DOMAIN}';\n`);
324
res.write(`var ANALYTICS_ENABLED = ${analytics_enabled};\n`);
325
// BASE_PATH
326
if (req.query.fqd === "false") {
327
res.write(`var PREFIX = '${base_path}';\n`);
328
} else {
329
const prefix = `//${DOMAIN}${base_path}`;
330
res.write(`var PREFIX = '${prefix}';\n\n`);
331
}
332
res.write(analytics_js);
333
return res.end();
334
});
335
336
/*
337
// tracking image: this is a 100% experimental idea and not used
338
router.get(
339
"/analytics.js/track.png",
340
cors(analytics_cors),
341
function (req, res) {
342
// in case user was already here, do not set a cookie
343
if (!req.cookies[analytics_cookie_name]) {
344
setAnalyticsCookie(res); // ,DNS);
345
}
346
res.header("Content-Type", "image/png");
347
res.header("Content-Length", `${PNG_1x1.length}`);
348
return res.end(PNG_1x1);
349
}
350
);
351
*/
352
353
router.post("/analytics.js", cors(analytics_cors), function (req, res): void {
354
const token = req.cookies[ANALYTICS_COOKIE_NAME];
355
dbg(`/analytics.js POST token='${token}'`);
356
357
// req.body is an object (json middleware somewhere?)
358
// e.g. {"utm":{"source":"asdfasdf"},"landing":"https://cocalc.com/..."}
359
// ATTN key/values could be malicious
360
361
// Always record analytics data - either with token (cookie-based) or without (cookieless)
362
// The recordAnalyticsData function handles both cases
363
recordAnalyticsData(database, token || null, req.body, pii_retention);
364
365
res.end();
366
});
367
368
// additionally, custom content types require a preflight cors check
369
router.options("/analytics.js", cors(analytics_cors));
370
}
371
372
// I'm not setting the domain, since it's making testing difficult.
373
function setAnalyticsCookie(res /* DNS: string */): void {
374
// set the cookie (TODO sign it? that would be good so that
375
// users can fake a cookie.)
376
const analytics_token = uuid();
377
res.cookie(ANALYTICS_COOKIE_NAME, analytics_token, {
378
path: "/",
379
maxAge: ms("7 days"),
380
// httpOnly: true,
381
// domain: DNS,
382
});
383
}
384
385