CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
sagemathinc

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.

GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/packages/hub/analytics.ts
Views: 687
1
/*
2
* This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.
3
* License: MS-RSL – see LICENSE.md for details
4
*/
5
6
import { join } from "path";
7
import ms from "ms";
8
import { isEqual } from "lodash";
9
import { Router, json } from "express";
10
import {
11
analytics_cookie_name,
12
is_valid_uuid_string,
13
uuid,
14
} from "@cocalc/util/misc";
15
import type { PostgreSQL } from "@cocalc/database/postgres/types";
16
import { get_server_settings } from "@cocalc/database/postgres/server-settings";
17
import { pii_retention_to_future } from "@cocalc/database/postgres/pii";
18
import * as fs from "fs";
19
const UglifyJS = require("uglify-js");
20
// express-js cors plugin:
21
import cors from "cors";
22
import {
23
parseDomain,
24
fromUrl,
25
ParseResultType,
26
ParseResult,
27
} from "parse-domain";
28
import { getLogger } from "./logger";
29
30
// Minifying analytics-script.js. Note
31
// that this file analytics.ts gets compiled to
32
// dist/analytics.js and also analytics-script.ts
33
// gets compiled to dist/analytics-script.js.
34
const result = UglifyJS.minify(
35
fs.readFileSync(join(__dirname, "analytics-script.js")).toString()
36
);
37
if (result.error) {
38
throw Error(`Error minifying analytics-script.js -- ${result.error}`);
39
}
40
export const analytics_js =
41
"if (window.exports === undefined) { var exports={}; } \n" + result.code;
42
43
function create_log(name) {
44
return getLogger(`analytics.${name}`).debug;
45
}
46
47
/*
48
// base64 encoded PNG (white), 1x1 pixels
49
const _PNG_DATA =
50
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+ip1sAAAAASUVORK5CYII=";
51
const PNG_1x1 = Buffer.from(_PNG_DATA, "base64");
52
*/
53
54
function sanitize(obj: object, recursive = 0): any {
55
if (recursive >= 2) return { error: "recursion limit" };
56
const ret: any = {};
57
let cnt = 0;
58
for (const key of Object.keys(obj)) {
59
cnt += 1;
60
if (cnt > 20) break;
61
const key_san = key.slice(0, 50);
62
let val_san = obj[key];
63
if (val_san == null) continue;
64
if (typeof val_san === "object") {
65
val_san = sanitize(val_san, recursive + 1);
66
} else if (typeof val_san === "string") {
67
val_san = val_san.slice(0, 2000);
68
} else {
69
// do nothing
70
}
71
ret[key_san] = val_san;
72
}
73
return ret;
74
}
75
76
// record analytics data
77
// case 1: store "token" with associated "data", referrer, utm, etc.
78
// case 2: update entry with a known "token" with the account_id + 2nd timestamp
79
function recordAnalyticsData(
80
db: any,
81
token: string,
82
payload: object | undefined,
83
pii_retention: number | false
84
): void {
85
if (payload == null) return;
86
if (!is_valid_uuid_string(token)) return;
87
const dbg = create_log("record");
88
dbg({ token, payload });
89
// sanitize data (limits size and number of characters)
90
const rec_data = sanitize(payload);
91
dbg("sanitized data", rec_data);
92
const expire = pii_retention_to_future(pii_retention);
93
94
if (rec_data.account_id != null) {
95
// dbg("update analytics", rec_data.account_id);
96
// only update if account id isn't already set!
97
db._query({
98
query: "UPDATE analytics",
99
where: [{ "token = $::UUID": token }, "account_id IS NULL"],
100
set: {
101
"account_id :: UUID": rec_data.account_id,
102
"account_id_time :: TIMESTAMP": new Date(),
103
"expire :: TIMESTAMP": expire,
104
},
105
});
106
} else {
107
db._query({
108
query: "INSERT INTO analytics",
109
values: {
110
"token :: UUID": token,
111
"data :: JSONB": rec_data,
112
"data_time :: TIMESTAMP": new Date(),
113
"expire :: TIMESTAMP": expire,
114
},
115
conflict: "token",
116
});
117
}
118
}
119
120
// could throw an error
121
function check_cors(
122
origin: string | undefined,
123
dns_parsed: ParseResult,
124
dbg: Function
125
): boolean {
126
// no origin, e.g. when loaded as usual in a script tag
127
if (origin == null) return true;
128
129
// origin could be https://...
130
const origin_parsed = parseDomain(fromUrl(origin));
131
if (origin_parsed.type === ParseResultType.Reserved) {
132
// This happens, e.g., when origin is https://localhost, which happens with cocalc-docker.
133
return true;
134
}
135
// the configured DNS name is not ok
136
if (dns_parsed.type !== ParseResultType.Listed) {
137
dbg(`parsed DNS domain invalid: ${JSON.stringify(dns_parsed)}`);
138
return false;
139
}
140
// now, we want dns_parsed and origin_parsed to be valid and listed
141
if (origin_parsed.type === ParseResultType.Listed) {
142
// most likely case: same domain as settings.DNS
143
if (
144
isEqual(origin_parsed.topLevelDomains, dns_parsed.topLevelDomains) &&
145
origin_parsed.domain === dns_parsed.domain
146
) {
147
return true;
148
}
149
// we also allow cocalc.com and sagemath.com
150
if (isEqual(origin_parsed.topLevelDomains, ["com"])) {
151
if (
152
origin_parsed.domain === "cocalc" ||
153
origin_parsed.domain === "sagemath"
154
) {
155
return true;
156
}
157
}
158
// … as well as sagemath.org
159
if (
160
isEqual(origin_parsed.topLevelDomains, ["org"]) &&
161
origin_parsed.domain === "sagemath"
162
) {
163
return true;
164
}
165
}
166
return false;
167
}
168
169
/*
170
cocalc analytics setup -- this is used in http_hub_server to setup the /analytics.js endpoint
171
172
this extracts tracking information about landing pages, measure campaign performance, etc.
173
174
1. it sends a static js file (which is included in a script tag) to a page
175
2. a unique ID is generated and stored in a cookie
176
3. the script (should) send back a POST request, telling us about
177
the UTM params, referral, landing page, etc.
178
179
The query param "fqd" (fully qualified domain) can be set to true or false (default true)
180
It controls if the bounce back URL mentions the domain.
181
*/
182
183
import base_path from "@cocalc/backend/base-path";
184
185
export async function initAnalytics(
186
router: Router,
187
database: PostgreSQL
188
): Promise<void> {
189
const dbg = create_log("analytics_js/cors");
190
191
// we only get the DNS once at startup – i.e. hub restart required upon changing DNS!
192
const settings = await get_server_settings(database);
193
const DNS = settings.dns;
194
const dns_parsed = parseDomain(DNS);
195
const pii_retention = settings.pii_retention;
196
197
if (
198
dns_parsed.type !== ParseResultType.Listed &&
199
dns_parsed.type !== ParseResultType.Reserved
200
) {
201
dbg(
202
`WARNING: the configured domain name ${DNS} cannot be parsed properly. ` +
203
`Please fix it in Admin → Site Settings!\n` +
204
`dns_parsed="${JSON.stringify(dns_parsed)}}"`
205
);
206
}
207
208
// CORS-setup: allow access from other trusted (!) domains
209
const analytics_cors = {
210
credentials: true,
211
methods: ["GET", "POST"],
212
allowedHeaders: ["Content-Type", "*"],
213
origin: function (origin, cb) {
214
dbg(`check origin='${origin}'`);
215
try {
216
if (check_cors(origin, dns_parsed, dbg)) {
217
cb(null, true);
218
} else {
219
cb(`origin="${origin}" is not allowed`, false);
220
}
221
} catch (e) {
222
cb(e);
223
return;
224
}
225
},
226
};
227
228
// process POST body data
229
// https://expressjs.com/en/api.html#express.json
230
router.use("/analytics.js", json());
231
232
router.get("/analytics.js", cors(analytics_cors), function (req, res) {
233
res.header("Content-Type", "text/javascript");
234
235
// in case user was already here, do not send it again.
236
// only the first hit is interesting.
237
dbg(
238
`/analytics.js GET analytics_cookie='${req.cookies[analytics_cookie_name]}'`
239
);
240
241
if (!req.cookies[analytics_cookie_name]) {
242
// No analytics cookie is set, so we set one.
243
// We always set this despite any issues with parsing or
244
// or whether or not we are actually using the analytics.js
245
// script, since it's *also* useful to have this cookie set
246
// for other purposes, e.g., logging.
247
setAnalyticsCookie(res /* DNS */);
248
}
249
250
// also, don't write a script if the DNS is not valid
251
if (
252
req.cookies[analytics_cookie_name] ||
253
dns_parsed.type !== ParseResultType.Listed
254
) {
255
// cache for 6 hours -- max-age has unit seconds
256
res.header(
257
"Cache-Control",
258
`private, max-age=${6 * 60 * 60}, must-revalidate`
259
);
260
res.write("// NOOP");
261
res.end();
262
return;
263
}
264
265
// write response script
266
// this only runs once, hence no caching
267
res.header("Cache-Control", "no-cache, no-store");
268
269
const DOMAIN = `${dns_parsed.domain}.${dns_parsed.topLevelDomains.join(
270
"."
271
)}`;
272
res.write(`var NAME = '${analytics_cookie_name}';\n`);
273
res.write(`var ID = '${uuid()}';\n`);
274
res.write(`var DOMAIN = '${DOMAIN}';\n`);
275
// BASE_PATH
276
if (req.query.fqd === "false") {
277
res.write(`var PREFIX = '${base_path}';\n`);
278
} else {
279
const prefix = `//${DOMAIN}${base_path}`;
280
res.write(`var PREFIX = '${prefix}';\n\n`);
281
}
282
res.write(analytics_js);
283
return res.end();
284
});
285
286
/*
287
// tracking image: this is a 100% experimental idea and not used
288
router.get(
289
"/analytics.js/track.png",
290
cors(analytics_cors),
291
function (req, res) {
292
// in case user was already here, do not set a cookie
293
if (!req.cookies[analytics_cookie_name]) {
294
setAnalyticsCookie(res); // ,DNS);
295
}
296
res.header("Content-Type", "image/png");
297
res.header("Content-Length", `${PNG_1x1.length}`);
298
return res.end(PNG_1x1);
299
}
300
);
301
*/
302
303
router.post("/analytics.js", cors(analytics_cors), function (req, res): void {
304
// check if token is in the cookie (see above)
305
// if not, ignore it
306
const token = req.cookies[analytics_cookie_name];
307
dbg(`/analytics.js POST token='${token}'`);
308
if (token) {
309
// req.body is an object (json middlewhere somewhere?)
310
// e.g. {"utm":{"source":"asdfasdf"},"landing":"https://cocalc.com/..."}
311
// ATTN key/values could be malicious
312
// record it, there is no need for a callback
313
recordAnalyticsData(database, token, req.body, pii_retention);
314
}
315
res.end();
316
});
317
318
// additionally, custom content types require a preflight cors check
319
router.options("/analytics.js", cors(analytics_cors));
320
}
321
322
// I'm not setting the domain, since it's making testing difficult.
323
function setAnalyticsCookie(res /* DNS: string */): void {
324
// set the cookie (TODO sign it? that would be good so that
325
// users can fake a cookie.)
326
const analytics_token = uuid();
327
res.cookie(analytics_cookie_name, analytics_token, {
328
path: "/",
329
maxAge: ms("7 days"),
330
// httpOnly: true,
331
// domain: DNS,
332
});
333
}
334
335