Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Path: blob/master/src/packages/hub/analytics.ts
Views: 687
/*1* This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.2* License: MS-RSL – see LICENSE.md for details3*/45import { join } from "path";6import ms from "ms";7import { isEqual } from "lodash";8import { Router, json } from "express";9import {10analytics_cookie_name,11is_valid_uuid_string,12uuid,13} from "@cocalc/util/misc";14import type { PostgreSQL } from "@cocalc/database/postgres/types";15import { get_server_settings } from "@cocalc/database/postgres/server-settings";16import { pii_retention_to_future } from "@cocalc/database/postgres/pii";17import * as fs from "fs";18const UglifyJS = require("uglify-js");19// express-js cors plugin:20import cors from "cors";21import {22parseDomain,23fromUrl,24ParseResultType,25ParseResult,26} from "parse-domain";27import { getLogger } from "./logger";2829// Minifying analytics-script.js. Note30// that this file analytics.ts gets compiled to31// dist/analytics.js and also analytics-script.ts32// gets compiled to dist/analytics-script.js.33const result = UglifyJS.minify(34fs.readFileSync(join(__dirname, "analytics-script.js")).toString()35);36if (result.error) {37throw Error(`Error minifying analytics-script.js -- ${result.error}`);38}39export const analytics_js =40"if (window.exports === undefined) { var exports={}; } \n" + result.code;4142function create_log(name) {43return getLogger(`analytics.${name}`).debug;44}4546/*47// base64 encoded PNG (white), 1x1 pixels48const _PNG_DATA =49"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+ip1sAAAAASUVORK5CYII=";50const PNG_1x1 = Buffer.from(_PNG_DATA, "base64");51*/5253function sanitize(obj: object, recursive = 0): any {54if (recursive >= 2) return { error: "recursion limit" };55const ret: any = {};56let cnt = 0;57for (const key of Object.keys(obj)) {58cnt += 1;59if (cnt > 20) break;60const key_san = key.slice(0, 50);61let val_san = obj[key];62if (val_san == null) continue;63if (typeof val_san === "object") {64val_san = sanitize(val_san, recursive + 1);65} else if (typeof val_san === "string") {66val_san = val_san.slice(0, 2000);67} else {68// do nothing69}70ret[key_san] = val_san;71}72return ret;73}7475// record analytics data76// case 1: store "token" with associated "data", referrer, utm, etc.77// case 2: update entry with a known "token" with the account_id + 2nd timestamp78function recordAnalyticsData(79db: any,80token: string,81payload: object | undefined,82pii_retention: number | false83): void {84if (payload == null) return;85if (!is_valid_uuid_string(token)) return;86const dbg = create_log("record");87dbg({ token, payload });88// sanitize data (limits size and number of characters)89const rec_data = sanitize(payload);90dbg("sanitized data", rec_data);91const expire = pii_retention_to_future(pii_retention);9293if (rec_data.account_id != null) {94// dbg("update analytics", rec_data.account_id);95// only update if account id isn't already set!96db._query({97query: "UPDATE analytics",98where: [{ "token = $::UUID": token }, "account_id IS NULL"],99set: {100"account_id :: UUID": rec_data.account_id,101"account_id_time :: TIMESTAMP": new Date(),102"expire :: TIMESTAMP": expire,103},104});105} else {106db._query({107query: "INSERT INTO analytics",108values: {109"token :: UUID": token,110"data :: JSONB": rec_data,111"data_time :: TIMESTAMP": new Date(),112"expire :: TIMESTAMP": expire,113},114conflict: "token",115});116}117}118119// could throw an error120function check_cors(121origin: string | undefined,122dns_parsed: ParseResult,123dbg: Function124): boolean {125// no origin, e.g. when loaded as usual in a script tag126if (origin == null) return true;127128// origin could be https://...129const origin_parsed = parseDomain(fromUrl(origin));130if (origin_parsed.type === ParseResultType.Reserved) {131// This happens, e.g., when origin is https://localhost, which happens with cocalc-docker.132return true;133}134// the configured DNS name is not ok135if (dns_parsed.type !== ParseResultType.Listed) {136dbg(`parsed DNS domain invalid: ${JSON.stringify(dns_parsed)}`);137return false;138}139// now, we want dns_parsed and origin_parsed to be valid and listed140if (origin_parsed.type === ParseResultType.Listed) {141// most likely case: same domain as settings.DNS142if (143isEqual(origin_parsed.topLevelDomains, dns_parsed.topLevelDomains) &&144origin_parsed.domain === dns_parsed.domain145) {146return true;147}148// we also allow cocalc.com and sagemath.com149if (isEqual(origin_parsed.topLevelDomains, ["com"])) {150if (151origin_parsed.domain === "cocalc" ||152origin_parsed.domain === "sagemath"153) {154return true;155}156}157// … as well as sagemath.org158if (159isEqual(origin_parsed.topLevelDomains, ["org"]) &&160origin_parsed.domain === "sagemath"161) {162return true;163}164}165return false;166}167168/*169cocalc analytics setup -- this is used in http_hub_server to setup the /analytics.js endpoint170171this extracts tracking information about landing pages, measure campaign performance, etc.1721731. it sends a static js file (which is included in a script tag) to a page1742. a unique ID is generated and stored in a cookie1753. the script (should) send back a POST request, telling us about176the UTM params, referral, landing page, etc.177178The query param "fqd" (fully qualified domain) can be set to true or false (default true)179It controls if the bounce back URL mentions the domain.180*/181182import base_path from "@cocalc/backend/base-path";183184export async function initAnalytics(185router: Router,186database: PostgreSQL187): Promise<void> {188const dbg = create_log("analytics_js/cors");189190// we only get the DNS once at startup – i.e. hub restart required upon changing DNS!191const settings = await get_server_settings(database);192const DNS = settings.dns;193const dns_parsed = parseDomain(DNS);194const pii_retention = settings.pii_retention;195196if (197dns_parsed.type !== ParseResultType.Listed &&198dns_parsed.type !== ParseResultType.Reserved199) {200dbg(201`WARNING: the configured domain name ${DNS} cannot be parsed properly. ` +202`Please fix it in Admin → Site Settings!\n` +203`dns_parsed="${JSON.stringify(dns_parsed)}}"`204);205}206207// CORS-setup: allow access from other trusted (!) domains208const analytics_cors = {209credentials: true,210methods: ["GET", "POST"],211allowedHeaders: ["Content-Type", "*"],212origin: function (origin, cb) {213dbg(`check origin='${origin}'`);214try {215if (check_cors(origin, dns_parsed, dbg)) {216cb(null, true);217} else {218cb(`origin="${origin}" is not allowed`, false);219}220} catch (e) {221cb(e);222return;223}224},225};226227// process POST body data228// https://expressjs.com/en/api.html#express.json229router.use("/analytics.js", json());230231router.get("/analytics.js", cors(analytics_cors), function (req, res) {232res.header("Content-Type", "text/javascript");233234// in case user was already here, do not send it again.235// only the first hit is interesting.236dbg(237`/analytics.js GET analytics_cookie='${req.cookies[analytics_cookie_name]}'`238);239240if (!req.cookies[analytics_cookie_name]) {241// No analytics cookie is set, so we set one.242// We always set this despite any issues with parsing or243// or whether or not we are actually using the analytics.js244// script, since it's *also* useful to have this cookie set245// for other purposes, e.g., logging.246setAnalyticsCookie(res /* DNS */);247}248249// also, don't write a script if the DNS is not valid250if (251req.cookies[analytics_cookie_name] ||252dns_parsed.type !== ParseResultType.Listed253) {254// cache for 6 hours -- max-age has unit seconds255res.header(256"Cache-Control",257`private, max-age=${6 * 60 * 60}, must-revalidate`258);259res.write("// NOOP");260res.end();261return;262}263264// write response script265// this only runs once, hence no caching266res.header("Cache-Control", "no-cache, no-store");267268const DOMAIN = `${dns_parsed.domain}.${dns_parsed.topLevelDomains.join(269"."270)}`;271res.write(`var NAME = '${analytics_cookie_name}';\n`);272res.write(`var ID = '${uuid()}';\n`);273res.write(`var DOMAIN = '${DOMAIN}';\n`);274// BASE_PATH275if (req.query.fqd === "false") {276res.write(`var PREFIX = '${base_path}';\n`);277} else {278const prefix = `//${DOMAIN}${base_path}`;279res.write(`var PREFIX = '${prefix}';\n\n`);280}281res.write(analytics_js);282return res.end();283});284285/*286// tracking image: this is a 100% experimental idea and not used287router.get(288"/analytics.js/track.png",289cors(analytics_cors),290function (req, res) {291// in case user was already here, do not set a cookie292if (!req.cookies[analytics_cookie_name]) {293setAnalyticsCookie(res); // ,DNS);294}295res.header("Content-Type", "image/png");296res.header("Content-Length", `${PNG_1x1.length}`);297return res.end(PNG_1x1);298}299);300*/301302router.post("/analytics.js", cors(analytics_cors), function (req, res): void {303// check if token is in the cookie (see above)304// if not, ignore it305const token = req.cookies[analytics_cookie_name];306dbg(`/analytics.js POST token='${token}'`);307if (token) {308// req.body is an object (json middlewhere somewhere?)309// e.g. {"utm":{"source":"asdfasdf"},"landing":"https://cocalc.com/..."}310// ATTN key/values could be malicious311// record it, there is no need for a callback312recordAnalyticsData(database, token, req.body, pii_retention);313}314res.end();315});316317// additionally, custom content types require a preflight cors check318router.options("/analytics.js", cors(analytics_cors));319}320321// I'm not setting the domain, since it's making testing difficult.322function setAnalyticsCookie(res /* DNS: string */): void {323// set the cookie (TODO sign it? that would be good so that324// users can fake a cookie.)325const analytics_token = uuid();326res.cookie(analytics_cookie_name, analytics_token, {327path: "/",328maxAge: ms("7 days"),329// httpOnly: true,330// domain: DNS,331});332}333334335