Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Path: blob/master/src/packages/hub/hub.ts
Views: 687
//########################################################################1// This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.2// License: MS-RSL – see LICENSE.md for details3//########################################################################45// This is the CoCalc Global HUB. It runs as a daemon, sitting in the6// middle of the action, connected to potentially thousands of clients,7// many Sage sessions, and PostgreSQL database.89import TTLCache from "@isaacs/ttlcache";10import { callback } from "awaiting";11import blocked from "blocked";12import { spawn } from "child_process";13import { program as commander, Option } from "commander";14import basePath from "@cocalc/backend/base-path";15import {16pghost as DEFAULT_DB_HOST,17pgdatabase as DEFAULT_DB_NAME,18pguser as DEFAULT_DB_USER,19} from "@cocalc/backend/data";20import { trimLogFileSize } from "@cocalc/backend/logger";21import port from "@cocalc/backend/port";22import { init_start_always_running_projects } from "@cocalc/database/postgres/always-running";23import { load_server_settings_from_env } from "@cocalc/database/settings/server-settings";24import { init_passport } from "@cocalc/server/hub/auth";25import { initialOnPremSetup } from "@cocalc/server/initial-onprem-setup";26import initHandleMentions from "@cocalc/server/mentions/handle";27import initProjectControl, {28COCALC_MODES,29} from "@cocalc/server/projects/control";30import initIdleTimeout from "@cocalc/server/projects/control/stop-idle-projects";31import initNewProjectPoolMaintenanceLoop from "@cocalc/server/projects/pool/maintain";32import initPurchasesMaintenanceLoop from "@cocalc/server/purchases/maintenance";33import initSalesloftMaintenance from "@cocalc/server/salesloft/init";34import { stripe_sync } from "@cocalc/server/stripe/sync";35import { callback2, retry_until_success } from "@cocalc/util/async-utils";36import { getClients } from "./clients";37import { set_agent_endpoint } from "./health-checks";38import { start as startHubRegister } from "./hub_register";39import { getLogger } from "./logger";40import initDatabase, { database } from "./servers/database";41import initExpressApp from "./servers/express-app";42import initHttpRedirect from "./servers/http-redirect";43import initPrimus from "./servers/primus";44import initVersionServer from "./servers/version";4546const MetricsRecorder = require("./metrics-recorder"); // import * as MetricsRecorder from "./metrics-recorder";4748// Logger tagged with 'hub' for this file.49const winston = getLogger("hub");5051// program gets populated with the command line options below.52let program: { [option: string]: any } = {};53export { program };5455// How frequently to register with the database that this hub is up and running,56// and also report number of connected clients.57const REGISTER_INTERVAL_S = 20;5859// the jsmap of connected clients60const clients = getClients();6162async function reset_password(email_address: string): Promise<void> {63try {64await callback2(database.reset_password, { email_address });65winston.info(`Password changed for ${email_address}`);66} catch (err) {67winston.info(`Error resetting password -- ${err}`);68}69}7071// This calculates and updates the statistics for the /stats endpoint.72// It's important that we call this periodically, because otherwise the /stats data is outdated.73async function init_update_stats(): Promise<void> {74winston.info("init updating stats periodically");75const update = () => callback2(database.get_stats);76// Do it every minute:77setInterval(() => update(), 60000);78// Also do it once now:79await update();80}8182// This calculates and updates the site_license_usage_log.83// It's important that we call this periodically, if we want84// to be able to monitor site license usage. This is enabled85// by default only for dev mode (so for development).86async function init_update_site_license_usage_log() {87winston.info("init updating site license usage log periodically");88const update = async () => await database.update_site_license_usage_log();89setInterval(update, 31000);90await update();91}9293async function initMetrics() {94winston.info("Initializing Metrics Recorder...");95await callback(MetricsRecorder.init, winston);96return {97metric_blocked: MetricsRecorder.new_counter(98"blocked_ms_total",99'accumulates the "blocked" time in the hub [ms]',100),101uncaught_exception_total: MetricsRecorder.new_counter(102"uncaught_exception_total",103'counts "BUG"s',104),105};106}107108async function startServer(): Promise<void> {109winston.info("start_server");110111winston.info(`basePath='${basePath}'`);112winston.info(113`database: name="${program.databaseName}" nodes="${program.databaseNodes}" user="${program.databaseUser}"`,114);115116const { metric_blocked, uncaught_exception_total } = await initMetrics();117118// Log anything that blocks the CPU for more than ~100ms -- see https://github.com/tj/node-blocked119blocked((ms: number) => {120if (ms > 100) {121metric_blocked.inc(ms);122}123// record that something blocked:124if (ms > 100) {125winston.debug(`BLOCKED for ${ms}ms`);126}127});128129// Wait for database connection to work. Everything requires this.130await retry_until_success({131f: async () => await callback2(database.connect),132start_delay: 1000,133max_delay: 10000,134});135winston.info("connected to database.");136137if (program.updateDatabaseSchema) {138winston.info("Update database schema");139await callback2(database.update_schema);140141// in those cases where we initialize the database upon startup142// (essentially only relevant for kucalc's hub-websocket)143if (program.mode === "kucalc") {144// and for on-prem setups, also initialize the admin account, set a registration token, etc.145await initialOnPremSetup(database);146}147}148149// set server settings based on environment variables150await load_server_settings_from_env(database);151152if (program.agentPort) {153winston.info("Configure agent port");154set_agent_endpoint(program.agentPort, program.hostname);155}156157// Mentions158if (program.mentions) {159winston.info("enabling handling of mentions...");160initHandleMentions();161}162163// Project control164winston.info("initializing project control...");165const projectControl = initProjectControl(program.mode);166// used for nextjs hot module reloading dev server167process.env["COCALC_MODE"] = program.mode;168169if (program.mode != "kucalc" && program.websocketServer) {170// We handle idle timeout of projects.171// This can be disabled via COCALC_NO_IDLE_TIMEOUT.172// This only uses the admin-configurable settings field of projects173// in the database and isn't aware of licenses or upgrades.174initIdleTimeout(projectControl);175}176177if (program.websocketServer) {178// Initialize the version server -- must happen after updating schema179// (for first ever run).180await initVersionServer();181182if (program.mode == "single-user" && process.env.USER == "user") {183// Definitely in dev mode, probably on cocalc.com in a project, so we kill184// all the running projects when starting the hub:185// Whenever we start the dev server, we just assume186// all projects are stopped, since assuming they are187// running when they are not is bad. Something similar188// is done in cocalc-docker.189winston.info("killing all projects...");190await callback2(database._query, {191safety_check: false,192query: 'update projects set state=\'{"state":"opened"}\'',193});194await spawn("pkill", ["-f", "node_modules/.bin/cocalc-project"]);195196// Also, unrelated to killing projects, for purposes of developing197// custom software images, we inject a couple of random nonsense entries198// into the table in the DB:199winston.info("inserting random nonsense compute images in database");200await callback2(database.insert_random_compute_images);201}202203if (program.mode != "kucalc") {204await init_update_stats();205await init_update_site_license_usage_log();206// This is async but runs forever, so don't wait for it.207winston.info("init starting always running projects");208init_start_always_running_projects(database);209}210}211212const { router, httpServer } = await initExpressApp({213isPersonal: program.personal,214projectControl,215proxyServer: !!program.proxyServer,216nextServer: !!program.nextServer,217cert: program.httpsCert,218key: program.httpsKey,219listenersHack:220program.mode == "single-user" &&221program.proxyServer &&222program.nextServer &&223program.websocketServer &&224process.env["NODE_ENV"] == "development",225});226227// The express app create via initExpressApp above **assumes** that init_passport is done228// or complains a lot. This is obviously not really necessary, but we leave it for now.229await callback2(init_passport, {230router,231database,232host: program.hostname,233});234235winston.info(`starting webserver listening on ${program.hostname}:${port}`);236await callback(httpServer.listen.bind(httpServer), port, program.hostname);237238if (port == 443 && program.httpsCert && program.httpsKey) {239// also start a redirect from port 80 to port 443.240await initHttpRedirect(program.hostname);241}242243if (program.websocketServer) {244winston.info("initializing primus websocket server");245initPrimus({246httpServer,247router,248projectControl,249clients,250host: program.hostname,251port,252isPersonal: program.personal,253});254}255256if (program.websocketServer || program.proxyServer || program.nextServer) {257winston.info(258"Starting registering periodically with the database and updating a health check...",259);260261// register the hub with the database periodically, and262// also confirms that database is working.263await callback2(startHubRegister, {264database,265clients,266host: program.hostname,267port,268interval_s: REGISTER_INTERVAL_S,269});270271const protocol = program.httpsKey ? "https" : "http";272const target = `${protocol}://${program.hostname}:${port}${basePath}`;273274const msg = `Started HUB!\n\n-----------\n\n The following URL *might* work: ${target}\n\n\nPORT=${port}\nBASE_PATH=${basePath}\nPROTOCOL=${protocol}\n\n${275basePath.length <= 1276? ""277: "If you are developing cocalc inside of cocalc, take the URL of the host cocalc\nand append " +278basePath +279" to it."280}\n\n-----------\n\n`;281winston.info(msg);282console.log(msg);283284// this is not so robust, so disabled for now.285// if (286// program.websocketServer &&287// program.nextServer &&288// process.env["NODE_ENV"] != "production"289// ) {290// // This is entirely to deal with conflicts between both nextjs and webpack when doing291// // hot module reloading. They fight with each other, and the we -- the developers --292// // win only AFTER the fight is done. So we force the fight automatically, rather than293// // manually, which is confusing.294// console.log(295// `launch get of ${target} so that webpack and nextjs websockets can fight things out`,296// );297// const process = spawn(298// "chromium-browser",299// ["--no-sandbox", "--headless", target],300// { detached: true, stdio: "ignore" },301// );302// process.unref();303// }304}305306if (program.all || program.mentions) {307// kucalc: for now we just have the hub-mentions servers308// do the new project pool maintenance, since there is only309// one hub-stats.310// On non-cocalc it'll get done by *the* hub because of program.all.311initNewProjectPoolMaintenanceLoop();312// Starts periodic maintenance on pay-as-you-go purchases, e.g., quota313// upgrades of projects.314initPurchasesMaintenanceLoop();315initSalesloftMaintenance();316setInterval(trimLogFileSize, 1000 * 60 * 3);317}318319addErrorListeners(uncaught_exception_total);320}321322// addErrorListeners: after successful startup, don't crash on routine errors.323// We don't do this until startup, since we do want to crash on errors on startup.324325// Use cache to not save the SAME error to the database (and prometheus)326// more than once per minute.327const errorReportCache = new TTLCache({ ttl: 60 * 1000 });328329function addErrorListeners(uncaught_exception_total) {330process.addListener("uncaughtException", function (err) {331winston.error(332"BUG ****************************************************************************",333);334winston.error("Uncaught exception: " + err);335console.error(err.stack);336winston.error(err.stack);337winston.error(338"BUG ****************************************************************************",339);340const key = `${err}`;341if (errorReportCache.has(key)) {342return;343}344errorReportCache.set(key, true);345database?.uncaught_exception(err);346uncaught_exception_total.inc(1);347});348349return process.on("unhandledRejection", function (reason, p) {350winston.error(351"BUG UNHANDLED REJECTION *********************************************************",352);353console.error(p, reason); // strangely sometimes winston.error can't actually show the traceback...354winston.error("Unhandled Rejection at:", p, "reason:", reason);355winston.error(356"BUG UNHANDLED REJECTION *********************************************************",357);358const key = `${p}${reason}`;359if (errorReportCache.has(key)) {360return;361}362errorReportCache.set(key, true);363database?.uncaught_exception(reason);364uncaught_exception_total.inc(1);365});366}367368//############################################369// Process command line arguments370//############################################371async function main(): Promise<void> {372commander373.name("cocalc-hub-server")374.usage("options")375.addOption(376new Option(377"--mode [string]",378`REQUIRED mode in which to run CoCalc (${COCALC_MODES.join(379", ",380)}) - or set COCALC_MODE env var`,381).choices(COCALC_MODES as any as string[]),382)383.option(384"--all",385"runs all of the servers: websocket, proxy, next (so you don't have to pass all those opts separately), and also mentions updator and updates db schema on startup; use this in situations where there is a single hub that serves everything (instead of a microservice situation like kucalc)",386)387.option("--websocket-server", "run the websocket server")388.option("--proxy-server", "run the proxy server")389.option(390"--next-server",391"run the nextjs server (landing pages, share server, etc.)",392)393.option(394"--https-key [string]",395"serve over https. argument should be a key filename (both https-key and https-cert must be specified)",396)397.option(398"--https-cert [string]",399"serve over https. argument should be a cert filename (both https-key and https-cert must be specified)",400)401.option(402"--agent-port <n>",403"port for HAProxy agent-check (default: 0 -- do not start)",404(n) => parseInt(n),4050,406)407.option(408"--hostname [string]",409'host of interface to bind to (default: "127.0.0.1")',410"127.0.0.1",411)412.option(413"--database-nodes <string,string,...>",414`database address (default: '${DEFAULT_DB_HOST}')`,415DEFAULT_DB_HOST,416)417.option(418"--database-name [string]",419`Database name to use (default: "${DEFAULT_DB_NAME}")`,420DEFAULT_DB_NAME,421)422.option(423"--database-user [string]",424`Database username to use (default: "${DEFAULT_DB_USER}")`,425DEFAULT_DB_USER,426)427.option("--passwd [email_address]", "Reset password of given user", "")428.option(429"--update-database-schema",430"If specified, updates database schema on startup (always happens when mode is not kucalc).",431)432.option(433"--stripe-sync",434"Sync stripe subscriptions to database for all users with stripe id",435"yes",436)437.option(438"--update-stats",439"Calculates the statistics for the /stats endpoint and stores them in the database",440"yes",441)442.option("--delete-expired", "Delete expired data from the database", "yes")443.option(444"--blob-maintenance",445"Do blob-related maintenance (dump to tarballs, offload to gcloud)",446"yes",447)448.option(449"--mentions",450"if given, periodically handle mentions; on kucalc there is only one of these. It also managed the new project pool. Maybe this should be renamed --singleton!",451)452.option(453"--test",454"terminate after setting up the hub -- used to test if it starts up properly",455)456.option(457"--db-concurrent-warn <n>",458"be very unhappy if number of concurrent db requests exceeds this (default: 300)",459(n) => parseInt(n),460300,461)462.option(463"--personal",464"run VERY UNSAFE: there is only one user and no authentication",465)466.parse(process.argv);467// Everywhere else in our code, we just refer to program.[options] since we468// wrote this code against an ancient version of commander.469const opts = commander.opts();470for (const name in opts) {471program[name] = opts[name];472}473if (!program.mode) {474program.mode = process.env.COCALC_MODE;475if (!program.mode) {476throw Error(477`the --mode option must be specified or the COCALC_MODE env var set to one of ${COCALC_MODES.join(478", ",479)}`,480);481process.exit(1);482}483}484if (program.all) {485program.websocketServer =486program.proxyServer =487program.nextServer =488program.mentions =489program.updateDatabaseSchema =490true;491}492493//console.log("got opts", opts);494495try {496// Everything we do here requires the database to be initialized. Once497// this is called, require('@cocalc/database/postgres/database').default() is a valid db498// instance that can be used.499initDatabase({500host: program.databaseNodes,501database: program.databaseName,502user: program.databaseUser,503concurrent_warn: program.dbConcurrentWarn,504});505506if (program.passwd) {507winston.debug("Resetting password");508await reset_password(program.passwd);509process.exit();510} else if (program.stripeSync) {511winston.debug("Stripe sync");512await stripe_sync({ database, logger: winston });513process.exit();514} else if (program.deleteExpired) {515await callback2(database.delete_expired, {516count_only: false,517});518process.exit();519} else if (program.blobMaintenance) {520await callback2(database.blob_maintenance);521process.exit();522} else if (program.updateStats) {523await callback2(database.get_stats);524process.exit();525} else {526await startServer();527}528} catch (err) {529console.log(err);530winston.error("Error -- ", err);531process.exit(1);532}533}534535main();536537538