Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/master/src/packages/util/db-schema/compute-servers.ts
Views: 923
/*1* This file is part of CoCalc: Copyright © 2023 Sagemath, Inc.2* License: MS-RSL – see LICENSE.md for details3*/45import type {6Region as HyperstackRegion,7VirtualMachine as HyperstackVirtualMachine,8} from "@cocalc/util/compute/cloud/hyperstack/api-types";9import { COLORS } from "@cocalc/util/theme";10import { ID, NOTES } from "./crm";11import { SCHEMA as schema } from "./index";12import { Table } from "./types";13export {14CLOUDS_BY_NAME,15GOOGLE_CLOUD_DEFAULTS,16ON_PREM_DEFAULTS,17} from "@cocalc/util/compute/cloud/clouds";1819// These are just fallbacks in case something is wrong with the image configuration.20export const STANDARD_DISK_SIZE = 20;21export const CUDA_DISK_SIZE = 60;2223export const CHECK_IN_PERIOD_S = 20;24export const CHECK_IN_PATH = "/cocalc/conf/check-in";2526// Clients are recommended to wait this long after a purchase ends before27// requesting the cost. This should give us about a day of wiggle room.28// There is no SLA on billing data.29const GOOGLE_COST_LAG_DAYS = 2;30export const GOOGLE_COST_LAG_MS = GOOGLE_COST_LAG_DAYS * 24 * 60 * 60 * 1000;3132// Compute Server Images -- typings. See packages/server/compute/images.ts for33// how the actual data is populated.3435export interface ImageVersion {36// tag - must be given and distinct for each version -- this typically identifies the image to docker37tag: string;38// version -- defaults to tag if not given; usually the upstream version39version?: string;40// label -- defaults to the tag; this is to display to the user41label?: string;42// tested -- if this is not set to true, then this version should not be shown by default.43// If not tested, only show to users who explicitly really want this (e.g., admins).44tested?: boolean;45}4647export const IDLE_TIMEOUT_MINUTES_DEFAULT = 30;4849export const HEALTH_CHECK_DEFAULTS = {50command: "pwd",51initialDelaySeconds: 10 * 60,52timeoutSeconds: 30,53periodSeconds: 60,54failureThreshold: 3,55enabled: false,56action: "reboot",57};5859export const HEALTH_CHECK_ACTIONS = [60"reboot",61"stop",62"suspend",63"deprovision",64];65type HealthCheckAction = (typeof HEALTH_CHECK_ACTIONS)[number];6667export function validatedHealthCheck(68healthCheck?: any,69): HealthCheck | undefined {70if (healthCheck == null) {71return undefined;72}73let {74command,75periodSeconds,76failureThreshold,77enabled,78action,79timeoutSeconds,80initialDelaySeconds,81} = healthCheck;82command = `${command}`;83periodSeconds = parseFloat(84periodSeconds ?? HEALTH_CHECK_DEFAULTS.periodSeconds,85);86if (periodSeconds < 0 || !isFinite(periodSeconds)) {87periodSeconds = HEALTH_CHECK_DEFAULTS.periodSeconds;88}89failureThreshold = parseFloat(90failureThreshold ?? HEALTH_CHECK_DEFAULTS.failureThreshold,91);92if (failureThreshold < 1 || !isFinite(failureThreshold)) {93failureThreshold = HEALTH_CHECK_DEFAULTS.failureThreshold;94}95timeoutSeconds = parseFloat(96timeoutSeconds ?? HEALTH_CHECK_DEFAULTS.timeoutSeconds,97);98if (timeoutSeconds < 5 || !isFinite(timeoutSeconds)) {99timeoutSeconds = HEALTH_CHECK_DEFAULTS.timeoutSeconds;100}101initialDelaySeconds = parseFloat(102initialDelaySeconds ?? HEALTH_CHECK_DEFAULTS.initialDelaySeconds,103);104if (initialDelaySeconds < 0 || !isFinite(initialDelaySeconds)) {105initialDelaySeconds = HEALTH_CHECK_DEFAULTS.initialDelaySeconds;106}107enabled = !!enabled;108if (!HEALTH_CHECK_ACTIONS.includes(action)) {109action = HEALTH_CHECK_DEFAULTS.action;110}111return {112command,113initialDelaySeconds,114timeoutSeconds,115periodSeconds,116failureThreshold,117enabled,118action,119};120}121122export interface HealthCheck {123// run the command with given args on the compute server.124// If the command fails (nonzero exit code) failureThreshold times, then the125// action happens. If it contains the deprovision126// string, then it deprovisions.127command: string;128// timeout for running the command129timeoutSeconds: number;130// initial delay131initialDelaySeconds: number;132// period in seconds to wait between running the command133periodSeconds: number;134// When a probe fails, CoCalc will try failureThreshold times before doing the action.135failureThreshold: number;136137action: HealthCheckAction;138enabled: boolean;139}140141interface ProxyRoute {142path: string;143target: string;144ws?: boolean;145}146147export interface Image {148// What we show the user to describe this image, e.g., in the image select menu.149label: string;150// The name of the package on npmjs or dockerhub:151package?: string;152// In case there is a different package name for ARM64, the name of it.153package_arm64?: string;154// Root filesystem image must be at least this big in GB.155minDiskSizeGb?: number;156// Description in MARKDOWN to show user of this image. Can include links.157// Rough estimate of compressed size of Docker image; useful158// to get a sense of how long it will take to download image159// on clouds without pregenerated images.160dockerSizeGb?: number;161description?: string;162// Upstream URL for this image, e.g., https://julialang.org/ for the Julia image.163url?: string;164// Icon to show next to the label for this image.165icon?: string;166// Link to a URL with the source for building this image.167source: string;168// optional list of links to videos about this image, ordered from lowest to highest priority.169videos?: string[];170// optional list of links to tutorials171tutorials?: string[];172// The versions of this image that we claim to have built.173// The ones with role='prod' (or not specified) are shown174// to users as options.175versions: ImageVersion[];176// If true, then a GPU is required to use this image.177gpu?: boolean;178// If true, then the microk8s snap is required to use this image.179microk8s?: boolean;180// authToken: if true, image has web interface that supports configurable auth token181authToken?: boolean;182// jupyterKernels: if false, no jupyter kernels included. If true or a list of183// names, there are kernels available – used in frontend/jupyter/select-kernel.tsx184jupyterKernels?: false | true | string[];185// If set to true, do not allow creating this compute server with a DNS subdomain.186// Some images only make sense to use over the web, and the web server just won't187// work without DNS setup properly (e.g., VS Code with LEAN). Ignored for on prem.188requireDns?: boolean;189// system: if true, this is a system container that is not for user compute190system?: boolean;191// disabled: if true, this image is completely disabled, so will not be used in any way.192disabled?: boolean;193// priority -- optional integer used for sorting options to display to user. The bigger the higher.194priority?: number;195// proxy: if false, do NOT run https proxy server on host VM196// if nothing given, runs proxy server with no default config (so does nothing)197// if given, is array of proxy config.198proxy?: false | ProxyRoute[];199apps?: {200[name: string]: {201icon: string;202label: string;203url: string;204path: string;205launch: string;206requiresDns?: boolean;207};208};209}210211export type Images = { [name: string]: Image };212213export interface GoogleCloudImage {214labels: { [name: string]: string };215diskSizeGb: number;216creationTimestamp: string;217}218export type GoogleCloudImages = { [name: string]: GoogleCloudImage };219220// valid for google cloud -- probably not sufficient221export function makeValidGoogleName(s: string): string {222return s.replace(/[._]/g, "-").toLowerCase().slice(0, 63);223}224225export type State =226| "off"227| "starting"228| "running"229| "stopping"230| "deprovisioned"231| "suspending"232| "suspended"233| "unknown";234235// used for sorting by state -- ordered from my alive to least alive.236export const ORDERED_STATES: State[] = [237"running",238"starting",239"stopping",240"suspending",241"suspended",242"off",243"deprovisioned",244"unknown",245];246export const STATE_TO_NUMBER: { [state: string]: number } = {};247let n = 0;248for (const state of ORDERED_STATES) {249STATE_TO_NUMBER[state] = n;250n += 1;251}252253export function getArchitecture(configuration: Configuration): Architecture {254if (configuration.cloud == "onprem") {255return configuration.arch ?? "x86_64";256}257if (configuration.cloud != "google-cloud") {258// no ARM outside of GCP right now259return "x86_64";260}261const { machineType } = configuration;262const v = machineType.split("-");263if (v[0].endsWith("a")) {264// The known machines with are are: t2a-, c4a-265// Everything else ends with a number or d.266// Hopefully this pattern persists.267return "arm64";268}269return "x86_64";270}271272function supportsSuspend(configuration: Configuration) {273if (configuration.cloud != "google-cloud") {274return false;275}276if (getArchitecture(configuration) != "x86_64") {277// TODO: suspend/resume breaks the clock badly on ARM64, and I haven't278// figured out a workaround, so don't support it for now. I guess this279// is a GCP bug.280return false;281}282// must have no gpu and <= 208GB of RAM -- https://cloud.google.com/compute/docs/instances/suspend-resume-instance283if (configuration.acceleratorType) {284return false;285}286return true;287}288289export type Action =290| "start"291| "resume"292| "stop"293| "suspend"294| "deprovision"295| "reboot";296297export const ACTION_INFO: {298[action: string]: {299label: string;300icon: string;301tip: string;302description: string;303confirm?: boolean;304confirmMessage?: string;305danger?: boolean;306target: State; // target stable state after doing this action.307clouds?: Cloud[];308isSupported?: (configuration: Configuration) => boolean;309};310} = {311start: {312label: "Start",313icon: "play",314tip: "Start",315description: "Start the compute server running.",316target: "running",317},318resume: {319label: "Resume",320icon: "play",321clouds: ["google-cloud"],322tip: "Resume",323description: "Resume the compute server from suspend.",324target: "running",325isSupported: supportsSuspend,326},327stop: {328label: "Stop",329icon: "stop",330tip: "Turn off",331description:332"Turn the compute server off. No data on disk is lost, but any data and state in memory will be lost. This is like turning your laptop off.",333confirm: true,334target: "off",335},336deprovision: {337label: "Deprovision",338icon: "trash",339tip: "Deprovision the virtual machine",340description:341"Deprovisioning DELETES THE VIRTUAL MACHINE BOOT DISK, but keeps the compute server parameters. There are no costs associated with a deprovisioned compute server, and you can move it to a different region or zone. Any files in the home directory of your project are not affected.",342confirm: true,343confirmMessage:344"I understand that my compute server disks will be deleted.",345danger: true,346target: "deprovisioned",347},348reboot: {349label: "Hard Reboot",350icon: "refresh",351tip: "Hard reboot the virtual machine.",352description:353"Perform a HARD reset on the virtual machine, which wipes the memory contents and resets the virtual machine to its initial state. This should not delete data from the disk, but can lead to filesystem corruption.",354confirm: true,355confirmMessage:356"I understand that this can lead to filesystem corruption and is slightly dangerous.",357danger: true,358target: "running",359clouds: ["google-cloud", "hyperstack"],360},361suspend: {362label: "Suspend",363icon: "pause",364clouds: ["google-cloud"],365tip: "Suspend disk and memory state",366confirm: true,367description:368"Suspend the compute server. No data on disk or memory is lost, and you are only charged for storing disk and memory. This is like closing your laptop screen. You can leave a compute server suspended for up to 60 days before it automatically shuts off.",369target: "suspended",370isSupported: supportsSuspend,371},372};373374export const STATE_INFO: {375[state: string]: {376label: string;377actions: Action[];378icon: string;379color?: string;380stable?: boolean;381target?: State; // if not stable, this is the target state it is heading to382};383} = {384off: {385label: "Off",386color: "#ff4b00",387actions: ["start", "deprovision"],388icon: "stop",389stable: true,390},391suspended: {392label: "Suspended",393actions: ["resume", "deprovision", "stop"],394icon: "pause",395color: "#0097a7",396stable: true,397},398suspending: {399label: "Suspending",400actions: ["suspend"],401icon: "pause",402color: "#00bcd4",403stable: false,404target: "suspended",405},406starting: {407label: "Starting",408color: "#388e3c",409actions: ["start"],410icon: "bolt",411stable: false,412target: "running",413},414running: {415label: "Running",416color: COLORS.RUN,417actions: ["stop", "deprovision", "reboot", "suspend"],418icon: "run",419stable: true,420},421stopping: {422label: "Stopping",423color: "#ff9800",424actions: ["stop"],425icon: "hand",426stable: false,427target: "off",428},429unknown: {430label: "Unknown (click to refresh)",431actions: [],432icon: "question-circle",433stable: true,434},435deprovisioned: {436label: "Deprovisioned",437actions: ["start"],438color: "#888",439icon: "minus-square",440stable: true,441},442};443444export function getTargetState(x: State | Action): State {445if (ACTION_INFO[x] != null) {446return ACTION_INFO[x].target;447}448if (STATE_INFO[x] != null) {449if (!STATE_INFO[x]?.stable) {450return (STATE_INFO[x].target ?? x) as State;451}452return x as State;453}454throw Error(`x =${x} must be a state or action`);455}456457export type Architecture = "x86_64" | "arm64";458459// Convention is used in cocalc-compute-docker for making460// the npm packages @cocalc/compute-server. Don't mess with it!461export function getImageField(arch: Architecture) {462return arch == "x86_64" ? "package" : "package_arm64";463}464465export type Cloud =466| "any"467| "onprem"468| "core-weave"469| "hyperstack"470| "lambda-cloud"471| "google-cloud"472| "aws"473| "fluid-stack"474| "test";475476export function getMinDiskSizeGb({477configuration,478IMAGES,479}: {480configuration;481IMAGES: Images;482}) {483if (configuration?.image) {484const { minDiskSizeGb } = IMAGES[configuration.image] ?? {};485if (minDiskSizeGb) {486return minDiskSizeGb;487}488}489// TODO: will have to do something based on actual image size,490// maybe, unless I come up with a clever trick involving491// one PD mounted on many machines (?).492if (configuration?.acceleratorType) {493return CUDA_DISK_SIZE;494} else {495return STANDARD_DISK_SIZE;496}497}498499// This means "you can spend at most dollars every hours on a RUNNING compute server"500export interface SpendLimit {501hours: number;502dollars: number;503enabled: boolean;504}505506export const SPEND_LIMIT_DEFAULTS = {507hours: 24 * 7,508dollars: 25,509enabled: false,510};511512export function validatedSpendLimit(spendLimit?: any): SpendLimit | undefined {513if (spendLimit == null) {514return undefined;515}516let { hours, dollars, enabled } = spendLimit;517hours = parseFloat(hours ?? 0);518dollars = parseFloat(dollars ?? 0);519enabled = !!enabled;520if (hours < 0 || !isFinite(hours)) {521hours = SPEND_LIMIT_DEFAULTS.hours;522}523if (dollars < 0 || !isFinite(dollars)) {524dollars = SPEND_LIMIT_DEFAULTS.dollars;525}526return { enabled, hours, dollars };527}528529export function spendLimitPeriod(hours) {530if (hours == 24) {531return "day";532}533if (hours == 24 * 7) {534return "week";535}536if (hours == 30.5 * 24 * 7) {537return "month";538}539if (hours == 12 * 30.5 * 24 * 7) {540return "year";541}542return `${hours} hours`;543}544545const tenAM = new Date();546tenAM.setHours(10, 0, 0, 0);547export const DEFAULT_SHUTDOWN_TIME = {548epochMs: tenAM.valueOf(),549enabled: false,550};551552export interface ShutdownTime {553epochMs: number;554enabled?: boolean;555}556557export function validatedShutdownTime(558shutdownTime?: any,559): ShutdownTime | undefined {560if (shutdownTime == null) {561return undefined;562}563let { epochMs, enabled } = shutdownTime;564epochMs = parseFloat(epochMs ?? DEFAULT_SHUTDOWN_TIME.epochMs);565if (epochMs < 0 || !isFinite(epochMs)) {566epochMs = DEFAULT_SHUTDOWN_TIME.epochMs;567}568enabled = !!enabled;569return { enabled, epochMs };570}571572interface BaseConfiguration {573// image: name of the image to use, e.g. 'python' or 'pytorch'.574// images are managed in src/packages/server/compute/images.ts575image: string;576// tag: tag for the image to use when starting the compute server.577// this references the versions field of the image.578// If the tag is not given or not available, we use the latest579// available tag.580tag?: string;581// tag_filesystem: tag for the file system container582tag_filesystem?: string;583// tag_cocalc: tag for the @cocalc/compute-server package.584tag_cocalc?: string;585// dns - If the string is set and the VM has an external ip address586// and dns is configured, then point https://{dns}....587// with ssl proxying to this compute server when it is running.588dns?: string;589// Array of top level directories to exclude from sync.590// These can't have "|" in them, since we use that as a separator.591// Use "~" to completely disable sync.592excludeFromSync?: readonly string[];593// If true, view data on the compute server as ephemeral.594// Currently this is only meant to impact the user interface.595ephemeral?: boolean;596// Token used for authentication at https://compute-server...597authToken?: string;598// Configuration of the https proxy server.599proxy?: ProxyRoute[];600// If this compute server stops pinging us, e.g., due to being preempted or601// just crashing due to out of memory (etc) should we automatically do a602// forced restart. Note that currently for on prem this isn't possible.603autoRestart?: boolean;604autoRestartDisabled?: boolean; // used to temporarily disable it to avoid accidentally triggering it.605// Allow collaborators to control the state of the compute server.606// They cannot change any other configuration. User still pays for everything and owns compute server.607allowCollaboratorControl?: boolean;608609// AUTOMATIC SHUTDOWN configuration:610// turn compute server off if spend more then dollars during the last hours.611// this can only be set by the owner.612// Limit spending613spendLimit?: SpendLimit;614idleTimeoutMinutes?: number;615healthCheck?: HealthCheck;616// number = ms since epoch defines a time; at *that* time each day, the server is turned off.617shutdownTime?: ShutdownTime;618}619620export const AUTOMATIC_SHUTDOWN_FIELDS = [621"spendLimit",622"idleTimeoutMinutes",623"healthCheck",624"shutdownTime",625];626627interface LambdaConfiguration extends BaseConfiguration {628cloud: "lambda-cloud";629instance_type_name: string;630region_name: string;631}632633export interface HyperstackConfiguration extends BaseConfiguration {634cloud: "hyperstack";635flavor_name: string;636region_name: HyperstackRegion;637// diskSizeGb is an integer >= 1. It defaults to 10.638// It's the size of the /data partition. It's implemented639// using 1 or more hyperstack (=ceph) volumes, which are combined640// together as a ZFS pool. If the compute server is641// named "foo", the volumes are named "foo-1", "foo-2",642// "foo-3", etc.643// There is also always a separate 50GB root volume, which644// is named "foo-0", and whose size is not configurable.645// NOTE: users install packages "systemwide" inside of646// a docker container and we configure docker to store647// its data in the zpool, so that's in here too.648diskSizeGb: number;649}650651export const COREWEAVE_CPU_TYPES = [652"amd-epyc-rome",653"amd-epyc-milan",654"intel-xeon-v1",655"intel-xeon-v2",656"intel-xeon-v3",657"intel-xeon-v4",658"intel-xeon-scalable",659] as const;660661export const COREWEAVE_GPU_TYPES = [662"Quadro_RTX_4000",663"Quadro_RTX_5000",664"RTX_A4000",665"RTX_A5000",666"RTX_A6000",667"A40",668"Tesla_V100_PCIE",669"Tesla_V100_NVLINK",670"A100_PCIE_40GB",671"A100_PCIE_80GB",672"A100_NVLINK_40GB",673"A100_NVLINK_80GB",674] as const;675676interface CoreWeaveConfiguration extends BaseConfiguration {677cloud: "core-weave";678gpu: {679type:680| "Quadro_RTX_4000"681| "Quadro_RTX_5000"682| "RTX_A4000"683| "RTX_A5000"684| "RTX_A6000"685| "A40"686| "Tesla_V100_PCIE"687| "Tesla_V100_NVLINK"688| "A100_PCIE_40GB"689| "A100_PCIE_80GB"690| "A100_NVLINK_40GB"691| "A100_NVLINK_80GB"; //(typeof COREWEAVE_GPU_TYPES)[number];692count: number;693};694cpu: {695count: number;696type?:697| "amd-epyc-rome"698| "amd-epyc-milan"699| "intel-xeon-v1"700| "intel-xeon-v2"701| "intel-xeon-v3"702| "intel-xeon-v4"703| "intel-xeon-scalable"; //(typeof COREWEAVE_CPU_TYPES)[number];704};705memory: string; // e.g., "12Gi"706storage?: {707root: {708size: string; // e.g., '40Gi'709};710};711}712713interface FluidStackConfiguration extends BaseConfiguration {714cloud: "fluid-stack";715plan: string;716region: string;717os: string;718}719export type GoogleCloudAcceleratorType =720| "nvidia-h200-141gb"721| "nvidia-h100-80gb"722| "nvidia-a100-80gb"723| "nvidia-tesla-a100"724| "nvidia-l4"725| "nvidia-tesla-t4"726| "nvidia-tesla-v100"727| "nvidia-tesla-p4"728| "nvidia-tesla-p100";729730export const GOOGLE_CLOUD_ACCELERATOR_TYPES: GoogleCloudAcceleratorType[] = [731"nvidia-h200-141gb",732"nvidia-h100-80gb",733"nvidia-a100-80gb",734"nvidia-tesla-a100",735"nvidia-l4",736"nvidia-tesla-t4",737"nvidia-tesla-v100",738"nvidia-tesla-p4",739"nvidia-tesla-p100",740];741742export type GoogleCloudDiskType =743| "pd-standard"744| "pd-balanced"745| "pd-ssd"746| "hyperdisk-balanced";747748export const GOOGLE_CLOUD_DISK_TYPES: GoogleCloudDiskType[] = [749"pd-standard",750"pd-balanced",751"pd-ssd",752// NOTE: hyperdisks are complicated and multidimensional, but for cocalc753// we just hardcode options for the iops and bandwidth, and allow the754// user to adjust the size. Also, "hyperdisk-balanced" means hyperdisk755// with the defaults for iops and bandwidth defined in756// src/packages/util/compute/cloud/google-cloud/compute-cost.ts757"hyperdisk-balanced",758];759760export interface GoogleCloudConfiguration extends BaseConfiguration {761cloud: "google-cloud";762region: string;763zone: string;764machineType: string;765// Ues a spot instance if spot is true.766spot?: boolean;767// The boot disk:768// diskSizeGb is an integer >= 10. It defaults to 10. It's the size of the boot disk.769diskSizeGb?: number;770hyperdiskBalancedIops?: number;771hyperdiskBalancedThroughput?: number;772diskType?: GoogleCloudDiskType;773acceleratorType?: GoogleCloudAcceleratorType;774// the allowed number depends on the accelerator; it defaults to 1.775acceleratorCount?: number;776// minCpuPlatform777terminationTime?: Date;778maxRunDurationSeconds?: number;779// if true, use newest image, whether or not it is labeled with prod=true.780test?: boolean;781// an image name of the form "2023-09-13-063355-test", i.e., a timestamp in that format782// followed by an optional string. Whether or not to use cuda and and the arch are783// determined by parameters above. This is meant to be used for two purposes (1) testing784// before deploying to production, and (2) stability, so a given compute server has the785// exact same base image every time it is started, instead of being updated. Regarding (2),786// this might not be needed, but we'll see. If image is not set, we use the newest787// image that is tagged prod:true, or its an error if no such image exists. This is788// all about Google Cloud images, not the IMAGES object defined elsewhere in this file.789sourceImage?: string;790// If true, then we have an external ip address791externalIp?: boolean;792// If true, can run full VM's inside of the machine, but there is 10% performance penalty.793// This will only work for Intel non-e2 non-a3 instance types. No AMD and no ARM64.794enableNestedVirtualization?: boolean;795}796797export interface OnPremCloudConfiguration extends BaseConfiguration {798cloud: "onprem";799arch?: Architecture;800gpu?: boolean;801}802803export type Configuration =804| LambdaConfiguration805| HyperstackConfiguration806| CoreWeaveConfiguration807| FluidStackConfiguration808| GoogleCloudConfiguration809| OnPremCloudConfiguration;810811interface BaseData {812cloudflareId?: string;813externalIp?: string;814internalIp?: string;815}816817export interface LambdaCloudData extends BaseData {818cloud: "lambda-cloud";819instance_id: string;820}821822export interface HyperstackData extends BaseData {823cloud: "hyperstack";824// name we are using for the vm825name?: string;826// hyperstack description of this vm.827vm?: HyperstackVirtualMachine;828// id's of persistent storage, with first id the boot disk.829// disks are named {name}-0, {name}-1, {name}-2, etc.,830// with {name}-0 being the boot disk.831disks?: number[];832creationTimestamp?: Date;833}834835export interface GoogleCloudData extends BaseData {836cloud: "google-cloud";837name?: string;838state?: State;839cpuPlatform?: string;840creationTimestamp?: Date;841lastStartTimestamp?: Date;842}843844export type Data = GoogleCloudData | LambdaCloudData | HyperstackData;845846export interface ComponentState {847state: string;848time: number;849expire?: number;850}851852export interface ComputeServerTemplate {853enabled?: boolean;854priority?: number;855}856857export interface ComputeServerUserInfo {858id: number;859project_specific_id?: number; // the project_specific_id of this compute server -- unique within project, minimal860account_id: string;861project_id: string;862title?: string;863color?: string;864cost_per_hour?: number;865deleted?: boolean;866state_changed?: Date;867started_by?: string;868error?: string;869state?: State;870// google-cloud has a new "Time limit" either by hour or by date, which seems like a great idea!871// time_limit872autorestart?: boolean;873cloud: Cloud;874configuration: Configuration;875provisioned_configuration?: Configuration;876data?: Data;877purchase_id?: number;878last_edited?: Date;879last_edited_user?: Date;880position?: number; // used for UI sorting.881detailed_state?: { [name: string]: ComponentState };882update_purchase?: boolean;883last_purchase_update?: Date;884template?: ComputeServerTemplate;885spend?: number;886}887888export interface ComputeServer extends ComputeServerUserInfo {889api_key?: string; // project level api key for the project890api_key_id?: number; // id of the api key (needed so we can delete it from database).891}892893Table({894name: "compute_servers",895rules: {896primary_key: "id",897// unique vpn ip address *within* a given project only:898pg_unique_indexes: [899"(project_id, vpn_ip)",900"(project_id, project_specific_id)",901],902user_query: {903get: {904pg_where: [{ "project_id = $::UUID": "project_id" }],905throttle_changes: 0, // do not make this bigger; UI really feels off if throttled906fields: {907id: null,908account_id: null,909created: null,910title: null,911color: null,912cost_per_hour: null,913deleted: null,914project_id: null,915state_changed: null,916error: null,917state: null,918autorestart: null,919cloud: null,920configuration: null,921data: null,922provisioned_configuration: null,923avatar_image_tiny: null,924last_edited: null,925last_edited_user: null,926purchase_id: null,927position: null,928detailed_state: null,929template: null,930notes: null,931vpn_ip: null,932project_specific_id: null,933course_project_id: null,934course_server_id: null,935spend: null,936},937},938set: {939// ATTN: It's assumed that users can't set the data field. Doing so would be very bad and could allow940// them to maybe abuse the system and not pay for something.941// Most fields, e.g., configuration, get set via api calls, which ensures consistency in terms of valid942// data and what is actively deployed.943fields: {944project_id: "project_write",945id: true,946position: true,947error: true, // easily clear the error948notes: true,949last_edited_user: true,950},951},952},953},954fields: {955id: ID,956account_id: {957type: "uuid",958desc: "User that owns this compute server.",959render: { type: "account" },960},961created: {962type: "timestamp",963desc: "When the compute server was created.",964},965title: {966type: "string",967pg_type: "VARCHAR(254)",968desc: "Title of this computer server. Used purely to make it easier for the user to keep track of it.",969render: { type: "text", maxLength: 254, editable: true },970},971color: {972type: "string",973desc: "A user configurable color, which is used for tags and UI to indicate where a tab is running.",974pg_type: "VARCHAR(30)",975render: { type: "color", editable: true },976},977cost_per_hour: {978title: "Cost per Hour",979desc: "The cost in US dollars per hour that this compute server cost us when it is provisioned. Any time the state is changed, this is set by the server to the proper cost.",980type: "number",981pg_type: "real",982},983deleted: {984type: "boolean",985desc: "True if the compute server has been deleted.",986},987project_id: {988type: "uuid",989desc: "The project id that this compute server provides compute for.",990render: { type: "project_link" },991},992api_key: {993type: "string",994pg_type: "VARCHAR(128)",995desc: "api key to connect to the project. This is created by the system right when we are going to create the VM, and gets deleted when we stop it. It's not set by the user and should not be revealed to the user.",996},997api_key_id: {998type: "number",999desc: "id of the api key; needed so we can delete it from database",1000},1001state_changed: {1002type: "timestamp",1003desc: "When the state last changed.",1004},1005error: {1006type: "string",1007desc: "In case something went wrong, e.g., in starting this compute server, this field will get set with a string error message to show the user. It's also cleared right when we try to start server.",1008},1009state: {1010type: "string",1011desc: "One of - 'off', 'starting', 'running', 'stopping', 'deprovisioned' (etc.). This is the underlying VM's state.",1012pg_type: "VARCHAR(16)",1013},1014autorestart: {1015type: "boolean",1016desc: "If true and the compute server stops for any reason, then it will be automatically started again. This is primarily useful for stop instances.",1017},1018cloud: {1019type: "string",1020pg_type: "varchar(30)",1021desc: "The cloud where this compute server runs: 'user', 'coreweave', 'lambda', 'google-cloud', 'aws', 'fluidstack'.",1022},1023configuration: {1024type: "map",1025pg_type: "jsonb",1026desc: "Cloud specific configuration of the computer at the cloud host. The format depends on the cloud",1027},1028provisioned_configuration: {1029type: "map",1030pg_type: "jsonb",1031desc: "Same as configuration, but this is the one we actually used last time we provisioned a VM in a cloud.",1032},1033data: {1034type: "map",1035pg_type: "jsonb",1036desc: "Arbitrary data about this server that is cloud provider specific. Store data here to facilitate working with the virtual machine, e.g., the id of the server when it is running, etc. This *MAY BE* returned to the user -- do not put secrets here the user can't see.",1037},1038avatar_image_tiny: {1039title: "Image",1040type: "string",1041desc: "tiny (32x32) visual image associated with the compute server. Suitable to include as part of changefeed, since about 3kb. Derived from avatar_image_full.",1042render: { type: "image" },1043},1044avatar_image_full: {1045title: "Image",1046type: "string",1047desc: "User configurable visual image associated with the compute server. Could be 150kb. NOT include as part of changefeed of projects, since potentially big (e.g., 200kb x 1000 projects = 200MB!).",1048render: { type: "image" },1049},1050purchase_id: {1051type: "number",1052desc: "if there is a current active purchase related to this compute server, this is the id of that purchase in the purchases table",1053},1054update_purchase: {1055type: "boolean",1056desc: "This is set to true if activity with this server is happening that warrants creating/ending a purchase.",1057},1058last_purchase_update: {1059type: "timestamp",1060desc: "Last time we requested an update to the purchase info about this compute server.",1061},1062position: {1063type: "number",1064desc: "Used for sorting a list of compute servers in the UI.",1065},1066last_edited: {1067type: "timestamp",1068desc: "Last time the configuration, state, etc., changed.",1069},1070last_edited_user: {1071type: "timestamp",1072desc: "Last time a user explicitly edited a file or used an application (e.g., terminal) on the compute server via the UI. This is like last_edited for projects, and is used to implement configuration.idleTimeoutMinutes.",1073},1074detailed_state: {1075type: "map",1076pg_type: "jsonb",1077desc: "Map from component name to something like {state:'running',time:Date.now()}, e.g., {vm: {state:'running', time:393939938484}}, filesystem: {state:'updating', time:939398484892}, uptime:{state:'22:56:33 up 3 days, 9:28, 0 users, load average: 0.93, 0.73, 0.56', time:?}}. This is used to provide users with insight into what's currently happening on their compute server.",1078},1079notes: NOTES,1080template: {1081type: "map",1082pg_type: "jsonb",1083desc: "Use this compute server configuration as a public template. Only admins can set this field for now. The exact structure of this jsonb is yet to be determined.",1084},1085vpn_ip: {1086type: "string",1087desc: "IP address of the compute server on the private encrypted project-wide VPN.",1088},1089vpn_public_key: {1090type: "string",1091desc: "Wireguard public key for this compute server.",1092},1093vpn_private_key: {1094type: "string",1095desc: "Wireguard private key for this compute server.",1096},1097project_specific_id: {1098type: "integer",1099desc: "A unique project-specific id assigned to this compute server. This is a positive integer that is guaranteed to be unique for compute servers *in a given project* and minimal when assigned (so it is as small as possible). This number is useful for distributed algorithms, since it can be used to ensure distinct sequence without any additional coordination. This is also useful to display to users so that the id number they see everywhere is not huge.",1100},1101course_project_id: {1102type: "uuid",1103desc: "If this is a compute server created for a student in a course, then this is the id of the project that the instructor(s) are using to host the course. IMPORTANT: Our security model is that a user can read info about a compute server if they are a collaborator on *either* the compute server's project_id OR on the course_project_id, if set (but then only via the compute_servers_by_course virtual table).",1104},1105course_server_id: {1106type: "integer",1107desc: "If this compute server is a clone of an instructor server in a course, this is the id of that instructor server.",1108},1109spend: {1110type: "number",1111desc: "If configuration.spendLimit is enabled, then the spend during the current period gets recorded here every few minutes. This is useful to efficiently provide a UI element showing the current spend status. It is cleared whenever configuration.spendLimit is changed, to avoid confusion.",1112},1113},1114});11151116// The compute_servers_by_course table is exactly like the compute_servers1117// table, but instead of having to specify1118Table({1119name: "compute_servers_by_course",1120fields: schema.compute_servers.fields,1121rules: {1122primary_key: schema.compute_servers.primary_key,1123virtual: "compute_servers",1124user_query: {1125get: {1126// only allow read access when course_project_id is a project1127// that client user is a collaborator on.1128pg_where: [1129{1130"course_project_id = ANY(select project_id from projects where users ? $::TEXT)":1131"account_id",1132},1133],1134fields: {1135...schema.compute_servers.user_query?.get?.fields,1136},1137},1138},1139},1140});11411142Table({1143name: "crm_compute_servers",1144fields: schema.compute_servers.fields,1145rules: {1146primary_key: schema.compute_servers.primary_key,1147virtual: "compute_servers",1148user_query: {1149get: {1150admin: true, // only admins can do get queries on this table1151// (without this, users who have read access could read)1152pg_where: [],1153fields: {1154...schema.compute_servers.user_query?.get?.fields,1155template: null,1156},1157},1158set: {1159admin: true,1160fields: {1161id: true,1162title: true,1163color: true,1164deleted: true,1165notes: true,1166template: true,1167state_control: null,1168},1169},1170},1171},1172});11731174Table({1175name: "compute_servers_cache",1176fields: {1177cloud: {1178type: "string",1179desc: "The cloud that we're caching information about",1180},1181key: {1182type: "string",1183desc: "The key for whatever we're caching.",1184},1185value: {1186type: "string",1187desc: "The cached data.",1188},1189expire: {1190type: "timestamp",1191desc: "When this action should be expired.",1192},1193},1194rules: {1195durability: "soft", // it's just a cache1196desc: "Cache data about what's going on in various clouds that are used to implement compute servers.",1197primary_key: ["cloud", "key"],1198},1199});120012011202