Path: blob/master/src/packages/util/db-schema/compute-servers.ts
5828 views
/*1* This file is part of CoCalc: Copyright © 2023 Sagemath, Inc.2* License: MS-RSL – see LICENSE.md for details3*/45import type {6Region as HyperstackRegion,7VirtualMachine as HyperstackVirtualMachine,8} from "@cocalc/util/compute/cloud/hyperstack/api-types";9import { COLORS } from "@cocalc/util/theme";10import { ID, NOTES } from "./crm";11import { SCHEMA as schema } from "./index";12import { Table } from "./types";13export {14CLOUDS_BY_NAME,15GOOGLE_CLOUD_DEFAULTS,16ON_PREM_DEFAULTS,17} from "@cocalc/util/compute/cloud/clouds";1819// These are just fallbacks in case something is wrong with the image configuration.20export const STANDARD_DISK_SIZE = 20;21export const CUDA_DISK_SIZE = 60;2223export const CHECK_IN_PERIOD_S = 20;24export const CHECK_IN_PATH = "/cocalc/conf/check-in";2526// Clients are recommended to wait this long after a purchase ends before27// requesting the cost. This should give us about a day of wiggle room.28// There is no SLA on billing data.29const GOOGLE_COST_LAG_DAYS = 2;30export const GOOGLE_COST_LAG_MS = GOOGLE_COST_LAG_DAYS * 24 * 60 * 60 * 1000;3132// Compute Server Images -- typings. See packages/server/compute/images.ts for33// how the actual data is populated.3435export interface ImageVersion {36// tag - must be given and distinct for each version -- this typically identifies the image to docker37tag: string;38// version -- defaults to tag if not given; usually the upstream version39version?: string;40// label -- defaults to the tag; this is to display to the user41label?: string;42// tested -- if this is not set to true, then this version should not be shown by default.43// If not tested, only show to users who explicitly really want this (e.g., admins).44tested?: boolean;45}4647export const IDLE_TIMEOUT_MINUTES_DEFAULT = 30;4849export const HEALTH_CHECK_DEFAULTS = {50command: "pwd",51initialDelaySeconds: 10 * 60,52timeoutSeconds: 30,53periodSeconds: 60,54failureThreshold: 3,55enabled: false,56action: "reboot",57};5859export const HEALTH_CHECK_ACTIONS = [60"reboot",61"stop",62"suspend",63"deprovision",64];65type HealthCheckAction = (typeof HEALTH_CHECK_ACTIONS)[number];6667export function validatedHealthCheck(68healthCheck?: any,69): HealthCheck | undefined {70if (healthCheck == null) {71return undefined;72}73let {74command,75periodSeconds,76failureThreshold,77enabled,78action,79timeoutSeconds,80initialDelaySeconds,81} = healthCheck;82command = `${command}`;83periodSeconds = parseFloat(84periodSeconds ?? HEALTH_CHECK_DEFAULTS.periodSeconds,85);86if (periodSeconds < 0 || !isFinite(periodSeconds)) {87periodSeconds = HEALTH_CHECK_DEFAULTS.periodSeconds;88}89failureThreshold = parseFloat(90failureThreshold ?? HEALTH_CHECK_DEFAULTS.failureThreshold,91);92if (failureThreshold < 1 || !isFinite(failureThreshold)) {93failureThreshold = HEALTH_CHECK_DEFAULTS.failureThreshold;94}95timeoutSeconds = parseFloat(96timeoutSeconds ?? HEALTH_CHECK_DEFAULTS.timeoutSeconds,97);98if (timeoutSeconds < 5 || !isFinite(timeoutSeconds)) {99timeoutSeconds = HEALTH_CHECK_DEFAULTS.timeoutSeconds;100}101initialDelaySeconds = parseFloat(102initialDelaySeconds ?? HEALTH_CHECK_DEFAULTS.initialDelaySeconds,103);104if (initialDelaySeconds < 0 || !isFinite(initialDelaySeconds)) {105initialDelaySeconds = HEALTH_CHECK_DEFAULTS.initialDelaySeconds;106}107enabled = !!enabled;108if (!HEALTH_CHECK_ACTIONS.includes(action)) {109action = HEALTH_CHECK_DEFAULTS.action;110}111return {112command,113initialDelaySeconds,114timeoutSeconds,115periodSeconds,116failureThreshold,117enabled,118action,119};120}121122export interface HealthCheck {123// run the command with given args on the compute server.124// If the command fails (nonzero exit code) failureThreshold times, then the125// action happens. If it contains the deprovision126// string, then it deprovisions.127command: string;128// timeout for running the command129timeoutSeconds: number;130// initial delay131initialDelaySeconds: number;132// period in seconds to wait between running the command133periodSeconds: number;134// When a probe fails, CoCalc will try failureThreshold times before doing the action.135failureThreshold: number;136137action: HealthCheckAction;138enabled: boolean;139}140141interface ProxyRoute {142path: string;143target: string;144ws?: boolean;145}146147export interface Image {148// What we show the user to describe this image, e.g., in the image select menu.149label: string;150// The name of the package on npmjs or dockerhub:151package?: string;152// In case there is a different package name for ARM64, the name of it.153package_arm64?: string;154// Root filesystem image must be at least this big in GB.155minDiskSizeGb?: number;156// Description in MARKDOWN to show user of this image. Can include links.157// Rough estimate of compressed size of Docker image; useful158// to get a sense of how long it will take to download image159// on clouds without pregenerated images.160dockerSizeGb?: number;161description?: string;162// Upstream URL for this image, e.g., https://julialang.org/ for the Julia image.163url?: string;164// Icon to show next to the label for this image.165icon?: string;166// Link to a URL with the source for building this image.167source: string;168// optional list of links to videos about this image, ordered from lowest to highest priority.169videos?: string[];170// optional list of links to tutorials171tutorials?: string[];172// The versions of this image that we claim to have built.173// The ones with role='prod' (or not specified) are shown174// to users as options.175versions: ImageVersion[];176// If true, then a GPU is required to use this image.177gpu?: boolean;178// If true, then the microk8s snap is required to use this image.179microk8s?: boolean;180// authToken: if true, image has web interface that supports configurable auth token181authToken?: boolean;182// jupyterKernels: if false, no jupyter kernels included. If true or a list of183// names, there are kernels available – used in frontend/jupyter/select-kernel.tsx184jupyterKernels?: false | true | string[];185// If set to true, do not allow creating this compute server with a DNS subdomain.186// Some images only make sense to use over the web, and the web server just won't187// work without DNS setup properly (e.g., VS Code with LEAN). Ignored for on prem.188requireDns?: boolean;189// system: if true, this is a system container that is not for user compute190system?: boolean;191// disabled: if true, this image is completely disabled, so will not be used in any way.192disabled?: boolean;193// priority -- optional integer used for sorting options to display to user. The bigger the higher.194priority?: number;195// proxy: if false, do NOT run https proxy server on host VM196// if nothing given, runs proxy server with no default config (so does nothing)197// if given, is array of proxy config.198proxy?: false | ProxyRoute[];199apps?: {200[name: string]: {201icon: string;202label: string;203url: string;204path: string;205launch: string;206requiresDns?: boolean;207};208};209}210211export type Images = { [name: string]: Image };212213export interface GoogleCloudImage {214labels: { [name: string]: string };215diskSizeGb: number;216creationTimestamp: string;217}218export type GoogleCloudImages = { [name: string]: GoogleCloudImage };219220// valid for google cloud -- probably not sufficient221export function makeValidGoogleName(s: string): string {222return s.replace(/[._]/g, "-").toLowerCase().slice(0, 63);223}224225export type State =226| "off"227| "starting"228| "running"229| "stopping"230| "deprovisioned"231| "suspending"232| "suspended"233| "unknown";234235// used for sorting by state -- ordered from my alive to least alive.236export const ORDERED_STATES: State[] = [237"running",238"starting",239"stopping",240"suspending",241"suspended",242"off",243"deprovisioned",244"unknown",245];246export const STATE_TO_NUMBER: { [state: string]: number } = {};247let n = 0;248for (const state of ORDERED_STATES) {249STATE_TO_NUMBER[state] = n;250n += 1;251}252253// Helper function to determine the architecture of a machine type254export function getMachineTypeArchitecture(machineType: string): Architecture {255const v = machineType.split("-");256if (v[0].endsWith("a")) {257// The known machines with ARM are: t2a-, c4a-258// Everything else ends with a number or d.259// Hopefully this pattern persists.260return "arm64";261}262return "x86_64";263}264265export function getArchitecture(configuration: Configuration): Architecture {266if (configuration.cloud == "onprem") {267return configuration.arch ?? "x86_64";268}269if (configuration.cloud != "google-cloud") {270// no ARM outside of GCP right now271return "x86_64";272}273const { machineType } = configuration;274return getMachineTypeArchitecture(machineType);275}276277function supportsSuspend(configuration: Configuration) {278if (configuration.cloud != "google-cloud") {279return false;280}281if (getArchitecture(configuration) != "x86_64") {282// TODO: suspend/resume breaks the clock badly on ARM64, and I haven't283// figured out a workaround, so don't support it for now. I guess this284// is a GCP bug.285return false;286}287// must have no gpu and <= 208GB of RAM -- https://cloud.google.com/compute/docs/instances/suspend-resume-instance288if (configuration.acceleratorType) {289return false;290}291return true;292}293294export type Action =295| "start"296| "resume"297| "stop"298| "suspend"299| "deprovision"300| "reboot";301302export const ACTION_INFO: {303[action: string]: {304label: string;305icon: string;306tip: string;307description: string;308confirm?: boolean;309confirmMessage?: string;310danger?: boolean;311target: State; // target stable state after doing this action.312clouds?: Cloud[];313isSupported?: (configuration: Configuration) => boolean;314};315} = {316start: {317label: "Start",318icon: "play",319tip: "Start",320description: "Start the compute server running.",321target: "running",322},323resume: {324label: "Resume",325icon: "play",326clouds: ["google-cloud"],327tip: "Resume",328description: "Resume the compute server from suspend.",329target: "running",330isSupported: supportsSuspend,331},332stop: {333label: "Stop",334icon: "stop",335tip: "Turn off",336description:337"Turn the compute server off. No data on disk is lost, but any data and state in memory will be lost. This is like turning your laptop off.",338confirm: true,339target: "off",340},341deprovision: {342label: "Deprovision",343icon: "trash",344tip: "Deprovision the virtual machine",345description:346"Deprovisioning DELETES THE VIRTUAL MACHINE BOOT DISK, but keeps the compute server parameters. There are no costs associated with a deprovisioned compute server, and you can move it to a different region or zone. Any files in the home directory of your project are not affected.",347confirm: true,348confirmMessage:349"I understand that my compute server disks will be deleted.",350danger: true,351target: "deprovisioned",352},353reboot: {354label: "Hard Reboot",355icon: "refresh",356tip: "Hard reboot the virtual machine.",357description:358"Perform a HARD reset on the virtual machine, which wipes the memory contents and resets the virtual machine to its initial state. This should not delete data from the disk, but can lead to filesystem corruption.",359confirm: true,360confirmMessage:361"I understand that this can lead to filesystem corruption and is slightly dangerous.",362danger: true,363target: "running",364clouds: ["google-cloud", "hyperstack"],365},366suspend: {367label: "Suspend",368icon: "pause",369clouds: ["google-cloud"],370tip: "Suspend disk and memory state",371confirm: true,372description:373"Suspend the compute server. No data on disk or memory is lost, and you are only charged for storing disk and memory. This is like closing your laptop screen. You can leave a compute server suspended for up to 60 days before it automatically shuts off.",374target: "suspended",375isSupported: supportsSuspend,376},377};378379export const STATE_INFO: {380[state: string]: {381label: string;382actions: Action[];383icon: string;384color?: string;385stable?: boolean;386target?: State; // if not stable, this is the target state it is heading to387};388} = {389off: {390label: "Off",391color: "#ff4b00",392actions: ["start", "deprovision"],393icon: "stop",394stable: true,395},396suspended: {397label: "Suspended",398actions: ["resume", "deprovision", "stop"],399icon: "pause",400color: "#0097a7",401stable: true,402},403suspending: {404label: "Suspending",405actions: ["suspend"],406icon: "pause",407color: "#00bcd4",408stable: false,409target: "suspended",410},411starting: {412label: "Starting",413color: "#388e3c",414actions: ["start"],415icon: "bolt",416stable: false,417target: "running",418},419running: {420label: "Running",421color: COLORS.RUN,422actions: ["stop", "deprovision", "reboot", "suspend"],423icon: "run",424stable: true,425},426stopping: {427label: "Stopping",428color: "#ff9800",429actions: ["stop"],430icon: "hand",431stable: false,432target: "off",433},434unknown: {435label: "Unknown (click to refresh)",436actions: [],437icon: "question-circle",438stable: true,439},440deprovisioned: {441label: "Deprovisioned",442actions: ["start"],443color: "#888",444icon: "minus-square",445stable: true,446},447};448449export function getTargetState(x: State | Action): State {450if (ACTION_INFO[x] != null) {451return ACTION_INFO[x].target;452}453if (STATE_INFO[x] != null) {454if (!STATE_INFO[x]?.stable) {455return (STATE_INFO[x].target ?? x) as State;456}457return x as State;458}459throw Error(`x =${x} must be a state or action`);460}461462export type Architecture = "x86_64" | "arm64";463464// Convention is used in cocalc-compute-docker for making465// the npm packages @cocalc/compute-server. Don't mess with it!466export function getImageField(arch: Architecture) {467return arch == "x86_64" ? "package" : "package_arm64";468}469470export type Cloud =471| "any"472| "onprem"473| "core-weave"474| "hyperstack"475| "lambda-cloud"476| "google-cloud"477| "aws"478| "fluid-stack"479| "test";480481export function getMinDiskSizeGb({482configuration,483IMAGES,484}: {485configuration;486IMAGES: Images;487}) {488if (configuration?.image) {489const { minDiskSizeGb } = IMAGES[configuration.image] ?? {};490if (minDiskSizeGb) {491return minDiskSizeGb;492}493}494// TODO: will have to do something based on actual image size,495// maybe, unless I come up with a clever trick involving496// one PD mounted on many machines (?).497if (configuration?.acceleratorType) {498return CUDA_DISK_SIZE;499} else {500return STANDARD_DISK_SIZE;501}502}503504// This means "you can spend at most dollars every hours on a RUNNING compute server"505export interface SpendLimit {506hours: number;507dollars: number;508enabled: boolean;509}510511export const SPEND_LIMIT_DEFAULTS = {512hours: 24 * 7,513dollars: 25,514enabled: false,515};516517export function validatedSpendLimit(spendLimit?: any): SpendLimit | undefined {518if (spendLimit == null) {519return undefined;520}521let { hours, dollars, enabled } = spendLimit;522hours = parseFloat(hours ?? 0);523dollars = parseFloat(dollars ?? 0);524enabled = !!enabled;525if (hours < 0 || !isFinite(hours)) {526hours = SPEND_LIMIT_DEFAULTS.hours;527}528if (dollars < 0 || !isFinite(dollars)) {529dollars = SPEND_LIMIT_DEFAULTS.dollars;530}531return { enabled, hours, dollars };532}533534export function spendLimitPeriod(hours) {535if (hours == 24) {536return "day";537}538if (hours == 24 * 7) {539return "week";540}541if (hours == 30.5 * 24 * 7) {542return "month";543}544if (hours == 12 * 30.5 * 24 * 7) {545return "year";546}547return `${hours} hours`;548}549550const tenAM = new Date();551tenAM.setHours(10, 0, 0, 0);552export const DEFAULT_SHUTDOWN_TIME = {553epochMs: tenAM.valueOf(),554enabled: false,555};556557export interface ShutdownTime {558epochMs: number;559enabled?: boolean;560}561562export function validatedShutdownTime(563shutdownTime?: any,564): ShutdownTime | undefined {565if (shutdownTime == null) {566return undefined;567}568let { epochMs, enabled } = shutdownTime;569epochMs = parseFloat(epochMs ?? DEFAULT_SHUTDOWN_TIME.epochMs);570if (epochMs < 0 || !isFinite(epochMs)) {571epochMs = DEFAULT_SHUTDOWN_TIME.epochMs;572}573enabled = !!enabled;574return { enabled, epochMs };575}576577interface BaseConfiguration {578// image: name of the image to use, e.g. 'python' or 'pytorch'.579// images are managed in src/packages/server/compute/images.ts580image: string;581// tag: tag for the image to use when starting the compute server.582// this references the versions field of the image.583// If the tag is not given or not available, we use the latest584// available tag.585tag?: string;586// tag_filesystem: tag for the file system container587tag_filesystem?: string;588// tag_cocalc: tag for the @cocalc/compute-server package.589tag_cocalc?: string;590// dns - If the string is set and the VM has an external ip address591// and dns is configured, then point https://{dns}....592// with ssl proxying to this compute server when it is running.593dns?: string;594// Array of top level directories to exclude from sync.595// These can't have "|" in them, since we use that as a separator.596// Use "~" to completely disable sync.597excludeFromSync?: readonly string[];598// If true, view data on the compute server as ephemeral.599// Currently this is only meant to impact the user interface.600ephemeral?: boolean;601// Token used for authentication at https://compute-server...602authToken?: string;603// Configuration of the https proxy server.604proxy?: ProxyRoute[];605// If this compute server stops pinging us, e.g., due to being preempted or606// just crashing due to out of memory (etc) should we automatically do a607// forced restart. Note that currently for on prem this isn't possible.608autoRestart?: boolean;609autoRestartDisabled?: boolean; // used to temporarily disable it to avoid accidentally triggering it.610// Allow collaborators to control the state of the compute server.611// They cannot change any other configuration. User still pays for everything and owns compute server.612allowCollaboratorControl?: boolean;613614// AUTOMATIC SHUTDOWN configuration:615// turn compute server off if spend more then dollars during the last hours.616// this can only be set by the owner.617// Limit spending618spendLimit?: SpendLimit;619idleTimeoutMinutes?: number;620healthCheck?: HealthCheck;621// number = ms since epoch defines a time; at *that* time each day, the server is turned off.622shutdownTime?: ShutdownTime;623}624625export const AUTOMATIC_SHUTDOWN_FIELDS = [626"spendLimit",627"idleTimeoutMinutes",628"healthCheck",629"shutdownTime",630];631632interface LambdaConfiguration extends BaseConfiguration {633cloud: "lambda-cloud";634instance_type_name: string;635region_name: string;636}637638export interface HyperstackConfiguration extends BaseConfiguration {639cloud: "hyperstack";640flavor_name: string;641region_name: HyperstackRegion;642// diskSizeGb is an integer >= 1. It defaults to 10.643// It's the size of the /data partition. It's implemented644// using 1 or more hyperstack (=ceph) volumes, which are combined645// together as a ZFS pool. If the compute server is646// named "foo", the volumes are named "foo-1", "foo-2",647// "foo-3", etc.648// There is also always a separate 50GB root volume, which649// is named "foo-0", and whose size is not configurable.650// NOTE: users install packages "systemwide" inside of651// a docker container and we configure docker to store652// its data in the zpool, so that's in here too.653diskSizeGb: number;654}655656export const COREWEAVE_CPU_TYPES = [657"amd-epyc-rome",658"amd-epyc-milan",659"intel-xeon-v1",660"intel-xeon-v2",661"intel-xeon-v3",662"intel-xeon-v4",663"intel-xeon-scalable",664] as const;665666export const COREWEAVE_GPU_TYPES = [667"Quadro_RTX_4000",668"Quadro_RTX_5000",669"RTX_A4000",670"RTX_A5000",671"RTX_A6000",672"A40",673"Tesla_V100_PCIE",674"Tesla_V100_NVLINK",675"A100_PCIE_40GB",676"A100_PCIE_80GB",677"A100_NVLINK_40GB",678"A100_NVLINK_80GB",679] as const;680681interface CoreWeaveConfiguration extends BaseConfiguration {682cloud: "core-weave";683gpu: {684type:685| "Quadro_RTX_4000"686| "Quadro_RTX_5000"687| "RTX_A4000"688| "RTX_A5000"689| "RTX_A6000"690| "A40"691| "Tesla_V100_PCIE"692| "Tesla_V100_NVLINK"693| "A100_PCIE_40GB"694| "A100_PCIE_80GB"695| "A100_NVLINK_40GB"696| "A100_NVLINK_80GB"; //(typeof COREWEAVE_GPU_TYPES)[number];697count: number;698};699cpu: {700count: number;701type?:702| "amd-epyc-rome"703| "amd-epyc-milan"704| "intel-xeon-v1"705| "intel-xeon-v2"706| "intel-xeon-v3"707| "intel-xeon-v4"708| "intel-xeon-scalable"; //(typeof COREWEAVE_CPU_TYPES)[number];709};710memory: string; // e.g., "12Gi"711storage?: {712root: {713size: string; // e.g., '40Gi'714};715};716}717718interface FluidStackConfiguration extends BaseConfiguration {719cloud: "fluid-stack";720plan: string;721region: string;722os: string;723}724export type GoogleCloudAcceleratorType =725| "nvidia-h200-141gb"726| "nvidia-h100-80gb"727| "nvidia-a100-80gb"728| "nvidia-tesla-a100"729| "nvidia-l4"730| "nvidia-tesla-t4"731| "nvidia-tesla-v100"732| "nvidia-tesla-p4"733| "nvidia-tesla-p100";734735export const GOOGLE_CLOUD_ACCELERATOR_TYPES: GoogleCloudAcceleratorType[] = [736"nvidia-h200-141gb",737"nvidia-h100-80gb",738"nvidia-a100-80gb",739"nvidia-tesla-a100",740"nvidia-l4",741"nvidia-tesla-t4",742"nvidia-tesla-v100",743"nvidia-tesla-p4",744"nvidia-tesla-p100",745];746747export type GoogleCloudDiskType =748| "pd-standard"749| "pd-balanced"750| "pd-ssd"751| "hyperdisk-balanced";752753export const GOOGLE_CLOUD_DISK_TYPES: GoogleCloudDiskType[] = [754"pd-standard",755"pd-balanced",756"pd-ssd",757// NOTE: hyperdisks are complicated and multidimensional, but for cocalc758// we just hardcode options for the iops and bandwidth, and allow the759// user to adjust the size. Also, "hyperdisk-balanced" means hyperdisk760// with the defaults for iops and bandwidth defined in761// src/packages/util/compute/cloud/google-cloud/compute-cost.ts762"hyperdisk-balanced",763];764765export interface GoogleCloudConfiguration extends BaseConfiguration {766cloud: "google-cloud";767region: string;768zone: string;769machineType: string;770// Ues a spot instance if spot is true.771spot?: boolean;772// The boot disk:773// diskSizeGb is an integer >= 10. It defaults to 10. It's the size of the boot disk.774diskSizeGb?: number;775hyperdiskBalancedIops?: number;776hyperdiskBalancedThroughput?: number;777diskType?: GoogleCloudDiskType;778acceleratorType?: GoogleCloudAcceleratorType;779// the allowed number depends on the accelerator; it defaults to 1.780acceleratorCount?: number;781// minCpuPlatform782terminationTime?: Date;783maxRunDurationSeconds?: number;784// if true, use newest image, whether or not it is labeled with prod=true.785test?: boolean;786// an image name of the form "2023-09-13-063355-test", i.e., a timestamp in that format787// followed by an optional string. Whether or not to use cuda and and the arch are788// determined by parameters above. This is meant to be used for two purposes (1) testing789// before deploying to production, and (2) stability, so a given compute server has the790// exact same base image every time it is started, instead of being updated. Regarding (2),791// this might not be needed, but we'll see. If image is not set, we use the newest792// image that is tagged prod:true, or its an error if no such image exists. This is793// all about Google Cloud images, not the IMAGES object defined elsewhere in this file.794sourceImage?: string;795// If true, then we have an external ip address796externalIp?: boolean;797// If true, can run full VM's inside of the machine, but there is 10% performance penalty.798// This will only work for Intel non-e2 non-a3 instance types. No AMD and no ARM64.799enableNestedVirtualization?: boolean;800}801802export interface OnPremCloudConfiguration extends BaseConfiguration {803cloud: "onprem";804arch?: Architecture;805gpu?: boolean;806}807808export type Configuration =809| LambdaConfiguration810| HyperstackConfiguration811| CoreWeaveConfiguration812| FluidStackConfiguration813| GoogleCloudConfiguration814| OnPremCloudConfiguration;815816interface BaseData {817cloudflareId?: string;818externalIp?: string;819internalIp?: string;820}821822export interface LambdaCloudData extends BaseData {823cloud: "lambda-cloud";824instance_id: string;825}826827export interface HyperstackData extends BaseData {828cloud: "hyperstack";829// name we are using for the vm830name?: string;831// hyperstack description of this vm.832vm?: HyperstackVirtualMachine;833// id's of persistent storage, with first id the boot disk.834// disks are named {name}-0, {name}-1, {name}-2, etc.,835// with {name}-0 being the boot disk.836disks?: number[];837creationTimestamp?: Date;838}839840export interface GoogleCloudData extends BaseData {841cloud: "google-cloud";842name?: string;843state?: State;844cpuPlatform?: string;845creationTimestamp?: Date;846lastStartTimestamp?: Date;847}848849export type Data = GoogleCloudData | LambdaCloudData | HyperstackData;850851export interface ComponentState {852state: string;853time: number;854expire?: number;855}856857export interface ComputeServerTemplate {858enabled?: boolean;859priority?: number;860}861862export interface ComputeServerUserInfo {863id: number;864project_specific_id?: number; // the project_specific_id of this compute server -- unique within project, minimal865account_id: string;866project_id: string;867title?: string;868color?: string;869cost_per_hour?: number;870deleted?: boolean;871state_changed?: Date;872started_by?: string;873error?: string;874state?: State;875// google-cloud has a new "Time limit" either by hour or by date, which seems like a great idea!876// time_limit877autorestart?: boolean;878cloud: Cloud;879configuration: Configuration;880provisioned_configuration?: Configuration;881data?: Data;882purchase_id?: number;883last_edited?: Date;884last_edited_user?: Date;885position?: number; // used for UI sorting.886detailed_state?: { [name: string]: ComponentState };887update_purchase?: boolean;888last_purchase_update?: Date;889template?: ComputeServerTemplate;890spend?: number;891}892893export interface ComputeServer extends ComputeServerUserInfo {894api_key?: string; // project level api key for the project895api_key_id?: number; // id of the api key (needed so we can delete it from database).896}897898Table({899name: "compute_servers",900rules: {901primary_key: "id",902// unique vpn ip address *within* a given project only:903pg_unique_indexes: [904"(project_id, vpn_ip)",905"(project_id, project_specific_id)",906],907user_query: {908get: {909pg_where: [{ "project_id = $::UUID": "project_id" }],910throttle_changes: 0, // do not make this bigger; UI really feels off if throttled911fields: {912id: null,913account_id: null,914created: null,915title: null,916color: null,917cost_per_hour: null,918deleted: null,919project_id: null,920state_changed: null,921error: null,922state: null,923autorestart: null,924cloud: null,925configuration: null,926data: null,927provisioned_configuration: null,928avatar_image_tiny: null,929last_edited: null,930last_edited_user: null,931purchase_id: null,932position: null,933detailed_state: null,934template: null,935notes: null,936vpn_ip: null,937project_specific_id: null,938course_project_id: null,939course_server_id: null,940spend: null,941},942},943set: {944// ATTN: It's assumed that users can't set the data field. Doing so would be very bad and could allow945// them to maybe abuse the system and not pay for something.946// Most fields, e.g., configuration, get set via api calls, which ensures consistency in terms of valid947// data and what is actively deployed.948fields: {949project_id: "project_write",950id: true,951position: true,952error: true, // easily clear the error953notes: true,954last_edited_user: true,955},956},957},958},959fields: {960id: ID,961account_id: {962type: "uuid",963desc: "User that owns this compute server.",964render: { type: "account" },965},966created: {967type: "timestamp",968desc: "When the compute server was created.",969},970title: {971type: "string",972pg_type: "VARCHAR(254)",973desc: "Title of this computer server. Used purely to make it easier for the user to keep track of it.",974render: { type: "text", maxLength: 254, editable: true },975},976color: {977type: "string",978desc: "A user configurable color, which is used for tags and UI to indicate where a tab is running.",979pg_type: "VARCHAR(30)",980render: { type: "color", editable: true },981},982cost_per_hour: {983title: "Cost per Hour",984desc: "The cost in US dollars per hour that this compute server cost us when it is provisioned. Any time the state is changed, this is set by the server to the proper cost.",985type: "number",986pg_type: "real",987},988deleted: {989type: "boolean",990desc: "True if the compute server has been deleted.",991},992project_id: {993type: "uuid",994desc: "The project id that this compute server provides compute for.",995render: { type: "project_link" },996},997api_key: {998type: "string",999pg_type: "VARCHAR(128)",1000desc: "api key to connect to the project. This is created by the system right when we are going to create the VM, and gets deleted when we stop it. It's not set by the user and should not be revealed to the user.",1001},1002api_key_id: {1003type: "number",1004desc: "id of the api key; needed so we can delete it from database",1005},1006state_changed: {1007type: "timestamp",1008desc: "When the state last changed.",1009},1010error: {1011type: "string",1012desc: "In case something went wrong, e.g., in starting this compute server, this field will get set with a string error message to show the user. It's also cleared right when we try to start server.",1013},1014state: {1015type: "string",1016desc: "One of - 'off', 'starting', 'running', 'stopping', 'deprovisioned' (etc.). This is the underlying VM's state.",1017pg_type: "VARCHAR(16)",1018},1019autorestart: {1020type: "boolean",1021desc: "If true and the compute server stops for any reason, then it will be automatically started again. This is primarily useful for stop instances.",1022},1023cloud: {1024type: "string",1025pg_type: "varchar(30)",1026desc: "The cloud where this compute server runs: 'user', 'coreweave', 'lambda', 'google-cloud', 'aws', 'fluidstack'.",1027},1028configuration: {1029type: "map",1030pg_type: "jsonb",1031desc: "Cloud specific configuration of the computer at the cloud host. The format depends on the cloud",1032},1033provisioned_configuration: {1034type: "map",1035pg_type: "jsonb",1036desc: "Same as configuration, but this is the one we actually used last time we provisioned a VM in a cloud.",1037},1038data: {1039type: "map",1040pg_type: "jsonb",1041desc: "Arbitrary data about this server that is cloud provider specific. Store data here to facilitate working with the virtual machine, e.g., the id of the server when it is running, etc. This *MAY BE* returned to the user -- do not put secrets here the user can't see.",1042},1043avatar_image_tiny: {1044title: "Image",1045type: "string",1046desc: "tiny (32x32) visual image associated with the compute server. Suitable to include as part of changefeed, since about 3kb. Derived from avatar_image_full.",1047render: { type: "image" },1048},1049avatar_image_full: {1050title: "Image",1051type: "string",1052desc: "User configurable visual image associated with the compute server. Could be 150kb. NOT include as part of changefeed of projects, since potentially big (e.g., 200kb x 1000 projects = 200MB!).",1053render: { type: "image" },1054},1055purchase_id: {1056type: "number",1057desc: "if there is a current active purchase related to this compute server, this is the id of that purchase in the purchases table",1058},1059update_purchase: {1060type: "boolean",1061desc: "This is set to true if activity with this server is happening that warrants creating/ending a purchase.",1062},1063last_purchase_update: {1064type: "timestamp",1065desc: "Last time we requested an update to the purchase info about this compute server.",1066},1067position: {1068type: "number",1069desc: "Used for sorting a list of compute servers in the UI.",1070},1071last_edited: {1072type: "timestamp",1073desc: "Last time the configuration, state, etc., changed.",1074},1075last_edited_user: {1076type: "timestamp",1077desc: "Last time a user explicitly edited a file or used an application (e.g., terminal) on the compute server via the UI. This is like last_edited for projects, and is used to implement configuration.idleTimeoutMinutes.",1078},1079detailed_state: {1080type: "map",1081pg_type: "jsonb",1082desc: "Map from component name to something like {state:'running',time:Date.now()}, e.g., {vm: {state:'running', time:393939938484}}, filesystem: {state:'updating', time:939398484892}, uptime:{state:'22:56:33 up 3 days, 9:28, 0 users, load average: 0.93, 0.73, 0.56', time:?}}. This is used to provide users with insight into what's currently happening on their compute server.",1083},1084notes: NOTES,1085template: {1086type: "map",1087pg_type: "jsonb",1088desc: "Use this compute server configuration as a public template. Only admins can set this field for now. The exact structure of this jsonb is yet to be determined.",1089},1090vpn_ip: {1091type: "string",1092desc: "IP address of the compute server on the private encrypted project-wide VPN.",1093},1094vpn_public_key: {1095type: "string",1096desc: "Wireguard public key for this compute server.",1097},1098vpn_private_key: {1099type: "string",1100desc: "Wireguard private key for this compute server.",1101},1102project_specific_id: {1103type: "integer",1104desc: "A unique project-specific id assigned to this compute server. This is a positive integer that is guaranteed to be unique for compute servers *in a given project* and minimal when assigned (so it is as small as possible). This number is useful for distributed algorithms, since it can be used to ensure distinct sequence without any additional coordination. This is also useful to display to users so that the id number they see everywhere is not huge.",1105},1106course_project_id: {1107type: "uuid",1108desc: "If this is a compute server created for a student in a course, then this is the id of the project that the instructor(s) are using to host the course. IMPORTANT: Our security model is that a user can read info about a compute server if they are a collaborator on *either* the compute server's project_id OR on the course_project_id, if set (but then only via the compute_servers_by_course virtual table).",1109},1110course_server_id: {1111type: "integer",1112desc: "If this compute server is a clone of an instructor server in a course, this is the id of that instructor server.",1113},1114spend: {1115type: "number",1116desc: "If configuration.spendLimit is enabled, then the spend during the current period gets recorded here every few minutes. This is useful to efficiently provide a UI element showing the current spend status. It is cleared whenever configuration.spendLimit is changed, to avoid confusion.",1117},1118},1119});11201121// The compute_servers_by_course table is exactly like the compute_servers1122// table, but instead of having to specify1123Table({1124name: "compute_servers_by_course",1125fields: schema.compute_servers.fields,1126rules: {1127primary_key: schema.compute_servers.primary_key,1128virtual: "compute_servers",1129user_query: {1130get: {1131// only allow read access when course_project_id is a project1132// that client user is a collaborator on.1133pg_where: [1134{1135"course_project_id = ANY(select project_id from projects where users ? $::TEXT)":1136"account_id",1137},1138],1139fields: {1140...schema.compute_servers.user_query?.get?.fields,1141},1142},1143},1144},1145});11461147Table({1148name: "crm_compute_servers",1149fields: schema.compute_servers.fields,1150rules: {1151primary_key: schema.compute_servers.primary_key,1152virtual: "compute_servers",1153user_query: {1154get: {1155admin: true, // only admins can do get queries on this table1156// (without this, users who have read access could read)1157pg_where: [],1158fields: {1159...schema.compute_servers.user_query?.get?.fields,1160template: null,1161},1162},1163set: {1164admin: true,1165fields: {1166id: true,1167title: true,1168color: true,1169deleted: true,1170notes: true,1171template: true,1172state_control: null,1173},1174},1175},1176},1177});11781179Table({1180name: "compute_servers_cache",1181fields: {1182cloud: {1183type: "string",1184desc: "The cloud that we're caching information about",1185},1186key: {1187type: "string",1188desc: "The key for whatever we're caching.",1189},1190value: {1191type: "string",1192desc: "The cached data.",1193},1194expire: {1195type: "timestamp",1196desc: "When this action should be expired.",1197},1198},1199rules: {1200durability: "soft", // it's just a cache1201desc: "Cache data about what's going on in various clouds that are used to implement compute servers.",1202primary_key: ["cloud", "key"],1203},1204});120512061207