Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sagemathinc
GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/packages/util/db-schema/compute-servers.ts
5828 views
1
/*
2
* This file is part of CoCalc: Copyright © 2023 Sagemath, Inc.
3
* License: MS-RSL – see LICENSE.md for details
4
*/
5
6
import type {
7
Region as HyperstackRegion,
8
VirtualMachine as HyperstackVirtualMachine,
9
} from "@cocalc/util/compute/cloud/hyperstack/api-types";
10
import { COLORS } from "@cocalc/util/theme";
11
import { ID, NOTES } from "./crm";
12
import { SCHEMA as schema } from "./index";
13
import { Table } from "./types";
14
export {
15
CLOUDS_BY_NAME,
16
GOOGLE_CLOUD_DEFAULTS,
17
ON_PREM_DEFAULTS,
18
} from "@cocalc/util/compute/cloud/clouds";
19
20
// These are just fallbacks in case something is wrong with the image configuration.
21
export const STANDARD_DISK_SIZE = 20;
22
export const CUDA_DISK_SIZE = 60;
23
24
export const CHECK_IN_PERIOD_S = 20;
25
export const CHECK_IN_PATH = "/cocalc/conf/check-in";
26
27
// Clients are recommended to wait this long after a purchase ends before
28
// requesting the cost. This should give us about a day of wiggle room.
29
// There is no SLA on billing data.
30
const GOOGLE_COST_LAG_DAYS = 2;
31
export const GOOGLE_COST_LAG_MS = GOOGLE_COST_LAG_DAYS * 24 * 60 * 60 * 1000;
32
33
// Compute Server Images -- typings. See packages/server/compute/images.ts for
34
// how the actual data is populated.
35
36
export interface ImageVersion {
37
// tag - must be given and distinct for each version -- this typically identifies the image to docker
38
tag: string;
39
// version -- defaults to tag if not given; usually the upstream version
40
version?: string;
41
// label -- defaults to the tag; this is to display to the user
42
label?: string;
43
// tested -- if this is not set to true, then this version should not be shown by default.
44
// If not tested, only show to users who explicitly really want this (e.g., admins).
45
tested?: boolean;
46
}
47
48
export const IDLE_TIMEOUT_MINUTES_DEFAULT = 30;
49
50
export const HEALTH_CHECK_DEFAULTS = {
51
command: "pwd",
52
initialDelaySeconds: 10 * 60,
53
timeoutSeconds: 30,
54
periodSeconds: 60,
55
failureThreshold: 3,
56
enabled: false,
57
action: "reboot",
58
};
59
60
export const HEALTH_CHECK_ACTIONS = [
61
"reboot",
62
"stop",
63
"suspend",
64
"deprovision",
65
];
66
type HealthCheckAction = (typeof HEALTH_CHECK_ACTIONS)[number];
67
68
export function validatedHealthCheck(
69
healthCheck?: any,
70
): HealthCheck | undefined {
71
if (healthCheck == null) {
72
return undefined;
73
}
74
let {
75
command,
76
periodSeconds,
77
failureThreshold,
78
enabled,
79
action,
80
timeoutSeconds,
81
initialDelaySeconds,
82
} = healthCheck;
83
command = `${command}`;
84
periodSeconds = parseFloat(
85
periodSeconds ?? HEALTH_CHECK_DEFAULTS.periodSeconds,
86
);
87
if (periodSeconds < 0 || !isFinite(periodSeconds)) {
88
periodSeconds = HEALTH_CHECK_DEFAULTS.periodSeconds;
89
}
90
failureThreshold = parseFloat(
91
failureThreshold ?? HEALTH_CHECK_DEFAULTS.failureThreshold,
92
);
93
if (failureThreshold < 1 || !isFinite(failureThreshold)) {
94
failureThreshold = HEALTH_CHECK_DEFAULTS.failureThreshold;
95
}
96
timeoutSeconds = parseFloat(
97
timeoutSeconds ?? HEALTH_CHECK_DEFAULTS.timeoutSeconds,
98
);
99
if (timeoutSeconds < 5 || !isFinite(timeoutSeconds)) {
100
timeoutSeconds = HEALTH_CHECK_DEFAULTS.timeoutSeconds;
101
}
102
initialDelaySeconds = parseFloat(
103
initialDelaySeconds ?? HEALTH_CHECK_DEFAULTS.initialDelaySeconds,
104
);
105
if (initialDelaySeconds < 0 || !isFinite(initialDelaySeconds)) {
106
initialDelaySeconds = HEALTH_CHECK_DEFAULTS.initialDelaySeconds;
107
}
108
enabled = !!enabled;
109
if (!HEALTH_CHECK_ACTIONS.includes(action)) {
110
action = HEALTH_CHECK_DEFAULTS.action;
111
}
112
return {
113
command,
114
initialDelaySeconds,
115
timeoutSeconds,
116
periodSeconds,
117
failureThreshold,
118
enabled,
119
action,
120
};
121
}
122
123
export interface HealthCheck {
124
// run the command with given args on the compute server.
125
// If the command fails (nonzero exit code) failureThreshold times, then the
126
// action happens. If it contains the deprovision
127
// string, then it deprovisions.
128
command: string;
129
// timeout for running the command
130
timeoutSeconds: number;
131
// initial delay
132
initialDelaySeconds: number;
133
// period in seconds to wait between running the command
134
periodSeconds: number;
135
// When a probe fails, CoCalc will try failureThreshold times before doing the action.
136
failureThreshold: number;
137
138
action: HealthCheckAction;
139
enabled: boolean;
140
}
141
142
interface ProxyRoute {
143
path: string;
144
target: string;
145
ws?: boolean;
146
}
147
148
export interface Image {
149
// What we show the user to describe this image, e.g., in the image select menu.
150
label: string;
151
// The name of the package on npmjs or dockerhub:
152
package?: string;
153
// In case there is a different package name for ARM64, the name of it.
154
package_arm64?: string;
155
// Root filesystem image must be at least this big in GB.
156
minDiskSizeGb?: number;
157
// Description in MARKDOWN to show user of this image. Can include links.
158
// Rough estimate of compressed size of Docker image; useful
159
// to get a sense of how long it will take to download image
160
// on clouds without pregenerated images.
161
dockerSizeGb?: number;
162
description?: string;
163
// Upstream URL for this image, e.g., https://julialang.org/ for the Julia image.
164
url?: string;
165
// Icon to show next to the label for this image.
166
icon?: string;
167
// Link to a URL with the source for building this image.
168
source: string;
169
// optional list of links to videos about this image, ordered from lowest to highest priority.
170
videos?: string[];
171
// optional list of links to tutorials
172
tutorials?: string[];
173
// The versions of this image that we claim to have built.
174
// The ones with role='prod' (or not specified) are shown
175
// to users as options.
176
versions: ImageVersion[];
177
// If true, then a GPU is required to use this image.
178
gpu?: boolean;
179
// If true, then the microk8s snap is required to use this image.
180
microk8s?: boolean;
181
// authToken: if true, image has web interface that supports configurable auth token
182
authToken?: boolean;
183
// jupyterKernels: if false, no jupyter kernels included. If true or a list of
184
// names, there are kernels available – used in frontend/jupyter/select-kernel.tsx
185
jupyterKernels?: false | true | string[];
186
// If set to true, do not allow creating this compute server with a DNS subdomain.
187
// Some images only make sense to use over the web, and the web server just won't
188
// work without DNS setup properly (e.g., VS Code with LEAN). Ignored for on prem.
189
requireDns?: boolean;
190
// system: if true, this is a system container that is not for user compute
191
system?: boolean;
192
// disabled: if true, this image is completely disabled, so will not be used in any way.
193
disabled?: boolean;
194
// priority -- optional integer used for sorting options to display to user. The bigger the higher.
195
priority?: number;
196
// proxy: if false, do NOT run https proxy server on host VM
197
// if nothing given, runs proxy server with no default config (so does nothing)
198
// if given, is array of proxy config.
199
proxy?: false | ProxyRoute[];
200
apps?: {
201
[name: string]: {
202
icon: string;
203
label: string;
204
url: string;
205
path: string;
206
launch: string;
207
requiresDns?: boolean;
208
};
209
};
210
}
211
212
export type Images = { [name: string]: Image };
213
214
export interface GoogleCloudImage {
215
labels: { [name: string]: string };
216
diskSizeGb: number;
217
creationTimestamp: string;
218
}
219
export type GoogleCloudImages = { [name: string]: GoogleCloudImage };
220
221
// valid for google cloud -- probably not sufficient
222
export function makeValidGoogleName(s: string): string {
223
return s.replace(/[._]/g, "-").toLowerCase().slice(0, 63);
224
}
225
226
export type State =
227
| "off"
228
| "starting"
229
| "running"
230
| "stopping"
231
| "deprovisioned"
232
| "suspending"
233
| "suspended"
234
| "unknown";
235
236
// used for sorting by state -- ordered from my alive to least alive.
237
export const ORDERED_STATES: State[] = [
238
"running",
239
"starting",
240
"stopping",
241
"suspending",
242
"suspended",
243
"off",
244
"deprovisioned",
245
"unknown",
246
];
247
export const STATE_TO_NUMBER: { [state: string]: number } = {};
248
let n = 0;
249
for (const state of ORDERED_STATES) {
250
STATE_TO_NUMBER[state] = n;
251
n += 1;
252
}
253
254
// Helper function to determine the architecture of a machine type
255
export function getMachineTypeArchitecture(machineType: string): Architecture {
256
const v = machineType.split("-");
257
if (v[0].endsWith("a")) {
258
// The known machines with ARM are: t2a-, c4a-
259
// Everything else ends with a number or d.
260
// Hopefully this pattern persists.
261
return "arm64";
262
}
263
return "x86_64";
264
}
265
266
export function getArchitecture(configuration: Configuration): Architecture {
267
if (configuration.cloud == "onprem") {
268
return configuration.arch ?? "x86_64";
269
}
270
if (configuration.cloud != "google-cloud") {
271
// no ARM outside of GCP right now
272
return "x86_64";
273
}
274
const { machineType } = configuration;
275
return getMachineTypeArchitecture(machineType);
276
}
277
278
function supportsSuspend(configuration: Configuration) {
279
if (configuration.cloud != "google-cloud") {
280
return false;
281
}
282
if (getArchitecture(configuration) != "x86_64") {
283
// TODO: suspend/resume breaks the clock badly on ARM64, and I haven't
284
// figured out a workaround, so don't support it for now. I guess this
285
// is a GCP bug.
286
return false;
287
}
288
// must have no gpu and <= 208GB of RAM -- https://cloud.google.com/compute/docs/instances/suspend-resume-instance
289
if (configuration.acceleratorType) {
290
return false;
291
}
292
return true;
293
}
294
295
export type Action =
296
| "start"
297
| "resume"
298
| "stop"
299
| "suspend"
300
| "deprovision"
301
| "reboot";
302
303
export const ACTION_INFO: {
304
[action: string]: {
305
label: string;
306
icon: string;
307
tip: string;
308
description: string;
309
confirm?: boolean;
310
confirmMessage?: string;
311
danger?: boolean;
312
target: State; // target stable state after doing this action.
313
clouds?: Cloud[];
314
isSupported?: (configuration: Configuration) => boolean;
315
};
316
} = {
317
start: {
318
label: "Start",
319
icon: "play",
320
tip: "Start",
321
description: "Start the compute server running.",
322
target: "running",
323
},
324
resume: {
325
label: "Resume",
326
icon: "play",
327
clouds: ["google-cloud"],
328
tip: "Resume",
329
description: "Resume the compute server from suspend.",
330
target: "running",
331
isSupported: supportsSuspend,
332
},
333
stop: {
334
label: "Stop",
335
icon: "stop",
336
tip: "Turn off",
337
description:
338
"Turn the compute server off. No data on disk is lost, but any data and state in memory will be lost. This is like turning your laptop off.",
339
confirm: true,
340
target: "off",
341
},
342
deprovision: {
343
label: "Deprovision",
344
icon: "trash",
345
tip: "Deprovision the virtual machine",
346
description:
347
"Deprovisioning DELETES THE VIRTUAL MACHINE BOOT DISK, but keeps the compute server parameters. There are no costs associated with a deprovisioned compute server, and you can move it to a different region or zone. Any files in the home directory of your project are not affected.",
348
confirm: true,
349
confirmMessage:
350
"I understand that my compute server disks will be deleted.",
351
danger: true,
352
target: "deprovisioned",
353
},
354
reboot: {
355
label: "Hard Reboot",
356
icon: "refresh",
357
tip: "Hard reboot the virtual machine.",
358
description:
359
"Perform a HARD reset on the virtual machine, which wipes the memory contents and resets the virtual machine to its initial state. This should not delete data from the disk, but can lead to filesystem corruption.",
360
confirm: true,
361
confirmMessage:
362
"I understand that this can lead to filesystem corruption and is slightly dangerous.",
363
danger: true,
364
target: "running",
365
clouds: ["google-cloud", "hyperstack"],
366
},
367
suspend: {
368
label: "Suspend",
369
icon: "pause",
370
clouds: ["google-cloud"],
371
tip: "Suspend disk and memory state",
372
confirm: true,
373
description:
374
"Suspend the compute server. No data on disk or memory is lost, and you are only charged for storing disk and memory. This is like closing your laptop screen. You can leave a compute server suspended for up to 60 days before it automatically shuts off.",
375
target: "suspended",
376
isSupported: supportsSuspend,
377
},
378
};
379
380
export const STATE_INFO: {
381
[state: string]: {
382
label: string;
383
actions: Action[];
384
icon: string;
385
color?: string;
386
stable?: boolean;
387
target?: State; // if not stable, this is the target state it is heading to
388
};
389
} = {
390
off: {
391
label: "Off",
392
color: "#ff4b00",
393
actions: ["start", "deprovision"],
394
icon: "stop",
395
stable: true,
396
},
397
suspended: {
398
label: "Suspended",
399
actions: ["resume", "deprovision", "stop"],
400
icon: "pause",
401
color: "#0097a7",
402
stable: true,
403
},
404
suspending: {
405
label: "Suspending",
406
actions: ["suspend"],
407
icon: "pause",
408
color: "#00bcd4",
409
stable: false,
410
target: "suspended",
411
},
412
starting: {
413
label: "Starting",
414
color: "#388e3c",
415
actions: ["start"],
416
icon: "bolt",
417
stable: false,
418
target: "running",
419
},
420
running: {
421
label: "Running",
422
color: COLORS.RUN,
423
actions: ["stop", "deprovision", "reboot", "suspend"],
424
icon: "run",
425
stable: true,
426
},
427
stopping: {
428
label: "Stopping",
429
color: "#ff9800",
430
actions: ["stop"],
431
icon: "hand",
432
stable: false,
433
target: "off",
434
},
435
unknown: {
436
label: "Unknown (click to refresh)",
437
actions: [],
438
icon: "question-circle",
439
stable: true,
440
},
441
deprovisioned: {
442
label: "Deprovisioned",
443
actions: ["start"],
444
color: "#888",
445
icon: "minus-square",
446
stable: true,
447
},
448
};
449
450
export function getTargetState(x: State | Action): State {
451
if (ACTION_INFO[x] != null) {
452
return ACTION_INFO[x].target;
453
}
454
if (STATE_INFO[x] != null) {
455
if (!STATE_INFO[x]?.stable) {
456
return (STATE_INFO[x].target ?? x) as State;
457
}
458
return x as State;
459
}
460
throw Error(`x =${x} must be a state or action`);
461
}
462
463
export type Architecture = "x86_64" | "arm64";
464
465
// Convention is used in cocalc-compute-docker for making
466
// the npm packages @cocalc/compute-server. Don't mess with it!
467
export function getImageField(arch: Architecture) {
468
return arch == "x86_64" ? "package" : "package_arm64";
469
}
470
471
export type Cloud =
472
| "any"
473
| "onprem"
474
| "core-weave"
475
| "hyperstack"
476
| "lambda-cloud"
477
| "google-cloud"
478
| "aws"
479
| "fluid-stack"
480
| "test";
481
482
export function getMinDiskSizeGb({
483
configuration,
484
IMAGES,
485
}: {
486
configuration;
487
IMAGES: Images;
488
}) {
489
if (configuration?.image) {
490
const { minDiskSizeGb } = IMAGES[configuration.image] ?? {};
491
if (minDiskSizeGb) {
492
return minDiskSizeGb;
493
}
494
}
495
// TODO: will have to do something based on actual image size,
496
// maybe, unless I come up with a clever trick involving
497
// one PD mounted on many machines (?).
498
if (configuration?.acceleratorType) {
499
return CUDA_DISK_SIZE;
500
} else {
501
return STANDARD_DISK_SIZE;
502
}
503
}
504
505
// This means "you can spend at most dollars every hours on a RUNNING compute server"
506
export interface SpendLimit {
507
hours: number;
508
dollars: number;
509
enabled: boolean;
510
}
511
512
export const SPEND_LIMIT_DEFAULTS = {
513
hours: 24 * 7,
514
dollars: 25,
515
enabled: false,
516
};
517
518
export function validatedSpendLimit(spendLimit?: any): SpendLimit | undefined {
519
if (spendLimit == null) {
520
return undefined;
521
}
522
let { hours, dollars, enabled } = spendLimit;
523
hours = parseFloat(hours ?? 0);
524
dollars = parseFloat(dollars ?? 0);
525
enabled = !!enabled;
526
if (hours < 0 || !isFinite(hours)) {
527
hours = SPEND_LIMIT_DEFAULTS.hours;
528
}
529
if (dollars < 0 || !isFinite(dollars)) {
530
dollars = SPEND_LIMIT_DEFAULTS.dollars;
531
}
532
return { enabled, hours, dollars };
533
}
534
535
export function spendLimitPeriod(hours) {
536
if (hours == 24) {
537
return "day";
538
}
539
if (hours == 24 * 7) {
540
return "week";
541
}
542
if (hours == 30.5 * 24 * 7) {
543
return "month";
544
}
545
if (hours == 12 * 30.5 * 24 * 7) {
546
return "year";
547
}
548
return `${hours} hours`;
549
}
550
551
const tenAM = new Date();
552
tenAM.setHours(10, 0, 0, 0);
553
export const DEFAULT_SHUTDOWN_TIME = {
554
epochMs: tenAM.valueOf(),
555
enabled: false,
556
};
557
558
export interface ShutdownTime {
559
epochMs: number;
560
enabled?: boolean;
561
}
562
563
export function validatedShutdownTime(
564
shutdownTime?: any,
565
): ShutdownTime | undefined {
566
if (shutdownTime == null) {
567
return undefined;
568
}
569
let { epochMs, enabled } = shutdownTime;
570
epochMs = parseFloat(epochMs ?? DEFAULT_SHUTDOWN_TIME.epochMs);
571
if (epochMs < 0 || !isFinite(epochMs)) {
572
epochMs = DEFAULT_SHUTDOWN_TIME.epochMs;
573
}
574
enabled = !!enabled;
575
return { enabled, epochMs };
576
}
577
578
interface BaseConfiguration {
579
// image: name of the image to use, e.g. 'python' or 'pytorch'.
580
// images are managed in src/packages/server/compute/images.ts
581
image: string;
582
// tag: tag for the image to use when starting the compute server.
583
// this references the versions field of the image.
584
// If the tag is not given or not available, we use the latest
585
// available tag.
586
tag?: string;
587
// tag_filesystem: tag for the file system container
588
tag_filesystem?: string;
589
// tag_cocalc: tag for the @cocalc/compute-server package.
590
tag_cocalc?: string;
591
// dns - If the string is set and the VM has an external ip address
592
// and dns is configured, then point https://{dns}....
593
// with ssl proxying to this compute server when it is running.
594
dns?: string;
595
// Array of top level directories to exclude from sync.
596
// These can't have "|" in them, since we use that as a separator.
597
// Use "~" to completely disable sync.
598
excludeFromSync?: readonly string[];
599
// If true, view data on the compute server as ephemeral.
600
// Currently this is only meant to impact the user interface.
601
ephemeral?: boolean;
602
// Token used for authentication at https://compute-server...
603
authToken?: string;
604
// Configuration of the https proxy server.
605
proxy?: ProxyRoute[];
606
// If this compute server stops pinging us, e.g., due to being preempted or
607
// just crashing due to out of memory (etc) should we automatically do a
608
// forced restart. Note that currently for on prem this isn't possible.
609
autoRestart?: boolean;
610
autoRestartDisabled?: boolean; // used to temporarily disable it to avoid accidentally triggering it.
611
// Allow collaborators to control the state of the compute server.
612
// They cannot change any other configuration. User still pays for everything and owns compute server.
613
allowCollaboratorControl?: boolean;
614
615
// AUTOMATIC SHUTDOWN configuration:
616
// turn compute server off if spend more then dollars during the last hours.
617
// this can only be set by the owner.
618
// Limit spending
619
spendLimit?: SpendLimit;
620
idleTimeoutMinutes?: number;
621
healthCheck?: HealthCheck;
622
// number = ms since epoch defines a time; at *that* time each day, the server is turned off.
623
shutdownTime?: ShutdownTime;
624
}
625
626
export const AUTOMATIC_SHUTDOWN_FIELDS = [
627
"spendLimit",
628
"idleTimeoutMinutes",
629
"healthCheck",
630
"shutdownTime",
631
];
632
633
interface LambdaConfiguration extends BaseConfiguration {
634
cloud: "lambda-cloud";
635
instance_type_name: string;
636
region_name: string;
637
}
638
639
export interface HyperstackConfiguration extends BaseConfiguration {
640
cloud: "hyperstack";
641
flavor_name: string;
642
region_name: HyperstackRegion;
643
// diskSizeGb is an integer >= 1. It defaults to 10.
644
// It's the size of the /data partition. It's implemented
645
// using 1 or more hyperstack (=ceph) volumes, which are combined
646
// together as a ZFS pool. If the compute server is
647
// named "foo", the volumes are named "foo-1", "foo-2",
648
// "foo-3", etc.
649
// There is also always a separate 50GB root volume, which
650
// is named "foo-0", and whose size is not configurable.
651
// NOTE: users install packages "systemwide" inside of
652
// a docker container and we configure docker to store
653
// its data in the zpool, so that's in here too.
654
diskSizeGb: number;
655
}
656
657
export const COREWEAVE_CPU_TYPES = [
658
"amd-epyc-rome",
659
"amd-epyc-milan",
660
"intel-xeon-v1",
661
"intel-xeon-v2",
662
"intel-xeon-v3",
663
"intel-xeon-v4",
664
"intel-xeon-scalable",
665
] as const;
666
667
export const COREWEAVE_GPU_TYPES = [
668
"Quadro_RTX_4000",
669
"Quadro_RTX_5000",
670
"RTX_A4000",
671
"RTX_A5000",
672
"RTX_A6000",
673
"A40",
674
"Tesla_V100_PCIE",
675
"Tesla_V100_NVLINK",
676
"A100_PCIE_40GB",
677
"A100_PCIE_80GB",
678
"A100_NVLINK_40GB",
679
"A100_NVLINK_80GB",
680
] as const;
681
682
interface CoreWeaveConfiguration extends BaseConfiguration {
683
cloud: "core-weave";
684
gpu: {
685
type:
686
| "Quadro_RTX_4000"
687
| "Quadro_RTX_5000"
688
| "RTX_A4000"
689
| "RTX_A5000"
690
| "RTX_A6000"
691
| "A40"
692
| "Tesla_V100_PCIE"
693
| "Tesla_V100_NVLINK"
694
| "A100_PCIE_40GB"
695
| "A100_PCIE_80GB"
696
| "A100_NVLINK_40GB"
697
| "A100_NVLINK_80GB"; //(typeof COREWEAVE_GPU_TYPES)[number];
698
count: number;
699
};
700
cpu: {
701
count: number;
702
type?:
703
| "amd-epyc-rome"
704
| "amd-epyc-milan"
705
| "intel-xeon-v1"
706
| "intel-xeon-v2"
707
| "intel-xeon-v3"
708
| "intel-xeon-v4"
709
| "intel-xeon-scalable"; //(typeof COREWEAVE_CPU_TYPES)[number];
710
};
711
memory: string; // e.g., "12Gi"
712
storage?: {
713
root: {
714
size: string; // e.g., '40Gi'
715
};
716
};
717
}
718
719
interface FluidStackConfiguration extends BaseConfiguration {
720
cloud: "fluid-stack";
721
plan: string;
722
region: string;
723
os: string;
724
}
725
export type GoogleCloudAcceleratorType =
726
| "nvidia-h200-141gb"
727
| "nvidia-h100-80gb"
728
| "nvidia-a100-80gb"
729
| "nvidia-tesla-a100"
730
| "nvidia-l4"
731
| "nvidia-tesla-t4"
732
| "nvidia-tesla-v100"
733
| "nvidia-tesla-p4"
734
| "nvidia-tesla-p100";
735
736
export const GOOGLE_CLOUD_ACCELERATOR_TYPES: GoogleCloudAcceleratorType[] = [
737
"nvidia-h200-141gb",
738
"nvidia-h100-80gb",
739
"nvidia-a100-80gb",
740
"nvidia-tesla-a100",
741
"nvidia-l4",
742
"nvidia-tesla-t4",
743
"nvidia-tesla-v100",
744
"nvidia-tesla-p4",
745
"nvidia-tesla-p100",
746
];
747
748
export type GoogleCloudDiskType =
749
| "pd-standard"
750
| "pd-balanced"
751
| "pd-ssd"
752
| "hyperdisk-balanced";
753
754
export const GOOGLE_CLOUD_DISK_TYPES: GoogleCloudDiskType[] = [
755
"pd-standard",
756
"pd-balanced",
757
"pd-ssd",
758
// NOTE: hyperdisks are complicated and multidimensional, but for cocalc
759
// we just hardcode options for the iops and bandwidth, and allow the
760
// user to adjust the size. Also, "hyperdisk-balanced" means hyperdisk
761
// with the defaults for iops and bandwidth defined in
762
// src/packages/util/compute/cloud/google-cloud/compute-cost.ts
763
"hyperdisk-balanced",
764
];
765
766
export interface GoogleCloudConfiguration extends BaseConfiguration {
767
cloud: "google-cloud";
768
region: string;
769
zone: string;
770
machineType: string;
771
// Ues a spot instance if spot is true.
772
spot?: boolean;
773
// The boot disk:
774
// diskSizeGb is an integer >= 10. It defaults to 10. It's the size of the boot disk.
775
diskSizeGb?: number;
776
hyperdiskBalancedIops?: number;
777
hyperdiskBalancedThroughput?: number;
778
diskType?: GoogleCloudDiskType;
779
acceleratorType?: GoogleCloudAcceleratorType;
780
// the allowed number depends on the accelerator; it defaults to 1.
781
acceleratorCount?: number;
782
// minCpuPlatform
783
terminationTime?: Date;
784
maxRunDurationSeconds?: number;
785
// if true, use newest image, whether or not it is labeled with prod=true.
786
test?: boolean;
787
// an image name of the form "2023-09-13-063355-test", i.e., a timestamp in that format
788
// followed by an optional string. Whether or not to use cuda and and the arch are
789
// determined by parameters above. This is meant to be used for two purposes (1) testing
790
// before deploying to production, and (2) stability, so a given compute server has the
791
// exact same base image every time it is started, instead of being updated. Regarding (2),
792
// this might not be needed, but we'll see. If image is not set, we use the newest
793
// image that is tagged prod:true, or its an error if no such image exists. This is
794
// all about Google Cloud images, not the IMAGES object defined elsewhere in this file.
795
sourceImage?: string;
796
// If true, then we have an external ip address
797
externalIp?: boolean;
798
// If true, can run full VM's inside of the machine, but there is 10% performance penalty.
799
// This will only work for Intel non-e2 non-a3 instance types. No AMD and no ARM64.
800
enableNestedVirtualization?: boolean;
801
}
802
803
export interface OnPremCloudConfiguration extends BaseConfiguration {
804
cloud: "onprem";
805
arch?: Architecture;
806
gpu?: boolean;
807
}
808
809
export type Configuration =
810
| LambdaConfiguration
811
| HyperstackConfiguration
812
| CoreWeaveConfiguration
813
| FluidStackConfiguration
814
| GoogleCloudConfiguration
815
| OnPremCloudConfiguration;
816
817
interface BaseData {
818
cloudflareId?: string;
819
externalIp?: string;
820
internalIp?: string;
821
}
822
823
export interface LambdaCloudData extends BaseData {
824
cloud: "lambda-cloud";
825
instance_id: string;
826
}
827
828
export interface HyperstackData extends BaseData {
829
cloud: "hyperstack";
830
// name we are using for the vm
831
name?: string;
832
// hyperstack description of this vm.
833
vm?: HyperstackVirtualMachine;
834
// id's of persistent storage, with first id the boot disk.
835
// disks are named {name}-0, {name}-1, {name}-2, etc.,
836
// with {name}-0 being the boot disk.
837
disks?: number[];
838
creationTimestamp?: Date;
839
}
840
841
export interface GoogleCloudData extends BaseData {
842
cloud: "google-cloud";
843
name?: string;
844
state?: State;
845
cpuPlatform?: string;
846
creationTimestamp?: Date;
847
lastStartTimestamp?: Date;
848
}
849
850
export type Data = GoogleCloudData | LambdaCloudData | HyperstackData;
851
852
export interface ComponentState {
853
state: string;
854
time: number;
855
expire?: number;
856
}
857
858
export interface ComputeServerTemplate {
859
enabled?: boolean;
860
priority?: number;
861
}
862
863
export interface ComputeServerUserInfo {
864
id: number;
865
project_specific_id?: number; // the project_specific_id of this compute server -- unique within project, minimal
866
account_id: string;
867
project_id: string;
868
title?: string;
869
color?: string;
870
cost_per_hour?: number;
871
deleted?: boolean;
872
state_changed?: Date;
873
started_by?: string;
874
error?: string;
875
state?: State;
876
// google-cloud has a new "Time limit" either by hour or by date, which seems like a great idea!
877
// time_limit
878
autorestart?: boolean;
879
cloud: Cloud;
880
configuration: Configuration;
881
provisioned_configuration?: Configuration;
882
data?: Data;
883
purchase_id?: number;
884
last_edited?: Date;
885
last_edited_user?: Date;
886
position?: number; // used for UI sorting.
887
detailed_state?: { [name: string]: ComponentState };
888
update_purchase?: boolean;
889
last_purchase_update?: Date;
890
template?: ComputeServerTemplate;
891
spend?: number;
892
}
893
894
export interface ComputeServer extends ComputeServerUserInfo {
895
api_key?: string; // project level api key for the project
896
api_key_id?: number; // id of the api key (needed so we can delete it from database).
897
}
898
899
Table({
900
name: "compute_servers",
901
rules: {
902
primary_key: "id",
903
// unique vpn ip address *within* a given project only:
904
pg_unique_indexes: [
905
"(project_id, vpn_ip)",
906
"(project_id, project_specific_id)",
907
],
908
user_query: {
909
get: {
910
pg_where: [{ "project_id = $::UUID": "project_id" }],
911
throttle_changes: 0, // do not make this bigger; UI really feels off if throttled
912
fields: {
913
id: null,
914
account_id: null,
915
created: null,
916
title: null,
917
color: null,
918
cost_per_hour: null,
919
deleted: null,
920
project_id: null,
921
state_changed: null,
922
error: null,
923
state: null,
924
autorestart: null,
925
cloud: null,
926
configuration: null,
927
data: null,
928
provisioned_configuration: null,
929
avatar_image_tiny: null,
930
last_edited: null,
931
last_edited_user: null,
932
purchase_id: null,
933
position: null,
934
detailed_state: null,
935
template: null,
936
notes: null,
937
vpn_ip: null,
938
project_specific_id: null,
939
course_project_id: null,
940
course_server_id: null,
941
spend: null,
942
},
943
},
944
set: {
945
// ATTN: It's assumed that users can't set the data field. Doing so would be very bad and could allow
946
// them to maybe abuse the system and not pay for something.
947
// Most fields, e.g., configuration, get set via api calls, which ensures consistency in terms of valid
948
// data and what is actively deployed.
949
fields: {
950
project_id: "project_write",
951
id: true,
952
position: true,
953
error: true, // easily clear the error
954
notes: true,
955
last_edited_user: true,
956
},
957
},
958
},
959
},
960
fields: {
961
id: ID,
962
account_id: {
963
type: "uuid",
964
desc: "User that owns this compute server.",
965
render: { type: "account" },
966
},
967
created: {
968
type: "timestamp",
969
desc: "When the compute server was created.",
970
},
971
title: {
972
type: "string",
973
pg_type: "VARCHAR(254)",
974
desc: "Title of this computer server. Used purely to make it easier for the user to keep track of it.",
975
render: { type: "text", maxLength: 254, editable: true },
976
},
977
color: {
978
type: "string",
979
desc: "A user configurable color, which is used for tags and UI to indicate where a tab is running.",
980
pg_type: "VARCHAR(30)",
981
render: { type: "color", editable: true },
982
},
983
cost_per_hour: {
984
title: "Cost per Hour",
985
desc: "The cost in US dollars per hour that this compute server cost us when it is provisioned. Any time the state is changed, this is set by the server to the proper cost.",
986
type: "number",
987
pg_type: "real",
988
},
989
deleted: {
990
type: "boolean",
991
desc: "True if the compute server has been deleted.",
992
},
993
project_id: {
994
type: "uuid",
995
desc: "The project id that this compute server provides compute for.",
996
render: { type: "project_link" },
997
},
998
api_key: {
999
type: "string",
1000
pg_type: "VARCHAR(128)",
1001
desc: "api key to connect to the project. This is created by the system right when we are going to create the VM, and gets deleted when we stop it. It's not set by the user and should not be revealed to the user.",
1002
},
1003
api_key_id: {
1004
type: "number",
1005
desc: "id of the api key; needed so we can delete it from database",
1006
},
1007
state_changed: {
1008
type: "timestamp",
1009
desc: "When the state last changed.",
1010
},
1011
error: {
1012
type: "string",
1013
desc: "In case something went wrong, e.g., in starting this compute server, this field will get set with a string error message to show the user. It's also cleared right when we try to start server.",
1014
},
1015
state: {
1016
type: "string",
1017
desc: "One of - 'off', 'starting', 'running', 'stopping', 'deprovisioned' (etc.). This is the underlying VM's state.",
1018
pg_type: "VARCHAR(16)",
1019
},
1020
autorestart: {
1021
type: "boolean",
1022
desc: "If true and the compute server stops for any reason, then it will be automatically started again. This is primarily useful for stop instances.",
1023
},
1024
cloud: {
1025
type: "string",
1026
pg_type: "varchar(30)",
1027
desc: "The cloud where this compute server runs: 'user', 'coreweave', 'lambda', 'google-cloud', 'aws', 'fluidstack'.",
1028
},
1029
configuration: {
1030
type: "map",
1031
pg_type: "jsonb",
1032
desc: "Cloud specific configuration of the computer at the cloud host. The format depends on the cloud",
1033
},
1034
provisioned_configuration: {
1035
type: "map",
1036
pg_type: "jsonb",
1037
desc: "Same as configuration, but this is the one we actually used last time we provisioned a VM in a cloud.",
1038
},
1039
data: {
1040
type: "map",
1041
pg_type: "jsonb",
1042
desc: "Arbitrary data about this server that is cloud provider specific. Store data here to facilitate working with the virtual machine, e.g., the id of the server when it is running, etc. This *MAY BE* returned to the user -- do not put secrets here the user can't see.",
1043
},
1044
avatar_image_tiny: {
1045
title: "Image",
1046
type: "string",
1047
desc: "tiny (32x32) visual image associated with the compute server. Suitable to include as part of changefeed, since about 3kb. Derived from avatar_image_full.",
1048
render: { type: "image" },
1049
},
1050
avatar_image_full: {
1051
title: "Image",
1052
type: "string",
1053
desc: "User configurable visual image associated with the compute server. Could be 150kb. NOT include as part of changefeed of projects, since potentially big (e.g., 200kb x 1000 projects = 200MB!).",
1054
render: { type: "image" },
1055
},
1056
purchase_id: {
1057
type: "number",
1058
desc: "if there is a current active purchase related to this compute server, this is the id of that purchase in the purchases table",
1059
},
1060
update_purchase: {
1061
type: "boolean",
1062
desc: "This is set to true if activity with this server is happening that warrants creating/ending a purchase.",
1063
},
1064
last_purchase_update: {
1065
type: "timestamp",
1066
desc: "Last time we requested an update to the purchase info about this compute server.",
1067
},
1068
position: {
1069
type: "number",
1070
desc: "Used for sorting a list of compute servers in the UI.",
1071
},
1072
last_edited: {
1073
type: "timestamp",
1074
desc: "Last time the configuration, state, etc., changed.",
1075
},
1076
last_edited_user: {
1077
type: "timestamp",
1078
desc: "Last time a user explicitly edited a file or used an application (e.g., terminal) on the compute server via the UI. This is like last_edited for projects, and is used to implement configuration.idleTimeoutMinutes.",
1079
},
1080
detailed_state: {
1081
type: "map",
1082
pg_type: "jsonb",
1083
desc: "Map from component name to something like {state:'running',time:Date.now()}, e.g., {vm: {state:'running', time:393939938484}}, filesystem: {state:'updating', time:939398484892}, uptime:{state:'22:56:33 up 3 days, 9:28, 0 users, load average: 0.93, 0.73, 0.56', time:?}}. This is used to provide users with insight into what's currently happening on their compute server.",
1084
},
1085
notes: NOTES,
1086
template: {
1087
type: "map",
1088
pg_type: "jsonb",
1089
desc: "Use this compute server configuration as a public template. Only admins can set this field for now. The exact structure of this jsonb is yet to be determined.",
1090
},
1091
vpn_ip: {
1092
type: "string",
1093
desc: "IP address of the compute server on the private encrypted project-wide VPN.",
1094
},
1095
vpn_public_key: {
1096
type: "string",
1097
desc: "Wireguard public key for this compute server.",
1098
},
1099
vpn_private_key: {
1100
type: "string",
1101
desc: "Wireguard private key for this compute server.",
1102
},
1103
project_specific_id: {
1104
type: "integer",
1105
desc: "A unique project-specific id assigned to this compute server. This is a positive integer that is guaranteed to be unique for compute servers *in a given project* and minimal when assigned (so it is as small as possible). This number is useful for distributed algorithms, since it can be used to ensure distinct sequence without any additional coordination. This is also useful to display to users so that the id number they see everywhere is not huge.",
1106
},
1107
course_project_id: {
1108
type: "uuid",
1109
desc: "If this is a compute server created for a student in a course, then this is the id of the project that the instructor(s) are using to host the course. IMPORTANT: Our security model is that a user can read info about a compute server if they are a collaborator on *either* the compute server's project_id OR on the course_project_id, if set (but then only via the compute_servers_by_course virtual table).",
1110
},
1111
course_server_id: {
1112
type: "integer",
1113
desc: "If this compute server is a clone of an instructor server in a course, this is the id of that instructor server.",
1114
},
1115
spend: {
1116
type: "number",
1117
desc: "If configuration.spendLimit is enabled, then the spend during the current period gets recorded here every few minutes. This is useful to efficiently provide a UI element showing the current spend status. It is cleared whenever configuration.spendLimit is changed, to avoid confusion.",
1118
},
1119
},
1120
});
1121
1122
// The compute_servers_by_course table is exactly like the compute_servers
1123
// table, but instead of having to specify
1124
Table({
1125
name: "compute_servers_by_course",
1126
fields: schema.compute_servers.fields,
1127
rules: {
1128
primary_key: schema.compute_servers.primary_key,
1129
virtual: "compute_servers",
1130
user_query: {
1131
get: {
1132
// only allow read access when course_project_id is a project
1133
// that client user is a collaborator on.
1134
pg_where: [
1135
{
1136
"course_project_id = ANY(select project_id from projects where users ? $::TEXT)":
1137
"account_id",
1138
},
1139
],
1140
fields: {
1141
...schema.compute_servers.user_query?.get?.fields,
1142
},
1143
},
1144
},
1145
},
1146
});
1147
1148
Table({
1149
name: "crm_compute_servers",
1150
fields: schema.compute_servers.fields,
1151
rules: {
1152
primary_key: schema.compute_servers.primary_key,
1153
virtual: "compute_servers",
1154
user_query: {
1155
get: {
1156
admin: true, // only admins can do get queries on this table
1157
// (without this, users who have read access could read)
1158
pg_where: [],
1159
fields: {
1160
...schema.compute_servers.user_query?.get?.fields,
1161
template: null,
1162
},
1163
},
1164
set: {
1165
admin: true,
1166
fields: {
1167
id: true,
1168
title: true,
1169
color: true,
1170
deleted: true,
1171
notes: true,
1172
template: true,
1173
state_control: null,
1174
},
1175
},
1176
},
1177
},
1178
});
1179
1180
Table({
1181
name: "compute_servers_cache",
1182
fields: {
1183
cloud: {
1184
type: "string",
1185
desc: "The cloud that we're caching information about",
1186
},
1187
key: {
1188
type: "string",
1189
desc: "The key for whatever we're caching.",
1190
},
1191
value: {
1192
type: "string",
1193
desc: "The cached data.",
1194
},
1195
expire: {
1196
type: "timestamp",
1197
desc: "When this action should be expired.",
1198
},
1199
},
1200
rules: {
1201
durability: "soft", // it's just a cache
1202
desc: "Cache data about what's going on in various clouds that are used to implement compute servers.",
1203
primary_key: ["cloud", "key"],
1204
},
1205
});
1206
1207