CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
sagemathinc

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/packages/util/db-schema/compute-servers.ts
Views: 923
1
/*
2
* This file is part of CoCalc: Copyright © 2023 Sagemath, Inc.
3
* License: MS-RSL – see LICENSE.md for details
4
*/
5
6
import type {
7
Region as HyperstackRegion,
8
VirtualMachine as HyperstackVirtualMachine,
9
} from "@cocalc/util/compute/cloud/hyperstack/api-types";
10
import { COLORS } from "@cocalc/util/theme";
11
import { ID, NOTES } from "./crm";
12
import { SCHEMA as schema } from "./index";
13
import { Table } from "./types";
14
export {
15
CLOUDS_BY_NAME,
16
GOOGLE_CLOUD_DEFAULTS,
17
ON_PREM_DEFAULTS,
18
} from "@cocalc/util/compute/cloud/clouds";
19
20
// These are just fallbacks in case something is wrong with the image configuration.
21
export const STANDARD_DISK_SIZE = 20;
22
export const CUDA_DISK_SIZE = 60;
23
24
export const CHECK_IN_PERIOD_S = 20;
25
export const CHECK_IN_PATH = "/cocalc/conf/check-in";
26
27
// Clients are recommended to wait this long after a purchase ends before
28
// requesting the cost. This should give us about a day of wiggle room.
29
// There is no SLA on billing data.
30
const GOOGLE_COST_LAG_DAYS = 2;
31
export const GOOGLE_COST_LAG_MS = GOOGLE_COST_LAG_DAYS * 24 * 60 * 60 * 1000;
32
33
// Compute Server Images -- typings. See packages/server/compute/images.ts for
34
// how the actual data is populated.
35
36
export interface ImageVersion {
37
// tag - must be given and distinct for each version -- this typically identifies the image to docker
38
tag: string;
39
// version -- defaults to tag if not given; usually the upstream version
40
version?: string;
41
// label -- defaults to the tag; this is to display to the user
42
label?: string;
43
// tested -- if this is not set to true, then this version should not be shown by default.
44
// If not tested, only show to users who explicitly really want this (e.g., admins).
45
tested?: boolean;
46
}
47
48
export const IDLE_TIMEOUT_MINUTES_DEFAULT = 30;
49
50
export const HEALTH_CHECK_DEFAULTS = {
51
command: "pwd",
52
initialDelaySeconds: 10 * 60,
53
timeoutSeconds: 30,
54
periodSeconds: 60,
55
failureThreshold: 3,
56
enabled: false,
57
action: "reboot",
58
};
59
60
export const HEALTH_CHECK_ACTIONS = [
61
"reboot",
62
"stop",
63
"suspend",
64
"deprovision",
65
];
66
type HealthCheckAction = (typeof HEALTH_CHECK_ACTIONS)[number];
67
68
export function validatedHealthCheck(
69
healthCheck?: any,
70
): HealthCheck | undefined {
71
if (healthCheck == null) {
72
return undefined;
73
}
74
let {
75
command,
76
periodSeconds,
77
failureThreshold,
78
enabled,
79
action,
80
timeoutSeconds,
81
initialDelaySeconds,
82
} = healthCheck;
83
command = `${command}`;
84
periodSeconds = parseFloat(
85
periodSeconds ?? HEALTH_CHECK_DEFAULTS.periodSeconds,
86
);
87
if (periodSeconds < 0 || !isFinite(periodSeconds)) {
88
periodSeconds = HEALTH_CHECK_DEFAULTS.periodSeconds;
89
}
90
failureThreshold = parseFloat(
91
failureThreshold ?? HEALTH_CHECK_DEFAULTS.failureThreshold,
92
);
93
if (failureThreshold < 1 || !isFinite(failureThreshold)) {
94
failureThreshold = HEALTH_CHECK_DEFAULTS.failureThreshold;
95
}
96
timeoutSeconds = parseFloat(
97
timeoutSeconds ?? HEALTH_CHECK_DEFAULTS.timeoutSeconds,
98
);
99
if (timeoutSeconds < 5 || !isFinite(timeoutSeconds)) {
100
timeoutSeconds = HEALTH_CHECK_DEFAULTS.timeoutSeconds;
101
}
102
initialDelaySeconds = parseFloat(
103
initialDelaySeconds ?? HEALTH_CHECK_DEFAULTS.initialDelaySeconds,
104
);
105
if (initialDelaySeconds < 0 || !isFinite(initialDelaySeconds)) {
106
initialDelaySeconds = HEALTH_CHECK_DEFAULTS.initialDelaySeconds;
107
}
108
enabled = !!enabled;
109
if (!HEALTH_CHECK_ACTIONS.includes(action)) {
110
action = HEALTH_CHECK_DEFAULTS.action;
111
}
112
return {
113
command,
114
initialDelaySeconds,
115
timeoutSeconds,
116
periodSeconds,
117
failureThreshold,
118
enabled,
119
action,
120
};
121
}
122
123
export interface HealthCheck {
124
// run the command with given args on the compute server.
125
// If the command fails (nonzero exit code) failureThreshold times, then the
126
// action happens. If it contains the deprovision
127
// string, then it deprovisions.
128
command: string;
129
// timeout for running the command
130
timeoutSeconds: number;
131
// initial delay
132
initialDelaySeconds: number;
133
// period in seconds to wait between running the command
134
periodSeconds: number;
135
// When a probe fails, CoCalc will try failureThreshold times before doing the action.
136
failureThreshold: number;
137
138
action: HealthCheckAction;
139
enabled: boolean;
140
}
141
142
interface ProxyRoute {
143
path: string;
144
target: string;
145
ws?: boolean;
146
}
147
148
export interface Image {
149
// What we show the user to describe this image, e.g., in the image select menu.
150
label: string;
151
// The name of the package on npmjs or dockerhub:
152
package?: string;
153
// In case there is a different package name for ARM64, the name of it.
154
package_arm64?: string;
155
// Root filesystem image must be at least this big in GB.
156
minDiskSizeGb?: number;
157
// Description in MARKDOWN to show user of this image. Can include links.
158
// Rough estimate of compressed size of Docker image; useful
159
// to get a sense of how long it will take to download image
160
// on clouds without pregenerated images.
161
dockerSizeGb?: number;
162
description?: string;
163
// Upstream URL for this image, e.g., https://julialang.org/ for the Julia image.
164
url?: string;
165
// Icon to show next to the label for this image.
166
icon?: string;
167
// Link to a URL with the source for building this image.
168
source: string;
169
// optional list of links to videos about this image, ordered from lowest to highest priority.
170
videos?: string[];
171
// optional list of links to tutorials
172
tutorials?: string[];
173
// The versions of this image that we claim to have built.
174
// The ones with role='prod' (or not specified) are shown
175
// to users as options.
176
versions: ImageVersion[];
177
// If true, then a GPU is required to use this image.
178
gpu?: boolean;
179
// If true, then the microk8s snap is required to use this image.
180
microk8s?: boolean;
181
// authToken: if true, image has web interface that supports configurable auth token
182
authToken?: boolean;
183
// jupyterKernels: if false, no jupyter kernels included. If true or a list of
184
// names, there are kernels available – used in frontend/jupyter/select-kernel.tsx
185
jupyterKernels?: false | true | string[];
186
// If set to true, do not allow creating this compute server with a DNS subdomain.
187
// Some images only make sense to use over the web, and the web server just won't
188
// work without DNS setup properly (e.g., VS Code with LEAN). Ignored for on prem.
189
requireDns?: boolean;
190
// system: if true, this is a system container that is not for user compute
191
system?: boolean;
192
// disabled: if true, this image is completely disabled, so will not be used in any way.
193
disabled?: boolean;
194
// priority -- optional integer used for sorting options to display to user. The bigger the higher.
195
priority?: number;
196
// proxy: if false, do NOT run https proxy server on host VM
197
// if nothing given, runs proxy server with no default config (so does nothing)
198
// if given, is array of proxy config.
199
proxy?: false | ProxyRoute[];
200
apps?: {
201
[name: string]: {
202
icon: string;
203
label: string;
204
url: string;
205
path: string;
206
launch: string;
207
requiresDns?: boolean;
208
};
209
};
210
}
211
212
export type Images = { [name: string]: Image };
213
214
export interface GoogleCloudImage {
215
labels: { [name: string]: string };
216
diskSizeGb: number;
217
creationTimestamp: string;
218
}
219
export type GoogleCloudImages = { [name: string]: GoogleCloudImage };
220
221
// valid for google cloud -- probably not sufficient
222
export function makeValidGoogleName(s: string): string {
223
return s.replace(/[._]/g, "-").toLowerCase().slice(0, 63);
224
}
225
226
export type State =
227
| "off"
228
| "starting"
229
| "running"
230
| "stopping"
231
| "deprovisioned"
232
| "suspending"
233
| "suspended"
234
| "unknown";
235
236
// used for sorting by state -- ordered from my alive to least alive.
237
export const ORDERED_STATES: State[] = [
238
"running",
239
"starting",
240
"stopping",
241
"suspending",
242
"suspended",
243
"off",
244
"deprovisioned",
245
"unknown",
246
];
247
export const STATE_TO_NUMBER: { [state: string]: number } = {};
248
let n = 0;
249
for (const state of ORDERED_STATES) {
250
STATE_TO_NUMBER[state] = n;
251
n += 1;
252
}
253
254
export function getArchitecture(configuration: Configuration): Architecture {
255
if (configuration.cloud == "onprem") {
256
return configuration.arch ?? "x86_64";
257
}
258
if (configuration.cloud != "google-cloud") {
259
// no ARM outside of GCP right now
260
return "x86_64";
261
}
262
const { machineType } = configuration;
263
const v = machineType.split("-");
264
if (v[0].endsWith("a")) {
265
// The known machines with are are: t2a-, c4a-
266
// Everything else ends with a number or d.
267
// Hopefully this pattern persists.
268
return "arm64";
269
}
270
return "x86_64";
271
}
272
273
function supportsSuspend(configuration: Configuration) {
274
if (configuration.cloud != "google-cloud") {
275
return false;
276
}
277
if (getArchitecture(configuration) != "x86_64") {
278
// TODO: suspend/resume breaks the clock badly on ARM64, and I haven't
279
// figured out a workaround, so don't support it for now. I guess this
280
// is a GCP bug.
281
return false;
282
}
283
// must have no gpu and <= 208GB of RAM -- https://cloud.google.com/compute/docs/instances/suspend-resume-instance
284
if (configuration.acceleratorType) {
285
return false;
286
}
287
return true;
288
}
289
290
export type Action =
291
| "start"
292
| "resume"
293
| "stop"
294
| "suspend"
295
| "deprovision"
296
| "reboot";
297
298
export const ACTION_INFO: {
299
[action: string]: {
300
label: string;
301
icon: string;
302
tip: string;
303
description: string;
304
confirm?: boolean;
305
confirmMessage?: string;
306
danger?: boolean;
307
target: State; // target stable state after doing this action.
308
clouds?: Cloud[];
309
isSupported?: (configuration: Configuration) => boolean;
310
};
311
} = {
312
start: {
313
label: "Start",
314
icon: "play",
315
tip: "Start",
316
description: "Start the compute server running.",
317
target: "running",
318
},
319
resume: {
320
label: "Resume",
321
icon: "play",
322
clouds: ["google-cloud"],
323
tip: "Resume",
324
description: "Resume the compute server from suspend.",
325
target: "running",
326
isSupported: supportsSuspend,
327
},
328
stop: {
329
label: "Stop",
330
icon: "stop",
331
tip: "Turn off",
332
description:
333
"Turn the compute server off. No data on disk is lost, but any data and state in memory will be lost. This is like turning your laptop off.",
334
confirm: true,
335
target: "off",
336
},
337
deprovision: {
338
label: "Deprovision",
339
icon: "trash",
340
tip: "Deprovision the virtual machine",
341
description:
342
"Deprovisioning DELETES THE VIRTUAL MACHINE BOOT DISK, but keeps the compute server parameters. There are no costs associated with a deprovisioned compute server, and you can move it to a different region or zone. Any files in the home directory of your project are not affected.",
343
confirm: true,
344
confirmMessage:
345
"I understand that my compute server disks will be deleted.",
346
danger: true,
347
target: "deprovisioned",
348
},
349
reboot: {
350
label: "Hard Reboot",
351
icon: "refresh",
352
tip: "Hard reboot the virtual machine.",
353
description:
354
"Perform a HARD reset on the virtual machine, which wipes the memory contents and resets the virtual machine to its initial state. This should not delete data from the disk, but can lead to filesystem corruption.",
355
confirm: true,
356
confirmMessage:
357
"I understand that this can lead to filesystem corruption and is slightly dangerous.",
358
danger: true,
359
target: "running",
360
clouds: ["google-cloud", "hyperstack"],
361
},
362
suspend: {
363
label: "Suspend",
364
icon: "pause",
365
clouds: ["google-cloud"],
366
tip: "Suspend disk and memory state",
367
confirm: true,
368
description:
369
"Suspend the compute server. No data on disk or memory is lost, and you are only charged for storing disk and memory. This is like closing your laptop screen. You can leave a compute server suspended for up to 60 days before it automatically shuts off.",
370
target: "suspended",
371
isSupported: supportsSuspend,
372
},
373
};
374
375
export const STATE_INFO: {
376
[state: string]: {
377
label: string;
378
actions: Action[];
379
icon: string;
380
color?: string;
381
stable?: boolean;
382
target?: State; // if not stable, this is the target state it is heading to
383
};
384
} = {
385
off: {
386
label: "Off",
387
color: "#ff4b00",
388
actions: ["start", "deprovision"],
389
icon: "stop",
390
stable: true,
391
},
392
suspended: {
393
label: "Suspended",
394
actions: ["resume", "deprovision", "stop"],
395
icon: "pause",
396
color: "#0097a7",
397
stable: true,
398
},
399
suspending: {
400
label: "Suspending",
401
actions: ["suspend"],
402
icon: "pause",
403
color: "#00bcd4",
404
stable: false,
405
target: "suspended",
406
},
407
starting: {
408
label: "Starting",
409
color: "#388e3c",
410
actions: ["start"],
411
icon: "bolt",
412
stable: false,
413
target: "running",
414
},
415
running: {
416
label: "Running",
417
color: COLORS.RUN,
418
actions: ["stop", "deprovision", "reboot", "suspend"],
419
icon: "run",
420
stable: true,
421
},
422
stopping: {
423
label: "Stopping",
424
color: "#ff9800",
425
actions: ["stop"],
426
icon: "hand",
427
stable: false,
428
target: "off",
429
},
430
unknown: {
431
label: "Unknown (click to refresh)",
432
actions: [],
433
icon: "question-circle",
434
stable: true,
435
},
436
deprovisioned: {
437
label: "Deprovisioned",
438
actions: ["start"],
439
color: "#888",
440
icon: "minus-square",
441
stable: true,
442
},
443
};
444
445
export function getTargetState(x: State | Action): State {
446
if (ACTION_INFO[x] != null) {
447
return ACTION_INFO[x].target;
448
}
449
if (STATE_INFO[x] != null) {
450
if (!STATE_INFO[x]?.stable) {
451
return (STATE_INFO[x].target ?? x) as State;
452
}
453
return x as State;
454
}
455
throw Error(`x =${x} must be a state or action`);
456
}
457
458
export type Architecture = "x86_64" | "arm64";
459
460
// Convention is used in cocalc-compute-docker for making
461
// the npm packages @cocalc/compute-server. Don't mess with it!
462
export function getImageField(arch: Architecture) {
463
return arch == "x86_64" ? "package" : "package_arm64";
464
}
465
466
export type Cloud =
467
| "any"
468
| "onprem"
469
| "core-weave"
470
| "hyperstack"
471
| "lambda-cloud"
472
| "google-cloud"
473
| "aws"
474
| "fluid-stack"
475
| "test";
476
477
export function getMinDiskSizeGb({
478
configuration,
479
IMAGES,
480
}: {
481
configuration;
482
IMAGES: Images;
483
}) {
484
if (configuration?.image) {
485
const { minDiskSizeGb } = IMAGES[configuration.image] ?? {};
486
if (minDiskSizeGb) {
487
return minDiskSizeGb;
488
}
489
}
490
// TODO: will have to do something based on actual image size,
491
// maybe, unless I come up with a clever trick involving
492
// one PD mounted on many machines (?).
493
if (configuration?.acceleratorType) {
494
return CUDA_DISK_SIZE;
495
} else {
496
return STANDARD_DISK_SIZE;
497
}
498
}
499
500
// This means "you can spend at most dollars every hours on a RUNNING compute server"
501
export interface SpendLimit {
502
hours: number;
503
dollars: number;
504
enabled: boolean;
505
}
506
507
export const SPEND_LIMIT_DEFAULTS = {
508
hours: 24 * 7,
509
dollars: 25,
510
enabled: false,
511
};
512
513
export function validatedSpendLimit(spendLimit?: any): SpendLimit | undefined {
514
if (spendLimit == null) {
515
return undefined;
516
}
517
let { hours, dollars, enabled } = spendLimit;
518
hours = parseFloat(hours ?? 0);
519
dollars = parseFloat(dollars ?? 0);
520
enabled = !!enabled;
521
if (hours < 0 || !isFinite(hours)) {
522
hours = SPEND_LIMIT_DEFAULTS.hours;
523
}
524
if (dollars < 0 || !isFinite(dollars)) {
525
dollars = SPEND_LIMIT_DEFAULTS.dollars;
526
}
527
return { enabled, hours, dollars };
528
}
529
530
export function spendLimitPeriod(hours) {
531
if (hours == 24) {
532
return "day";
533
}
534
if (hours == 24 * 7) {
535
return "week";
536
}
537
if (hours == 30.5 * 24 * 7) {
538
return "month";
539
}
540
if (hours == 12 * 30.5 * 24 * 7) {
541
return "year";
542
}
543
return `${hours} hours`;
544
}
545
546
const tenAM = new Date();
547
tenAM.setHours(10, 0, 0, 0);
548
export const DEFAULT_SHUTDOWN_TIME = {
549
epochMs: tenAM.valueOf(),
550
enabled: false,
551
};
552
553
export interface ShutdownTime {
554
epochMs: number;
555
enabled?: boolean;
556
}
557
558
export function validatedShutdownTime(
559
shutdownTime?: any,
560
): ShutdownTime | undefined {
561
if (shutdownTime == null) {
562
return undefined;
563
}
564
let { epochMs, enabled } = shutdownTime;
565
epochMs = parseFloat(epochMs ?? DEFAULT_SHUTDOWN_TIME.epochMs);
566
if (epochMs < 0 || !isFinite(epochMs)) {
567
epochMs = DEFAULT_SHUTDOWN_TIME.epochMs;
568
}
569
enabled = !!enabled;
570
return { enabled, epochMs };
571
}
572
573
interface BaseConfiguration {
574
// image: name of the image to use, e.g. 'python' or 'pytorch'.
575
// images are managed in src/packages/server/compute/images.ts
576
image: string;
577
// tag: tag for the image to use when starting the compute server.
578
// this references the versions field of the image.
579
// If the tag is not given or not available, we use the latest
580
// available tag.
581
tag?: string;
582
// tag_filesystem: tag for the file system container
583
tag_filesystem?: string;
584
// tag_cocalc: tag for the @cocalc/compute-server package.
585
tag_cocalc?: string;
586
// dns - If the string is set and the VM has an external ip address
587
// and dns is configured, then point https://{dns}....
588
// with ssl proxying to this compute server when it is running.
589
dns?: string;
590
// Array of top level directories to exclude from sync.
591
// These can't have "|" in them, since we use that as a separator.
592
// Use "~" to completely disable sync.
593
excludeFromSync?: readonly string[];
594
// If true, view data on the compute server as ephemeral.
595
// Currently this is only meant to impact the user interface.
596
ephemeral?: boolean;
597
// Token used for authentication at https://compute-server...
598
authToken?: string;
599
// Configuration of the https proxy server.
600
proxy?: ProxyRoute[];
601
// If this compute server stops pinging us, e.g., due to being preempted or
602
// just crashing due to out of memory (etc) should we automatically do a
603
// forced restart. Note that currently for on prem this isn't possible.
604
autoRestart?: boolean;
605
autoRestartDisabled?: boolean; // used to temporarily disable it to avoid accidentally triggering it.
606
// Allow collaborators to control the state of the compute server.
607
// They cannot change any other configuration. User still pays for everything and owns compute server.
608
allowCollaboratorControl?: boolean;
609
610
// AUTOMATIC SHUTDOWN configuration:
611
// turn compute server off if spend more then dollars during the last hours.
612
// this can only be set by the owner.
613
// Limit spending
614
spendLimit?: SpendLimit;
615
idleTimeoutMinutes?: number;
616
healthCheck?: HealthCheck;
617
// number = ms since epoch defines a time; at *that* time each day, the server is turned off.
618
shutdownTime?: ShutdownTime;
619
}
620
621
export const AUTOMATIC_SHUTDOWN_FIELDS = [
622
"spendLimit",
623
"idleTimeoutMinutes",
624
"healthCheck",
625
"shutdownTime",
626
];
627
628
interface LambdaConfiguration extends BaseConfiguration {
629
cloud: "lambda-cloud";
630
instance_type_name: string;
631
region_name: string;
632
}
633
634
export interface HyperstackConfiguration extends BaseConfiguration {
635
cloud: "hyperstack";
636
flavor_name: string;
637
region_name: HyperstackRegion;
638
// diskSizeGb is an integer >= 1. It defaults to 10.
639
// It's the size of the /data partition. It's implemented
640
// using 1 or more hyperstack (=ceph) volumes, which are combined
641
// together as a ZFS pool. If the compute server is
642
// named "foo", the volumes are named "foo-1", "foo-2",
643
// "foo-3", etc.
644
// There is also always a separate 50GB root volume, which
645
// is named "foo-0", and whose size is not configurable.
646
// NOTE: users install packages "systemwide" inside of
647
// a docker container and we configure docker to store
648
// its data in the zpool, so that's in here too.
649
diskSizeGb: number;
650
}
651
652
export const COREWEAVE_CPU_TYPES = [
653
"amd-epyc-rome",
654
"amd-epyc-milan",
655
"intel-xeon-v1",
656
"intel-xeon-v2",
657
"intel-xeon-v3",
658
"intel-xeon-v4",
659
"intel-xeon-scalable",
660
] as const;
661
662
export const COREWEAVE_GPU_TYPES = [
663
"Quadro_RTX_4000",
664
"Quadro_RTX_5000",
665
"RTX_A4000",
666
"RTX_A5000",
667
"RTX_A6000",
668
"A40",
669
"Tesla_V100_PCIE",
670
"Tesla_V100_NVLINK",
671
"A100_PCIE_40GB",
672
"A100_PCIE_80GB",
673
"A100_NVLINK_40GB",
674
"A100_NVLINK_80GB",
675
] as const;
676
677
interface CoreWeaveConfiguration extends BaseConfiguration {
678
cloud: "core-weave";
679
gpu: {
680
type:
681
| "Quadro_RTX_4000"
682
| "Quadro_RTX_5000"
683
| "RTX_A4000"
684
| "RTX_A5000"
685
| "RTX_A6000"
686
| "A40"
687
| "Tesla_V100_PCIE"
688
| "Tesla_V100_NVLINK"
689
| "A100_PCIE_40GB"
690
| "A100_PCIE_80GB"
691
| "A100_NVLINK_40GB"
692
| "A100_NVLINK_80GB"; //(typeof COREWEAVE_GPU_TYPES)[number];
693
count: number;
694
};
695
cpu: {
696
count: number;
697
type?:
698
| "amd-epyc-rome"
699
| "amd-epyc-milan"
700
| "intel-xeon-v1"
701
| "intel-xeon-v2"
702
| "intel-xeon-v3"
703
| "intel-xeon-v4"
704
| "intel-xeon-scalable"; //(typeof COREWEAVE_CPU_TYPES)[number];
705
};
706
memory: string; // e.g., "12Gi"
707
storage?: {
708
root: {
709
size: string; // e.g., '40Gi'
710
};
711
};
712
}
713
714
interface FluidStackConfiguration extends BaseConfiguration {
715
cloud: "fluid-stack";
716
plan: string;
717
region: string;
718
os: string;
719
}
720
export type GoogleCloudAcceleratorType =
721
| "nvidia-h200-141gb"
722
| "nvidia-h100-80gb"
723
| "nvidia-a100-80gb"
724
| "nvidia-tesla-a100"
725
| "nvidia-l4"
726
| "nvidia-tesla-t4"
727
| "nvidia-tesla-v100"
728
| "nvidia-tesla-p4"
729
| "nvidia-tesla-p100";
730
731
export const GOOGLE_CLOUD_ACCELERATOR_TYPES: GoogleCloudAcceleratorType[] = [
732
"nvidia-h200-141gb",
733
"nvidia-h100-80gb",
734
"nvidia-a100-80gb",
735
"nvidia-tesla-a100",
736
"nvidia-l4",
737
"nvidia-tesla-t4",
738
"nvidia-tesla-v100",
739
"nvidia-tesla-p4",
740
"nvidia-tesla-p100",
741
];
742
743
export type GoogleCloudDiskType =
744
| "pd-standard"
745
| "pd-balanced"
746
| "pd-ssd"
747
| "hyperdisk-balanced";
748
749
export const GOOGLE_CLOUD_DISK_TYPES: GoogleCloudDiskType[] = [
750
"pd-standard",
751
"pd-balanced",
752
"pd-ssd",
753
// NOTE: hyperdisks are complicated and multidimensional, but for cocalc
754
// we just hardcode options for the iops and bandwidth, and allow the
755
// user to adjust the size. Also, "hyperdisk-balanced" means hyperdisk
756
// with the defaults for iops and bandwidth defined in
757
// src/packages/util/compute/cloud/google-cloud/compute-cost.ts
758
"hyperdisk-balanced",
759
];
760
761
export interface GoogleCloudConfiguration extends BaseConfiguration {
762
cloud: "google-cloud";
763
region: string;
764
zone: string;
765
machineType: string;
766
// Ues a spot instance if spot is true.
767
spot?: boolean;
768
// The boot disk:
769
// diskSizeGb is an integer >= 10. It defaults to 10. It's the size of the boot disk.
770
diskSizeGb?: number;
771
hyperdiskBalancedIops?: number;
772
hyperdiskBalancedThroughput?: number;
773
diskType?: GoogleCloudDiskType;
774
acceleratorType?: GoogleCloudAcceleratorType;
775
// the allowed number depends on the accelerator; it defaults to 1.
776
acceleratorCount?: number;
777
// minCpuPlatform
778
terminationTime?: Date;
779
maxRunDurationSeconds?: number;
780
// if true, use newest image, whether or not it is labeled with prod=true.
781
test?: boolean;
782
// an image name of the form "2023-09-13-063355-test", i.e., a timestamp in that format
783
// followed by an optional string. Whether or not to use cuda and and the arch are
784
// determined by parameters above. This is meant to be used for two purposes (1) testing
785
// before deploying to production, and (2) stability, so a given compute server has the
786
// exact same base image every time it is started, instead of being updated. Regarding (2),
787
// this might not be needed, but we'll see. If image is not set, we use the newest
788
// image that is tagged prod:true, or its an error if no such image exists. This is
789
// all about Google Cloud images, not the IMAGES object defined elsewhere in this file.
790
sourceImage?: string;
791
// If true, then we have an external ip address
792
externalIp?: boolean;
793
// If true, can run full VM's inside of the machine, but there is 10% performance penalty.
794
// This will only work for Intel non-e2 non-a3 instance types. No AMD and no ARM64.
795
enableNestedVirtualization?: boolean;
796
}
797
798
export interface OnPremCloudConfiguration extends BaseConfiguration {
799
cloud: "onprem";
800
arch?: Architecture;
801
gpu?: boolean;
802
}
803
804
export type Configuration =
805
| LambdaConfiguration
806
| HyperstackConfiguration
807
| CoreWeaveConfiguration
808
| FluidStackConfiguration
809
| GoogleCloudConfiguration
810
| OnPremCloudConfiguration;
811
812
interface BaseData {
813
cloudflareId?: string;
814
externalIp?: string;
815
internalIp?: string;
816
}
817
818
export interface LambdaCloudData extends BaseData {
819
cloud: "lambda-cloud";
820
instance_id: string;
821
}
822
823
export interface HyperstackData extends BaseData {
824
cloud: "hyperstack";
825
// name we are using for the vm
826
name?: string;
827
// hyperstack description of this vm.
828
vm?: HyperstackVirtualMachine;
829
// id's of persistent storage, with first id the boot disk.
830
// disks are named {name}-0, {name}-1, {name}-2, etc.,
831
// with {name}-0 being the boot disk.
832
disks?: number[];
833
creationTimestamp?: Date;
834
}
835
836
export interface GoogleCloudData extends BaseData {
837
cloud: "google-cloud";
838
name?: string;
839
state?: State;
840
cpuPlatform?: string;
841
creationTimestamp?: Date;
842
lastStartTimestamp?: Date;
843
}
844
845
export type Data = GoogleCloudData | LambdaCloudData | HyperstackData;
846
847
export interface ComponentState {
848
state: string;
849
time: number;
850
expire?: number;
851
}
852
853
export interface ComputeServerTemplate {
854
enabled?: boolean;
855
priority?: number;
856
}
857
858
export interface ComputeServerUserInfo {
859
id: number;
860
project_specific_id?: number; // the project_specific_id of this compute server -- unique within project, minimal
861
account_id: string;
862
project_id: string;
863
title?: string;
864
color?: string;
865
cost_per_hour?: number;
866
deleted?: boolean;
867
state_changed?: Date;
868
started_by?: string;
869
error?: string;
870
state?: State;
871
// google-cloud has a new "Time limit" either by hour or by date, which seems like a great idea!
872
// time_limit
873
autorestart?: boolean;
874
cloud: Cloud;
875
configuration: Configuration;
876
provisioned_configuration?: Configuration;
877
data?: Data;
878
purchase_id?: number;
879
last_edited?: Date;
880
last_edited_user?: Date;
881
position?: number; // used for UI sorting.
882
detailed_state?: { [name: string]: ComponentState };
883
update_purchase?: boolean;
884
last_purchase_update?: Date;
885
template?: ComputeServerTemplate;
886
spend?: number;
887
}
888
889
export interface ComputeServer extends ComputeServerUserInfo {
890
api_key?: string; // project level api key for the project
891
api_key_id?: number; // id of the api key (needed so we can delete it from database).
892
}
893
894
Table({
895
name: "compute_servers",
896
rules: {
897
primary_key: "id",
898
// unique vpn ip address *within* a given project only:
899
pg_unique_indexes: [
900
"(project_id, vpn_ip)",
901
"(project_id, project_specific_id)",
902
],
903
user_query: {
904
get: {
905
pg_where: [{ "project_id = $::UUID": "project_id" }],
906
throttle_changes: 0, // do not make this bigger; UI really feels off if throttled
907
fields: {
908
id: null,
909
account_id: null,
910
created: null,
911
title: null,
912
color: null,
913
cost_per_hour: null,
914
deleted: null,
915
project_id: null,
916
state_changed: null,
917
error: null,
918
state: null,
919
autorestart: null,
920
cloud: null,
921
configuration: null,
922
data: null,
923
provisioned_configuration: null,
924
avatar_image_tiny: null,
925
last_edited: null,
926
last_edited_user: null,
927
purchase_id: null,
928
position: null,
929
detailed_state: null,
930
template: null,
931
notes: null,
932
vpn_ip: null,
933
project_specific_id: null,
934
course_project_id: null,
935
course_server_id: null,
936
spend: null,
937
},
938
},
939
set: {
940
// ATTN: It's assumed that users can't set the data field. Doing so would be very bad and could allow
941
// them to maybe abuse the system and not pay for something.
942
// Most fields, e.g., configuration, get set via api calls, which ensures consistency in terms of valid
943
// data and what is actively deployed.
944
fields: {
945
project_id: "project_write",
946
id: true,
947
position: true,
948
error: true, // easily clear the error
949
notes: true,
950
last_edited_user: true,
951
},
952
},
953
},
954
},
955
fields: {
956
id: ID,
957
account_id: {
958
type: "uuid",
959
desc: "User that owns this compute server.",
960
render: { type: "account" },
961
},
962
created: {
963
type: "timestamp",
964
desc: "When the compute server was created.",
965
},
966
title: {
967
type: "string",
968
pg_type: "VARCHAR(254)",
969
desc: "Title of this computer server. Used purely to make it easier for the user to keep track of it.",
970
render: { type: "text", maxLength: 254, editable: true },
971
},
972
color: {
973
type: "string",
974
desc: "A user configurable color, which is used for tags and UI to indicate where a tab is running.",
975
pg_type: "VARCHAR(30)",
976
render: { type: "color", editable: true },
977
},
978
cost_per_hour: {
979
title: "Cost per Hour",
980
desc: "The cost in US dollars per hour that this compute server cost us when it is provisioned. Any time the state is changed, this is set by the server to the proper cost.",
981
type: "number",
982
pg_type: "real",
983
},
984
deleted: {
985
type: "boolean",
986
desc: "True if the compute server has been deleted.",
987
},
988
project_id: {
989
type: "uuid",
990
desc: "The project id that this compute server provides compute for.",
991
render: { type: "project_link" },
992
},
993
api_key: {
994
type: "string",
995
pg_type: "VARCHAR(128)",
996
desc: "api key to connect to the project. This is created by the system right when we are going to create the VM, and gets deleted when we stop it. It's not set by the user and should not be revealed to the user.",
997
},
998
api_key_id: {
999
type: "number",
1000
desc: "id of the api key; needed so we can delete it from database",
1001
},
1002
state_changed: {
1003
type: "timestamp",
1004
desc: "When the state last changed.",
1005
},
1006
error: {
1007
type: "string",
1008
desc: "In case something went wrong, e.g., in starting this compute server, this field will get set with a string error message to show the user. It's also cleared right when we try to start server.",
1009
},
1010
state: {
1011
type: "string",
1012
desc: "One of - 'off', 'starting', 'running', 'stopping', 'deprovisioned' (etc.). This is the underlying VM's state.",
1013
pg_type: "VARCHAR(16)",
1014
},
1015
autorestart: {
1016
type: "boolean",
1017
desc: "If true and the compute server stops for any reason, then it will be automatically started again. This is primarily useful for stop instances.",
1018
},
1019
cloud: {
1020
type: "string",
1021
pg_type: "varchar(30)",
1022
desc: "The cloud where this compute server runs: 'user', 'coreweave', 'lambda', 'google-cloud', 'aws', 'fluidstack'.",
1023
},
1024
configuration: {
1025
type: "map",
1026
pg_type: "jsonb",
1027
desc: "Cloud specific configuration of the computer at the cloud host. The format depends on the cloud",
1028
},
1029
provisioned_configuration: {
1030
type: "map",
1031
pg_type: "jsonb",
1032
desc: "Same as configuration, but this is the one we actually used last time we provisioned a VM in a cloud.",
1033
},
1034
data: {
1035
type: "map",
1036
pg_type: "jsonb",
1037
desc: "Arbitrary data about this server that is cloud provider specific. Store data here to facilitate working with the virtual machine, e.g., the id of the server when it is running, etc. This *MAY BE* returned to the user -- do not put secrets here the user can't see.",
1038
},
1039
avatar_image_tiny: {
1040
title: "Image",
1041
type: "string",
1042
desc: "tiny (32x32) visual image associated with the compute server. Suitable to include as part of changefeed, since about 3kb. Derived from avatar_image_full.",
1043
render: { type: "image" },
1044
},
1045
avatar_image_full: {
1046
title: "Image",
1047
type: "string",
1048
desc: "User configurable visual image associated with the compute server. Could be 150kb. NOT include as part of changefeed of projects, since potentially big (e.g., 200kb x 1000 projects = 200MB!).",
1049
render: { type: "image" },
1050
},
1051
purchase_id: {
1052
type: "number",
1053
desc: "if there is a current active purchase related to this compute server, this is the id of that purchase in the purchases table",
1054
},
1055
update_purchase: {
1056
type: "boolean",
1057
desc: "This is set to true if activity with this server is happening that warrants creating/ending a purchase.",
1058
},
1059
last_purchase_update: {
1060
type: "timestamp",
1061
desc: "Last time we requested an update to the purchase info about this compute server.",
1062
},
1063
position: {
1064
type: "number",
1065
desc: "Used for sorting a list of compute servers in the UI.",
1066
},
1067
last_edited: {
1068
type: "timestamp",
1069
desc: "Last time the configuration, state, etc., changed.",
1070
},
1071
last_edited_user: {
1072
type: "timestamp",
1073
desc: "Last time a user explicitly edited a file or used an application (e.g., terminal) on the compute server via the UI. This is like last_edited for projects, and is used to implement configuration.idleTimeoutMinutes.",
1074
},
1075
detailed_state: {
1076
type: "map",
1077
pg_type: "jsonb",
1078
desc: "Map from component name to something like {state:'running',time:Date.now()}, e.g., {vm: {state:'running', time:393939938484}}, filesystem: {state:'updating', time:939398484892}, uptime:{state:'22:56:33 up 3 days, 9:28, 0 users, load average: 0.93, 0.73, 0.56', time:?}}. This is used to provide users with insight into what's currently happening on their compute server.",
1079
},
1080
notes: NOTES,
1081
template: {
1082
type: "map",
1083
pg_type: "jsonb",
1084
desc: "Use this compute server configuration as a public template. Only admins can set this field for now. The exact structure of this jsonb is yet to be determined.",
1085
},
1086
vpn_ip: {
1087
type: "string",
1088
desc: "IP address of the compute server on the private encrypted project-wide VPN.",
1089
},
1090
vpn_public_key: {
1091
type: "string",
1092
desc: "Wireguard public key for this compute server.",
1093
},
1094
vpn_private_key: {
1095
type: "string",
1096
desc: "Wireguard private key for this compute server.",
1097
},
1098
project_specific_id: {
1099
type: "integer",
1100
desc: "A unique project-specific id assigned to this compute server. This is a positive integer that is guaranteed to be unique for compute servers *in a given project* and minimal when assigned (so it is as small as possible). This number is useful for distributed algorithms, since it can be used to ensure distinct sequence without any additional coordination. This is also useful to display to users so that the id number they see everywhere is not huge.",
1101
},
1102
course_project_id: {
1103
type: "uuid",
1104
desc: "If this is a compute server created for a student in a course, then this is the id of the project that the instructor(s) are using to host the course. IMPORTANT: Our security model is that a user can read info about a compute server if they are a collaborator on *either* the compute server's project_id OR on the course_project_id, if set (but then only via the compute_servers_by_course virtual table).",
1105
},
1106
course_server_id: {
1107
type: "integer",
1108
desc: "If this compute server is a clone of an instructor server in a course, this is the id of that instructor server.",
1109
},
1110
spend: {
1111
type: "number",
1112
desc: "If configuration.spendLimit is enabled, then the spend during the current period gets recorded here every few minutes. This is useful to efficiently provide a UI element showing the current spend status. It is cleared whenever configuration.spendLimit is changed, to avoid confusion.",
1113
},
1114
},
1115
});
1116
1117
// The compute_servers_by_course table is exactly like the compute_servers
1118
// table, but instead of having to specify
1119
Table({
1120
name: "compute_servers_by_course",
1121
fields: schema.compute_servers.fields,
1122
rules: {
1123
primary_key: schema.compute_servers.primary_key,
1124
virtual: "compute_servers",
1125
user_query: {
1126
get: {
1127
// only allow read access when course_project_id is a project
1128
// that client user is a collaborator on.
1129
pg_where: [
1130
{
1131
"course_project_id = ANY(select project_id from projects where users ? $::TEXT)":
1132
"account_id",
1133
},
1134
],
1135
fields: {
1136
...schema.compute_servers.user_query?.get?.fields,
1137
},
1138
},
1139
},
1140
},
1141
});
1142
1143
Table({
1144
name: "crm_compute_servers",
1145
fields: schema.compute_servers.fields,
1146
rules: {
1147
primary_key: schema.compute_servers.primary_key,
1148
virtual: "compute_servers",
1149
user_query: {
1150
get: {
1151
admin: true, // only admins can do get queries on this table
1152
// (without this, users who have read access could read)
1153
pg_where: [],
1154
fields: {
1155
...schema.compute_servers.user_query?.get?.fields,
1156
template: null,
1157
},
1158
},
1159
set: {
1160
admin: true,
1161
fields: {
1162
id: true,
1163
title: true,
1164
color: true,
1165
deleted: true,
1166
notes: true,
1167
template: true,
1168
state_control: null,
1169
},
1170
},
1171
},
1172
},
1173
});
1174
1175
Table({
1176
name: "compute_servers_cache",
1177
fields: {
1178
cloud: {
1179
type: "string",
1180
desc: "The cloud that we're caching information about",
1181
},
1182
key: {
1183
type: "string",
1184
desc: "The key for whatever we're caching.",
1185
},
1186
value: {
1187
type: "string",
1188
desc: "The cached data.",
1189
},
1190
expire: {
1191
type: "timestamp",
1192
desc: "When this action should be expired.",
1193
},
1194
},
1195
rules: {
1196
durability: "soft", // it's just a cache
1197
desc: "Cache data about what's going on in various clouds that are used to implement compute servers.",
1198
primary_key: ["cloud", "key"],
1199
},
1200
});
1201
1202