CoCalc -- compute-servers.ts

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/packages/util/db-schema/compute-servers.ts
Views: ¹¹⁴⁰
1
/*
2
 *  This file is part of CoCalc: Copyright © 2023 Sagemath, Inc.
3
 *  License: MS-RSL – see LICENSE.md for details
4
 */
5

6
import type {
7
  Region as HyperstackRegion,
8
  VirtualMachine as HyperstackVirtualMachine,
9
} from "@cocalc/util/compute/cloud/hyperstack/api-types";
10
import { COLORS } from "@cocalc/util/theme";
11
import { ID, NOTES } from "./crm";
12
import { SCHEMA as schema } from "./index";
13
import { Table } from "./types";
14
export {
15
  CLOUDS_BY_NAME,
16
  GOOGLE_CLOUD_DEFAULTS,
17
  ON_PREM_DEFAULTS,
18
} from "@cocalc/util/compute/cloud/clouds";
19

20
// These are just fallbacks in case something is wrong with the image configuration.
21
export const STANDARD_DISK_SIZE = 20;
22
export const CUDA_DISK_SIZE = 60;
23

24
export const CHECK_IN_PERIOD_S = 20;
25
export const CHECK_IN_PATH = "/cocalc/conf/check-in";
26

27
// Clients are recommended to wait this long after a purchase ends before
28
// requesting the cost.  This should give us about a day of wiggle room.
29
// There is no SLA on billing data.
30
const GOOGLE_COST_LAG_DAYS = 2;
31
export const GOOGLE_COST_LAG_MS = GOOGLE_COST_LAG_DAYS * 24 * 60 * 60 * 1000;
32

33
// Compute Server Images -- typings.  See packages/server/compute/images.ts for
34
// how the actual data is populated.
35

36
export interface ImageVersion {
37
  // tag - must be given and distinct for each version -- this typically identifies the image to docker
38
  tag: string;
39
  // version -- defaults to tag if not given; usually the upstream version
40
  version?: string;
41
  // label -- defaults to the tag; this is to display to the user
42
  label?: string;
43
  // tested -- if this is not set to true, then this version should not be shown by default.
44
  // If not tested, only show to users who explicitly really want this (e.g., admins).
45
  tested?: boolean;
46
}
47

48
export const IDLE_TIMEOUT_MINUTES_DEFAULT = 30;
49

50
export const HEALTH_CHECK_DEFAULTS = {
51
  command: "pwd",
52
  initialDelaySeconds: 10 * 60,
53
  timeoutSeconds: 30,
54
  periodSeconds: 60,
55
  failureThreshold: 3,
56
  enabled: false,
57
  action: "reboot",
58
};
59

60
export const HEALTH_CHECK_ACTIONS = [
61
  "reboot",
62
  "stop",
63
  "suspend",
64
  "deprovision",
65
];
66
type HealthCheckAction = (typeof HEALTH_CHECK_ACTIONS)[number];
67

68
export function validatedHealthCheck(
69
  healthCheck?: any,
70
): HealthCheck | undefined {
71
  if (healthCheck == null) {
72
    return undefined;
73
  }
74
  let {
75
    command,
76
    periodSeconds,
77
    failureThreshold,
78
    enabled,
79
    action,
80
    timeoutSeconds,
81
    initialDelaySeconds,
82
  } = healthCheck;
83
  command = `${command}`;
84
  periodSeconds = parseFloat(
85
    periodSeconds ?? HEALTH_CHECK_DEFAULTS.periodSeconds,
86
  );
87
  if (periodSeconds < 0 || !isFinite(periodSeconds)) {
88
    periodSeconds = HEALTH_CHECK_DEFAULTS.periodSeconds;
89
  }
90
  failureThreshold = parseFloat(
91
    failureThreshold ?? HEALTH_CHECK_DEFAULTS.failureThreshold,
92
  );
93
  if (failureThreshold < 1 || !isFinite(failureThreshold)) {
94
    failureThreshold = HEALTH_CHECK_DEFAULTS.failureThreshold;
95
  }
96
  timeoutSeconds = parseFloat(
97
    timeoutSeconds ?? HEALTH_CHECK_DEFAULTS.timeoutSeconds,
98
  );
99
  if (timeoutSeconds < 5 || !isFinite(timeoutSeconds)) {
100
    timeoutSeconds = HEALTH_CHECK_DEFAULTS.timeoutSeconds;
101
  }
102
  initialDelaySeconds = parseFloat(
103
    initialDelaySeconds ?? HEALTH_CHECK_DEFAULTS.initialDelaySeconds,
104
  );
105
  if (initialDelaySeconds < 0 || !isFinite(initialDelaySeconds)) {
106
    initialDelaySeconds = HEALTH_CHECK_DEFAULTS.initialDelaySeconds;
107
  }
108
  enabled = !!enabled;
109
  if (!HEALTH_CHECK_ACTIONS.includes(action)) {
110
    action = HEALTH_CHECK_DEFAULTS.action;
111
  }
112
  return {
113
    command,
114
    initialDelaySeconds,
115
    timeoutSeconds,
116
    periodSeconds,
117
    failureThreshold,
118
    enabled,
119
    action,
120
  };
121
}
122

123
export interface HealthCheck {
124
  // run the command with given args on the compute server.
125
  // If the command fails (nonzero exit code) failureThreshold times, then the
126
  // action happens. If it contains the deprovision
127
  // string, then it deprovisions.
128
  command: string;
129
  // timeout for running the command
130
  timeoutSeconds: number;
131
  // initial delay
132
  initialDelaySeconds: number;
133
  // period in seconds to wait between running the command
134
  periodSeconds: number;
135
  // When a probe fails, CoCalc will try failureThreshold times before doing the action.
136
  failureThreshold: number;
137

138
  action: HealthCheckAction;
139
  enabled: boolean;
140
}
141

142
interface ProxyRoute {
143
  path: string;
144
  target: string;
145
  ws?: boolean;
146
}
147

148
export interface Image {
149
  // What we show the user to describe this image, e.g., in the image select menu.
150
  label: string;
151
  // The name of the package on npmjs or dockerhub:
152
  package?: string;
153
  // In case there is a different package name for ARM64, the name of it.
154
  package_arm64?: string;
155
  // Root filesystem image must be at least this big in GB.
156
  minDiskSizeGb?: number;
157
  // Description in MARKDOWN to show user of this image.  Can include links.
158
  // Rough estimate of compressed size of Docker image; useful
159
  // to get a sense of how long it will take to download image
160
  // on clouds without pregenerated images.
161
  dockerSizeGb?: number;
162
  description?: string;
163
  // Upstream URL for this image, e.g., https://julialang.org/ for the Julia image.
164
  url?: string;
165
  // Icon to show next to the label for this image.
166
  icon?: string;
167
  // Link to a URL with the source for building this image.
168
  source: string;
169
  // optional list of links to videos about this image, ordered from lowest to highest priority.
170
  videos?: string[];
171
  // optional list of links to tutorials
172
  tutorials?: string[];
173
  // The versions of this image that we claim to have built.
174
  // The ones with role='prod' (or not specified) are shown
175
  // to users as options.
176
  versions: ImageVersion[];
177
  // If true, then a GPU is required to use this image.
178
  gpu?: boolean;
179
  // If true, then the microk8s snap is required to use this image.
180
  microk8s?: boolean;
181
  // authToken: if true, image has web interface that supports configurable auth token
182
  authToken?: boolean;
183
  // jupyterKernels: if false, no jupyter kernels included. If true or a list of
184
  // names, there are kernels available – used in frontend/jupyter/select-kernel.tsx
185
  jupyterKernels?: false | true | string[];
186
  // If set to true, do not allow creating this compute server with a DNS subdomain.
187
  // Some images only make sense to use over the web, and the web server just won't
188
  // work without DNS setup properly (e.g., VS Code with LEAN).  Ignored for on prem.
189
  requireDns?: boolean;
190
  // system: if true, this is a system container that is not for user compute
191
  system?: boolean;
192
  // disabled: if true, this image is completely disabled, so will not be used in any way.
193
  disabled?: boolean;
194
  // priority -- optional integer used for sorting options to display to user. The bigger the higher.
195
  priority?: number;
196
  // proxy: if false, do NOT run https proxy server on host VM
197
  //        if nothing given, runs proxy server with no default config (so does nothing)
198
  //        if given, is array of proxy config.
199
  proxy?: false | ProxyRoute[];
200
  apps?: {
201
    [name: string]: {
202
      icon: string;
203
      label: string;
204
      url: string;
205
      path: string;
206
      launch: string;
207
      requiresDns?: boolean;
208
    };
209
  };
210
}
211

212
export type Images = { [name: string]: Image };
213

214
export interface GoogleCloudImage {
215
  labels: { [name: string]: string };
216
  diskSizeGb: number;
217
  creationTimestamp: string;
218
}
219
export type GoogleCloudImages = { [name: string]: GoogleCloudImage };
220

221
// valid for google cloud -- probably not sufficient
222
export function makeValidGoogleName(s: string): string {
223
  return s.replace(/[._]/g, "-").toLowerCase().slice(0, 63);
224
}
225

226
export type State =
227
  | "off"
228
  | "starting"
229
  | "running"
230
  | "stopping"
231
  | "deprovisioned"
232
  | "suspending"
233
  | "suspended"
234
  | "unknown";
235

236
// used for sorting by state -- ordered from my alive to least alive.
237
export const ORDERED_STATES: State[] = [
238
  "running",
239
  "starting",
240
  "stopping",
241
  "suspending",
242
  "suspended",
243
  "off",
244
  "deprovisioned",
245
  "unknown",
246
];
247
export const STATE_TO_NUMBER: { [state: string]: number } = {};
248
let n = 0;
249
for (const state of ORDERED_STATES) {
250
  STATE_TO_NUMBER[state] = n;
251
  n += 1;
252
}
253

254
export function getArchitecture(configuration: Configuration): Architecture {
255
  if (configuration.cloud == "onprem") {
256
    return configuration.arch ?? "x86_64";
257
  }
258
  if (configuration.cloud != "google-cloud") {
259
    // no ARM outside of GCP right now
260
    return "x86_64";
261
  }
262
  const { machineType } = configuration;
263
  const v = machineType.split("-");
264
  if (v[0].endsWith("a")) {
265
    // The known machines with are are: t2a-, c4a-
266
    // Everything else ends with a number or d.
267
    // Hopefully this pattern persists.
268
    return "arm64";
269
  }
270
  return "x86_64";
271
}
272

273
function supportsSuspend(configuration: Configuration) {
274
  if (configuration.cloud != "google-cloud") {
275
    return false;
276
  }
277
  if (getArchitecture(configuration) != "x86_64") {
278
    // TODO: suspend/resume breaks the clock badly on ARM64, and I haven't
279
    // figured out a workaround, so don't support it for now.  I guess this
280
    // is a GCP bug.
281
    return false;
282
  }
283
  // must have no gpu and <= 208GB of RAM -- https://cloud.google.com/compute/docs/instances/suspend-resume-instance
284
  if (configuration.acceleratorType) {
285
    return false;
286
  }
287
  return true;
288
}
289

290
export type Action =
291
  | "start"
292
  | "resume"
293
  | "stop"
294
  | "suspend"
295
  | "deprovision"
296
  | "reboot";
297

298
export const ACTION_INFO: {
299
  [action: string]: {
300
    label: string;
301
    icon: string;
302
    tip: string;
303
    description: string;
304
    confirm?: boolean;
305
    confirmMessage?: string;
306
    danger?: boolean;
307
    target: State; // target stable state after doing this action.
308
    clouds?: Cloud[];
309
    isSupported?: (configuration: Configuration) => boolean;
310
  };
311
} = {
312
  start: {
313
    label: "Start",
314
    icon: "play",
315
    tip: "Start",
316
    description: "Start the compute server running.",
317
    target: "running",
318
  },
319
  resume: {
320
    label: "Resume",
321
    icon: "play",
322
    clouds: ["google-cloud"],
323
    tip: "Resume",
324
    description: "Resume the compute server from suspend.",
325
    target: "running",
326
    isSupported: supportsSuspend,
327
  },
328
  stop: {
329
    label: "Stop",
330
    icon: "stop",
331
    tip: "Turn off",
332
    description:
333
      "Turn the compute server off. No data on disk is lost, but any data and state in memory will be lost. This is like turning your laptop off.",
334
    confirm: true,
335
    target: "off",
336
  },
337
  deprovision: {
338
    label: "Deprovision",
339
    icon: "trash",
340
    tip: "Deprovision the virtual machine",
341
    description:
342
      "Deprovisioning DELETES THE VIRTUAL MACHINE BOOT DISK, but keeps the compute server parameters.   There are no costs associated with a deprovisioned compute server, and you can move it to a different region or zone.  Any files in the home directory of your project are not affected.",
343
    confirm: true,
344
    confirmMessage:
345
      "I understand that my compute server disks will be deleted.",
346
    danger: true,
347
    target: "deprovisioned",
348
  },
349
  reboot: {
350
    label: "Hard Reboot",
351
    icon: "refresh",
352
    tip: "Hard reboot the virtual machine.",
353
    description:
354
      "Perform a HARD reset on the virtual machine, which wipes the memory contents and resets the virtual machine to its initial state. This should not delete data from the disk, but can lead to filesystem corruption.",
355
    confirm: true,
356
    confirmMessage:
357
      "I understand that this can lead to filesystem corruption and is slightly dangerous.",
358
    danger: true,
359
    target: "running",
360
    clouds: ["google-cloud", "hyperstack"],
361
  },
362
  suspend: {
363
    label: "Suspend",
364
    icon: "pause",
365
    clouds: ["google-cloud"],
366
    tip: "Suspend disk and memory state",
367
    confirm: true,
368
    description:
369
      "Suspend the compute server.  No data on disk or memory is lost, and you are only charged for storing disk and memory. This is like closing your laptop screen.  You can leave a compute server suspended for up to 60 days before it automatically shuts off.",
370
    target: "suspended",
371
    isSupported: supportsSuspend,
372
  },
373
};
374

375
export const STATE_INFO: {
376
  [state: string]: {
377
    label: string;
378
    actions: Action[];
379
    icon: string;
380
    color?: string;
381
    stable?: boolean;
382
    target?: State; // if not stable, this is the target state it is heading to
383
  };
384
} = {
385
  off: {
386
    label: "Off",
387
    color: "#ff4b00",
388
    actions: ["start", "deprovision"],
389
    icon: "stop",
390
    stable: true,
391
  },
392
  suspended: {
393
    label: "Suspended",
394
    actions: ["resume", "deprovision", "stop"],
395
    icon: "pause",
396
    color: "#0097a7",
397
    stable: true,
398
  },
399
  suspending: {
400
    label: "Suspending",
401
    actions: ["suspend"],
402
    icon: "pause",
403
    color: "#00bcd4",
404
    stable: false,
405
    target: "suspended",
406
  },
407
  starting: {
408
    label: "Starting",
409
    color: "#388e3c",
410
    actions: ["start"],
411
    icon: "bolt",
412
    stable: false,
413
    target: "running",
414
  },
415
  running: {
416
    label: "Running",
417
    color: COLORS.RUN,
418
    actions: ["stop", "deprovision", "reboot", "suspend"],
419
    icon: "run",
420
    stable: true,
421
  },
422
  stopping: {
423
    label: "Stopping",
424
    color: "#ff9800",
425
    actions: ["stop"],
426
    icon: "hand",
427
    stable: false,
428
    target: "off",
429
  },
430
  unknown: {
431
    label: "Unknown (click to refresh)",
432
    actions: [],
433
    icon: "question-circle",
434
    stable: true,
435
  },
436
  deprovisioned: {
437
    label: "Deprovisioned",
438
    actions: ["start"],
439
    color: "#888",
440
    icon: "minus-square",
441
    stable: true,
442
  },
443
};
444

445
export function getTargetState(x: State | Action): State {
446
  if (ACTION_INFO[x] != null) {
447
    return ACTION_INFO[x].target;
448
  }
449
  if (STATE_INFO[x] != null) {
450
    if (!STATE_INFO[x]?.stable) {
451
      return (STATE_INFO[x].target ?? x) as State;
452
    }
453
    return x as State;
454
  }
455
  throw Error(`x =${x} must be a state or action`);
456
}
457

458
export type Architecture = "x86_64" | "arm64";
459

460
// Convention is used in cocalc-compute-docker for making
461
// the npm packages @cocalc/compute-server.  Don't mess with it!
462
export function getImageField(arch: Architecture) {
463
  return arch == "x86_64" ? "package" : "package_arm64";
464
}
465

466
export type Cloud =
467
  | "any"
468
  | "onprem"
469
  | "core-weave"
470
  | "hyperstack"
471
  | "lambda-cloud"
472
  | "google-cloud"
473
  | "aws"
474
  | "fluid-stack"
475
  | "test";
476

477
export function getMinDiskSizeGb({
478
  configuration,
479
  IMAGES,
480
}: {
481
  configuration;
482
  IMAGES: Images;
483
}) {
484
  if (configuration?.image) {
485
    const { minDiskSizeGb } = IMAGES[configuration.image] ?? {};
486
    if (minDiskSizeGb) {
487
      return minDiskSizeGb;
488
    }
489
  }
490
  // TODO: will have to do something based on actual image size,
491
  // maybe, unless I come up with a clever trick involving
492
  // one PD mounted on many machines (?).
493
  if (configuration?.acceleratorType) {
494
    return CUDA_DISK_SIZE;
495
  } else {
496
    return STANDARD_DISK_SIZE;
497
  }
498
}
499

500
// This means "you can spend at most dollars every hours on a RUNNING compute server"
501
export interface SpendLimit {
502
  hours: number;
503
  dollars: number;
504
  enabled: boolean;
505
}
506

507
export const SPEND_LIMIT_DEFAULTS = {
508
  hours: 24 * 7,
509
  dollars: 25,
510
  enabled: false,
511
};
512

513
export function validatedSpendLimit(spendLimit?: any): SpendLimit | undefined {
514
  if (spendLimit == null) {
515
    return undefined;
516
  }
517
  let { hours, dollars, enabled } = spendLimit;
518
  hours = parseFloat(hours ?? 0);
519
  dollars = parseFloat(dollars ?? 0);
520
  enabled = !!enabled;
521
  if (hours < 0 || !isFinite(hours)) {
522
    hours = SPEND_LIMIT_DEFAULTS.hours;
523
  }
524
  if (dollars < 0 || !isFinite(dollars)) {
525
    dollars = SPEND_LIMIT_DEFAULTS.dollars;
526
  }
527
  return { enabled, hours, dollars };
528
}
529

530
export function spendLimitPeriod(hours) {
531
  if (hours == 24) {
532
    return "day";
533
  }
534
  if (hours == 24 * 7) {
535
    return "week";
536
  }
537
  if (hours == 30.5 * 24 * 7) {
538
    return "month";
539
  }
540
  if (hours == 12 * 30.5 * 24 * 7) {
541
    return "year";
542
  }
543
  return `${hours} hours`;
544
}
545

546
const tenAM = new Date();
547
tenAM.setHours(10, 0, 0, 0);
548
export const DEFAULT_SHUTDOWN_TIME = {
549
  epochMs: tenAM.valueOf(),
550
  enabled: false,
551
};
552

553
export interface ShutdownTime {
554
  epochMs: number;
555
  enabled?: boolean;
556
}
557

558
export function validatedShutdownTime(
559
  shutdownTime?: any,
560
): ShutdownTime | undefined {
561
  if (shutdownTime == null) {
562
    return undefined;
563
  }
564
  let { epochMs, enabled } = shutdownTime;
565
  epochMs = parseFloat(epochMs ?? DEFAULT_SHUTDOWN_TIME.epochMs);
566
  if (epochMs < 0 || !isFinite(epochMs)) {
567
    epochMs = DEFAULT_SHUTDOWN_TIME.epochMs;
568
  }
569
  enabled = !!enabled;
570
  return { enabled, epochMs };
571
}
572

573
interface BaseConfiguration {
574
  // image: name of the image to use, e.g. 'python' or 'pytorch'.
575
  // images are managed in src/packages/server/compute/images.ts
576
  image: string;
577
  // tag: tag for the image to use when starting the compute server.
578
  // this references the versions field of the image.
579
  // If the tag is not given or not available, we use the latest
580
  // available tag.
581
  tag?: string;
582
  // tag_filesystem: tag for the file system container
583
  tag_filesystem?: string;
584
  // tag_cocalc: tag for the @cocalc/compute-server package.
585
  tag_cocalc?: string;
586
  // dns - If the string is set and the VM has an external ip address
587
  // and dns is configured, then point https://{dns}....
588
  // with ssl proxying to this compute server when it is running.
589
  dns?: string;
590
  // Array of top level directories to exclude from sync.
591
  // These can't have "|" in them, since we use that as a separator.
592
  // Use "~" to completely disable sync.
593
  excludeFromSync?: readonly string[];
594
  // If true, view data on the compute server as ephemeral.
595
  // Currently this is only meant to impact the user interface.
596
  ephemeral?: boolean;
597
  // Token used for authentication at https://compute-server...
598
  authToken?: string;
599
  // Configuration of the https proxy server.
600
  proxy?: ProxyRoute[];
601
  // If this compute server stops pinging us, e.g., due to being preempted or
602
  // just crashing due to out of memory (etc) should we automatically do a
603
  // forced restart.  Note that currently for on prem this isn't possible.
604
  autoRestart?: boolean;
605
  autoRestartDisabled?: boolean; // used to temporarily disable it to avoid accidentally triggering it.
606
  // Allow collaborators to control the state of the compute server.
607
  // They cannot change any other configuration.  User still pays for everything and owns compute server.
608
  allowCollaboratorControl?: boolean;
609

610
  // AUTOMATIC SHUTDOWN configuration:
611
  // turn compute server off if spend more then dollars during the last hours.
612
  // this can only be set by the owner.
613
  // Limit spending
614
  spendLimit?: SpendLimit;
615
  idleTimeoutMinutes?: number;
616
  healthCheck?: HealthCheck;
617
  // number = ms since epoch defines a time; at *that* time each day, the server is turned off.
618
  shutdownTime?: ShutdownTime;
619
}
620

621
export const AUTOMATIC_SHUTDOWN_FIELDS = [
622
  "spendLimit",
623
  "idleTimeoutMinutes",
624
  "healthCheck",
625
  "shutdownTime",
626
];
627

628
interface LambdaConfiguration extends BaseConfiguration {
629
  cloud: "lambda-cloud";
630
  instance_type_name: string;
631
  region_name: string;
632
}
633

634
export interface HyperstackConfiguration extends BaseConfiguration {
635
  cloud: "hyperstack";
636
  flavor_name: string;
637
  region_name: HyperstackRegion;
638
  // diskSizeGb is an integer >= 1.  It defaults to 10.
639
  // It's the size of the /data partition.  It's implemented
640
  // using 1 or more hyperstack (=ceph) volumes, which are combined
641
  // together as a ZFS pool.  If the compute server is
642
  // named "foo", the volumes are named "foo-1", "foo-2",
643
  // "foo-3", etc.
644
  // There is also always a separate 50GB root volume, which
645
  // is named "foo-0", and whose size is not configurable.
646
  // NOTE: users install packages "systemwide" inside of
647
  // a docker container and we configure docker to store
648
  // its data in the zpool, so that's in here too.
649
  diskSizeGb: number;
650
}
651

652
export const COREWEAVE_CPU_TYPES = [
653
  "amd-epyc-rome",
654
  "amd-epyc-milan",
655
  "intel-xeon-v1",
656
  "intel-xeon-v2",
657
  "intel-xeon-v3",
658
  "intel-xeon-v4",
659
  "intel-xeon-scalable",
660
] as const;
661

662
export const COREWEAVE_GPU_TYPES = [
663
  "Quadro_RTX_4000",
664
  "Quadro_RTX_5000",
665
  "RTX_A4000",
666
  "RTX_A5000",
667
  "RTX_A6000",
668
  "A40",
669
  "Tesla_V100_PCIE",
670
  "Tesla_V100_NVLINK",
671
  "A100_PCIE_40GB",
672
  "A100_PCIE_80GB",
673
  "A100_NVLINK_40GB",
674
  "A100_NVLINK_80GB",
675
] as const;
676

677
interface CoreWeaveConfiguration extends BaseConfiguration {
678
  cloud: "core-weave";
679
  gpu: {
680
    type:
681
      | "Quadro_RTX_4000"
682
      | "Quadro_RTX_5000"
683
      | "RTX_A4000"
684
      | "RTX_A5000"
685
      | "RTX_A6000"
686
      | "A40"
687
      | "Tesla_V100_PCIE"
688
      | "Tesla_V100_NVLINK"
689
      | "A100_PCIE_40GB"
690
      | "A100_PCIE_80GB"
691
      | "A100_NVLINK_40GB"
692
      | "A100_NVLINK_80GB"; //(typeof COREWEAVE_GPU_TYPES)[number];
693
    count: number;
694
  };
695
  cpu: {
696
    count: number;
697
    type?:
698
      | "amd-epyc-rome"
699
      | "amd-epyc-milan"
700
      | "intel-xeon-v1"
701
      | "intel-xeon-v2"
702
      | "intel-xeon-v3"
703
      | "intel-xeon-v4"
704
      | "intel-xeon-scalable"; //(typeof COREWEAVE_CPU_TYPES)[number];
705
  };
706
  memory: string; // e.g., "12Gi"
707
  storage?: {
708
    root: {
709
      size: string; // e.g., '40Gi'
710
    };
711
  };
712
}
713

714
interface FluidStackConfiguration extends BaseConfiguration {
715
  cloud: "fluid-stack";
716
  plan: string;
717
  region: string;
718
  os: string;
719
}
720
export type GoogleCloudAcceleratorType =
721
  | "nvidia-h200-141gb"
722
  | "nvidia-h100-80gb"
723
  | "nvidia-a100-80gb"
724
  | "nvidia-tesla-a100"
725
  | "nvidia-l4"
726
  | "nvidia-tesla-t4"
727
  | "nvidia-tesla-v100"
728
  | "nvidia-tesla-p4"
729
  | "nvidia-tesla-p100";
730

731
export const GOOGLE_CLOUD_ACCELERATOR_TYPES: GoogleCloudAcceleratorType[] = [
732
  "nvidia-h200-141gb",
733
  "nvidia-h100-80gb",
734
  "nvidia-a100-80gb",
735
  "nvidia-tesla-a100",
736
  "nvidia-l4",
737
  "nvidia-tesla-t4",
738
  "nvidia-tesla-v100",
739
  "nvidia-tesla-p4",
740
  "nvidia-tesla-p100",
741
];
742

743
export type GoogleCloudDiskType =
744
  | "pd-standard"
745
  | "pd-balanced"
746
  | "pd-ssd"
747
  | "hyperdisk-balanced";
748

749
export const GOOGLE_CLOUD_DISK_TYPES: GoogleCloudDiskType[] = [
750
  "pd-standard",
751
  "pd-balanced",
752
  "pd-ssd",
753
  // NOTE: hyperdisks are complicated and multidimensional, but for cocalc
754
  // we just hardcode options for the iops and bandwidth, and allow the
755
  // user to adjust the size.  Also, "hyperdisk-balanced" means hyperdisk
756
  // with the defaults for iops and bandwidth defined in
757
  // src/packages/util/compute/cloud/google-cloud/compute-cost.ts
758
  "hyperdisk-balanced",
759
];
760

761
export interface GoogleCloudConfiguration extends BaseConfiguration {
762
  cloud: "google-cloud";
763
  region: string;
764
  zone: string;
765
  machineType: string;
766
  // Ues a spot instance if spot is true.
767
  spot?: boolean;
768
  // The boot disk:
769
  // diskSizeGb is an integer >= 10.  It defaults to 10. It's the size of the boot disk.
770
  diskSizeGb?: number;
771
  hyperdiskBalancedIops?: number;
772
  hyperdiskBalancedThroughput?: number;
773
  diskType?: GoogleCloudDiskType;
774
  acceleratorType?: GoogleCloudAcceleratorType;
775
  // the allowed number depends on the accelerator; it defaults to 1.
776
  acceleratorCount?: number;
777
  // minCpuPlatform
778
  terminationTime?: Date;
779
  maxRunDurationSeconds?: number;
780
  // if true, use newest image, whether or not it is labeled with prod=true.
781
  test?: boolean;
782
  // an image name of the form "2023-09-13-063355-test", i.e., a timestamp in that format
783
  // followed by an optional string.  Whether or not to use cuda and and the arch are
784
  // determined by parameters above.  This is meant to be used for two purposes (1) testing
785
  // before deploying to production, and (2) stability, so a given compute server has the
786
  // exact same base image every time it is started, instead of being updated. Regarding (2),
787
  // this might not be needed, but we'll see.  If image is not set, we use the newest
788
  // image that is tagged prod:true, or its an error if no such image exists.  This is
789
  // all about Google Cloud images, not the IMAGES object defined elsewhere in this file.
790
  sourceImage?: string;
791
  // If true, then we have an external ip address
792
  externalIp?: boolean;
793
  // If true, can run full VM's inside of the machine, but there is 10% performance penalty.
794
  // This will only work for Intel non-e2 non-a3 instance types. No AMD and no ARM64.
795
  enableNestedVirtualization?: boolean;
796
}
797

798
export interface OnPremCloudConfiguration extends BaseConfiguration {
799
  cloud: "onprem";
800
  arch?: Architecture;
801
  gpu?: boolean;
802
}
803

804
export type Configuration =
805
  | LambdaConfiguration
806
  | HyperstackConfiguration
807
  | CoreWeaveConfiguration
808
  | FluidStackConfiguration
809
  | GoogleCloudConfiguration
810
  | OnPremCloudConfiguration;
811

812
interface BaseData {
813
  cloudflareId?: string;
814
  externalIp?: string;
815
  internalIp?: string;
816
}
817

818
export interface LambdaCloudData extends BaseData {
819
  cloud: "lambda-cloud";
820
  instance_id: string;
821
}
822

823
export interface HyperstackData extends BaseData {
824
  cloud: "hyperstack";
825
  // name we are using for the vm
826
  name?: string;
827
  // hyperstack description of this vm.
828
  vm?: HyperstackVirtualMachine;
829
  // id's of persistent storage, with first id the boot disk.
830
  // disks are named {name}-0, {name}-1, {name}-2, etc.,
831
  // with {name}-0 being the boot disk.
832
  disks?: number[];
833
  creationTimestamp?: Date;
834
}
835

836
export interface GoogleCloudData extends BaseData {
837
  cloud: "google-cloud";
838
  name?: string;
839
  state?: State;
840
  cpuPlatform?: string;
841
  creationTimestamp?: Date;
842
  lastStartTimestamp?: Date;
843
}
844

845
export type Data = GoogleCloudData | LambdaCloudData | HyperstackData;
846

847
export interface ComponentState {
848
  state: string;
849
  time: number;
850
  expire?: number;
851
}
852

853
export interface ComputeServerTemplate {
854
  enabled?: boolean;
855
  priority?: number;
856
}
857

858
export interface ComputeServerUserInfo {
859
  id: number;
860
  project_specific_id?: number; // the project_specific_id of this compute server -- unique within project, minimal
861
  account_id: string;
862
  project_id: string;
863
  title?: string;
864
  color?: string;
865
  cost_per_hour?: number;
866
  deleted?: boolean;
867
  state_changed?: Date;
868
  started_by?: string;
869
  error?: string;
870
  state?: State;
871
  // google-cloud has a new "Time limit" either by hour or by date, which seems like a great idea!
872
  // time_limit
873
  autorestart?: boolean;
874
  cloud: Cloud;
875
  configuration: Configuration;
876
  provisioned_configuration?: Configuration;
877
  data?: Data;
878
  purchase_id?: number;
879
  last_edited?: Date;
880
  last_edited_user?: Date;
881
  position?: number; // used for UI sorting.
882
  detailed_state?: { [name: string]: ComponentState };
883
  update_purchase?: boolean;
884
  last_purchase_update?: Date;
885
  template?: ComputeServerTemplate;
886
  spend?: number;
887
}
888

889
export interface ComputeServer extends ComputeServerUserInfo {
890
  api_key?: string; // project level api key for the project
891
  api_key_id?: number; // id of the api key (needed so we can delete it from database).
892
}
893

894
Table({
895
  name: "compute_servers",
896
  rules: {
897
    primary_key: "id",
898
    // unique vpn ip address *within* a given project only:
899
    pg_unique_indexes: [
900
      "(project_id, vpn_ip)",
901
      "(project_id, project_specific_id)",
902
    ],
903
    user_query: {
904
      get: {
905
        pg_where: [{ "project_id = $::UUID": "project_id" }],
906
        throttle_changes: 0, // do not make this bigger; UI really feels off if throttled
907
        fields: {
908
          id: null,
909
          account_id: null,
910
          created: null,
911
          title: null,
912
          color: null,
913
          cost_per_hour: null,
914
          deleted: null,
915
          project_id: null,
916
          state_changed: null,
917
          error: null,
918
          state: null,
919
          autorestart: null,
920
          cloud: null,
921
          configuration: null,
922
          data: null,
923
          provisioned_configuration: null,
924
          avatar_image_tiny: null,
925
          last_edited: null,
926
          last_edited_user: null,
927
          purchase_id: null,
928
          position: null,
929
          detailed_state: null,
930
          template: null,
931
          notes: null,
932
          vpn_ip: null,
933
          project_specific_id: null,
934
          course_project_id: null,
935
          course_server_id: null,
936
          spend: null,
937
        },
938
      },
939
      set: {
940
        // ATTN: It's assumed that users can't set the data field.  Doing so would be very bad and could allow
941
        // them to maybe abuse the system and not pay for something.
942
        // Most fields, e.g., configuration, get set via api calls, which ensures consistency in terms of valid
943
        // data and what is actively deployed.
944
        fields: {
945
          project_id: "project_write",
946
          id: true,
947
          position: true,
948
          error: true, // easily clear the error
949
          notes: true,
950
          last_edited_user: true,
951
        },
952
      },
953
    },
954
  },
955
  fields: {
956
    id: ID,
957
    account_id: {
958
      type: "uuid",
959
      desc: "User that owns this compute server.",
960
      render: { type: "account" },
961
    },
962
    created: {
963
      type: "timestamp",
964
      desc: "When the compute server was created.",
965
    },
966
    title: {
967
      type: "string",
968
      pg_type: "VARCHAR(254)",
969
      desc: "Title of this computer server.  Used purely to make it easier for the user to keep track of it.",
970
      render: { type: "text", maxLength: 254, editable: true },
971
    },
972
    color: {
973
      type: "string",
974
      desc: "A user configurable color, which is used for tags and UI to indicate where a tab is running.",
975
      pg_type: "VARCHAR(30)",
976
      render: { type: "color", editable: true },
977
    },
978
    cost_per_hour: {
979
      title: "Cost per Hour",
980
      desc: "The cost in US dollars per hour that this compute server cost us when it is provisioned. Any time the state is changed, this is set by the server to the proper cost.",
981
      type: "number",
982
      pg_type: "real",
983
    },
984
    deleted: {
985
      type: "boolean",
986
      desc: "True if the compute server has been deleted.",
987
    },
988
    project_id: {
989
      type: "uuid",
990
      desc: "The project id that this compute server provides compute for.",
991
      render: { type: "project_link" },
992
    },
993
    api_key: {
994
      type: "string",
995
      pg_type: "VARCHAR(128)",
996
      desc: "api key to connect to the project.  This is created by the system right when we are going to create the VM, and gets deleted when we stop it.  It's not set by the user and should not be revealed to the user.",
997
    },
998
    api_key_id: {
999
      type: "number",
1000
      desc: "id of the api key; needed so we can delete it from database",
1001
    },
1002
    state_changed: {
1003
      type: "timestamp",
1004
      desc: "When the state last changed.",
1005
    },
1006
    error: {
1007
      type: "string",
1008
      desc: "In case something went wrong, e.g., in starting this compute server, this field will get set with a string error message to show the user. It's also cleared right when we try to start server.",
1009
    },
1010
    state: {
1011
      type: "string",
1012
      desc: "One of - 'off', 'starting', 'running', 'stopping', 'deprovisioned' (etc.).  This is the underlying VM's state.",
1013
      pg_type: "VARCHAR(16)",
1014
    },
1015
    autorestart: {
1016
      type: "boolean",
1017
      desc: "If true and the compute server stops for any reason, then it will be automatically started again.  This is primarily useful for stop instances.",
1018
    },
1019
    cloud: {
1020
      type: "string",
1021
      pg_type: "varchar(30)",
1022
      desc: "The cloud where this compute server runs: 'user', 'coreweave', 'lambda', 'google-cloud', 'aws', 'fluidstack'.",
1023
    },
1024
    configuration: {
1025
      type: "map",
1026
      pg_type: "jsonb",
1027
      desc: "Cloud specific configuration of the computer at the cloud host. The format depends on the cloud",
1028
    },
1029
    provisioned_configuration: {
1030
      type: "map",
1031
      pg_type: "jsonb",
1032
      desc: "Same as configuration, but this is the one we actually used last time we provisioned a VM in a cloud.",
1033
    },
1034
    data: {
1035
      type: "map",
1036
      pg_type: "jsonb",
1037
      desc: "Arbitrary data about this server that is cloud provider specific.  Store data here to facilitate working with the virtual machine, e.g., the id of the server when it is running, etc.  This *MAY BE* returned to the user -- do not put secrets here the user can't see.",
1038
    },
1039
    avatar_image_tiny: {
1040
      title: "Image",
1041
      type: "string",
1042
      desc: "tiny (32x32) visual image associated with the compute server. Suitable to include as part of changefeed, since about 3kb. Derived from avatar_image_full.",
1043
      render: { type: "image" },
1044
    },
1045
    avatar_image_full: {
1046
      title: "Image",
1047
      type: "string",
1048
      desc: "User configurable visual image associated with the compute server.  Could be 150kb.  NOT include as part of changefeed of projects, since potentially big (e.g., 200kb x 1000 projects = 200MB!).",
1049
      render: { type: "image" },
1050
    },
1051
    purchase_id: {
1052
      type: "number",
1053
      desc: "if there is a current active purchase related to this compute server, this is the id of that purchase in the purchases table",
1054
    },
1055
    update_purchase: {
1056
      type: "boolean",
1057
      desc: "This is set to true if activity with this server is happening that warrants creating/ending a purchase.",
1058
    },
1059
    last_purchase_update: {
1060
      type: "timestamp",
1061
      desc: "Last time we requested an update to the purchase info about this compute server.",
1062
    },
1063
    position: {
1064
      type: "number",
1065
      desc: "Used for sorting a list of compute servers in the UI.",
1066
    },
1067
    last_edited: {
1068
      type: "timestamp",
1069
      desc: "Last time the configuration, state, etc., changed.",
1070
    },
1071
    last_edited_user: {
1072
      type: "timestamp",
1073
      desc: "Last time a user explicitly edited a file or used an application (e.g., terminal) on the compute server via the UI. This is like last_edited for projects, and is used to implement configuration.idleTimeoutMinutes.",
1074
    },
1075
    detailed_state: {
1076
      type: "map",
1077
      pg_type: "jsonb",
1078
      desc: "Map from component name to something like {state:'running',time:Date.now()}, e.g., {vm: {state:'running', time:393939938484}}, filesystem: {state:'updating', time:939398484892}, uptime:{state:'22:56:33 up 3 days,  9:28,  0 users,  load average: 0.93, 0.73, 0.56', time:?}}.  This is used to provide users with insight into what's currently happening on their compute server.",
1079
    },
1080
    notes: NOTES,
1081
    template: {
1082
      type: "map",
1083
      pg_type: "jsonb",
1084
      desc: "Use this compute server configuration as a public template.  Only admins can set this field for now. The exact structure of this jsonb is yet to be determined.",
1085
    },
1086
    vpn_ip: {
1087
      type: "string",
1088
      desc: "IP address of the compute server on the private encrypted project-wide VPN.",
1089
    },
1090
    vpn_public_key: {
1091
      type: "string",
1092
      desc: "Wireguard public key for this compute server.",
1093
    },
1094
    vpn_private_key: {
1095
      type: "string",
1096
      desc: "Wireguard private key for this compute server.",
1097
    },
1098
    project_specific_id: {
1099
      type: "integer",
1100
      desc: "A unique project-specific id assigned to this compute server.  This is a positive integer that is guaranteed to be unique for compute servers *in a given project* and minimal when assigned (so it is as small as possible).   This number is useful for distributed algorithms, since it can be used to ensure distinct sequence without any additional coordination.   This is also useful to display to users so that the id number they see everywhere is not huge.",
1101
    },
1102
    course_project_id: {
1103
      type: "uuid",
1104
      desc: "If this is a compute server created for a student in a course, then this is the id of the project that the instructor(s) are using to host the course.  IMPORTANT: Our security model is that a user can read info about a compute server if they are a collaborator on *either* the compute server's project_id OR on the course_project_id, if set (but then only via the compute_servers_by_course virtual table).",
1105
    },
1106
    course_server_id: {
1107
      type: "integer",
1108
      desc: "If this compute server is a clone of an instructor server in a course, this is the id of that instructor server.",
1109
    },
1110
    spend: {
1111
      type: "number",
1112
      desc: "If configuration.spendLimit is enabled, then the spend during the current period gets recorded here every few minutes.  This is useful to efficiently provide a UI element showing the current spend status.  It is cleared whenever configuration.spendLimit is changed, to avoid confusion.",
1113
    },
1114
  },
1115
});
1116

1117
// The compute_servers_by_course table is exactly like the compute_servers
1118
// table, but instead of having to specify
1119
Table({
1120
  name: "compute_servers_by_course",
1121
  fields: schema.compute_servers.fields,
1122
  rules: {
1123
    primary_key: schema.compute_servers.primary_key,
1124
    virtual: "compute_servers",
1125
    user_query: {
1126
      get: {
1127
        // only allow read access when course_project_id is a project
1128
        // that client user is a collaborator on.
1129
        pg_where: [
1130
          {
1131
            "course_project_id = ANY(select project_id from projects where users ? $::TEXT)":
1132
              "account_id",
1133
          },
1134
        ],
1135
        fields: {
1136
          ...schema.compute_servers.user_query?.get?.fields,
1137
        },
1138
      },
1139
    },
1140
  },
1141
});
1142

1143
Table({
1144
  name: "crm_compute_servers",
1145
  fields: schema.compute_servers.fields,
1146
  rules: {
1147
    primary_key: schema.compute_servers.primary_key,
1148
    virtual: "compute_servers",
1149
    user_query: {
1150
      get: {
1151
        admin: true, // only admins can do get queries on this table
1152
        // (without this, users who have read access could read)
1153
        pg_where: [],
1154
        fields: {
1155
          ...schema.compute_servers.user_query?.get?.fields,
1156
          template: null,
1157
        },
1158
      },
1159
      set: {
1160
        admin: true,
1161
        fields: {
1162
          id: true,
1163
          title: true,
1164
          color: true,
1165
          deleted: true,
1166
          notes: true,
1167
          template: true,
1168
          state_control: null,
1169
        },
1170
      },
1171
    },
1172
  },
1173
});
1174

1175
Table({
1176
  name: "compute_servers_cache",
1177
  fields: {
1178
    cloud: {
1179
      type: "string",
1180
      desc: "The cloud that we're caching information about",
1181
    },
1182
    key: {
1183
      type: "string",
1184
      desc: "The key for whatever we're caching.",
1185
    },
1186
    value: {
1187
      type: "string",
1188
      desc: "The cached data.",
1189
    },
1190
    expire: {
1191
      type: "timestamp",
1192
      desc: "When this action should be expired.",
1193
    },
1194
  },
1195
  rules: {
1196
    durability: "soft", // it's just a cache
1197
    desc: "Cache data about what's going on in various clouds that are used to implement compute servers.",
1198
    primary_key: ["cloud", "key"],
1199
  },
1200
});
1201

1202
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

Product

Resources

Company

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more, all in one place. Commercial Alternative to JupyterHub.

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.