CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
sagemathinc

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.

GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/packages/util/db-schema/cloud-filesystems.ts
Views: 687
1
/*
2
Configuration of network mounted shared POSIX filesystems associated
3
to projects for use initially by the compute servers.
4
5
Initially these will get mounted by all compute servers uniformly (mostly),
6
and later the project will also mount these via a sidecar.
7
8
This is 100% built on juicefs/keydb instead of gcs/s3, etc., since:
9
10
- there are so many gotchas with directly using fuse mounted gcs/s3,
11
- people can just use those directly or mount them directly easily
12
anyways (since they are root)
13
*/
14
15
import { Table } from "./types";
16
import { ID, NOTES } from "./crm";
17
import { SCHEMA as schema } from "./index";
18
19
// We do NOT charge to make a cloud file system. However, we require that
20
// the user have enough money to make a CREATE_CLOUD_FILESYSTEM_AMOUNT purchase.
21
// One reason to require credit is because billing is delayed by several days,
22
// and a user could spend substantially during that time (e.g., over $1000
23
// seems possible, e.g., bandwidth egress to China is $0.23/GB, and you can
24
// probably download 100MB/s or over 300GB/hour, or over $3000 in 2 days).
25
export const CREATE_CLOUD_FILESYSTEM_AMOUNT = 10;
26
27
export const DEFAULT_LOCK = "DELETE";
28
// Since all storage gets mounted on all compute servers, and basically
29
// you only need one shared storage volume in most cases, we do put a global
30
// limit to avoid abuse and efficiency issues for now.
31
export const MAX_CLOUD_FILESYSTEMS_PER_PROJECT = 100;
32
// We use a random port on the VPN between MIN_PORT and MAX_PORT.
33
export const MIN_PORT = 40000;
34
export const MAX_PORT = 48000;
35
export const MIN_BLOCK_SIZE = 1;
36
// requires my fork of juicefs to get above 16 (supports 64)!
37
// do not use non-fork on a file system with a block size bigger
38
// than 16, as it may corrupt it...
39
// Just in case -- for now we will restrict to 16 anyways.
40
export const MAX_BLOCK_SIZE = 16;
41
export const RECOMMENDED_BLOCK_SIZE = 16;
42
43
export interface GoogleCloudServiceAccountKey {
44
type: "service_account";
45
project_id: string;
46
private_key_id: string;
47
private_key: string;
48
client_email: string;
49
client_id: string;
50
auth_uri: string;
51
token_uri: string;
52
auth_provider_x509_cert_url: string;
53
client_x509_cert_url: string;
54
universe_domain: "googleapis.com";
55
}
56
57
export type Compression = "lz4" | "zstd" | "none";
58
export const GOOGLE_CLOUD_BUCKET_STORAGE_CLASSES = [
59
"standard",
60
"nearline",
61
"coldline",
62
"archive",
63
"autoclass-nearline",
64
"autoclass-archive",
65
];
66
export const GOOGLE_CLOUD_BUCKET_STORAGE_CLASSES_DESC = {
67
"autoclass-nearline": {
68
desc: "Autoclass - transitions objects between Standard or Nearline based on activity",
69
},
70
"autoclass-archive": {
71
desc: "Autoclass - transitions objects between Standard, Nearline, Coldline, and Archive based on activity",
72
},
73
standard: {
74
desc: "Standard - short-term storage and frequently accessed data",
75
minStorageDays: 0,
76
},
77
nearline: {
78
desc: "Nearline - backups and data accessed less than once a month",
79
minStorageDays: 30,
80
},
81
coldline: {
82
desc: "Coldline - disaster recovery and data accessed less than once a quarter",
83
minStorageDays: 90,
84
},
85
archive: {
86
desc: "Archive - long-term digital preservation of data accessed less than once a year",
87
minStorageDays: 365,
88
},
89
};
90
export type GoogleCloudBucketStorageClass =
91
(typeof GOOGLE_CLOUD_BUCKET_STORAGE_CLASSES)[number];
92
93
// We implement the three multiregions: asia, eu, and us.
94
// We also support *all* single regions. Dual regions are
95
// complicated to specify and have subtle restrictions and
96
// probably aren't that critical for our users, so we don't
97
// support them.
98
export const GOOGLE_CLOUD_MULTIREGIONS = ["us", "eu", "asia"];
99
// We will have to update the zone list when google adds more zones, since I didn't
100
// want to have a dependency on my package @cocalc/gcloud-pricing-calculator.
101
// However it's easy using that package:
102
// a =require('@cocalc/gcloud-pricing-calculator')
103
// z = new Set(Object.keys((await a.getData()).zones).map((x)=>{i=x.lastIndexOf('-');return x.slice(0,i)}))
104
export const GOOGLE_CLOUD_REGIONS = [
105
"us-central1",
106
"us-east1",
107
"us-east4",
108
"us-east5",
109
"us-west1",
110
"us-west2",
111
"us-west3",
112
"us-west4",
113
"us-south1",
114
"northamerica-northeast1",
115
"northamerica-northeast2",
116
"europe-north1",
117
"europe-central2",
118
"europe-southwest1",
119
"europe-west1",
120
"europe-west2",
121
"europe-west3",
122
"europe-west4",
123
"europe-west6",
124
"europe-west8",
125
"europe-west9",
126
"europe-west10",
127
"europe-west12",
128
"southamerica-east1",
129
"southamerica-west1",
130
"africa-south1",
131
"asia-east1",
132
"asia-east2",
133
"asia-northeast1",
134
"asia-northeast2",
135
"asia-northeast3",
136
"asia-south1",
137
"asia-south2",
138
"asia-southeast1",
139
"asia-southeast2",
140
"australia-southeast1",
141
"australia-southeast2",
142
"me-central1",
143
"me-central2",
144
"me-west1",
145
];
146
147
export const GOOGLE_REGION_PREFIX_TO_LOCATION = {
148
us: "North America",
149
northamerica: "North America",
150
europe: "Europe",
151
southamerica: "South America",
152
africa: "South Africa",
153
asia: "APAC",
154
australia: "APAC",
155
me: "Middle East",
156
eu: "Europe",
157
};
158
159
export type GoogleCloudBucketLocation =
160
| (typeof GOOGLE_CLOUD_MULTIREGIONS)[number]
161
| (typeof GOOGLE_CLOUD_REGIONS)[number];
162
163
export interface CloudFilesystem {
164
id: number;
165
project_specific_id: number;
166
project_id: string;
167
account_id: string;
168
created: Date;
169
bucket?: string;
170
mountpoint: string;
171
mount?: boolean; // whether it should get mounted right now
172
secret_key?: GoogleCloudServiceAccountKey;
173
port: number;
174
compression: Compression;
175
block_size: number;
176
trash_days: number;
177
bucket_location: GoogleCloudBucketLocation;
178
bucket_storage_class: GoogleCloudBucketStorageClass;
179
mount_options?: string;
180
keydb_options?: string;
181
title?: string;
182
color?: string;
183
deleting?: boolean;
184
error?: string;
185
notes?: string;
186
lock?: string;
187
position?: number;
188
last_edited?: Date;
189
purchase_id?: number;
190
bytes_used?: number;
191
}
192
// See https://juicefs.com/docs/community/command_reference#mount
193
194
//
195
196
export type CreateCloudFilesystem = Pick<
197
CloudFilesystem,
198
| "project_id"
199
| "mountpoint"
200
| "mount"
201
| "compression"
202
| "block_size"
203
| "trash_days"
204
| "title"
205
| "color"
206
| "notes"
207
| "position"
208
| "mount_options"
209
| "keydb_options"
210
| "bucket_location"
211
| "bucket_storage_class"
212
>;
213
214
export const DEFAULT_CONFIGURATION = {
215
mountpoint: "cloud",
216
mount: true,
217
compression: "lz4",
218
block_size: RECOMMENDED_BLOCK_SIZE,
219
trash_days: 0,
220
title: "Untitled",
221
lock: "DELETE",
222
//
223
// Without writeback things are quite slow (with GCS), so it's enabled.
224
// "-o allow_other" is because:
225
// - makes 'juicefs rmr /home/user/cloudfs/.trash' to empty the trash *possible*;
226
// as non-root there is no way to empty trash!
227
// - makes it possible to use ZFS on top of this, which may be interesting later.
228
// - --open-cache=(something) is needed since otherwise juicefs tries to use redis for network
229
// locks, which just don't work with async replication.
230
mount_options:
231
"--writeback -o allow_other --open-cache=1 --backup-meta=7200 --backup-skip-trash",
232
keydb_options: "",
233
bucket_location: "us-east1", // where cocalc.com is
234
bucket_storage_class: "autoclass-archive",
235
} as const;
236
237
export interface EditCloudFilesystem
238
extends Pick<
239
CloudFilesystem,
240
| "id"
241
| "mount"
242
| "title"
243
| "color"
244
| "notes"
245
| "position"
246
| "mount_options"
247
| "keydb_options"
248
| "lock"
249
> {
250
// making these optional
251
project_id?: string;
252
mountpoint?: string;
253
trash_days?: number;
254
bucket_storage_class?: GoogleCloudBucketStorageClass;
255
}
256
257
export const CHANGE_MOUNTED = new Set([
258
"title",
259
"color",
260
"notes",
261
"lock",
262
"mount",
263
"position",
264
"bucket_storage_class",
265
"trash_days",
266
]);
267
export const CHANGE_UNMOUNTED = new Set([
268
"project_id",
269
"mountpoint",
270
"mount_options",
271
"keydb_options",
272
"port",
273
]);
274
275
Table({
276
name: "cloud_filesystems",
277
rules: {
278
primary_key: "id",
279
// unique mountpoint *within* a given project; also unique port in case the
280
// storage service requires a port to sync (e.g., keydb).
281
pg_unique_indexes: [
282
"(project_id, mountpoint)",
283
"(project_id, port)",
284
"(project_id, project_specific_id)",
285
"bucket",
286
],
287
user_query: {
288
get: {
289
pg_where: [{ "project_id = $::UUID": "project_id" }],
290
throttle_changes: 0,
291
fields: {
292
id: null,
293
project_specific_id: null,
294
project_id: null,
295
account_id: null,
296
bucket: null,
297
mountpoint: null,
298
mount: null,
299
port: null,
300
compression: null,
301
block_size: null,
302
trash_days: null,
303
bucket_location: null,
304
bucket_storage_class: null,
305
title: null,
306
color: null,
307
error: null,
308
notes: null,
309
lock: null,
310
position: null,
311
last_edited: null,
312
purchase_id: null,
313
deleting: null,
314
mount_options: null,
315
keydb_options: null,
316
bytes_used: null,
317
},
318
},
319
set: {
320
fields: {
321
project_id: "project_write",
322
id: true,
323
mount: true,
324
error: true,
325
notes: true,
326
title: true,
327
color: true,
328
position: true,
329
lock: true,
330
},
331
},
332
},
333
},
334
fields: {
335
id: ID,
336
project_specific_id: {
337
not_null: true,
338
type: "integer",
339
desc: "A unique project-specific id assigned to this cloud file system. This is a positive integer that is guaranteed to be unique for cloud filesystems *in a given project* and minimal when assigned (so it is as small as possible). For now at least, I'm not using this in any way except as something to display to users. Internally we always use the global id.",
340
},
341
project_id: {
342
not_null: true,
343
type: "uuid",
344
desc: "The project id that this compute server provides compute for.",
345
render: { type: "project_link" },
346
},
347
account_id: {
348
not_null: true,
349
type: "uuid",
350
desc: "User that owns this cloud file system (they pay)",
351
render: { type: "account" },
352
},
353
created: {
354
not_null: true,
355
type: "timestamp",
356
desc: "When the compute server was created.",
357
},
358
bucket: {
359
type: "string",
360
pg_type: "VARCHAR(63)",
361
desc: "Google cloud storage bucket backing this filesystem",
362
render: { type: "text", maxLength: 63, editable: false },
363
},
364
bucket_storage_class: {
365
not_null: true,
366
type: "string",
367
pg_type: "VARCHAR(64)",
368
desc: "Default storage class of the google cloud storage bucket",
369
render: { type: "text", maxLength: 64, editable: false },
370
},
371
bucket_location: {
372
not_null: true,
373
type: "string",
374
pg_type: "VARCHAR(64)",
375
desc: "Where the google cloud storage bucket is stored.",
376
render: { type: "text", maxLength: 64, editable: false },
377
},
378
mountpoint: {
379
not_null: true,
380
type: "string",
381
pg_type: "VARCHAR(4096)",
382
desc: "Where compute server is mounted in the file system. If a relative path, then relative to home directory. Target path does not have to be empty. For sanity we restrict this string more than an arbitrary linux path.",
383
render: { type: "text", maxLength: 4096, editable: true },
384
},
385
mount: {
386
type: "boolean",
387
desc: "If true, then this cloud file system will be mounted on all compute servers associated to the project.",
388
},
389
secret_key: {
390
type: "map",
391
pg_type: "jsonb",
392
desc: "Secret key needed to use the bucket. It's a structured jsonb object. For google cloud storage, it's exactly the service account. This will only be not set if something went wrong initializing this storage.",
393
},
394
port: {
395
type: "integer",
396
desc: "Numerical port where local service runs on each client for the file system. E.g., this is keydb for juicefs.",
397
},
398
compression: {
399
not_null: true,
400
type: "string",
401
pg_type: "VARCHAR(64)",
402
desc: "Compression for the file system: lz4, zstd or none. Cannot be changed.",
403
render: { type: "text", maxLength: 64, editable: false },
404
},
405
block_size: {
406
type: "integer",
407
not_null: true,
408
desc: "Block size of file system in MB: between 1 and 64, inclusive. Cannot be changed.",
409
},
410
trash_days: {
411
type: "integer",
412
not_null: true,
413
desc: "Number of days to store deleted files. Use 0 to disable.",
414
},
415
mount_options: {
416
type: "string",
417
pg_type: "VARCHAR(4096)",
418
desc: "Options passed to the command line when running juicefs mount. See https://juicefs.com/docs/community/command_reference#mount This exact string is literally put on the command line after 'juicefs mount', and obviously getting it mangled can break mounting the file system.",
419
render: { type: "text", maxLength: 4096, editable: true },
420
},
421
keydb_options: {
422
type: "string",
423
pg_type: "VARCHAR(16384)",
424
desc: "Keydb (/Redis) configuration. This is placed at the end of keydb.conf and can be used to override or add to the keydb configuration used on each client.",
425
render: { type: "text", maxLength: 16384, editable: true },
426
},
427
title: {
428
type: "string",
429
pg_type: "VARCHAR(254)",
430
desc: "Title of this computer server. Used purely to make it easier for the user to keep track of it.",
431
render: { type: "text", maxLength: 254, editable: true },
432
},
433
color: {
434
type: "string",
435
desc: "A user configurable color, which is used for tags and UI to indicate where a tab is running.",
436
pg_type: "VARCHAR(30)",
437
render: { type: "color", editable: true },
438
},
439
deleting: {
440
type: "boolean",
441
desc: "True if this filesystem is in the process of being deleted.",
442
},
443
error: {
444
type: "string",
445
desc: "In case something went wrong, e.g., in starting this compute server, this field will get set with a string error message to show the user. It's also cleared right when we try to start server.",
446
},
447
notes: NOTES,
448
lock: {
449
type: "string",
450
pg_type: "VARCHAR(128)",
451
desc: "String that you must provide as part of any API call to delete this object. Use this as a personal reminder of conditions under which it is OK to delete this.",
452
render: { type: "text", maxLength: 128, editable: true },
453
},
454
position: {
455
type: "number",
456
desc: "Used for sorting a list of cloud file systems in the UI.",
457
},
458
last_edited: {
459
type: "timestamp",
460
desc: "Last time some field was changed. Also, this gets updated when the volume is actively mounted by some compute server, since the files are likely edited.",
461
},
462
purchase_id: {
463
type: "number",
464
desc: "if there is a current active purchase related to this compute server, this is the id of that purchase in the purchases table",
465
},
466
bytes_used: {
467
not_null: true,
468
type: "integer",
469
pg_type: "bigint",
470
desc: "The total number of bytes of data stored in the file system -- it's the output of df. It is not impacted by compression, i.e., it's not the bucket size itself.",
471
},
472
},
473
});
474
475
Table({
476
name: "crm_cloud_filesystems",
477
fields: schema.cloud_filesystems.fields,
478
rules: {
479
primary_key: schema.cloud_filesystems.primary_key,
480
virtual: "cloud_filesystems",
481
user_query: {
482
get: {
483
admin: true,
484
pg_where: [],
485
fields: {
486
...schema.cloud_filesystems.user_query?.get?.fields,
487
template: null,
488
},
489
},
490
set: {
491
admin: true,
492
fields: {
493
id: true,
494
title: true,
495
color: true,
496
notes: true,
497
mount_options: true,
498
keydb_options: true,
499
},
500
},
501
},
502
},
503
});
504
505
// some sanity checks
506
export function assertValidCompression(compression: Compression) {
507
if (
508
typeof compression == "string" &&
509
["lz4", "zstd", "none"].includes(compression)
510
) {
511
return;
512
}
513
throw Error(`compression must be 'lz4', 'zstd', or 'none'`);
514
}
515
516
export function assertValidPath(path: string) {
517
if (typeof path != "string") {
518
throw Error("path must be a string");
519
}
520
if (
521
path.includes("\0") ||
522
path.includes("\n") ||
523
path.includes("~") ||
524
path.includes("\\")
525
) {
526
throw Error(
527
`invalid path '${path}' -- must not include newlines or null characters or ~ or \\`,
528
);
529
}
530
if (path.length > 4096) {
531
throw Error(`invalid path '${path}' -- must be at most 4096 characters`);
532
}
533
for (let i = 0; i < path.length; i++) {
534
const charCode = path.charCodeAt(i);
535
if ((charCode >= 0x00 && charCode <= 0x1f) || charCode === 0x7f) {
536
throw Error(`invalid path '${path}' -- must not include control codes`);
537
}
538
}
539
}
540
541
Table({
542
name: "crm_cloud_filesystems",
543
fields: schema.cloud_filesystems.fields,
544
rules: {
545
primary_key: schema.cloud_filesystems.primary_key,
546
virtual: "cloud_filesystems",
547
user_query: {
548
get: {
549
admin: true,
550
pg_where: [],
551
fields: {
552
...schema.cloud_filesystems.user_query?.get?.fields,
553
template: null,
554
},
555
},
556
set: {
557
admin: true,
558
fields: {
559
id: true,
560
title: true,
561
color: true,
562
notes: true,
563
mount_options: true,
564
keydb_options: true,
565
},
566
},
567
},
568
},
569
});
570
571
export interface CloudFilesystemMetric {
572
timestamp: number; // what we get back from api since it's json -- ms since epoch
573
compute_server_id: number;
574
bytes_used: number;
575
process_uptime: number;
576
bytes_put?: number | null;
577
bytes_get?: number | null;
578
objects_put?: number | null;
579
objects_get?: number | null;
580
objects_delete?: number | null;
581
bucket_location: string;
582
bucket_storage_class: GoogleCloudBucketStorageClass;
583
compute_server_location: GoogleCloudBucketLocation;
584
cost?: number | null;
585
}
586
587
Table({
588
name: "cloud_filesystem_metrics",
589
rules: {
590
primary_key: ["timestamp", "cloud_filesystem_id", "compute_server_id"],
591
},
592
fields: {
593
timestamp: {
594
type: "timestamp",
595
desc: "When the metric was submitted. This is assigned by the database when data is inserted, so should be assumed correct and non-decreasing.",
596
},
597
cloud_filesystem_id: {
598
type: "integer",
599
desc: "The id of the cloud file system that this is a metric for.",
600
},
601
compute_server_id: {
602
type: "integer",
603
desc: "The id of the compute server that is submitting this metric.",
604
},
605
bytes_used: {
606
not_null: true,
607
type: "integer",
608
pg_type: "bigint",
609
desc: "The total number of bytes of data stored in the file system -- it's the output of df. It is not impacted by compression, i.e., it's not the bucket size itself.",
610
},
611
process_uptime: {
612
not_null: true,
613
type: "number",
614
desc: "Seconds since the process started collecting these metrics.",
615
},
616
bytes_put: {
617
type: "integer",
618
pg_type: "bigint",
619
desc: "The number of bytes of data that was written to cloud storage: juicefs_object_request_data_bytes_PUT in .stats",
620
},
621
bytes_get: {
622
type: "integer",
623
pg_type: "bigint",
624
desc: "The number of bytes of data that were written to cloud storage: juicefs_object_request_data_bytes_GET in .stats",
625
},
626
objects_put: {
627
type: "integer",
628
pg_type: "bigint",
629
desc: "Class A Operation: The number of distinct objects that were written to cloud storage: juicefs_object_request_durations_histogram_seconds_PUT_total in .stats",
630
},
631
objects_get: {
632
type: "integer",
633
pg_type: "bigint",
634
desc: "Class B Operation: The number of distinct objects that were read from cloud storage: juicefs_object_request_durations_histogram_seconds_GET_total in .stats",
635
},
636
objects_delete: {
637
type: "integer",
638
pg_type: "bigint",
639
desc: "Free Operation: The number of distinct objects that were deleted from cloud storage: juicefs_object_request_durations_histogram_seconds_DELETE_total in .stats",
640
},
641
bucket_location: {
642
not_null: true,
643
type: "string",
644
pg_type: "VARCHAR(64)",
645
desc: "Where the google cloud storage bucket is stored. A GCP region or 'us','eu','asia' for multiregion buckets.",
646
render: { type: "text", maxLength: 64, editable: false },
647
},
648
bucket_storage_class: {
649
not_null: true,
650
type: "string",
651
pg_type: "VARCHAR(64)",
652
desc: "Default storage class of the google cloud storage bucket at this point in time: 'standard', 'nearline', 'coldline', 'archive', 'autoclass-nearline' or 'autoclass-archive'",
653
render: { type: "text", maxLength: 64, editable: false },
654
},
655
compute_server_location: {
656
not_null: true,
657
type: "string",
658
pg_type: "VARCHAR(64)",
659
desc: "A GCP region or 'world', 'china', 'australia', 'unknown'. Here 'world' means something oether than 'china' or 'australia'. Also HK doesn't count as 'china'.",
660
render: { type: "text", maxLength: 64, editable: false },
661
},
662
cost: {
663
type: "number",
664
pg_type: "double precision",
665
desc: "The estimated accumulated total cost from when the bucket was created until this point in time. This could be recomputed, but is nice to have easily available, and means we can delete old data.",
666
},
667
// cost_state: {
668
// type: "object",
669
// desc: "Extra data at this point in time that can be used somehow in our cost estimation heuristic. E.g., {'bytes_used_standard':20000} would mean that we should assume going forward that 20000 bytes of data is of the standard storage class, irregardless of the current storage class because of a change of class. Obviously, some of this data could be deleted, but we don't know.",
670
// },
671
},
672
});
673
674