Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sagemath
GitHub Repository: sagemath/sagecell
Path: blob/master/contrib/vm/container_manager.py
804 views
1
#!/usr/bin/env python3
2
3
import argparse
4
import datetime
5
import grp
6
import logging
7
import logging.config
8
import os
9
import pwd
10
import random
11
import shlex
12
import shutil
13
import stat
14
import subprocess
15
import sys
16
import time
17
18
import lxc
19
import psutil
20
import yaml
21
22
number_of_compute_nodes = 3
23
24
#This will be visible on root and help pages. Suggested template:
25
# Resources for your computation are provided by <a href="...">...</a>.
26
provider_html = r"""
27
"""
28
29
# Container names
30
lxcn_base = "base" # OS and packages
31
lxcn_sage = "sage" # Sage without extra packages
32
lxcn_precell = "precell" # Everything but SageCell and system configuration
33
lxcn_sagecell = "sagecell" # Sage and SageCell
34
lxcn_backup = "sagecell-backup" # Saved master for restoration if necessary
35
lxcn_tester = "sctest" # Accessible via special port, for testing
36
lxcn_prefix = "sc-" # Prefix for main compute nodes
37
38
# Timeout in seconds to wait for a container to shutdown, network to start etc.
39
timeout = 120
40
# Time after which SageCell should be up and running.
41
start_delay = 66
42
# How long to wait after starting new containers before destroying old ones.
43
deploy_delay = 2*60*60 # Two hours to allow all interacts finish "naturally".
44
45
# User names and IDs
46
users = {"group": "sagecell", "GID": 8888,
47
"server": "sc_serv", "server_ID": 8888,
48
"worker": "sc_work", "worker_ID": 9999}
49
50
# Github repositories as (user, repository, branch)
51
repositories = [
52
("sagemath", "sage", "master"),
53
("sagemath", "sagecell", "master"),
54
]
55
56
# Packages to be installed in the base container
57
system_packages = [
58
# SageMath prerequisites as of Sage 9.7
59
'bc',
60
'binutils',
61
'bzip2',
62
'ca-certificates',
63
'cliquer',
64
'cmake',
65
'curl',
66
'ecl',
67
'eclib-tools',
68
'fflas-ffpack',
69
'flintqs',
70
'g++',
71
'gcc',
72
'gengetopt',
73
'gfan',
74
'gfortran',
75
'glpk-utils',
76
'gmp-ecm',
77
'lcalc',
78
'libatomic-ops-dev',
79
'libboost-dev',
80
'libbraiding-dev',
81
'libbz2-dev',
82
'libcdd-dev',
83
'libcdd-tools',
84
'libcliquer-dev',
85
'libcurl4-openssl-dev',
86
'libec-dev',
87
'libecm-dev',
88
'libffi-dev',
89
'libflint-dev',
90
'libfplll-dev',
91
'libfreetype6-dev',
92
'libgc-dev',
93
'libgd-dev',
94
'libgf2x-dev',
95
'libgiac-dev',
96
'libgivaro-dev',
97
'libglpk-dev',
98
'libgmp-dev',
99
'libgsl-dev',
100
'libhomfly-dev',
101
'libiml-dev',
102
'liblfunction-dev',
103
'liblinbox-dev',
104
'liblrcalc-dev',
105
'liblzma-dev',
106
'libm4ri-dev',
107
'libm4rie-dev',
108
'libmpc-dev',
109
'libmpfi-dev',
110
'libmpfr-dev',
111
'libncurses5-dev',
112
'libntl-dev',
113
'libopenblas-dev',
114
'libpari-dev',
115
'libpcre3-dev',
116
'libplanarity-dev',
117
'libppl-dev',
118
'libprimesieve-dev',
119
'libpython3-dev',
120
'libqhull-dev',
121
'libreadline-dev',
122
'librw-dev',
123
'libsingular4-dev',
124
'libsqlite3-dev',
125
'libssl-dev',
126
'libsuitesparse-dev',
127
'libsymmetrica2-dev',
128
'libz-dev',
129
'libzmq3-dev',
130
'libzn-poly-dev',
131
'm4',
132
'make',
133
'nauty',
134
'ninja-build',
135
'openssl',
136
'palp',
137
'pari-doc',
138
'pari-elldata',
139
'pari-galdata',
140
'pari-galpol',
141
'pari-gp2c',
142
'pari-seadata',
143
'patch',
144
'perl',
145
'pkg-config',
146
'planarity',
147
'ppl-dev',
148
'python3',
149
'python3-venv',
150
'r-base-dev',
151
'r-cran-lattice',
152
'singular',
153
'singular-doc',
154
'sqlite3',
155
'sympow',
156
'tachyon',
157
'tar',
158
'tox',
159
'xcas',
160
'xz-utils',
161
# SageMath development
162
'autoconf',
163
'automake',
164
'git',
165
'gpgconf',
166
'libtool',
167
# 'openssh', not available on Ubuntu 22.04
168
'openssh-client',
169
'pkg-config',
170
# SageMath recommendations
171
'default-jdk',
172
'dvipng',
173
'ffmpeg',
174
'imagemagick',
175
'latexmk',
176
'libavdevice-dev',
177
'pandoc',
178
'tex-gyre',
179
'texlive-fonts-recommended',
180
'texlive-lang-cyrillic',
181
'texlive-lang-english',
182
'texlive-lang-european',
183
'texlive-lang-french',
184
'texlive-lang-german',
185
'texlive-lang-italian',
186
'texlive-lang-japanese',
187
'texlive-lang-polish',
188
'texlive-lang-portuguese',
189
'texlive-lang-spanish',
190
'texlive-latex-extra',
191
'texlive-xetex',
192
# SageMath optional
193
'4ti2',
194
'clang',
195
'coinor-cbc',
196
'coinor-libcbc-dev',
197
'graphviz',
198
'libfile-slurp-perl',
199
'libgraphviz-dev',
200
'libigraph-dev',
201
'libisl-dev',
202
'libjson-perl',
203
'libmongodb-perl',
204
'libnauty-dev',
205
'libperl-dev',
206
'libpolymake-dev',
207
'libsvg-perl',
208
'libterm-readkey-perl',
209
'libterm-readline-gnu-perl',
210
'libxml-libxslt-perl',
211
'libxml-writer-perl',
212
'libxml2-dev',
213
'lrslib',
214
'pari-gp2c',
215
'pdf2svg',
216
# 'polymake', triggers firefox snap that does not work in containers
217
'texinfo',
218
# SageMathCell
219
'bison',
220
'build-essential',
221
'epstool',
222
'fig2dev',
223
'gettext',
224
'gnuplot',
225
'ipset',
226
'iptables',
227
'libcairo2-dev',
228
'libgeos-dev',
229
'libhdf5-dev',
230
'libnetcdf-dev',
231
'libopenmpi-dev',
232
'libopenmpi3',
233
'libproj-dev',
234
'libsnappy-dev',
235
'libsystemd-dev',
236
'libxslt1-dev',
237
'macaulay2',
238
'nginx',
239
'npm',
240
'octave',
241
'octave-econometrics',
242
'octave-statistics',
243
'octave-symbolic',
244
'php8.3-fpm',
245
'proj-bin',
246
'python3-requests',
247
'rsyslog-relp',
248
'ssh',
249
'texlive',
250
'tk-dev',
251
'tmpreaper',
252
'unattended-upgrades',
253
'unzip',
254
'wget',
255
# R packages
256
'r-cran-desolve',
257
'r-cran-ggally',
258
'r-cran-ggeffects',
259
'r-cran-ggplot2',
260
'r-cran-lazyeval',
261
'r-cran-pracma',
262
'r-cran-reticulate',
263
'r-cran-rhandsontable',
264
'r-cran-rms',
265
'r-cran-survey',
266
'r-cran-tidyverse',
267
]
268
269
# R packages that are not available as system ones
270
R_packages = [
271
"flextable",
272
"formattable",
273
"ggformula",
274
"glmmTMB",
275
"gt",
276
"huxtable",
277
"mosaic",
278
"reactable",
279
]
280
281
# Optional Sage packages to be installed
282
sage_optional_packages = [
283
"4ti2",
284
"biopython",
285
"bliss",
286
"cbc",
287
"database_cremona_ellcurve",
288
"database_jones_numfield",
289
"database_odlyzko_zeta",
290
"database_symbolic_data",
291
"dot2tex", # needs graphviz
292
"fricas",
293
"gap_packages",
294
"gap3",
295
"jmol",
296
"jupyter_jsmol",
297
"latte_int",
298
"lie", # needs bison
299
"lrslib",
300
"mcqd",
301
"normaliz",
302
"pari_elldata",
303
"pari_galpol",
304
"pari_nftables",
305
"pari_seadata",
306
"pybtex", # needs unzip
307
"pynormaliz",
308
"qepcad",
309
"saclib",
310
"sagemath_giac",
311
"tides",
312
#"topcom", Does not work as of November 2022 with relying on system packages
313
]
314
315
# Python packages to be installed into Sage (via pip)
316
python_packages = [
317
# Dependencies of SageMathCell
318
"comm",
319
"lockfile",
320
"paramiko",
321
"psutil",
322
"sockjs-tornado",
323
"git+https://github.com/systemd/python-systemd.git",
324
# Optional
325
"future", # fipy does not work without it installed first
326
#"--no-build-isolation git+https://github.com/abelfunctions/abelfunctions", downgrades numpy
327
"admcycles",
328
"altair",
329
"APMonitor",
330
"astropy",
331
"astroquery",
332
#"autoviz", downgrades numpy
333
"bioinfokit",
334
"bitarray",
335
"bokeh",
336
"calplot",
337
"cartopy",
338
"chart_studio",
339
"colorlog",
340
"covid-daily",
341
"cramjam",
342
"cufflinks",
343
"dash",
344
"dask[array]",
345
"drawdata",
346
"dropbox",
347
"duckdb",
348
"emoji",
349
"galgebra",
350
"geopandas",
351
"geoplot",
352
"getdist",
353
"ggplot",
354
"gif",
355
#"giotto-tda", wants sudo
356
"google-api-python-client",
357
"google-genai",
358
"google-generativeai",
359
"graphviz",
360
"gspread",
361
"fipy",
362
"folium",
363
"healpy",
364
"h5py",
365
"husl",
366
"itikz",
367
"july",
368
"keras",
369
"keyring",
370
"koboextractor",
371
"langchain",
372
"langchain-openai",
373
"langserve",
374
"langserve[all]",
375
#"lenstools", complaints there is no numpy
376
"lhsmdu",
377
"litellm",
378
"lxml",
379
"manimlib",
380
"mapclassify",
381
"mathchem",
382
"mistralai",
383
"mpi4py",
384
"msedge-selenium-tools",
385
"munkres",
386
"nest_asyncio",
387
"netcdf4",
388
"nltk",
389
"numexpr",
390
"oauth2client",
391
"oct2py",
392
"openai",
393
"openpyxl",
394
"pandas",
395
"pandas-profiling",
396
"patsy",
397
"plotnine",
398
"plotly",
399
"polars",
400
"pretty_html_table",
401
"pydot",
402
"pyforest",
403
"pygnuplot",
404
"PyPDF4",
405
"pyproj",
406
"pyswarms",
407
"python-snappy",
408
"python-ternary",
409
"pyvo",
410
"qiskit",
411
"qiskit[nature]",
412
"requests",
413
"scikit-image",
414
"scikit-learn",
415
"scikit-tda",
416
#"scimath", does not build
417
"scrapy",
418
"seaborn",
419
"selenium",
420
"Shapely",
421
"SimPy",
422
"snappy",
423
"spacy",
424
"SpeechRecognition",
425
"spiceypy",
426
"statsmodels",
427
"streamlit",
428
#"surface_dynamics", does not build
429
"sweetviz",
430
"tables",
431
"tbcontrol",
432
#"theano", does not build
433
"tikzplotlib",
434
"torch",
435
"transformers",
436
"tweepy",
437
"twint",
438
"vega_datasets",
439
"WeasyPrint",
440
"wordcloud",
441
"xarray",
442
"xlrd",
443
"moss", # This one only complains about missing dependencies
444
]
445
446
447
# limits configuration for the host - will not be overwritten later
448
limits_conf = """\
449
* - nofile 32768
450
root - nofile 32768
451
"""
452
453
454
# rsyslog configuration for the host - will not be overwritten later
455
rsyslog_conf = r"""global(maxMessageSize="64k")
456
457
module(load="imrelp")
458
input(type="imrelp" port="12514")
459
460
template(name="sagecell" type="list") {
461
property(name="hostname")
462
constant(value=" ")
463
property(name="syslogtag")
464
property(name="msg" spifno1stsp="on")
465
property(name="msg" droplastlf="on")
466
constant(value="\n")
467
}
468
469
if $syslogfacility-text == "local3" then
470
{
471
action(type="omfile"
472
file="/var/log/sagecell.stats.log"
473
template="sagecell")
474
stop
475
}
476
"""
477
478
479
# HA-Proxy configuration is regenerated every time the script is run.
480
HAProxy_header = """\
481
# Default from Ubuntu 22.04 LTS
482
global
483
log /dev/log local0
484
log /dev/log local1 notice
485
chroot /var/lib/haproxy
486
stats socket /run/haproxy/admin.sock mode 660 level admin expose-fd listeners
487
stats timeout 30s
488
user haproxy
489
group haproxy
490
daemon
491
492
# Default SSL material locations
493
ca-base /etc/ssl/certs
494
crt-base /etc/ssl/private
495
496
# See: https://ssl-config.mozilla.org/#server=haproxy&server-version=2.0.3&config=intermediate
497
ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384
498
ssl-default-bind-ciphersuites TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256
499
ssl-default-bind-options ssl-min-ver TLSv1.2 no-tls-tickets
500
501
defaults
502
log global
503
mode http
504
option httplog
505
option dontlognull
506
timeout connect 5000
507
timeout client 50000
508
timeout server 50000
509
errorfile 400 /etc/haproxy/errors/400.http
510
errorfile 403 /etc/haproxy/errors/403.http
511
errorfile 408 /etc/haproxy/errors/408.http
512
errorfile 500 /etc/haproxy/errors/500.http
513
errorfile 502 /etc/haproxy/errors/502.http
514
errorfile 503 /etc/haproxy/errors/503.http
515
errorfile 504 /etc/haproxy/errors/504.http
516
517
# SageMathCell additions
518
option http-server-close
519
option redispatch
520
timeout client-fin 50s
521
timeout tunnel 30m
522
"""
523
524
# {suffix} {port} {hostname} {peer_port} have to be set once
525
# lines with {node} and {id} should be repeated for each server
526
HAProxy_section = r"""
527
frontend http{suffix}
528
bind *:{port}
529
rate-limit sessions 10
530
http-request replace-path (/embedded_sagecell\.js.*) /static\1 if { url_beg /embedded_sagecell }
531
use_backend static{suffix} if { path_beg /static }
532
use_backend compute{suffix}
533
monitor-uri /?healthcheck
534
monitor fail if { nbsrv(compute{suffix}) lt 1 }
535
536
peers local{suffix}
537
peer {hostname} localhost:{peer_port}
538
539
backend static{suffix}
540
server {node} {ip}:8889 id {id} check
541
542
backend compute{suffix}
543
balance leastconn
544
stick-table type string len 36 size 1m expire 30m peers local{suffix}
545
stick on urlp(CellSessionID)
546
stick match req.hdr(Jupyter-Kernel-ID)
547
stick store-response res.hdr(Jupyter-Kernel-ID)
548
stick match path bytes(8,36) if { path_reg ^/kernel/.{36}/ }
549
option httpchk
550
551
server {node} {ip}:8888 id {id} check
552
"""
553
554
HAProxy_stats = """
555
listen stats
556
bind *:9999
557
stats enable
558
stats refresh 5s
559
stats uri /
560
stats show-legends
561
"""
562
563
564
def call(command):
565
command = command.format_map(users)
566
log.debug("executing %s", command)
567
return subprocess.call(shlex.split(command))
568
569
570
def check_call(command):
571
command = command.format_map(users)
572
log.debug("executing %s", command)
573
subprocess.check_call(shlex.split(command))
574
575
576
def check_output(command):
577
command = command.format_map(users)
578
log.debug("executing %s", command)
579
return subprocess.check_output(shlex.split(command),
580
universal_newlines=True)
581
582
583
def communicate(command, message):
584
command = command.format_map(users)
585
log.debug("sending %s to %s", message, command)
586
with subprocess.Popen(shlex.split(command),
587
stdin=subprocess.PIPE,
588
universal_newlines=True) as p:
589
p.communicate(message)
590
if p.returncode != 0:
591
msg = "{} failed".format(command)
592
log.error(msg)
593
raise RuntimeError(msg)
594
595
596
def timer_delay(delay, test=None):
597
r"""
598
Wait with a countdown timer.
599
600
``delay`` is either a timedelta or the number of seconds.
601
602
``test`` is either ``None`` (default) or callable, in which case the timer
603
stops as soon as ``False`` is returned.
604
"""
605
if isinstance(delay, datetime.timedelta):
606
delay = delay.total_seconds()
607
now = time.time()
608
end = now + delay
609
while now < end and (test is None or test()):
610
remaining = datetime.timedelta(seconds=int(end - now))
611
sys.stdout.write(" Please wait {} ...\r".format(remaining))
612
sys.stdout.flush()
613
time.sleep(1)
614
now = time.time()
615
616
617
def update_repositories():
618
r"""
619
Clone/update repositories and checkout appropriate branches.
620
"""
621
if not os.path.exists("github"):
622
os.mkdir("github")
623
os.chdir("github")
624
git = lambda command: check_call("git " + command)
625
for user, repository, branch in repositories:
626
log.info("updating repository %s", repository)
627
if not os.path.exists(repository):
628
git("clone https://github.com/{}/{}.git".format(user, repository))
629
os.chdir(repository)
630
git("fetch")
631
git("checkout " + branch)
632
if call("git symbolic-ref -q HEAD") == 0:
633
git("pull")
634
os.chdir(os.pardir)
635
os.chdir(os.pardir)
636
637
638
def create_host_users():
639
r"""
640
Create host users if necessary.
641
642
If users exist (from previous runs), check that they are as expected.
643
"""
644
log.info("creating users on the host")
645
try:
646
check_call("addgroup --gid {GID} {group}")
647
check_call("adduser --uid {server_ID} --ingroup {group} --gecos '' "
648
"--disabled-password --no-create-home {server}")
649
check_call("adduser --uid {worker_ID} --ingroup {group} --gecos '' "
650
"--disabled-password --no-create-home {worker}")
651
except subprocess.CalledProcessError:
652
try:
653
g = grp.getgrnam(users["group"])
654
s = pwd.getpwnam(users["server"])
655
w = pwd.getpwnam(users["worker"])
656
if g.gr_gid != users["GID"] or \
657
s.pw_uid != users["server_ID"] or s.pw_gid != users["GID"] or \
658
w.pw_uid != users["worker_ID"] or w.pw_gid != users["GID"]:
659
raise KeyError
660
except KeyError:
661
raise RuntimeError("failed to create accounts on host")
662
663
664
def setup_container_users():
665
r"""
666
Create container users and setup SSH access.
667
"""
668
log.info("setting up users in the containter")
669
check_call("addgroup --gid {GID} {group}")
670
check_call("adduser --uid {server_ID} --ingroup {group} --gecos '' "
671
"--disabled-password {server}")
672
check_call("adduser --uid {worker_ID} --ingroup {group} --gecos '' "
673
"--disabled-password {worker}")
674
675
shome = os.path.join("/home", users["server"])
676
os.chmod(shome, stat.S_IRWXU |
677
stat.S_IRGRP | stat.S_IXGRP |
678
stat.S_IROTH | stat.S_IXOTH)
679
os.chdir(shome)
680
os.setegid(users["GID"])
681
os.seteuid(users["server_ID"])
682
os.mkdir(".ssh", 0o700)
683
check_call("ssh-keygen -t ed25519 -q -N '' -f .ssh/id_ed25519")
684
685
whome = os.path.join("/home", users["worker"])
686
os.chdir(whome)
687
os.setuid(0)
688
os.seteuid(users["worker_ID"])
689
os.mkdir(".cache", 0o700)
690
os.mkdir(".sage")
691
os.mkdir(".ssh", 0o700)
692
files_to_lock = [
693
".cache/pip",
694
".sage/local",
695
".ssh",
696
".bash_logout",
697
".bash_profile",
698
".bashrc",
699
".profile",
700
]
701
check_call(" ".join(["touch"] + files_to_lock))
702
os.setuid(0)
703
shutil.copy2(os.path.join(shome, ".ssh/id_ed25519.pub"),
704
".ssh/authorized_keys")
705
os.chown(".ssh/authorized_keys", users["worker_ID"], users["GID"])
706
# Get the localhost in the known_hosts file.
707
check_call("su -l {server} -c "
708
"'ssh -q -oStrictHostKeyChecking=no {worker}@localhost whoami'")
709
for f in files_to_lock:
710
check_call("chattr -R +i " + f)
711
712
713
def become_server():
714
r"""
715
Adjust UID etc. to have files created as the server user.
716
"""
717
os.setgid(users["GID"])
718
os.setuid(users["server_ID"])
719
os.environ["HOME"] = os.path.join("/home", users["server"])
720
os.chdir(os.environ["HOME"])
721
os.environ.setdefault("MAKE", "make -j{}".format(os.cpu_count()))
722
723
724
def install_sage():
725
r"""
726
Install Sage.
727
"""
728
become_server()
729
shutil.move("github/sage", ".")
730
os.chdir("sage")
731
log.info("compiling Sage")
732
check_call("./bootstrap")
733
check_call("./configure")
734
check_call("make")
735
communicate("./sage", r"""
736
# make appropriate octave directory
737
octave.eval('1+2')
738
quit
739
""")
740
log.info("successfully compiled Sage")
741
742
743
def install_packages():
744
r"""
745
Assuming Sage is already installed, install optional packages.
746
"""
747
become_server()
748
os.chdir("sage")
749
log.info("installing optional Sage packages")
750
for package in sage_optional_packages:
751
check_call("./sage -i -y {}".format(package))
752
log.info("installing pip packages")
753
check_call("./sage -pip install --upgrade pip")
754
numpy_ver = check_output("./sage -c 'import numpy; print(numpy.__version__)'").strip()
755
log.info(f"numpy version is expected to stay at {numpy_ver}")
756
for package in python_packages:
757
# Many packages may downgrade numpy, so we force it to be at the Sage version
758
check_call(f"./sage -pip install numpy=={numpy_ver} {package}")
759
os.chdir("..")
760
761
762
def install_sagecell():
763
r"""
764
Install SageCell, assuming Sage and other packages are already installed.
765
"""
766
become_server()
767
log.info("compiling SageCell")
768
shutil.move("github/sagecell", ".")
769
shutil.rmtree("github")
770
os.chdir("sagecell")
771
with open("templates/provider.html", "w", encoding="utf-8") as f:
772
f.write(provider_html)
773
check_call("../sage/sage -sh -c 'make -B'")
774
log.info("successfully compiled SageCell")
775
776
777
def install_config_files():
778
r"""
779
Install container's config files, adjusting names inside.
780
"""
781
log.info("copying configuration files")
782
os.chdir(os.path.join("/home", users["server"],
783
"sagecell/contrib/vm/compute_node"))
784
785
def adjust_names(file):
786
with open(file) as f:
787
content = f.read()
788
for key, value in users.items():
789
content = content.replace("{%s}" % key, str(value))
790
with open(file, "w") as f:
791
f.write(content)
792
793
adjust_names(shutil.copy("config.py", "../../.."))
794
for root, _, files in os.walk("."):
795
if root == ".":
796
continue
797
for file in files:
798
name = os.path.join(root, file)
799
adjust_names(shutil.copy(name, name[1:]))
800
check_call("systemctl enable sagecell")
801
802
803
class SCLXC(object):
804
r"""
805
Wrapper for lxc.Container automatically performing prerequisite operations.
806
"""
807
808
def __init__(self, name):
809
self.name = name
810
self.c = lxc.Container(self.name)
811
812
def clone(self, clone_name, autostart=False, update=True):
813
r"""
814
Clone self, create a base container and destroy old clone if necessary.
815
"""
816
if not self.is_defined():
817
self.create()
818
if update:
819
self.update()
820
self.shutdown()
821
SCLXC(clone_name).destroy()
822
log.info("cloning %s to %s", self.name, clone_name)
823
if not self.c.clone(clone_name, flags=lxc.LXC_CLONE_SNAPSHOT):
824
raise RuntimeError("failed to clone " + self.name)
825
clone = SCLXC(clone_name)
826
if autostart:
827
clone.c.set_config_item("lxc.start.auto", "1")
828
clone.c.set_config_item("lxc.start.delay", str(start_delay))
829
clone.c.set_config_item("lxc.net.0.hwaddr",
830
"02:00:" + ":".join(["%02x" % random.randint(0, 255) for _ in range(4)]))
831
clone.c.save_config()
832
logdir = clone.c.get_config_item("lxc.rootfs.path") + "/var/log/"
833
for logfile in ["sagecell.log", "sagecell-console.log"]:
834
if os.path.exists(logdir + logfile):
835
os.remove(logdir + logfile)
836
return clone
837
838
def create(self):
839
r"""
840
Create this container based on the previous one, destroy old one if necessary.
841
842
It is the logical sequence of creating a fully configured SageMathCell container
843
from scratch, but broken into several steps. Previous steps for the current container
844
are performed if necessary, based on names. Random name is assumed to be a copy
845
of "the end result".
846
"""
847
self.destroy()
848
log.info("creating %s", self.name)
849
if self.name == lxcn_base:
850
# From scratch
851
# Try to automatically pick up proxy from host
852
os.environ["HTTP_PROXY"] = "apt"
853
if not self.c.create(
854
"download", 0,
855
{"dist": "ubuntu", "release": "noble", "arch": "amd64"},
856
"btrfs"):
857
raise RuntimeError("failed to create " + self.name)
858
os.environ.pop("HTTP_PROXY")
859
860
self.update()
861
# Need to preseed or there will be a dialog
862
self.inside(communicate, "/usr/bin/debconf-set-selections",
863
"tmpreaper tmpreaper/readsecurity note")
864
log.info("installing packages")
865
self.inside("apt install -y " + " ".join(system_packages))
866
# Relies on perl, so has to be after package installation
867
self.inside("/usr/sbin/deluser ubuntu --remove-home")
868
log.info("installing R packages")
869
for package in R_packages:
870
self.inside(f"""Rscript -e 'install.packages("{package}")'""")
871
self.inside(f"""Rscript -e 'library("{package}")'""")
872
elif self.name == lxcn_sage:
873
self.c = SCLXC(lxcn_base).clone(lxcn_sage).c
874
create_host_users()
875
self.inside(setup_container_users)
876
# FIXME: work with temp folders properly
877
self.inside(os.mkdir, "/tmp/sagecell", 0o730)
878
self.inside(os.chown, "/tmp/sagecell",
879
users["server_ID"], users["GID"])
880
self.inside(os.chmod, "/tmp/sagecell", stat.S_ISGID)
881
# Copy repositories into container
882
update_repositories()
883
log.info("uploading repositories to %s", self.name)
884
root = self.c.get_config_item("lxc.rootfs.path")
885
home = os.path.join(root, "home", users["server"])
886
dot_cache = os.path.join(home, ".cache")
887
shutil.copytree("github", os.path.join(home, "github"), symlinks=True)
888
self.inside("chown -R {server}:{group} /home/{server}/github")
889
try:
890
shutil.copytree("dot_cache", dot_cache, symlinks=True)
891
self.inside("chown -R {server}:{group} /home/{server}/.cache")
892
except FileNotFoundError:
893
pass
894
self.inside(install_sage)
895
elif self.name == lxcn_precell:
896
self.c = SCLXC(lxcn_sage).clone(lxcn_precell).c
897
self.inside(install_packages)
898
# Remove old versions of packages
899
root = self.c.get_config_item("lxc.rootfs.path")
900
home = os.path.join(root, "home", users["server"])
901
dot_cache = os.path.join(home, ".cache")
902
upstream = os.path.join(home, "sage/upstream")
903
packages = dict()
904
for f in os.listdir(upstream):
905
filename = os.path.join(upstream, f)
906
name = f.split("-", 1)[0]
907
if name not in packages:
908
packages[name] = []
909
packages[name].append((os.stat(filename).st_mtime, filename))
910
for package in packages.values():
911
package.sort()
912
package.pop()
913
for _, filename in package:
914
os.remove(filename)
915
try:
916
shutil.rmtree("github/sage/upstream")
917
except FileNotFoundError:
918
pass
919
shutil.move(upstream, "github/sage/upstream")
920
try:
921
shutil.rmtree("dot_cache")
922
except FileNotFoundError:
923
pass
924
shutil.copytree(dot_cache, "dot_cache", symlinks=True)
925
elif self.name == lxcn_sagecell:
926
self.c = SCLXC(lxcn_precell).clone(lxcn_sagecell).c
927
self.inside("su -c 'git -C /home/{server}/github/sagecell pull' {server}")
928
self.inside(install_sagecell)
929
self.inside(install_config_files)
930
self.c.set_config_item("lxc.cgroup.memory.limit_in_bytes", "8G")
931
self.c.save_config()
932
self.shutdown()
933
# Let first-time tasks to run and complete.
934
self.start()
935
timer_delay(start_delay + 300)
936
else:
937
# If the name is not recognized as some intermediate step, we assume
938
# that a copy of the fully built SageMathCell is desired
939
self.c = SCLXC(lxcn_sagecell).clone(self.name).c
940
941
def destroy(self):
942
r"""
943
Stop and destroy self if it exists.
944
"""
945
if self.c.defined:
946
log.info("destroying %s", self.name)
947
if self.c.running and not self.c.stop():
948
raise RuntimeError("failed to stop " + self.name)
949
if not self.c.destroy():
950
raise RuntimeError("failed to destroy " + self.name)
951
self.c = lxc.Container(self.name)
952
else:
953
log.debug("not destroying %s since it is not defined", self.name)
954
955
def inside(self, command, *args):
956
r"""
957
Run a function or a system command inside the container.
958
"""
959
self.start()
960
if isinstance(command, str):
961
command = command.format_map(users)
962
log.debug("executing '%s' in %s", command, self.name)
963
if self.c.attach_wait(lxc.attach_run_command,
964
shlex.split(command)):
965
raise RuntimeError("failed to execute '{}'".format(command))
966
else:
967
args = [arg.format_map(users) if isinstance(arg, str) else arg
968
for arg in args]
969
970
def wrapper():
971
command(*args)
972
os.sys.exit() # Otherwise attach_wait returns -1
973
974
log.debug("executing %s with arguments %s in %s",
975
command, args, self.name)
976
if self.c.attach_wait(wrapper):
977
raise RuntimeError("failed to execute {} with arguments {}"
978
.format(command, args))
979
980
def ip(self):
981
self.start()
982
return self.c.get_ips()[0]
983
984
def is_defined(self):
985
return self.c.defined
986
987
def save_logs(self):
988
stamp_length = len("2014-12-28 15:00:02,315")
989
root = self.c.get_config_item("lxc.rootfs.path")
990
logdir = os.path.join(root, "var", "log")
991
logname = "sagecell.log"
992
fullname = os.path.join(logdir, logname)
993
if not os.path.exists(fullname):
994
return
995
with open(fullname, "rb") as f:
996
start = f.read(stamp_length).decode()
997
f.seek(0, os.SEEK_END)
998
f.seek(max(f.tell() - 2**16, 0))
999
end = f.readlines()[-1][:stamp_length].decode()
1000
archname = "container_logs/%s to %s on %s" % (start, end, self.name)
1001
if not os.path.exists("container_logs"):
1002
os.mkdir("container_logs")
1003
log.info("saving %s", archname)
1004
shutil.make_archive(archname, "bztar", logdir, logname)
1005
1006
def shutdown(self):
1007
if self.c.running and not self.c.shutdown(timeout):
1008
raise RuntimeError("failed to shutdown " + self.name)
1009
1010
def start(self):
1011
r"""
1012
Make sure that ``self`` is running and network works.
1013
"""
1014
if not self.c.running and not self.c.start():
1015
raise RuntimeError("failed to start " + self.name)
1016
if not self.c.get_ips(timeout=timeout):
1017
raise RuntimeError("failed to start network in " + self.name)
1018
1019
def update(self):
1020
r"""
1021
Update OS packages in ``self``.
1022
"""
1023
if self.is_defined():
1024
log.info("updating packages in %s", self.name)
1025
try:
1026
self.inside("apt update")
1027
except RuntimeError:
1028
# We get here if /var/lib/dpkg/lock is locked:
1029
# let's wait a bit and try again once
1030
timer_delay(timeout)
1031
self.inside("apt update")
1032
self.inside("apt full-upgrade -y --auto-remove")
1033
else:
1034
self.create()
1035
1036
1037
def restart_haproxy(names, backup_names=[]):
1038
r"""
1039
Regenerate HA-Proxy configuration file and restart it.
1040
"""
1041
log.debug("generating HAProxy configuration file")
1042
lines = [HAProxy_header]
1043
if names:
1044
shift = lambda n: 1 if n.endswith("A") else number_of_compute_nodes + 1
1045
section = HAProxy_section
1046
for k, v in {"port" : 80,
1047
"suffix": "",
1048
"peer_port": 1080,
1049
"hostname": check_output("hostname").strip()}.items():
1050
section = section.replace("{" + k + "}", str(v))
1051
for l in section.splitlines():
1052
if "{node}" not in l:
1053
lines.append(l)
1054
else:
1055
for i, n in enumerate(names):
1056
lines.append(l.format(
1057
node=n, ip=SCLXC(n).ip(), id=i + shift(n)))
1058
l += " backup"
1059
for i, n in enumerate(backup_names):
1060
lines.append(l.format(
1061
node=n, ip=SCLXC(n).ip(), id=i + shift(n)))
1062
tester = SCLXC(lxcn_tester)
1063
if tester.is_defined():
1064
section = HAProxy_section
1065
for k, v in {"port" : 8888,
1066
"suffix": "_test",
1067
"node": lxcn_tester,
1068
"ip": tester.ip(),
1069
"id": 1,
1070
"peer_port": 1088,
1071
"hostname": check_output("hostname").strip()}.items():
1072
section = section.replace("{" + k + "}", str(v))
1073
lines.append(section)
1074
lines.append(HAProxy_stats)
1075
with open("/etc/haproxy/haproxy.cfg", "w") as f:
1076
f.write("\n".join(lines))
1077
try:
1078
check_call("systemctl reload haproxy")
1079
except subprocess.CalledProcessError:
1080
check_call("systemctl start haproxy")
1081
1082
1083
logging.config.dictConfig(yaml.safe_load("""
1084
version: 1
1085
formatters:
1086
file:
1087
format: '%(asctime)s %(levelname)s: %(message)s'
1088
console:
1089
format: '########## %(asctime)s %(levelname)s: %(message)s ##########'
1090
handlers:
1091
file:
1092
class: logging.FileHandler
1093
formatter: file
1094
filename: container_manager.log
1095
level: DEBUG
1096
console:
1097
class: logging.StreamHandler
1098
formatter: console
1099
stream: ext://sys.stdout
1100
level: INFO
1101
root:
1102
level: DEBUG
1103
handlers: [file, console]
1104
"""))
1105
log = logging.getLogger(__name__)
1106
1107
parser = argparse.ArgumentParser(description="manage SageCell LXC containers",
1108
epilog="""
1109
Missing necessary containers are always created automatically.
1110
1111
This script always overwrites system-wide HA-proxy configuration file and
1112
restarts HA-Proxy to resolve container names to new IP addresses.""")
1113
parser.add_argument("--savemaster", action="store_true",
1114
help="save existing master container")
1115
parser.add_argument("-b", "--base", action="store_true",
1116
help="rebuild 'OS and standard packages' container")
1117
parser.add_argument("-s", "--sage", action="store_true",
1118
help="rebuild Sage container")
1119
parser.add_argument("-p", "--precell", action="store_true",
1120
help="rebuild container with extra packages")
1121
group = parser.add_mutually_exclusive_group()
1122
group.add_argument("-m", "--master", action="store_true",
1123
help="rebuild 'Sage and SageCell' container")
1124
group.add_argument("--restoremaster", action="store_true",
1125
help="restore previously saved master container")
1126
parser.add_argument("-t", "--tester", action="store_true",
1127
help="rebuild 'testing' container")
1128
parser.add_argument("--deploy", action="store_true",
1129
help="rotate deployed containers based on current master")
1130
parser.add_argument("--nodelay", action="store_true",
1131
help="don't wait for old containers to be out of use")
1132
args = parser.parse_args()
1133
1134
# Do it only once and let users change it later.
1135
if not os.path.exists("/etc/security/limits.d/sagecell.conf"):
1136
log.info("setting up security limits configuration file")
1137
with open("/etc/security/limits.d/sagecell.conf", "w") as f:
1138
f.write(limits_conf)
1139
log.info("Finish this session and start a new one for system configuration"
1140
" changes to take effect.")
1141
exit()
1142
if not os.path.exists("/etc/rsyslog.d/sagecell.conf"):
1143
log.info("setting up rsyslog configuration file")
1144
with open("/etc/rsyslog.d/sagecell.conf", "w") as f:
1145
f.write(rsyslog_conf)
1146
check_call("systemctl restart rsyslog")
1147
1148
# Main chain: base -- sage -- precell -- (sagecell, backup) -- sc-NA/sc-NB
1149
if args.savemaster:
1150
SCLXC(lxcn_backup).create()
1151
if args.base:
1152
SCLXC(lxcn_base).create()
1153
if args.sage:
1154
SCLXC(lxcn_sage).create()
1155
if args.precell:
1156
SCLXC(lxcn_precell).create()
1157
if args.master:
1158
SCLXC(lxcn_sagecell).create()
1159
if args.restoremaster:
1160
SCLXC(lxcn_backup).clone(lxcn_sagecell)
1161
1162
# Autostart containers: tester and deployed nodes.
1163
if args.tester:
1164
SCLXC(lxcn_sagecell).clone(lxcn_tester, autostart=True).start()
1165
1166
A_names = ["{}{}{}".format(lxcn_prefix, n, "A")
1167
for n in range(number_of_compute_nodes)]
1168
B_names = ["{}{}{}".format(lxcn_prefix, n, "B")
1169
for n in range(number_of_compute_nodes)]
1170
if all(SCLXC(n).is_defined() for n in A_names):
1171
up_names, old_names = A_names, B_names
1172
elif all(SCLXC(n).is_defined() for n in B_names):
1173
up_names, old_names = B_names, A_names
1174
else:
1175
up_names, old_names = [], A_names
1176
1177
if args.deploy:
1178
sagecell = SCLXC(lxcn_sagecell)
1179
sagecell.update()
1180
up_names, old_names = old_names, up_names
1181
for n in up_names:
1182
sagecell.clone(n, autostart=True, update=False).start()
1183
log.info("waiting for new containers to fully initialize...")
1184
timer_delay(start_delay)
1185
old_nodes = list(map(SCLXC, old_names))
1186
if old_nodes and not args.nodelay:
1187
old_haproxy = []
1188
for p in psutil.process_iter():
1189
try:
1190
if p.username() == 'haproxy':
1191
old_haproxy.append(p)
1192
except psutil.NoSuchProcess:
1193
pass
1194
1195
def test():
1196
return psutil.wait_procs(old_haproxy, timeout=1)[1]
1197
1198
restart_haproxy(up_names, old_names)
1199
log.info("waiting for users to stop working with old containers...")
1200
timer_delay(deploy_delay, test)
1201
1202
restart_haproxy(up_names)
1203
1204
if args.deploy:
1205
for n in old_nodes:
1206
n.save_logs()
1207
n.destroy()
1208
1209