Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sagemath
GitHub Repository: sagemath/sagecell
Path: blob/master/contrib/vm/container_manager.py
447 views
1
#!/usr/bin/env python3
2
3
import argparse
4
import datetime
5
import grp
6
import logging
7
import logging.config
8
import os
9
import pwd
10
import random
11
import shlex
12
import shutil
13
import stat
14
import subprocess
15
import sys
16
import time
17
18
import lxc
19
import psutil
20
import yaml
21
22
number_of_compute_nodes = 3
23
24
#This will be visible on root and help pages. Suggested template:
25
# Resources for your computation are provided by <a href="...">...</a>.
26
provider_html = r"""
27
"""
28
29
# Container names
30
lxcn_base = "base" # OS and packages
31
lxcn_sage = "sage" # Sage without extra packages
32
lxcn_precell = "precell" # Everything but SageCell and system configuration
33
lxcn_sagecell = "sagecell" # Sage and SageCell
34
lxcn_backup = "sagecell-backup" # Saved master for restoration if necessary
35
lxcn_tester = "sctest" # Accessible via special port, for testing
36
lxcn_prefix = "sc-" # Prefix for main compute nodes
37
38
# Timeout in seconds to wait for a container to shutdown, network to start etc.
39
timeout = 120
40
# Time after which SageCell should be up and running.
41
start_delay = 126
42
# How long to wait after starting new containers before destroying old ones.
43
deploy_delay = 2*60*60 # Two hours to allow all interacts finish "naturally".
44
45
# User names and IDs
46
users = {"group": "sagecell", "GID": 8888,
47
"server": "sc_serv", "server_ID": 8888,
48
"worker": "sc_work", "worker_ID": 9999}
49
50
# Github repositories as (user, repository, branch)
51
repositories = [
52
("sagemath", "sage", "master"),
53
("sagemath", "sagecell", "master"),
54
]
55
56
# Packages to be installed in the base container
57
system_packages = [
58
# SageMath prerequisites as of Sage 9.7
59
'bc',
60
'binutils',
61
'bzip2',
62
'ca-certificates',
63
'cliquer',
64
'cmake',
65
'curl',
66
'ecl',
67
'eclib-tools',
68
'fflas-ffpack',
69
'flintqs',
70
'g++',
71
'gcc',
72
'gengetopt',
73
'gfan',
74
'gfortran',
75
'glpk-utils',
76
'gmp-ecm',
77
'lcalc',
78
'libatomic-ops-dev',
79
'libboost-dev',
80
'libbraiding-dev',
81
'libbz2-dev',
82
'libcdd-dev',
83
'libcdd-tools',
84
'libcliquer-dev',
85
'libcurl4-openssl-dev',
86
'libec-dev',
87
'libecm-dev',
88
'libffi-dev',
89
'libflint-dev',
90
'libfplll-dev',
91
'libfreetype6-dev',
92
'libgc-dev',
93
'libgd-dev',
94
'libgf2x-dev',
95
'libgiac-dev',
96
'libgivaro-dev',
97
'libglpk-dev',
98
'libgmp-dev',
99
'libgsl-dev',
100
'libhomfly-dev',
101
'libiml-dev',
102
'liblfunction-dev',
103
'liblinbox-dev',
104
'liblrcalc-dev',
105
'liblzma-dev',
106
'libm4ri-dev',
107
'libm4rie-dev',
108
'libmpc-dev',
109
'libmpfi-dev',
110
'libmpfr-dev',
111
'libncurses5-dev',
112
'libntl-dev',
113
'libopenblas-dev',
114
'libpari-dev',
115
'libpcre3-dev',
116
'libplanarity-dev',
117
'libppl-dev',
118
'libprimesieve-dev',
119
'libpython3-dev',
120
'libqhull-dev',
121
'libreadline-dev',
122
'librw-dev',
123
'libsingular4-dev',
124
'libsqlite3-dev',
125
'libssl-dev',
126
'libsuitesparse-dev',
127
'libsymmetrica2-dev',
128
'libz-dev',
129
'libzmq3-dev',
130
'libzn-poly-dev',
131
'm4',
132
'make',
133
'nauty',
134
'ninja-build',
135
'openssl',
136
'palp',
137
'pari-doc',
138
'pari-elldata',
139
'pari-galdata',
140
'pari-galpol',
141
'pari-gp2c',
142
'pari-seadata',
143
'patch',
144
'perl',
145
'pkg-config',
146
'planarity',
147
'ppl-dev',
148
'python3',
149
'python3-venv',
150
'r-base-dev',
151
'r-cran-lattice',
152
'singular',
153
'singular-doc',
154
'sqlite3',
155
'sympow',
156
'tachyon',
157
'tar',
158
'tox',
159
'xcas',
160
'xz-utils',
161
# SageMath development
162
'autoconf',
163
'automake',
164
'git',
165
'gpgconf',
166
'libtool',
167
# 'openssh', not available on Ubuntu 22.04
168
'openssh-client',
169
'pkg-config',
170
# SageMath recommendations
171
'default-jdk',
172
'dvipng',
173
'ffmpeg',
174
'imagemagick',
175
'latexmk',
176
'libavdevice-dev',
177
'pandoc',
178
'tex-gyre',
179
'texlive-fonts-recommended',
180
'texlive-lang-cyrillic',
181
'texlive-lang-english',
182
'texlive-lang-european',
183
'texlive-lang-french',
184
'texlive-lang-german',
185
'texlive-lang-italian',
186
'texlive-lang-japanese',
187
'texlive-lang-polish',
188
'texlive-lang-portuguese',
189
'texlive-lang-spanish',
190
'texlive-latex-extra',
191
'texlive-xetex',
192
# SageMath optional
193
'4ti2',
194
'clang',
195
'coinor-cbc',
196
'coinor-libcbc-dev',
197
'graphviz',
198
'libfile-slurp-perl',
199
'libgraphviz-dev',
200
'libigraph-dev',
201
'libisl-dev',
202
'libjson-perl',
203
'libmongodb-perl',
204
'libnauty-dev',
205
'libperl-dev',
206
'libpolymake-dev',
207
'libsvg-perl',
208
'libterm-readkey-perl',
209
'libterm-readline-gnu-perl',
210
'libxml-libxslt-perl',
211
'libxml-writer-perl',
212
'libxml2-dev',
213
'lrslib',
214
'pari-gp2c',
215
'pdf2svg',
216
# 'polymake', triggers firefox snap that does not work in containers
217
'texinfo',
218
# SageMathCell
219
'bison',
220
'build-essential',
221
'epstool',
222
'fig2dev',
223
'gettext',
224
'gnuplot',
225
'ipset',
226
'iptables',
227
'libcairo2-dev',
228
'libgeos-dev',
229
'libhdf5-dev',
230
'libnetcdf-dev',
231
'libopenmpi-dev',
232
'libopenmpi3',
233
'libproj-dev',
234
'libsnappy-dev',
235
'libsystemd-dev',
236
'libxslt1-dev',
237
'macaulay2',
238
'nginx',
239
'npm',
240
'octave',
241
'octave-econometrics',
242
'octave-statistics',
243
'octave-symbolic',
244
'php8.3-fpm',
245
'proj-bin',
246
'python3-requests',
247
'rsyslog-relp',
248
'ssh',
249
'texlive',
250
'tk-dev',
251
'tmpreaper',
252
'unattended-upgrades',
253
'unzip',
254
'wget',
255
# R packages
256
'r-cran-desolve',
257
'r-cran-ggally',
258
'r-cran-ggeffects',
259
'r-cran-ggplot2',
260
'r-cran-lazyeval',
261
'r-cran-pracma',
262
'r-cran-reticulate',
263
'r-cran-rhandsontable',
264
'r-cran-rms',
265
'r-cran-survey',
266
'r-cran-tidyverse',
267
]
268
269
# R packages that are not available as system ones
270
R_packages = [
271
"flextable",
272
"formattable",
273
"ggformula",
274
"glmmTMB",
275
"gt",
276
"gtExtras",
277
"huxtable",
278
"kableExtra",
279
"mosaic",
280
"pixiedust",
281
"reactable",
282
"reactablefmtr",
283
"swirl",
284
]
285
286
# Optional Sage packages to be installed
287
sage_optional_packages = [
288
"4ti2",
289
"biopython",
290
"bliss",
291
"cbc",
292
"database_cremona_ellcurve",
293
"database_jones_numfield",
294
"database_odlyzko_zeta",
295
"database_symbolic_data",
296
"dot2tex", # needs graphviz
297
"fricas",
298
"gap_packages",
299
"gap3",
300
"jmol",
301
"jupyter_jsmol",
302
"latte_int",
303
"lie", # needs bison
304
"lrslib",
305
"mcqd",
306
"normaliz",
307
"pari_elldata",
308
"pari_galpol",
309
"pari_nftables",
310
"pari_seadata",
311
"pybtex", # needs unzip
312
"pynormaliz",
313
"qepcad",
314
"saclib",
315
"tides",
316
#"topcom", Does not work as of November 2022 with relying on system packages
317
]
318
319
# Python packages to be installed into Sage (via pip)
320
python_packages = [
321
# Dependencies of SageMathCell
322
"comm",
323
"lockfile",
324
"paramiko",
325
"psutil",
326
"sockjs-tornado",
327
"git+https://github.com/systemd/python-systemd.git",
328
# Optional
329
"future", # fipy does not work without it installed first
330
#"--no-build-isolation git+https://github.com/abelfunctions/abelfunctions", downgrades numpy
331
"admcycles",
332
"altair",
333
"APMonitor",
334
"astropy",
335
"astroquery",
336
#"autoviz", downgrades numpy
337
"bioinfokit",
338
"bitarray",
339
"bokeh",
340
"calplot",
341
"cartopy",
342
"chart_studio",
343
"colorlog",
344
"covid-daily",
345
"cramjam",
346
"cufflinks",
347
"dash",
348
"dask[array]",
349
"drawdata",
350
"dropbox",
351
"duckdb",
352
"emoji",
353
"galgebra",
354
"geopandas",
355
"geoplot",
356
"getdist",
357
"ggplot",
358
"gif",
359
#"giotto-tda", wants sudo
360
"google-api-python-client",
361
"google-generativeai",
362
"graphviz",
363
"gspread",
364
"fipy",
365
"folium",
366
"healpy",
367
"h5py",
368
"husl",
369
"itikz",
370
"july",
371
"keras",
372
"keyring",
373
"koboextractor",
374
"langchain",
375
"langchain-openai",
376
"langserve",
377
"langserve[all]",
378
#"lenstools", complaints there is no numpy
379
"lhsmdu",
380
"litellm",
381
"lxml",
382
"manimlib",
383
"mapclassify",
384
"mathchem",
385
"mistralai",
386
"mpi4py",
387
"msedge-selenium-tools",
388
"munkres",
389
"nest_asyncio",
390
"netcdf4",
391
"nltk",
392
"numexpr",
393
"oauth2client",
394
"oct2py",
395
"openai",
396
"openpyxl",
397
"pandas",
398
"pandas-profiling",
399
"patsy",
400
"plotnine",
401
"plotly",
402
"polars",
403
"pretty_html_table",
404
"pydot",
405
"pyforest",
406
"pygnuplot",
407
"PyPDF4",
408
"pyproj",
409
"pyswarms",
410
"python-snappy",
411
"python-ternary",
412
"pyvo",
413
"qiskit",
414
"qiskit[nature]",
415
"requests",
416
"scikit-image",
417
"scikit-learn",
418
"scikit-tda",
419
#"scimath", does not build
420
"scrapy",
421
"seaborn",
422
"selenium",
423
"Shapely",
424
"SimPy",
425
"snappy",
426
"spacy",
427
"SpeechRecognition",
428
"spiceypy",
429
"statsmodels",
430
#"surface_dynamics", does not build
431
"sweetviz",
432
"tables",
433
"tbcontrol",
434
#"theano", does not build
435
"tikzplotlib",
436
"torch",
437
"transformers",
438
"tweepy",
439
"twint",
440
"vega_datasets",
441
"WeasyPrint",
442
"wordcloud",
443
"xarray",
444
"xlrd",
445
"moss", # This one only complains about missing dependencies
446
]
447
448
449
# limits configuration for the host - will not be overwritten later
450
limits_conf = """\
451
* - nofile 32768
452
root - nofile 32768
453
"""
454
455
456
# rsyslog configuration for the host - will not be overwritten later
457
rsyslog_conf = r"""global(maxMessageSize="64k")
458
459
module(load="imrelp")
460
input(type="imrelp" port="12514")
461
462
template(name="sagecell" type="list") {
463
property(name="hostname")
464
constant(value=" ")
465
property(name="syslogtag")
466
property(name="msg" spifno1stsp="on")
467
property(name="msg" droplastlf="on")
468
constant(value="\n")
469
}
470
471
if $syslogfacility-text == "local3" then
472
{
473
action(type="omfile"
474
file="/var/log/sagecell.stats.log"
475
template="sagecell")
476
stop
477
}
478
"""
479
480
481
# HA-Proxy configuration is regenerated every time the script is run.
482
HAProxy_header = """\
483
# Default from Ubuntu 22.04 LTS
484
global
485
log /dev/log local0
486
log /dev/log local1 notice
487
chroot /var/lib/haproxy
488
stats socket /run/haproxy/admin.sock mode 660 level admin expose-fd listeners
489
stats timeout 30s
490
user haproxy
491
group haproxy
492
daemon
493
494
# Default SSL material locations
495
ca-base /etc/ssl/certs
496
crt-base /etc/ssl/private
497
498
# See: https://ssl-config.mozilla.org/#server=haproxy&server-version=2.0.3&config=intermediate
499
ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384
500
ssl-default-bind-ciphersuites TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256
501
ssl-default-bind-options ssl-min-ver TLSv1.2 no-tls-tickets
502
503
defaults
504
log global
505
mode http
506
option httplog
507
option dontlognull
508
timeout connect 5000
509
timeout client 50000
510
timeout server 50000
511
errorfile 400 /etc/haproxy/errors/400.http
512
errorfile 403 /etc/haproxy/errors/403.http
513
errorfile 408 /etc/haproxy/errors/408.http
514
errorfile 500 /etc/haproxy/errors/500.http
515
errorfile 502 /etc/haproxy/errors/502.http
516
errorfile 503 /etc/haproxy/errors/503.http
517
errorfile 504 /etc/haproxy/errors/504.http
518
519
# SageMathCell additions
520
option http-server-close
521
option redispatch
522
timeout client-fin 50s
523
timeout tunnel 30m
524
"""
525
526
# {suffix} {port} {hostname} {peer_port} have to be set once
527
# lines with {node} and {id} should be repeated for each server
528
HAProxy_section = r"""
529
frontend http{suffix}
530
bind *:{port}
531
rate-limit sessions 10
532
http-request replace-path (/embedded_sagecell\.js.*) /static\1 if { url_beg /embedded_sagecell }
533
use_backend static{suffix} if { path_beg /static }
534
use_backend compute{suffix}
535
monitor-uri /?healthcheck
536
monitor fail if { nbsrv(compute{suffix}) lt 1 }
537
538
peers local{suffix}
539
peer {hostname} localhost:{peer_port}
540
541
backend static{suffix}
542
server {node} {ip}:8889 id {id} check
543
544
backend compute{suffix}
545
balance leastconn
546
stick-table type string len 36 size 1m expire 30m peers local{suffix}
547
stick on urlp(CellSessionID)
548
stick match req.hdr(Jupyter-Kernel-ID)
549
stick store-response res.hdr(Jupyter-Kernel-ID)
550
stick match path bytes(8,36) if { path_reg ^/kernel/.{36}/ }
551
option httpchk
552
553
server {node} {ip}:8888 id {id} check
554
"""
555
556
HAProxy_stats = """
557
listen stats
558
bind *:9999
559
stats enable
560
stats refresh 5s
561
stats uri /
562
stats show-legends
563
"""
564
565
566
def call(command):
567
command = command.format_map(users)
568
log.debug("executing %s", command)
569
return subprocess.call(shlex.split(command))
570
571
572
def check_call(command):
573
command = command.format_map(users)
574
log.debug("executing %s", command)
575
subprocess.check_call(shlex.split(command))
576
577
578
def check_output(command):
579
command = command.format_map(users)
580
log.debug("executing %s", command)
581
return subprocess.check_output(shlex.split(command),
582
universal_newlines=True)
583
584
585
def communicate(command, message):
586
command = command.format_map(users)
587
log.debug("sending %s to %s", message, command)
588
with subprocess.Popen(shlex.split(command),
589
stdin=subprocess.PIPE,
590
universal_newlines=True) as p:
591
p.communicate(message)
592
if p.returncode != 0:
593
msg = "{} failed".format(command)
594
log.error(msg)
595
raise RuntimeError(msg)
596
597
598
def timer_delay(delay, test=None):
599
r"""
600
Wait with a countdown timer.
601
602
``delay`` is either a timedelta or the number of seconds.
603
604
``test`` is either ``None`` (default) or callable, in which case the timer
605
stops as soon as ``False`` is returned.
606
"""
607
if isinstance(delay, datetime.timedelta):
608
delay = delay.total_seconds()
609
now = time.time()
610
end = now + delay
611
while now < end and (test is None or test()):
612
remaining = datetime.timedelta(seconds=int(end - now))
613
sys.stdout.write(" Please wait {} ...\r".format(remaining))
614
sys.stdout.flush()
615
time.sleep(1)
616
now = time.time()
617
618
619
def update_repositories():
620
r"""
621
Clone/update repositories and checkout appropriate branches.
622
"""
623
if not os.path.exists("github"):
624
os.mkdir("github")
625
os.chdir("github")
626
git = lambda command: check_call("git " + command)
627
for user, repository, branch in repositories:
628
log.info("updating repository %s", repository)
629
if not os.path.exists(repository):
630
git("clone https://github.com/{}/{}.git".format(user, repository))
631
os.chdir(repository)
632
git("fetch")
633
git("checkout " + branch)
634
if call("git symbolic-ref -q HEAD") == 0:
635
git("pull")
636
os.chdir(os.pardir)
637
os.chdir(os.pardir)
638
639
640
def create_host_users():
641
r"""
642
Create host users if necessary.
643
644
If users exist (from previous runs), check that they are as expected.
645
"""
646
log.info("creating users on the host")
647
try:
648
check_call("addgroup --gid {GID} {group}")
649
check_call("adduser --uid {server_ID} --ingroup {group} --gecos '' "
650
"--disabled-password --no-create-home {server}")
651
check_call("adduser --uid {worker_ID} --ingroup {group} --gecos '' "
652
"--disabled-password --no-create-home {worker}")
653
except subprocess.CalledProcessError:
654
try:
655
g = grp.getgrnam(users["group"])
656
s = pwd.getpwnam(users["server"])
657
w = pwd.getpwnam(users["worker"])
658
if g.gr_gid != users["GID"] or \
659
s.pw_uid != users["server_ID"] or s.pw_gid != users["GID"] or \
660
w.pw_uid != users["worker_ID"] or w.pw_gid != users["GID"]:
661
raise KeyError
662
except KeyError:
663
raise RuntimeError("failed to create accounts on host")
664
665
666
def setup_container_users():
667
r"""
668
Create container users and setup SSH access.
669
"""
670
log.info("setting up users in the containter")
671
check_call("addgroup --gid {GID} {group}")
672
check_call("adduser --uid {server_ID} --ingroup {group} --gecos '' "
673
"--disabled-password {server}")
674
check_call("adduser --uid {worker_ID} --ingroup {group} --gecos '' "
675
"--disabled-password {worker}")
676
677
shome = os.path.join("/home", users["server"])
678
os.chmod(shome, stat.S_IRWXU |
679
stat.S_IRGRP | stat.S_IXGRP |
680
stat.S_IROTH | stat.S_IXOTH)
681
os.chdir(shome)
682
os.setegid(users["GID"])
683
os.seteuid(users["server_ID"])
684
os.mkdir(".ssh", 0o700)
685
check_call("ssh-keygen -t ed25519 -q -N '' -f .ssh/id_ed25519")
686
687
whome = os.path.join("/home", users["worker"])
688
os.chdir(whome)
689
os.setuid(0)
690
os.seteuid(users["worker_ID"])
691
os.mkdir(".cache", 0o700)
692
os.mkdir(".sage")
693
os.mkdir(".ssh", 0o700)
694
files_to_lock = [
695
".cache/pip",
696
".sage/local",
697
".ssh",
698
".bash_logout",
699
".bash_profile",
700
".bashrc",
701
".profile",
702
]
703
check_call(" ".join(["touch"] + files_to_lock))
704
os.setuid(0)
705
shutil.copy2(os.path.join(shome, ".ssh/id_ed25519.pub"),
706
".ssh/authorized_keys")
707
os.chown(".ssh/authorized_keys", users["worker_ID"], users["GID"])
708
# Get the localhost in the known_hosts file.
709
check_call("su -l {server} -c "
710
"'ssh -q -oStrictHostKeyChecking=no {worker}@localhost whoami'")
711
for f in files_to_lock:
712
check_call("chattr -R +i " + f)
713
714
715
def become_server():
716
r"""
717
Adjust UID etc. to have files created as the server user.
718
"""
719
os.setgid(users["GID"])
720
os.setuid(users["server_ID"])
721
os.environ["HOME"] = os.path.join("/home", users["server"])
722
os.chdir(os.environ["HOME"])
723
os.environ.setdefault("MAKE", "make -j{}".format(os.cpu_count()))
724
725
726
def install_sage():
727
r"""
728
Install Sage.
729
"""
730
become_server()
731
shutil.move("github/sage", ".")
732
os.chdir("sage")
733
log.info("compiling Sage")
734
check_call("./bootstrap")
735
check_call("./configure")
736
check_call("make")
737
communicate("./sage", r"""
738
# make appropriate octave directory
739
octave.eval('1+2')
740
quit
741
""")
742
log.info("successfully compiled Sage")
743
744
745
def install_packages():
746
r"""
747
Assuming Sage is already installed, install optional packages.
748
"""
749
become_server()
750
os.chdir("sage")
751
log.info("installing optional Sage packages")
752
for package in sage_optional_packages:
753
check_call("./sage -i -y {}".format(package))
754
log.info("installing pip packages")
755
check_call("./sage -pip install --upgrade pip")
756
numpy_ver = check_output("./sage -c 'import numpy; print(numpy.__version__)'").strip()
757
log.info(f"numpy version is expected to stay at {numpy_ver}")
758
for package in python_packages:
759
# Many packages may downgrade numpy, so we force it to be at the Sage version
760
check_call(f"./sage -pip install numpy=={numpy_ver} {package}")
761
os.chdir("..")
762
763
764
def install_sagecell():
765
r"""
766
Install SageCell, assuming Sage and other packages are already installed.
767
"""
768
become_server()
769
log.info("compiling SageCell")
770
shutil.move("github/sagecell", ".")
771
shutil.rmtree("github")
772
os.chdir("sagecell")
773
with open("templates/provider.html", "w", encoding="utf-8") as f:
774
f.write(provider_html)
775
check_call("../sage/sage -sh -c 'make -B'")
776
log.info("successfully compiled SageCell")
777
778
779
def install_config_files():
780
r"""
781
Install container's config files, adjusting names inside.
782
"""
783
log.info("copying configuration files")
784
os.chdir(os.path.join("/home", users["server"],
785
"sagecell/contrib/vm/compute_node"))
786
787
def adjust_names(file):
788
with open(file) as f:
789
content = f.read()
790
for key, value in users.items():
791
content = content.replace("{%s}" % key, str(value))
792
with open(file, "w") as f:
793
f.write(content)
794
795
adjust_names(shutil.copy("config.py", "../../.."))
796
for root, _, files in os.walk("."):
797
if root == ".":
798
continue
799
for file in files:
800
name = os.path.join(root, file)
801
adjust_names(shutil.copy(name, name[1:]))
802
check_call("systemctl enable sagecell")
803
804
805
class SCLXC(object):
806
r"""
807
Wrapper for lxc.Container automatically performing prerequisite operations.
808
"""
809
810
def __init__(self, name):
811
self.name = name
812
self.c = lxc.Container(self.name)
813
814
def clone(self, clone_name, autostart=False, update=True):
815
r"""
816
Clone self, create a base container and destroy old clone if necessary.
817
"""
818
if not self.is_defined():
819
self.create()
820
if update:
821
self.update()
822
self.shutdown()
823
SCLXC(clone_name).destroy()
824
log.info("cloning %s to %s", self.name, clone_name)
825
if not self.c.clone(clone_name, flags=lxc.LXC_CLONE_SNAPSHOT):
826
raise RuntimeError("failed to clone " + self.name)
827
clone = SCLXC(clone_name)
828
if autostart:
829
clone.c.set_config_item("lxc.start.auto", "1")
830
clone.c.set_config_item("lxc.start.delay", str(start_delay))
831
clone.c.set_config_item("lxc.net.0.hwaddr",
832
"02:00:" + ":".join(["%02x" % random.randint(0, 255) for _ in range(4)]))
833
clone.c.save_config()
834
logdir = clone.c.get_config_item("lxc.rootfs.path") + "/var/log/"
835
for logfile in ["sagecell.log", "sagecell-console.log"]:
836
if os.path.exists(logdir + logfile):
837
os.remove(logdir + logfile)
838
return clone
839
840
def create(self):
841
r"""
842
Create this container based on the previous one, destroy old one if necessary.
843
844
It is the logical sequence of creating a fully configured SageMathCell container
845
from scratch, but broken into several steps. Previous steps for the current container
846
are performed if necessary, based on names. Random name is assumed to be a copy
847
of "the end result".
848
"""
849
self.destroy()
850
log.info("creating %s", self.name)
851
if self.name == lxcn_base:
852
# From scratch
853
# Try to automatically pick up proxy from host
854
os.environ["HTTP_PROXY"] = "apt"
855
if not self.c.create(
856
"download", 0,
857
{"dist": "ubuntu", "release": "noble", "arch": "amd64"},
858
"btrfs"):
859
raise RuntimeError("failed to create " + self.name)
860
os.environ.pop("HTTP_PROXY")
861
862
self.update()
863
# Need to preseed or there will be a dialog
864
self.inside(communicate, "/usr/bin/debconf-set-selections",
865
"tmpreaper tmpreaper/readsecurity note")
866
log.info("installing packages")
867
self.inside("apt install -y " + " ".join(system_packages))
868
# Relies on perl, so has to be after package installation
869
self.inside("/usr/sbin/deluser ubuntu --remove-home")
870
log.info("installing R packages")
871
for package in R_packages:
872
self.inside(f"""Rscript -e 'install.packages("{package}")'""")
873
self.inside(f"""Rscript -e 'library("{package}")'""")
874
elif self.name == lxcn_sage:
875
self.c = SCLXC(lxcn_base).clone(lxcn_sage).c
876
create_host_users()
877
self.inside(setup_container_users)
878
# FIXME: work with temp folders properly
879
self.inside(os.mkdir, "/tmp/sagecell", 0o730)
880
self.inside(os.chown, "/tmp/sagecell",
881
users["server_ID"], users["GID"])
882
self.inside(os.chmod, "/tmp/sagecell", stat.S_ISGID)
883
# Copy repositories into container
884
update_repositories()
885
log.info("uploading repositories to %s", self.name)
886
root = self.c.get_config_item("lxc.rootfs.path")
887
home = os.path.join(root, "home", users["server"])
888
dot_cache = os.path.join(home, ".cache")
889
shutil.copytree("github", os.path.join(home, "github"), symlinks=True)
890
self.inside("chown -R {server}:{group} /home/{server}/github")
891
try:
892
shutil.copytree("dot_cache", dot_cache, symlinks=True)
893
self.inside("chown -R {server}:{group} /home/{server}/.cache")
894
except FileNotFoundError:
895
pass
896
self.inside(install_sage)
897
elif self.name == lxcn_precell:
898
self.c = SCLXC(lxcn_sage).clone(lxcn_precell).c
899
self.inside(install_packages)
900
# Remove old versions of packages
901
root = self.c.get_config_item("lxc.rootfs.path")
902
home = os.path.join(root, "home", users["server"])
903
dot_cache = os.path.join(home, ".cache")
904
upstream = os.path.join(home, "sage/upstream")
905
packages = dict()
906
for f in os.listdir(upstream):
907
filename = os.path.join(upstream, f)
908
name = f.split("-", 1)[0]
909
if name not in packages:
910
packages[name] = []
911
packages[name].append((os.stat(filename).st_mtime, filename))
912
for package in packages.values():
913
package.sort()
914
package.pop()
915
for _, filename in package:
916
os.remove(filename)
917
try:
918
shutil.rmtree("github/sage/upstream")
919
except FileNotFoundError:
920
pass
921
shutil.move(upstream, "github/sage/upstream")
922
try:
923
shutil.rmtree("dot_cache")
924
except FileNotFoundError:
925
pass
926
shutil.copytree(dot_cache, "dot_cache", symlinks=True)
927
elif self.name == lxcn_sagecell:
928
self.c = SCLXC(lxcn_precell).clone(lxcn_sagecell).c
929
self.inside("su -c 'git -C /home/{server}/github/sagecell pull' {server}")
930
self.inside(install_sagecell)
931
self.inside(install_config_files)
932
self.c.set_config_item("lxc.cgroup.memory.limit_in_bytes", "8G")
933
self.c.save_config()
934
self.shutdown()
935
# Let first-time tasks to run and complete.
936
self.start()
937
timer_delay(start_delay)
938
else:
939
# If the name is not recognized as some intermediate step, we assume
940
# that a copy of the fully built SageMathCell is desired
941
self.c = SCLXC(lxcn_sagecell).clone(self.name).c
942
943
def destroy(self):
944
r"""
945
Stop and destroy self if it exists.
946
"""
947
if self.c.defined:
948
log.info("destroying %s", self.name)
949
if self.c.running and not self.c.stop():
950
raise RuntimeError("failed to stop " + self.name)
951
if not self.c.destroy():
952
raise RuntimeError("failed to destroy " + self.name)
953
self.c = lxc.Container(self.name)
954
else:
955
log.debug("not destroying %s since it is not defined", self.name)
956
957
def inside(self, command, *args):
958
r"""
959
Run a function or a system command inside the container.
960
"""
961
self.start()
962
if isinstance(command, str):
963
command = command.format_map(users)
964
log.debug("executing '%s' in %s", command, self.name)
965
if self.c.attach_wait(lxc.attach_run_command,
966
shlex.split(command)):
967
raise RuntimeError("failed to execute '{}'".format(command))
968
else:
969
args = [arg.format_map(users) if isinstance(arg, str) else arg
970
for arg in args]
971
972
def wrapper():
973
command(*args)
974
os.sys.exit() # Otherwise attach_wait returns -1
975
976
log.debug("executing %s with arguments %s in %s",
977
command, args, self.name)
978
if self.c.attach_wait(wrapper):
979
raise RuntimeError("failed to execute {} with arguments {}"
980
.format(command, args))
981
982
def ip(self):
983
self.start()
984
return self.c.get_ips()[0]
985
986
def is_defined(self):
987
return self.c.defined
988
989
def save_logs(self):
990
stamp_length = len("2014-12-28 15:00:02,315")
991
root = self.c.get_config_item("lxc.rootfs.path")
992
logdir = os.path.join(root, "var", "log")
993
logname = "sagecell.log"
994
fullname = os.path.join(logdir, logname)
995
if not os.path.exists(fullname):
996
return
997
with open(fullname, "rb") as f:
998
start = f.read(stamp_length).decode()
999
f.seek(0, os.SEEK_END)
1000
f.seek(max(f.tell() - 2**16, 0))
1001
end = f.readlines()[-1][:stamp_length].decode()
1002
archname = "container_logs/%s to %s on %s" % (start, end, self.name)
1003
if not os.path.exists("container_logs"):
1004
os.mkdir("container_logs")
1005
log.info("saving %s", archname)
1006
shutil.make_archive(archname, "bztar", logdir, logname)
1007
1008
def shutdown(self):
1009
if self.c.running and not self.c.shutdown(timeout):
1010
raise RuntimeError("failed to shutdown " + self.name)
1011
1012
def start(self):
1013
r"""
1014
Make sure that ``self`` is running and network works.
1015
"""
1016
if not self.c.running and not self.c.start():
1017
raise RuntimeError("failed to start " + self.name)
1018
if not self.c.get_ips(timeout=timeout):
1019
raise RuntimeError("failed to start network in " + self.name)
1020
1021
def update(self):
1022
r"""
1023
Update OS packages in ``self``.
1024
"""
1025
if self.is_defined():
1026
log.info("updating packages in %s", self.name)
1027
try:
1028
self.inside("apt update")
1029
except RuntimeError:
1030
# We get here if /var/lib/dpkg/lock is locked:
1031
# let's wait a bit and try again once
1032
timer_delay(timeout)
1033
self.inside("apt update")
1034
self.inside("apt full-upgrade -y --auto-remove")
1035
else:
1036
self.create()
1037
1038
1039
def restart_haproxy(names, backup_names=[]):
1040
r"""
1041
Regenerate HA-Proxy configuration file and restart it.
1042
"""
1043
log.debug("generating HAProxy configuration file")
1044
lines = [HAProxy_header]
1045
if names:
1046
shift = lambda n: 1 if n.endswith("A") else number_of_compute_nodes + 1
1047
section = HAProxy_section
1048
for k, v in {"port" : 80,
1049
"suffix": "",
1050
"peer_port": 1080,
1051
"hostname": check_output("hostname").strip()}.items():
1052
section = section.replace("{" + k + "}", str(v))
1053
for l in section.splitlines():
1054
if "{node}" not in l:
1055
lines.append(l)
1056
else:
1057
for i, n in enumerate(names):
1058
lines.append(l.format(
1059
node=n, ip=SCLXC(n).ip(), id=i + shift(n)))
1060
l += " backup"
1061
for i, n in enumerate(backup_names):
1062
lines.append(l.format(
1063
node=n, ip=SCLXC(n).ip(), id=i + shift(n)))
1064
tester = SCLXC(lxcn_tester)
1065
if tester.is_defined():
1066
section = HAProxy_section
1067
for k, v in {"port" : 8888,
1068
"suffix": "_test",
1069
"node": lxcn_tester,
1070
"ip": tester.ip(),
1071
"id": 1,
1072
"peer_port": 1088,
1073
"hostname": check_output("hostname").strip()}.items():
1074
section = section.replace("{" + k + "}", str(v))
1075
lines.append(section)
1076
lines.append(HAProxy_stats)
1077
with open("/etc/haproxy/haproxy.cfg", "w") as f:
1078
f.write("\n".join(lines))
1079
try:
1080
check_call("systemctl reload haproxy")
1081
except subprocess.CalledProcessError:
1082
check_call("systemctl start haproxy")
1083
1084
1085
logging.config.dictConfig(yaml.safe_load("""
1086
version: 1
1087
formatters:
1088
file:
1089
format: '%(asctime)s %(levelname)s: %(message)s'
1090
console:
1091
format: '########## %(asctime)s %(levelname)s: %(message)s ##########'
1092
handlers:
1093
file:
1094
class: logging.FileHandler
1095
formatter: file
1096
filename: container_manager.log
1097
level: DEBUG
1098
console:
1099
class: logging.StreamHandler
1100
formatter: console
1101
stream: ext://sys.stdout
1102
level: INFO
1103
root:
1104
level: DEBUG
1105
handlers: [file, console]
1106
"""))
1107
log = logging.getLogger(__name__)
1108
1109
parser = argparse.ArgumentParser(description="manage SageCell LXC containers",
1110
epilog="""
1111
Missing necessary containers are always created automatically.
1112
1113
This script always overwrites system-wide HA-proxy configuration file and
1114
restarts HA-Proxy to resolve container names to new IP addresses.""")
1115
parser.add_argument("--savemaster", action="store_true",
1116
help="save existing master container")
1117
parser.add_argument("-b", "--base", action="store_true",
1118
help="rebuild 'OS and standard packages' container")
1119
parser.add_argument("-s", "--sage", action="store_true",
1120
help="rebuild Sage container")
1121
parser.add_argument("-p", "--precell", action="store_true",
1122
help="rebuild container with extra packages")
1123
group = parser.add_mutually_exclusive_group()
1124
group.add_argument("-m", "--master", action="store_true",
1125
help="rebuild 'Sage and SageCell' container")
1126
group.add_argument("--restoremaster", action="store_true",
1127
help="restore previously saved master container")
1128
parser.add_argument("-t", "--tester", action="store_true",
1129
help="rebuild 'testing' container")
1130
parser.add_argument("--deploy", action="store_true",
1131
help="rotate deployed containers based on current master")
1132
parser.add_argument("--nodelay", action="store_true",
1133
help="don't wait for old containers to be out of use")
1134
args = parser.parse_args()
1135
1136
# Do it only once and let users change it later.
1137
if not os.path.exists("/etc/security/limits.d/sagecell.conf"):
1138
log.info("setting up security limits configuration file")
1139
with open("/etc/security/limits.d/sagecell.conf", "w") as f:
1140
f.write(limits_conf)
1141
log.info("Finish this session and start a new one for system configuration"
1142
" changes to take effect.")
1143
exit()
1144
if not os.path.exists("/etc/rsyslog.d/sagecell.conf"):
1145
log.info("setting up rsyslog configuration file")
1146
with open("/etc/rsyslog.d/sagecell.conf", "w") as f:
1147
f.write(rsyslog_conf)
1148
check_call("systemctl restart rsyslog")
1149
1150
# Main chain: base -- sage -- precell -- (sagecell, backup) -- sc-NA/sc-NB
1151
if args.savemaster:
1152
SCLXC(lxcn_backup).create()
1153
if args.base:
1154
SCLXC(lxcn_base).create()
1155
if args.sage:
1156
SCLXC(lxcn_sage).create()
1157
if args.precell:
1158
SCLXC(lxcn_precell).create()
1159
if args.master:
1160
SCLXC(lxcn_sagecell).create()
1161
if args.restoremaster:
1162
SCLXC(lxcn_backup).clone(lxcn_sagecell)
1163
1164
# Autostart containers: tester and deployed nodes.
1165
if args.tester:
1166
SCLXC(lxcn_sagecell).clone(lxcn_tester, autostart=True).start()
1167
1168
A_names = ["{}{}{}".format(lxcn_prefix, n, "A")
1169
for n in range(number_of_compute_nodes)]
1170
B_names = ["{}{}{}".format(lxcn_prefix, n, "B")
1171
for n in range(number_of_compute_nodes)]
1172
if all(SCLXC(n).is_defined() for n in A_names):
1173
up_names, old_names = A_names, B_names
1174
elif all(SCLXC(n).is_defined() for n in B_names):
1175
up_names, old_names = B_names, A_names
1176
else:
1177
up_names, old_names = [], A_names
1178
1179
if args.deploy:
1180
sagecell = SCLXC(lxcn_sagecell)
1181
sagecell.update()
1182
up_names, old_names = old_names, up_names
1183
for n in up_names:
1184
sagecell.clone(n, autostart=True, update=False).start()
1185
log.info("waiting for new containers to fully initialize...")
1186
timer_delay(start_delay)
1187
old_nodes = list(map(SCLXC, old_names))
1188
if old_nodes and not args.nodelay:
1189
old_haproxy = []
1190
for p in psutil.process_iter():
1191
try:
1192
if p.username() == 'haproxy':
1193
old_haproxy.append(p)
1194
except psutil.NoSuchProcess:
1195
pass
1196
1197
def test():
1198
return psutil.wait_procs(old_haproxy, timeout=1)[1]
1199
1200
restart_haproxy(up_names, old_names)
1201
log.info("waiting for users to stop working with old containers...")
1202
timer_delay(deploy_delay, test)
1203
1204
restart_haproxy(up_names)
1205
1206
if args.deploy:
1207
for n in old_nodes:
1208
n.save_logs()
1209
n.destroy()
1210
1211