Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sagemath
GitHub Repository: sagemath/sagecell
Path: blob/master/contrib/vm/container_manager.py
821 views
1
#!/usr/bin/env python3
2
3
import argparse
4
import datetime
5
import grp
6
import logging
7
import logging.config
8
import os
9
import pwd
10
import random
11
import shlex
12
import shutil
13
import stat
14
import subprocess
15
import sys
16
import time
17
18
import lxc
19
import psutil
20
import yaml
21
22
number_of_compute_nodes = 3
23
24
#This will be visible on root and help pages. Suggested template:
25
# Resources for your computation are provided by <a href="...">...</a>.
26
provider_html = r"""
27
"""
28
29
# Container names
30
lxcn_base = "base" # OS and packages
31
lxcn_sage = "sage" # Sage without extra packages
32
lxcn_precell = "precell" # Everything but SageCell and system configuration
33
lxcn_sagecell = "sagecell" # Sage and SageCell
34
lxcn_backup = "sagecell-backup" # Saved master for restoration if necessary
35
lxcn_tester = "sctest" # Accessible via special port, for testing
36
lxcn_prefix = "sc-" # Prefix for main compute nodes
37
38
# Timeout in seconds to wait for a container to shutdown, network to start etc.
39
timeout = 120
40
# Time after which SageCell should be up and running.
41
start_delay = 66
42
# How long to wait after starting new containers before destroying old ones.
43
deploy_delay = 2*60*60 # Two hours to allow all interacts finish "naturally".
44
45
# User names and IDs
46
users = {"group": "sagecell", "GID": 8888,
47
"server": "sc_serv", "server_ID": 8888,
48
"worker": "sc_work", "worker_ID": 9999}
49
50
# Github repositories as (user, repository, branch)
51
repositories = [
52
("sagemath", "sage", "master"),
53
("sagemath", "sagecell", "master"),
54
]
55
56
# Packages to be installed in the base container
57
system_packages = [
58
# SageMath prerequisites as of Sage 9.7
59
'bc',
60
'binutils',
61
'bzip2',
62
'ca-certificates',
63
'cliquer',
64
'cmake',
65
'curl',
66
'ecl',
67
'eclib-tools',
68
'fflas-ffpack',
69
'flintqs',
70
'g++',
71
'gcc',
72
'gengetopt',
73
'gfan',
74
'gfortran',
75
'glpk-utils',
76
'gmp-ecm',
77
'lcalc',
78
'libatomic-ops-dev',
79
'libboost-dev',
80
'libbraiding-dev',
81
'libbz2-dev',
82
'libcdd-dev',
83
'libcdd-tools',
84
'libcliquer-dev',
85
'libcurl4-openssl-dev',
86
'libec-dev',
87
'libecm-dev',
88
'libffi-dev',
89
'libflint-dev',
90
'libfplll-dev',
91
'libfreetype6-dev',
92
'libgc-dev',
93
'libgd-dev',
94
'libgf2x-dev',
95
'libgiac-dev',
96
'libgivaro-dev',
97
'libglpk-dev',
98
'libgmp-dev',
99
'libgsl-dev',
100
'libhomfly-dev',
101
'libiml-dev',
102
'liblfunction-dev',
103
'liblinbox-dev',
104
'liblrcalc-dev',
105
'liblzma-dev',
106
'libm4ri-dev',
107
'libm4rie-dev',
108
'libmpc-dev',
109
'libmpfi-dev',
110
'libmpfr-dev',
111
'libncurses5-dev',
112
'libntl-dev',
113
'libopenblas-dev',
114
'libpari-dev',
115
'libpcre3-dev',
116
'libplanarity-dev',
117
'libppl-dev',
118
'libprimesieve-dev',
119
'libpython3-dev',
120
'libqhull-dev',
121
'libreadline-dev',
122
'librw-dev',
123
'libsingular4-dev',
124
'libsqlite3-dev',
125
'libssl-dev',
126
'libsuitesparse-dev',
127
'libsymmetrica2-dev',
128
'libz-dev',
129
'libzmq3-dev',
130
'libzn-poly-dev',
131
'm4',
132
'make',
133
'nauty',
134
'ninja-build',
135
'openssl',
136
'palp',
137
'pari-doc',
138
'pari-elldata',
139
'pari-galdata',
140
'pari-galpol',
141
'pari-gp2c',
142
'pari-seadata',
143
'patch',
144
'perl',
145
'pkg-config',
146
'planarity',
147
'ppl-dev',
148
'python3',
149
'python3-venv',
150
'r-base-dev',
151
'r-cran-lattice',
152
'singular',
153
'singular-doc',
154
'sqlite3',
155
'sympow',
156
'tachyon',
157
'tar',
158
'tox',
159
'xcas',
160
'xz-utils',
161
# SageMath development
162
'autoconf',
163
'automake',
164
'git',
165
'gpgconf',
166
'libtool',
167
# 'openssh', not available on Ubuntu 22.04
168
'openssh-client',
169
'pkg-config',
170
# SageMath recommendations
171
'default-jdk',
172
'dvipng',
173
'ffmpeg',
174
'imagemagick',
175
'latexmk',
176
'libavdevice-dev',
177
'pandoc',
178
'tex-gyre',
179
'texlive-fonts-recommended',
180
'texlive-lang-cyrillic',
181
'texlive-lang-english',
182
'texlive-lang-european',
183
'texlive-lang-french',
184
'texlive-lang-german',
185
'texlive-lang-italian',
186
'texlive-lang-japanese',
187
'texlive-lang-polish',
188
'texlive-lang-portuguese',
189
'texlive-lang-spanish',
190
'texlive-latex-extra',
191
'texlive-xetex',
192
# SageMath optional
193
'4ti2',
194
'clang',
195
'coinor-cbc',
196
'coinor-libcbc-dev',
197
'graphviz',
198
'libfile-slurp-perl',
199
'libgraphviz-dev',
200
'libigraph-dev',
201
'libisl-dev',
202
'libjson-perl',
203
'libmongodb-perl',
204
'libnauty-dev',
205
'libperl-dev',
206
'libpolymake-dev',
207
'libsvg-perl',
208
'libterm-readkey-perl',
209
'libterm-readline-gnu-perl',
210
'libxml-libxslt-perl',
211
'libxml-writer-perl',
212
'libxml2-dev',
213
'lrslib',
214
'pari-gp2c',
215
'pdf2svg',
216
# 'polymake', triggers firefox snap that does not work in containers
217
'texinfo',
218
# SageMathCell
219
'bison',
220
'build-essential',
221
'epstool',
222
'fig2dev',
223
'gettext',
224
'gnuplot',
225
'ipset',
226
'iptables',
227
'libcairo2-dev',
228
'libgeos-dev',
229
'libhdf5-dev',
230
'libnetcdf-dev',
231
'libopenmpi-dev',
232
'libopenmpi3',
233
'libproj-dev',
234
'libsnappy-dev',
235
'libsystemd-dev',
236
'libxslt1-dev',
237
'macaulay2',
238
'nginx',
239
'npm',
240
'octave',
241
'octave-econometrics',
242
'octave-statistics',
243
'octave-symbolic',
244
'php8.3-fpm',
245
'proj-bin',
246
'python3-requests',
247
'rsyslog-relp',
248
'ssh',
249
'texlive',
250
'tk-dev',
251
'tmpreaper',
252
'unattended-upgrades',
253
'unzip',
254
'wget',
255
# R packages
256
'r-cran-desolve',
257
'r-cran-ggally',
258
'r-cran-ggeffects',
259
'r-cran-ggplot2',
260
'r-cran-lazyeval',
261
'r-cran-pracma',
262
'r-cran-reticulate',
263
'r-cran-rhandsontable',
264
'r-cran-rms',
265
'r-cran-survey',
266
'r-cran-tidyverse',
267
]
268
269
# R packages that are not available as system ones
270
R_packages = [
271
"flextable",
272
"formattable",
273
"ggformula",
274
"glmmTMB",
275
"gt",
276
"huxtable",
277
"mosaic",
278
"openintro",
279
"reactable",
280
]
281
282
# Optional Sage packages to be installed
283
sage_optional_packages = [
284
"4ti2",
285
"biopython",
286
"bliss",
287
"cbc",
288
"database_cremona_ellcurve",
289
"database_jones_numfield",
290
"database_odlyzko_zeta",
291
"database_symbolic_data",
292
"dot2tex", # needs graphviz
293
"fricas",
294
"gap_packages",
295
"gap3",
296
"jmol",
297
"jupyter_jsmol",
298
"latte_int",
299
"lie", # needs bison
300
"lrslib",
301
"mcqd",
302
"normaliz",
303
"pari_elldata",
304
"pari_galpol",
305
"pari_nftables",
306
"pari_seadata",
307
"pybtex", # needs unzip
308
"pynormaliz",
309
"qepcad",
310
"saclib",
311
"sagemath_giac",
312
"tides",
313
#"topcom", Does not work as of November 2022 with relying on system packages
314
]
315
316
# Python packages to be installed into Sage (via pip)
317
python_packages = [
318
# Dependencies of SageMathCell
319
"comm",
320
"lockfile",
321
"paramiko",
322
"psutil",
323
"sockjs-tornado",
324
"git+https://github.com/systemd/python-systemd.git",
325
# Optional
326
"future", # fipy does not work without it installed first
327
#"--no-build-isolation git+https://github.com/abelfunctions/abelfunctions", downgrades numpy
328
"admcycles",
329
"altair",
330
"APMonitor",
331
"astropy",
332
"astroquery",
333
#"autoviz", downgrades numpy
334
"bioinfokit",
335
"bitarray",
336
"bokeh",
337
"calplot",
338
"cartopy",
339
"chart_studio",
340
"colorlog",
341
"covid-daily",
342
"cramjam",
343
"cufflinks",
344
"dash",
345
"dask[array]",
346
"drawdata",
347
"dropbox",
348
"duckdb",
349
"emoji",
350
"galgebra",
351
"geopandas",
352
"geoplot",
353
"getdist",
354
"ggplot",
355
"gif",
356
#"giotto-tda", wants sudo
357
"google-api-python-client",
358
"google-genai",
359
"google-generativeai",
360
"graphviz",
361
"gspread",
362
"fipy",
363
"folium",
364
"healpy",
365
"h5py",
366
"husl",
367
"itikz",
368
"july",
369
"keras",
370
"keyring",
371
"koboextractor",
372
"langchain",
373
"langchain-openai",
374
"langserve",
375
"langserve[all]",
376
#"lenstools", complaints there is no numpy
377
"lhsmdu",
378
"litellm",
379
"lxml",
380
"manimlib",
381
"mapclassify",
382
"mathchem",
383
"mistralai",
384
"mpi4py",
385
"msedge-selenium-tools",
386
"munkres",
387
"nest_asyncio",
388
"netcdf4",
389
"nltk",
390
"numexpr",
391
"oauth2client",
392
"oct2py",
393
"openai",
394
"openpyxl",
395
"pandas",
396
"pandas-profiling",
397
"patsy",
398
"plotnine",
399
"plotly",
400
"polars",
401
"pretty_html_table",
402
"pydot",
403
"pygnuplot",
404
"PyPDF4",
405
"pyproj",
406
"pyswarms",
407
"python-snappy",
408
"python-ternary",
409
"pyvo",
410
"qiskit",
411
"qiskit[nature]",
412
"requests",
413
"scikit-image",
414
"scikit-learn",
415
"scikit-tda",
416
#"scimath", does not build
417
"scrapy",
418
"seaborn",
419
"selenium",
420
"Shapely",
421
"SimPy",
422
"snappy",
423
"spacy",
424
"SpeechRecognition",
425
"spiceypy",
426
"statsmodels",
427
"streamlit",
428
#"surface_dynamics", does not build
429
"sweetviz",
430
"tables",
431
"tbcontrol",
432
#"theano", does not build
433
"tikzplotlib",
434
"torch",
435
"transformers",
436
"tweepy",
437
"vega_datasets",
438
"WeasyPrint",
439
"wordcloud",
440
"xarray",
441
"xlrd",
442
]
443
444
445
# limits configuration for the host - will not be overwritten later
446
limits_conf = """\
447
* - nofile 32768
448
root - nofile 32768
449
"""
450
451
452
# rsyslog configuration for the host - will not be overwritten later
453
rsyslog_conf = r"""global(maxMessageSize="64k")
454
455
module(load="imrelp")
456
input(type="imrelp" port="12514")
457
458
template(name="sagecell" type="list") {
459
property(name="hostname")
460
constant(value=" ")
461
property(name="syslogtag")
462
property(name="msg" spifno1stsp="on")
463
property(name="msg" droplastlf="on")
464
constant(value="\n")
465
}
466
467
if $syslogfacility-text == "local3" then
468
{
469
action(type="omfile"
470
file="/var/log/sagecell.stats.log"
471
template="sagecell")
472
stop
473
}
474
"""
475
476
477
# HA-Proxy configuration is regenerated every time the script is run.
478
HAProxy_header = """\
479
# Default from Ubuntu 22.04 LTS
480
global
481
log /dev/log local0
482
log /dev/log local1 notice
483
chroot /var/lib/haproxy
484
stats socket /run/haproxy/admin.sock mode 660 level admin expose-fd listeners
485
stats timeout 30s
486
user haproxy
487
group haproxy
488
daemon
489
490
# Default SSL material locations
491
ca-base /etc/ssl/certs
492
crt-base /etc/ssl/private
493
494
# See: https://ssl-config.mozilla.org/#server=haproxy&server-version=2.0.3&config=intermediate
495
ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384
496
ssl-default-bind-ciphersuites TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256
497
ssl-default-bind-options ssl-min-ver TLSv1.2 no-tls-tickets
498
499
defaults
500
log global
501
mode http
502
option httplog
503
option dontlognull
504
timeout connect 5000
505
timeout client 50000
506
timeout server 50000
507
errorfile 400 /etc/haproxy/errors/400.http
508
errorfile 403 /etc/haproxy/errors/403.http
509
errorfile 408 /etc/haproxy/errors/408.http
510
errorfile 500 /etc/haproxy/errors/500.http
511
errorfile 502 /etc/haproxy/errors/502.http
512
errorfile 503 /etc/haproxy/errors/503.http
513
errorfile 504 /etc/haproxy/errors/504.http
514
515
# SageMathCell additions
516
option http-server-close
517
option redispatch
518
timeout client-fin 50s
519
timeout tunnel 30m
520
"""
521
522
# {suffix} {port} {hostname} {peer_port} have to be set once
523
# lines with {node} and {id} should be repeated for each server
524
HAProxy_section = r"""
525
frontend http{suffix}
526
bind *:{port}
527
rate-limit sessions 10
528
http-request replace-path (/embedded_sagecell\.js.*) /static\1 if { url_beg /embedded_sagecell }
529
use_backend static{suffix} if { path_beg /static }
530
use_backend compute{suffix}
531
monitor-uri /?healthcheck
532
monitor fail if { nbsrv(compute{suffix}) lt 1 }
533
534
peers local{suffix}
535
peer {hostname} localhost:{peer_port}
536
537
backend static{suffix}
538
server {node} {ip}:8889 id {id} check
539
540
backend compute{suffix}
541
balance leastconn
542
stick-table type string len 36 size 1m expire 30m peers local{suffix}
543
stick on urlp(CellSessionID)
544
stick match req.hdr(Jupyter-Kernel-ID)
545
stick store-response res.hdr(Jupyter-Kernel-ID)
546
stick match path bytes(8,36) if { path_reg ^/kernel/.{36}/ }
547
option httpchk
548
549
server {node} {ip}:8888 id {id} check
550
"""
551
552
HAProxy_stats = """
553
listen stats
554
bind *:9999
555
stats enable
556
stats refresh 5s
557
stats uri /
558
stats show-legends
559
"""
560
561
562
def call(command):
563
command = command.format_map(users)
564
log.debug("executing %s", command)
565
return subprocess.call(shlex.split(command))
566
567
568
def check_call(command):
569
command = command.format_map(users)
570
log.debug("executing %s", command)
571
subprocess.check_call(shlex.split(command))
572
573
574
def check_output(command):
575
command = command.format_map(users)
576
log.debug("executing %s", command)
577
return subprocess.check_output(shlex.split(command),
578
universal_newlines=True)
579
580
581
def communicate(command, message):
582
command = command.format_map(users)
583
log.debug("sending %s to %s", message, command)
584
with subprocess.Popen(shlex.split(command),
585
stdin=subprocess.PIPE,
586
universal_newlines=True) as p:
587
p.communicate(message)
588
if p.returncode != 0:
589
msg = "{} failed".format(command)
590
log.error(msg)
591
raise RuntimeError(msg)
592
593
594
def timer_delay(delay, test=None):
595
r"""
596
Wait with a countdown timer.
597
598
``delay`` is either a timedelta or the number of seconds.
599
600
``test`` is either ``None`` (default) or callable, in which case the timer
601
stops as soon as ``False`` is returned.
602
"""
603
if isinstance(delay, datetime.timedelta):
604
delay = delay.total_seconds()
605
now = time.time()
606
end = now + delay
607
while now < end and (test is None or test()):
608
remaining = datetime.timedelta(seconds=int(end - now))
609
sys.stdout.write(" Please wait {} ...\r".format(remaining))
610
sys.stdout.flush()
611
time.sleep(1)
612
now = time.time()
613
614
615
def update_repositories():
616
r"""
617
Clone/update repositories and checkout appropriate branches.
618
"""
619
if not os.path.exists("github"):
620
os.mkdir("github")
621
os.chdir("github")
622
git = lambda command: check_call("git " + command)
623
for user, repository, branch in repositories:
624
log.info("updating repository %s", repository)
625
if not os.path.exists(repository):
626
git("clone https://github.com/{}/{}.git".format(user, repository))
627
os.chdir(repository)
628
git("fetch")
629
git("checkout " + branch)
630
if call("git symbolic-ref -q HEAD") == 0:
631
git("pull")
632
os.chdir(os.pardir)
633
os.chdir(os.pardir)
634
635
636
def create_host_users():
637
r"""
638
Create host users if necessary.
639
640
If users exist (from previous runs), check that they are as expected.
641
"""
642
log.info("creating users on the host")
643
try:
644
check_call("addgroup --gid {GID} {group}")
645
check_call("adduser --uid {server_ID} --ingroup {group} --gecos '' "
646
"--disabled-password --no-create-home {server}")
647
check_call("adduser --uid {worker_ID} --ingroup {group} --gecos '' "
648
"--disabled-password --no-create-home {worker}")
649
except subprocess.CalledProcessError:
650
try:
651
g = grp.getgrnam(users["group"])
652
s = pwd.getpwnam(users["server"])
653
w = pwd.getpwnam(users["worker"])
654
if g.gr_gid != users["GID"] or \
655
s.pw_uid != users["server_ID"] or s.pw_gid != users["GID"] or \
656
w.pw_uid != users["worker_ID"] or w.pw_gid != users["GID"]:
657
raise KeyError
658
except KeyError:
659
raise RuntimeError("failed to create accounts on host")
660
661
662
def setup_container_users():
663
r"""
664
Create container users and setup SSH access.
665
"""
666
log.info("setting up users in the containter")
667
check_call("addgroup --gid {GID} {group}")
668
check_call("adduser --uid {server_ID} --ingroup {group} --gecos '' "
669
"--disabled-password {server}")
670
check_call("adduser --uid {worker_ID} --ingroup {group} --gecos '' "
671
"--disabled-password {worker}")
672
673
shome = os.path.join("/home", users["server"])
674
os.chmod(shome, stat.S_IRWXU |
675
stat.S_IRGRP | stat.S_IXGRP |
676
stat.S_IROTH | stat.S_IXOTH)
677
os.chdir(shome)
678
os.setegid(users["GID"])
679
os.seteuid(users["server_ID"])
680
os.mkdir(".ssh", 0o700)
681
check_call("ssh-keygen -t ed25519 -q -N '' -f .ssh/id_ed25519")
682
683
whome = os.path.join("/home", users["worker"])
684
os.chdir(whome)
685
os.setuid(0)
686
os.seteuid(users["worker_ID"])
687
os.mkdir(".cache", 0o700)
688
os.mkdir(".sage")
689
os.mkdir(".ssh", 0o700)
690
files_to_lock = [
691
".cache/pip",
692
".sage/local",
693
".ssh",
694
".bash_logout",
695
".bash_profile",
696
".bashrc",
697
".profile",
698
]
699
check_call(" ".join(["touch"] + files_to_lock))
700
os.setuid(0)
701
shutil.copy2(os.path.join(shome, ".ssh/id_ed25519.pub"),
702
".ssh/authorized_keys")
703
os.chown(".ssh/authorized_keys", users["worker_ID"], users["GID"])
704
# Get the localhost in the known_hosts file.
705
check_call("su -l {server} -c "
706
"'ssh -q -oStrictHostKeyChecking=no {worker}@localhost whoami'")
707
for f in files_to_lock:
708
check_call("chattr -R +i " + f)
709
710
711
def become_server():
712
r"""
713
Adjust UID etc. to have files created as the server user.
714
"""
715
os.setgid(users["GID"])
716
os.setuid(users["server_ID"])
717
os.environ["HOME"] = os.path.join("/home", users["server"])
718
os.chdir(os.environ["HOME"])
719
os.environ.setdefault("MAKE", "make -j{}".format(os.cpu_count()))
720
721
722
def install_sage():
723
r"""
724
Install Sage.
725
"""
726
become_server()
727
shutil.move("github/sage", ".")
728
os.chdir("sage")
729
log.info("compiling Sage")
730
check_call("./bootstrap")
731
check_call("./configure")
732
check_call("make")
733
communicate("./sage", r"""
734
# make appropriate octave directory
735
octave.eval('1+2')
736
quit
737
""")
738
log.info("successfully compiled Sage")
739
740
741
def install_packages():
742
r"""
743
Assuming Sage is already installed, install optional packages.
744
"""
745
become_server()
746
os.chdir("sage")
747
log.info("installing optional Sage packages")
748
check_call("./sage -i -y {}".format(" ".join(sage_optional_packages)))
749
log.info("installing pip packages")
750
check_call("./sage -pip install --upgrade pip")
751
numpy_ver = check_output("./sage -c 'import numpy; print(numpy.__version__)'").strip()
752
log.info(f"numpy version is expected to stay at {numpy_ver}")
753
for package in python_packages:
754
# Many packages may downgrade numpy, so we force it to be at the Sage version
755
check_call(f"./sage -pip install numpy=={numpy_ver} {package}")
756
os.chdir("..")
757
758
759
def install_sagecell():
760
r"""
761
Install SageCell, assuming Sage and other packages are already installed.
762
"""
763
become_server()
764
log.info("compiling SageCell")
765
shutil.move("github/sagecell", ".")
766
shutil.rmtree("github")
767
os.chdir("sagecell")
768
with open("templates/provider.html", "w", encoding="utf-8") as f:
769
f.write(provider_html)
770
check_call("../sage/sage -sh -c 'make -B'")
771
log.info("successfully compiled SageCell")
772
773
774
def install_config_files():
775
r"""
776
Install container's config files, adjusting names inside.
777
"""
778
log.info("copying configuration files")
779
os.chdir(os.path.join("/home", users["server"],
780
"sagecell/contrib/vm/compute_node"))
781
782
def adjust_names(file):
783
with open(file) as f:
784
content = f.read()
785
for key, value in users.items():
786
content = content.replace("{%s}" % key, str(value))
787
with open(file, "w") as f:
788
f.write(content)
789
790
adjust_names(shutil.copy("config.py", "../../.."))
791
for root, _, files in os.walk("."):
792
if root == ".":
793
continue
794
for file in files:
795
name = os.path.join(root, file)
796
adjust_names(shutil.copy(name, name[1:]))
797
check_call("systemctl enable sagecell")
798
799
800
class SCLXC(object):
801
r"""
802
Wrapper for lxc.Container automatically performing prerequisite operations.
803
"""
804
805
def __init__(self, name):
806
self.name = name
807
self.c = lxc.Container(self.name)
808
809
def clone(self, clone_name, autostart=False, update=True):
810
r"""
811
Clone self, create a base container and destroy old clone if necessary.
812
"""
813
if not self.is_defined():
814
self.create()
815
if update:
816
self.update()
817
self.shutdown()
818
SCLXC(clone_name).destroy()
819
log.info("cloning %s to %s", self.name, clone_name)
820
if not self.c.clone(clone_name, flags=lxc.LXC_CLONE_SNAPSHOT):
821
raise RuntimeError("failed to clone " + self.name)
822
clone = SCLXC(clone_name)
823
if autostart:
824
clone.c.set_config_item("lxc.start.auto", "1")
825
clone.c.set_config_item("lxc.start.delay", str(start_delay))
826
clone.c.set_config_item("lxc.net.0.hwaddr",
827
"02:00:" + ":".join(["%02x" % random.randint(0, 255) for _ in range(4)]))
828
clone.c.save_config()
829
logdir = clone.c.get_config_item("lxc.rootfs.path") + "/var/log/"
830
for logfile in ["sagecell.log", "sagecell-console.log"]:
831
if os.path.exists(logdir + logfile):
832
os.remove(logdir + logfile)
833
return clone
834
835
def create(self):
836
r"""
837
Create this container based on the previous one, destroy old one if necessary.
838
839
It is the logical sequence of creating a fully configured SageMathCell container
840
from scratch, but broken into several steps. Previous steps for the current container
841
are performed if necessary, based on names. Random name is assumed to be a copy
842
of "the end result".
843
"""
844
self.destroy()
845
log.info("creating %s", self.name)
846
if self.name == lxcn_base:
847
# From scratch
848
# Try to automatically pick up proxy from host
849
os.environ["HTTP_PROXY"] = "apt"
850
if not self.c.create(
851
"download", 0,
852
{"dist": "ubuntu", "release": "noble", "arch": "amd64"},
853
"btrfs"):
854
raise RuntimeError("failed to create " + self.name)
855
os.environ.pop("HTTP_PROXY")
856
857
self.update()
858
# Need to preseed or there will be a dialog
859
self.inside(communicate, "/usr/bin/debconf-set-selections",
860
"tmpreaper tmpreaper/readsecurity note")
861
log.info("installing packages")
862
self.inside("apt install -y " + " ".join(system_packages))
863
# Relies on perl, so has to be after package installation
864
self.inside("/usr/sbin/deluser ubuntu --remove-home")
865
log.info("installing R packages")
866
for package in R_packages:
867
self.inside(f"""Rscript -e 'install.packages("{package}")'""")
868
self.inside(f"""Rscript -e 'library("{package}")'""")
869
elif self.name == lxcn_sage:
870
self.c = SCLXC(lxcn_base).clone(lxcn_sage).c
871
create_host_users()
872
self.inside(setup_container_users)
873
# FIXME: work with temp folders properly
874
self.inside(os.mkdir, "/tmp/sagecell", 0o730)
875
self.inside(os.chown, "/tmp/sagecell",
876
users["server_ID"], users["GID"])
877
self.inside(os.chmod, "/tmp/sagecell", stat.S_ISGID)
878
# Copy repositories into container
879
update_repositories()
880
log.info("uploading repositories to %s", self.name)
881
root = self.c.get_config_item("lxc.rootfs.path")
882
home = os.path.join(root, "home", users["server"])
883
dot_cache = os.path.join(home, ".cache")
884
shutil.copytree("github", os.path.join(home, "github"), symlinks=True)
885
self.inside("chown -R {server}:{group} /home/{server}/github")
886
try:
887
shutil.copytree("dot_cache", dot_cache, symlinks=True)
888
self.inside("chown -R {server}:{group} /home/{server}/.cache")
889
except FileNotFoundError:
890
pass
891
self.inside(install_sage)
892
elif self.name == lxcn_precell:
893
self.c = SCLXC(lxcn_sage).clone(lxcn_precell).c
894
self.inside(install_packages)
895
# Remove old versions of packages
896
root = self.c.get_config_item("lxc.rootfs.path")
897
home = os.path.join(root, "home", users["server"])
898
dot_cache = os.path.join(home, ".cache")
899
upstream = os.path.join(home, "sage/upstream")
900
packages = dict()
901
for f in os.listdir(upstream):
902
filename = os.path.join(upstream, f)
903
name = f.split("-", 1)[0]
904
if name not in packages:
905
packages[name] = []
906
packages[name].append((os.stat(filename).st_mtime, filename))
907
for package in packages.values():
908
package.sort()
909
package.pop()
910
for _, filename in package:
911
os.remove(filename)
912
try:
913
shutil.rmtree("github/sage/upstream")
914
except FileNotFoundError:
915
pass
916
shutil.move(upstream, "github/sage/upstream")
917
try:
918
shutil.rmtree("dot_cache")
919
except FileNotFoundError:
920
pass
921
shutil.copytree(dot_cache, "dot_cache", symlinks=True)
922
elif self.name == lxcn_sagecell:
923
self.c = SCLXC(lxcn_precell).clone(lxcn_sagecell).c
924
self.inside("su -c 'git -C /home/{server}/github/sagecell pull' {server}")
925
self.inside(install_sagecell)
926
self.inside(install_config_files)
927
self.c.set_config_item("lxc.cgroup.memory.limit_in_bytes", "8G")
928
self.c.save_config()
929
self.shutdown()
930
# Let first-time tasks to run and complete.
931
self.start()
932
timer_delay(start_delay + 300)
933
else:
934
# If the name is not recognized as some intermediate step, we assume
935
# that a copy of the fully built SageMathCell is desired
936
self.c = SCLXC(lxcn_sagecell).clone(self.name).c
937
938
def destroy(self):
939
r"""
940
Stop and destroy self if it exists.
941
"""
942
if self.c.defined:
943
log.info("destroying %s", self.name)
944
if self.c.running and not self.c.stop():
945
raise RuntimeError("failed to stop " + self.name)
946
if not self.c.destroy():
947
raise RuntimeError("failed to destroy " + self.name)
948
self.c = lxc.Container(self.name)
949
else:
950
log.debug("not destroying %s since it is not defined", self.name)
951
952
def inside(self, command, *args):
953
r"""
954
Run a function or a system command inside the container.
955
"""
956
self.start()
957
if isinstance(command, str):
958
command = command.format_map(users)
959
log.debug("executing '%s' in %s", command, self.name)
960
if self.c.attach_wait(lxc.attach_run_command,
961
shlex.split(command)):
962
raise RuntimeError("failed to execute '{}'".format(command))
963
else:
964
args = [arg.format_map(users) if isinstance(arg, str) else arg
965
for arg in args]
966
967
def wrapper():
968
command(*args)
969
os.sys.exit() # Otherwise attach_wait returns -1
970
971
log.debug("executing %s with arguments %s in %s",
972
command, args, self.name)
973
if self.c.attach_wait(wrapper):
974
raise RuntimeError("failed to execute {} with arguments {}"
975
.format(command, args))
976
977
def ip(self):
978
self.start()
979
return self.c.get_ips()[0]
980
981
def is_defined(self):
982
return self.c.defined
983
984
def save_logs(self):
985
stamp_length = len("2014-12-28 15:00:02,315")
986
root = self.c.get_config_item("lxc.rootfs.path")
987
logdir = os.path.join(root, "var", "log")
988
logname = "sagecell.log"
989
fullname = os.path.join(logdir, logname)
990
if not os.path.exists(fullname):
991
return
992
with open(fullname, "rb") as f:
993
start = f.read(stamp_length).decode()
994
f.seek(0, os.SEEK_END)
995
f.seek(max(f.tell() - 2**16, 0))
996
end = f.readlines()[-1][:stamp_length].decode()
997
archname = "container_logs/%s to %s on %s" % (start, end, self.name)
998
if not os.path.exists("container_logs"):
999
os.mkdir("container_logs")
1000
log.info("saving %s", archname)
1001
shutil.make_archive(archname, "bztar", logdir, logname)
1002
1003
def shutdown(self):
1004
if self.c.running and not self.c.shutdown(timeout):
1005
raise RuntimeError("failed to shutdown " + self.name)
1006
1007
def start(self):
1008
r"""
1009
Make sure that ``self`` is running and network works.
1010
"""
1011
if not self.c.running and not self.c.start():
1012
raise RuntimeError("failed to start " + self.name)
1013
if not self.c.get_ips(timeout=timeout):
1014
raise RuntimeError("failed to start network in " + self.name)
1015
1016
def update(self):
1017
r"""
1018
Update OS packages in ``self``.
1019
"""
1020
if self.is_defined():
1021
log.info("updating packages in %s", self.name)
1022
try:
1023
self.inside("apt update")
1024
except RuntimeError:
1025
# We get here if /var/lib/dpkg/lock is locked:
1026
# let's wait a bit and try again once
1027
timer_delay(timeout)
1028
self.inside("apt update")
1029
self.inside("apt full-upgrade -y --auto-remove")
1030
else:
1031
self.create()
1032
1033
1034
def restart_haproxy(names, backup_names=[]):
1035
r"""
1036
Regenerate HA-Proxy configuration file and restart it.
1037
"""
1038
log.debug("generating HAProxy configuration file")
1039
lines = [HAProxy_header]
1040
if names:
1041
shift = lambda n: 1 if n.endswith("A") else number_of_compute_nodes + 1
1042
section = HAProxy_section
1043
for k, v in {"port" : 80,
1044
"suffix": "",
1045
"peer_port": 1080,
1046
"hostname": check_output("hostname").strip()}.items():
1047
section = section.replace("{" + k + "}", str(v))
1048
for l in section.splitlines():
1049
if "{node}" not in l:
1050
lines.append(l)
1051
else:
1052
for i, n in enumerate(names):
1053
lines.append(l.format(
1054
node=n, ip=SCLXC(n).ip(), id=i + shift(n)))
1055
l += " backup"
1056
for i, n in enumerate(backup_names):
1057
lines.append(l.format(
1058
node=n, ip=SCLXC(n).ip(), id=i + shift(n)))
1059
tester = SCLXC(lxcn_tester)
1060
if tester.is_defined():
1061
section = HAProxy_section
1062
for k, v in {"port" : 8888,
1063
"suffix": "_test",
1064
"node": lxcn_tester,
1065
"ip": tester.ip(),
1066
"id": 1,
1067
"peer_port": 1088,
1068
"hostname": check_output("hostname").strip()}.items():
1069
section = section.replace("{" + k + "}", str(v))
1070
lines.append(section)
1071
lines.append(HAProxy_stats)
1072
with open("/etc/haproxy/haproxy.cfg", "w") as f:
1073
f.write("\n".join(lines))
1074
try:
1075
check_call("systemctl reload haproxy")
1076
except subprocess.CalledProcessError:
1077
check_call("systemctl start haproxy")
1078
1079
1080
logging.config.dictConfig(yaml.safe_load("""
1081
version: 1
1082
formatters:
1083
file:
1084
format: '%(asctime)s %(levelname)s: %(message)s'
1085
console:
1086
format: '########## %(asctime)s %(levelname)s: %(message)s ##########'
1087
handlers:
1088
file:
1089
class: logging.FileHandler
1090
formatter: file
1091
filename: container_manager.log
1092
level: DEBUG
1093
console:
1094
class: logging.StreamHandler
1095
formatter: console
1096
stream: ext://sys.stdout
1097
level: INFO
1098
root:
1099
level: DEBUG
1100
handlers: [file, console]
1101
"""))
1102
log = logging.getLogger(__name__)
1103
1104
parser = argparse.ArgumentParser(description="manage SageCell LXC containers",
1105
epilog="""
1106
Missing necessary containers are always created automatically.
1107
1108
This script always overwrites system-wide HA-proxy configuration file and
1109
restarts HA-Proxy to resolve container names to new IP addresses.""")
1110
parser.add_argument("--savemaster", action="store_true",
1111
help="save existing master container")
1112
parser.add_argument("-b", "--base", action="store_true",
1113
help="rebuild 'OS and standard packages' container")
1114
parser.add_argument("-s", "--sage", action="store_true",
1115
help="rebuild Sage container")
1116
parser.add_argument("-p", "--precell", action="store_true",
1117
help="rebuild container with extra packages")
1118
group = parser.add_mutually_exclusive_group()
1119
group.add_argument("-m", "--master", action="store_true",
1120
help="rebuild 'Sage and SageCell' container")
1121
group.add_argument("--restoremaster", action="store_true",
1122
help="restore previously saved master container")
1123
parser.add_argument("-t", "--tester", action="store_true",
1124
help="rebuild 'testing' container")
1125
parser.add_argument("--deploy", action="store_true",
1126
help="rotate deployed containers based on current master")
1127
parser.add_argument("--nodelay", action="store_true",
1128
help="don't wait for old containers to be out of use")
1129
args = parser.parse_args()
1130
1131
# Do it only once and let users change it later.
1132
if not os.path.exists("/etc/security/limits.d/sagecell.conf"):
1133
log.info("setting up security limits configuration file")
1134
with open("/etc/security/limits.d/sagecell.conf", "w") as f:
1135
f.write(limits_conf)
1136
log.info("Finish this session and start a new one for system configuration"
1137
" changes to take effect.")
1138
exit()
1139
if not os.path.exists("/etc/rsyslog.d/sagecell.conf"):
1140
log.info("setting up rsyslog configuration file")
1141
with open("/etc/rsyslog.d/sagecell.conf", "w") as f:
1142
f.write(rsyslog_conf)
1143
check_call("systemctl restart rsyslog")
1144
1145
# Main chain: base -- sage -- precell -- (sagecell, backup) -- sc-NA/sc-NB
1146
if args.savemaster:
1147
SCLXC(lxcn_backup).create()
1148
if args.base:
1149
SCLXC(lxcn_base).create()
1150
if args.sage:
1151
SCLXC(lxcn_sage).create()
1152
if args.precell:
1153
SCLXC(lxcn_precell).create()
1154
if args.master:
1155
SCLXC(lxcn_sagecell).create()
1156
if args.restoremaster:
1157
SCLXC(lxcn_backup).clone(lxcn_sagecell)
1158
1159
# Autostart containers: tester and deployed nodes.
1160
if args.tester:
1161
SCLXC(lxcn_sagecell).clone(lxcn_tester, autostart=True).start()
1162
1163
A_names = ["{}{}{}".format(lxcn_prefix, n, "A")
1164
for n in range(number_of_compute_nodes)]
1165
B_names = ["{}{}{}".format(lxcn_prefix, n, "B")
1166
for n in range(number_of_compute_nodes)]
1167
if all(SCLXC(n).is_defined() for n in A_names):
1168
up_names, old_names = A_names, B_names
1169
elif all(SCLXC(n).is_defined() for n in B_names):
1170
up_names, old_names = B_names, A_names
1171
else:
1172
up_names, old_names = [], A_names
1173
1174
if args.deploy:
1175
sagecell = SCLXC(lxcn_sagecell)
1176
sagecell.update()
1177
up_names, old_names = old_names, up_names
1178
for n in up_names:
1179
sagecell.clone(n, autostart=True, update=False).start()
1180
log.info("waiting for new containers to fully initialize...")
1181
timer_delay(start_delay)
1182
old_nodes = list(map(SCLXC, old_names))
1183
if old_nodes and not args.nodelay:
1184
old_haproxy = []
1185
for p in psutil.process_iter():
1186
try:
1187
if p.username() == 'haproxy':
1188
old_haproxy.append(p)
1189
except psutil.NoSuchProcess:
1190
pass
1191
1192
def test():
1193
return psutil.wait_procs(old_haproxy, timeout=1)[1]
1194
1195
restart_haproxy(up_names, old_names)
1196
log.info("waiting for users to stop working with old containers...")
1197
timer_delay(deploy_delay, test)
1198
1199
restart_haproxy(up_names)
1200
1201
if args.deploy:
1202
for n in old_nodes:
1203
n.save_logs()
1204
n.destroy()
1205
1206