#!/usr/bin/env python3 #***************************************************************************** # This is the sage monitor *daemon*, which cleans up after Sage. # Some things that it cleans up: # * $DOT_SAGE/temp/HOSTNAME/pid directories # * Processes that Sage spawns. If a copy of Sage isn't # running, then any process it spawned should have its # process group killed #***************************************************************************** # Copyright (C) 2005, 2006, 2007 William Stein <[email protected]> # # Distributed under the terms of the GNU General Public License (GPL) # as published by the Free Software Foundation; either version 2 of # the License, or (at your option) any later version. # http://www.gnu.org/licenses/ #***************************************************************************** import atexit import errno import os import shutil import signal import socket import sys import time HOSTNAME = os.environ.get('HOSTNAME', socket.gethostname()) DOT_SAGE = os.environ['DOT_SAGE'] SAGE_TMP_ROOT = os.path.join(DOT_SAGE, 'temp', HOSTNAME) logfile = os.path.join(SAGE_TMP_ROOT, 'cleaner.log') pidfile = os.path.join(SAGE_TMP_ROOT, 'cleaner.pid') # pidfile used by earlier versions of sage-cleaner (before #15457) old_pidfile = os.path.join(DOT_SAGE, 'temp', 'cleaner-%s.pid'%HOSTNAME) import logging logger = logging.getLogger(__name__) def mkdir_p(path): """ A "mkdir -p" command that makes intermediate directories if necessary """ try: os.makedirs(path) except OSError: if not os.path.isdir(path): raise def rm_rf(file_or_path): """ Force recursive delete, ignoring errors. """ try: os.unlink(file_or_path) except OSError as e: if e.errno == errno.EISDIR or e.errno == errno.EPERM: shutil.rmtree(file_or_path, ignore_errors=True) def is_running(pid): """ Return True if and only if there is a process with id pid running. """ try: os.kill(pid,0) return True except OSError: return False def cleanup(): tmp_dirs = os.listdir(SAGE_TMP_ROOT) try: tmp_dirs.remove('cleaner.pid') tmp_dirs.remove('cleaner.log') except ValueError: logger.error('No cleaner pid/log in SAGE_TMP_ROOT') # Convert strings to integers pid_list = [] for dir_entry in tmp_dirs: try: pid_list.append(int(dir_entry)) except ValueError: badfile = os.path.join(SAGE_TMP_ROOT, dir_entry) logger.warning('File %s must not be in SAGE_TMP_ROOT, deleting', badfile) rm_rf(badfile) logger.info("Checking PIDs %s", pid_list) for parent_pid in pid_list: try: if not is_running(parent_pid): logger.info("Process %s is no longer running, so we clean up", parent_pid) dir_name = os.path.join(SAGE_TMP_ROOT, str(parent_pid)) spawned_processes = os.path.join(dir_name, 'spawned_processes') if not os.path.isfile(spawned_processes) \ or kill_spawned_jobs(spawned_processes, parent_pid): logger.info('Deleting %s', dir_name) rm_rf(dir_name) except Exception: logger.error("Exception while cleaning up PID %s:", parent_pid, exc_info=True) return len(pid_list) def cleanup_cruft(): """ remove directories leftover from improper shutdown """ tmp_dirs = os.listdir(SAGE_TMP_ROOT) for dir_entry in tmp_dirs: baddir = os.path.join(SAGE_TMP_ROOT, dir_entry) if os.path.isdir(baddir): logger.warning('Removing old directory %s from SAGE_TMP_ROOT', baddir) rm_rf(baddir) def kill_spawned_jobs(jobfile, parent_pid): logger.info("Killing %s's spawned jobs", parent_pid) killed_them_all = True for job in open(jobfile).readlines(): try: pid, cmd = job.strip().split(' ', 1) pid = int(pid) logger.info("--> Killing %r with PID %s and parent PID %s", cmd, pid, parent_pid) except Exception: logger.error("Exception while processing job %r from %s, ignoring", job, jobfile) continue try: pgrp = os.getpgid(pid) logger.info("--> Killing process group %s", pgrp) os.killpg(pgrp, signal.SIGKILL) except OSError: pass if is_running(pid): logger.error("--> Failed to kill %s", pid) # try again later killed_them_all = False return killed_them_all def exit_handler(): logger.info("Removing pidfile and logfile") rm_rf(pidfile) rm_rf(logfile) def setup_daemon(): mkdir_p(SAGE_TMP_ROOT) logger.setLevel(logging.DEBUG) h = logging.StreamHandler() # log to stderr h.setLevel(logging.DEBUG) logger.addHandler(h) h = logging.FileHandler(filename=logfile) # log to logfile h.setLevel(logging.INFO) logger.addHandler(h) logger.info("SAGE_TMP_ROOT = %s", SAGE_TMP_ROOT) # Check old pidfile (to allow old and new sage-cleaners to peacefully coexist) try: pid = int(open(old_pidfile).read()) except (IOError, ValueError): pass else: if is_running(pid): logger.info("old sage-cleaner is already running with PID %s, exiting", pid) sys.exit(0) try: pid = int(open(pidfile).read()) except (IOError, ValueError): pass else: if is_running(pid): logger.info("sage-cleaner is already running with PID %s, exiting", pid) sys.exit(0) with open(pidfile, 'w') as f: f.write(str(os.getpid())) # initialize pid file atexit.register(exit_handler) def fix_old_mistakes(): """ Experience is simply the name we give our mistakes. """ # inconsistently escaped hyphens with underscores (https://github.com/sagemath/sage/issues/14055) wrong_hostname = HOSTNAME.replace('-','_').replace('/','_').replace('\\','_') wrong_sage_tmp_root = os.path.join(DOT_SAGE, 'temp', wrong_hostname) if wrong_sage_tmp_root != SAGE_TMP_ROOT and os.path.exists(wrong_sage_tmp_root): logger.warning('Deleting invalid temp dir %s', wrong_sage_tmp_root) rm_rf(wrong_sage_tmp_root) # SAGE_TMP in DOT_SAGE/tmp instead of DOT_SAGE/temp import glob old_root = glob.glob(os.path.join(DOT_SAGE, 'tmp', HOSTNAME+'-*')) if wrong_hostname != HOSTNAME: old_root += glob.glob(os.path.join(DOT_SAGE, 'tmp', wrong_hostname+'-*')) for old_tmp in old_root: logger.warning('Deleting invalid temp dir %s', old_tmp) rm_rf(old_tmp) # PID file before it was moved into SAGE_TMP_ROOT rm_rf(old_pidfile) if __name__ == '__main__': setup_daemon() fix_old_mistakes() logger.info("Starting sage-cleaner with PID %s", os.getpid()) cleanup_cruft() if len(sys.argv) > 1: wait = int(sys.argv[1]) else: wait = 10 # Initial cleanup, ignore time running_sages = cleanup() cleanup_time = 0.0 count = 1 # In the first 10 iterations, continue anyway (even if there are # no Sage processes running) because it can happen that Sage is # not yet started. while count < 10 or running_sages > 0: # Time to wait = "wait" plus 20 times the time of last cleanup(). # This ensures that sage-cleaner causes a load of at most 5%. time.sleep(wait + 20*cleanup_time) count += 1 t0 = time.time() running_sages = cleanup() cleanup_time = time.time() - t0 logger.debug("cleanup() #{:d} took {:.2f}s".format(count, cleanup_time)) logger.info("sage-cleaner is finished")