Path: blob/master/buildenv/jenkins/jobs/infrastructure/Cleanup-Nodes.groovy
23788 views
/*******************************************************************************
* Copyright (c) 2018, 2021 IBM Corp. and others
*
* This program and the accompanying materials are made available under
* the terms of the Eclipse Public License 2.0 which accompanies this
* distribution and is available at https://www.eclipse.org/legal/epl-2.0/
* or the Apache License, Version 2.0 which accompanies this distribution and
* is available at https://www.apache.org/licenses/LICENSE-2.0.
*
* This Source Code may also be made available under the following
* Secondary Licenses when the conditions for such availability set
* forth in the Eclipse Public License, v. 2.0 are satisfied: GNU
* General Public License, version 2 with the GNU Classpath
* Exception [1] and GNU General Public License, version 2 with the
* OpenJDK Assembly Exception [2].
*
* [1] https://www.gnu.org/software/classpath/license.html
* [2] http://openjdk.java.net/legal/assembly-exception.html
*
* SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0 WITH Classpath-exception-2.0 OR LicenseRef-GPL-2.0 WITH Assembly-exception
*******************************************************************************/
// #10071 This is a Jenkins class
import hudson.slaves.OfflineCause
defaultSetupLabel = 'worker'
defaultLabel = 'ci.role.build || ci.role.test'
defaultMode = 'cleanup'
defaultTime = '12'
defaultUnits = 'HOURS'
defaultReconnectTimeout = '20' //minutes
defaultSleepTime = '30' //seconds
SETUP_LABEL = params.SETUP_LABEL
if (!SETUP_LABEL) {
SETUP_LABEL = defaultSetupLabel
}
LABEL = params.LABEL
if (!LABEL) {
LABEL = defaultLabel
}
// expected MODE: cleanup | sanitize | all
MODES = []
if (!params.MODE) {
MODES.add(defaultMode)
} else if (params.MODE.equals('all')) {
MODES.addAll(['cleanup', 'sanitize'])
} else {
MODES = params.MODE.trim().replaceAll("\\s","").tokenize(',')
}
TIMEOUT_TIME = params.TIMEOUT_TIME
if (!TIMEOUT_TIME) {
TIMEOUT_TIME = defaultTime
}
TIMEOUT_UNITS = params.TIMEOUT_UNITS
if (!TIMEOUT_UNITS) {
TIMEOUT_UNITS = defaultUnits
} else {
TIMEOUT_UNITS = TIMEOUT_UNITS.toUpperCase()
}
SLACK_CHANNEL = params.SLACK_CHANNEL
RECONNECT_TIMEOUT = params.RECONNECT_TIMEOUT
if (!RECONNECT_TIMEOUT) {
RECONNECT_TIMEOUT = defaultReconnectTimeout
}
SLEEP_TIME = params.SLEEP_TIME
if (!SLEEP_TIME) {
SLEEP_TIME = defaultSleepTime
}
PARAMETERS = [string(name: 'SETUP_LABEL', defaultValue: defaultSetupLabel),
string(name: 'LABEL', defaultValue: defaultLabel),
choice(name: 'MODE', choices: ['cleanup', 'sanitize'], defaultValue: defaultMode),
string(name: 'TIMEOUT_TIME', defaultValue: defaultTime),
string(name: 'TIMEOUT_UNITS', defaultValue: defaultUnits),
string(name: 'RECONNECT_TIMEOUT', defaultValue: defaultReconnectTimeout),
string(name: 'SLEEP_TIME', defaultValue: defaultSleepTime),
string(name: 'SLACK_CHANNEL', defaultValue: '')]
/*
properties([buildDiscarder(logRotator(artifactDaysToKeepStr: '', artifactNumToKeepStr: '', daysToKeepStr: '', numToKeepStr: '5')),
pipelineTriggers([cron('''# Daily at 8am, 12pm, 4pm, 11pm (before nightly build)
0 8,12,16,23 * * *''')]),
parameters(PARAMETERS)])
*/
jobs = [:]
offlineNodes = [:]
buildNodes = []
timeout(time: TIMEOUT_TIME.toInteger(), unit: TIMEOUT_UNITS) {
timestamps {
node(SETUP_LABEL) {
try {
def cleanDirs = ['NoEntryTest*.zip',
'auth*.login',
'tmp*',
'classes*',
'aci*',
'append*',
'mlib*',
'resource-*',
'openj9tr_resources*',
'testParentDir',
'jni-*',
'mauve',
'test*',
'blah-*.tmp',
'lines*.tmp',
'prefix*.json',
'sink*.tmp',
'source*.tmp',
'target*.tmp',
'sharedcacheapi',
'intermediateClassCreateTest',
'sh-np.*',
'xlc*',
'sh-np-*']
for (aNode in jenkins.model.Jenkins.instance.getLabel(LABEL).getNodes()) {
def nodeName = aNode.getDisplayName()
if (aNode.toComputer().isOffline()) {
// skip offline nodes
def offlineCause = aNode.toComputer().getOfflineCause()
if (offlineCause instanceof OfflineCause.UserCause) {
// skip offline node disconnected by users
offlineNodes.put(nodeName, offlineCause.toString())
} else {
// cache nodes, will attempt to reconnect nodes disconnected by system later
buildNodes.add(nodeName)
}
continue
}
def nodeLabels = []
if (aNode.getLabelString()) {
nodeLabels.addAll(aNode.getLabelString().tokenize(' '))
}
buildNodes.add(nodeName)
// cache job
jobs["${nodeName}"] = {
node("${nodeName}") {
if (MODES.contains('cleanup')) {
stage("${nodeName} - Cleanup Workspaces") {
def buildWorkspace = "${env.WORKSPACE}"
if (nodeLabels.contains('sw.os.windows')) {
// convert windows path to unix path
buildWorkspace = sh(script: "cygpath -u '${env.WORKSPACE}'", returnStdout: true).trim()
}
def cleanDirsStr = "/tmp/${cleanDirs.join(' /tmp/')}"
if (nodeLabels.contains('sw.os.windows')) {
// test resources
cleanDirsStr += " ${buildWorkspace}/../../"
cleanDirsStr += cleanDirs.join(" ${buildWorkspace}/../../")
// shared classes cache
cleanDirsStr += " ${buildWorkspace}/../../javasharedresources /tmp/javasharedresources /temp/javasharedresources"
}
// cleanup test results
sh "rm -fr ${cleanDirsStr}"
// Cleanup OSX shared memory and content in /cores
if (nodeLabels.contains('sw.os.osx')) {
retry(2) {
sh """
ipcs -ma
ipcs -ma | awk '/^m / { if (\$9 == 0) { print \$2 }}' | xargs -n 1 ipcrm -m
ipcs -ma
du -sh /cores
rm -rf /cores/*
du -sh /cores
"""
}
}
// Cleanup zOS datasets
if (nodeLabels.contains('sw.os.zos')) {
listcat = sh(script: "tso listcat | grep '${env.USER}' | cut -d. -f 2-", returnStdout: true).trim()
listcat.split('\n').each {
sh "tso delete ${it}"
}
}
// Clean up defunct pipelines workspaces
def retStatus = 0
def cleanWSDirs = get_other_workspaces("${buildWorkspace}/../")
if (cleanWSDirs) {
def cleanWSDirsStr = "${buildWorkspace}/../"
cleanWSDirsStr += cleanWSDirs.join(" ${buildWorkspace}/../")
retry(3) {
if (retStatus != 0) {
sleep(time: SLEEP_TIME.toInteger(), unit: 'SECONDS')
}
retStatus = sh script: "rm -rf ${cleanWSDirsStr}", returnStatus: true
}
if (retStatus != 0) {
throw new Exception("Could not delete old builds workspaces on ${nodeName}!")
}
}
}
}
if (MODES.contains('sanitize')) {
stage("${nodeName} - Sanitize node") {
sanitize_node(nodeName)
}
}
}
}
}
if (offlineNodes) {
println("Offline nodes: ${offlineNodes.toString()}")
}
} catch (e) {
if (SLACK_CHANNEL) {
slackSend channel: SLACK_CHANNEL, color: 'danger', message: "Failed: ${env.JOB_NAME} #${env.BUILD_NUMBER} (<${env.BUILD_URL}|Open>)"
}
throw e
} finally {
cleanWs()
}
}
try {
parallel jobs
} finally {
if (MODES.contains('sanitize')) {
def offlineNodes = []
for (label in buildNodes.sort()) {
if (jenkins.model.Jenkins.instance.getNode(label)) {
def aComputer = jenkins.model.Jenkins.instance.getNode(label).toComputer()
if (aComputer.isOffline() && !(aComputer.getOfflineCause() instanceof OfflineCause.UserCause)) {
// reconnect node (asynchronously)
println("${label}: Reconnecting...")
aComputer.connect(true)
if (aComputer.isOffline()) {
echo "Node: ${JENKINS_URL}${aComputer.getUrl()} - Status: offline - Cause: ${aComputer.getOfflineCause().toString()}"
offlineNodes.add("<${JENKINS_URL}${aComputer.getUrl()}|${aComputer.getDisplayName()}>")
} else {
println("${label} is back online: ${aComputer.isOnline()}")
}
}
}
}
if (!offlineNodes.isEmpty() && SLACK_CHANNEL) {
slackSend channel: SLACK_CHANNEL, color: 'warning', message: "${env.JOB_NAME} #${env.BUILD_NUMBER} (<${env.BUILD_URL}|Open>) left nodes offline: ${offlineNodes.join(',')}"
}
}
}
}
}
/*
* Return a list of workspace directories (current build workspace excluded)
*/
def get_other_workspaces(workspaceDir) {
// fetch all directories in workspaceDir (this should not fail)
def workspaces = sh(script: "ls ${workspaceDir}", returnStdout: true).trim().tokenize(System.lineSeparator())
// remove current build workspace
def otherWS = workspaces.findAll { ws -> ws.startsWith(JOB_NAME) == false }
return otherWS
}
/*
* Kill all processes and reconnect a Jenkins node
*/
def sanitize_node(nodeName) {
def workingNode = jenkins.model.Jenkins.instance.getNode(nodeName)
def workingComputer = workingNode.toComputer()
workingComputer.setTemporarilyOffline(true, null)
try {
def cmd = ''
if (workingNode.getLabelString().indexOf("sw.os.windows") != -1) {
println("\t ${nodeName}: Rebooting...")
// NB: user requires shut down permissions (SeShutdownPrivilege) or
// belongs to the Administrators group
cmd = "cmd.exe /K shutdown /f /r"
} else {
println("\t ${nodeName}: Killing all owned processes...")
cmd = "kill -9 -1"
if (workingNode.getLabelString().indexOf("sw.os.zos") != -1) {
cmd = "ps -f -u ${env.USER} | awk '{print \$2}' | xargs kill -s KILL"
}
}
// execute command
sh "${cmd}"
} catch (e) {
println(e.getMessage())
}
//reconnect node
timeout(time: RECONNECT_TIMEOUT.toInteger(), unit: 'MINUTES') {
println("\t ${nodeName}: Disconnecting...")
workingComputer.disconnect(null)
workingComputer.waitUntilOffline()
println("\t ${nodeName}: Connecting...")
workingComputer.connect(false)
workingComputer.setTemporarilyOffline(false, null)
workingComputer.waitUntilOnline()
println("\t ${nodeName} is back online: ${workingComputer.isOnline()}")
}
}