Path: blob/master/src/hotspot/os/linux/cgroupSubsystem_linux.cpp
40951 views
/*1* Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation.7*8* This code is distributed in the hope that it will be useful, but WITHOUT9* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or10* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License11* version 2 for more details (a copy is included in the LICENSE file that12* accompanied this code).13*14* You should have received a copy of the GNU General Public License version15* 2 along with this work; if not, write to the Free Software Foundation,16* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.17*18* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA19* or visit www.oracle.com if you need additional information or have any20* questions.21*22*/2324#include <string.h>25#include <math.h>26#include <errno.h>27#include "cgroupSubsystem_linux.hpp"28#include "cgroupV1Subsystem_linux.hpp"29#include "cgroupV2Subsystem_linux.hpp"30#include "logging/log.hpp"31#include "memory/allocation.hpp"32#include "runtime/globals.hpp"33#include "runtime/os.hpp"34#include "utilities/globalDefinitions.hpp"3536CgroupSubsystem* CgroupSubsystemFactory::create() {37CgroupV1MemoryController* memory = NULL;38CgroupV1Controller* cpuset = NULL;39CgroupV1Controller* cpu = NULL;40CgroupV1Controller* cpuacct = NULL;41CgroupInfo cg_infos[CG_INFO_LENGTH];42u1 cg_type_flags = INVALID_CGROUPS_GENERIC;43const char* proc_cgroups = "/proc/cgroups";44const char* proc_self_cgroup = "/proc/self/cgroup";45const char* proc_self_mountinfo = "/proc/self/mountinfo";4647bool valid_cgroup = determine_type(cg_infos, proc_cgroups, proc_self_cgroup, proc_self_mountinfo, &cg_type_flags);4849if (!valid_cgroup) {50// Could not detect cgroup type51return NULL;52}53assert(is_valid_cgroup(&cg_type_flags), "Expected valid cgroup type");5455if (is_cgroup_v2(&cg_type_flags)) {56// Cgroups v2 case, we have all the info we need.57// Construct the subsystem, free resources and return58// Note: any index in cg_infos will do as the path is the same for59// all controllers.60CgroupController* unified = new CgroupV2Controller(cg_infos[MEMORY_IDX]._mount_path, cg_infos[MEMORY_IDX]._cgroup_path);61log_debug(os, container)("Detected cgroups v2 unified hierarchy");62cleanup(cg_infos);63return new CgroupV2Subsystem(unified);64}6566/*67* Cgroup v1 case:68*69* Use info gathered previously from /proc/self/cgroup70* and map host mount point to71* local one via /proc/self/mountinfo content above72*73* Docker example:74* 5:memory:/docker/6558aed8fc662b194323ceab5b964f69cf36b3e8af877a14b80256e93aecb04475*76* Host example:77* 5:memory:/user.slice78*79* Construct a path to the process specific memory and cpuset80* cgroup directory.81*82* For a container running under Docker from memory example above83* the paths would be:84*85* /sys/fs/cgroup/memory86*87* For a Host from memory example above the path would be:88*89* /sys/fs/cgroup/memory/user.slice90*91*/92assert(is_cgroup_v1(&cg_type_flags), "Cgroup v1 expected");93for (int i = 0; i < CG_INFO_LENGTH; i++) {94CgroupInfo info = cg_infos[i];95if (strcmp(info._name, "memory") == 0) {96memory = new CgroupV1MemoryController(info._root_mount_path, info._mount_path);97memory->set_subsystem_path(info._cgroup_path);98} else if (strcmp(info._name, "cpuset") == 0) {99cpuset = new CgroupV1Controller(info._root_mount_path, info._mount_path);100cpuset->set_subsystem_path(info._cgroup_path);101} else if (strcmp(info._name, "cpu") == 0) {102cpu = new CgroupV1Controller(info._root_mount_path, info._mount_path);103cpu->set_subsystem_path(info._cgroup_path);104} else if (strcmp(info._name, "cpuacct") == 0) {105cpuacct = new CgroupV1Controller(info._root_mount_path, info._mount_path);106cpuacct->set_subsystem_path(info._cgroup_path);107}108}109cleanup(cg_infos);110return new CgroupV1Subsystem(cpuset, cpu, cpuacct, memory);111}112113bool CgroupSubsystemFactory::determine_type(CgroupInfo* cg_infos,114const char* proc_cgroups,115const char* proc_self_cgroup,116const char* proc_self_mountinfo,117u1* flags) {118FILE *mntinfo = NULL;119FILE *cgroups = NULL;120FILE *cgroup = NULL;121char buf[MAXPATHLEN+1];122char *p;123bool is_cgroupsV2;124// true iff all controllers, memory, cpu, cpuset, cpuacct are enabled125// at the kernel level.126bool all_controllers_enabled;127128/*129* Read /proc/cgroups so as to be able to distinguish cgroups v2 vs cgroups v1.130*131* For cgroups v1 hierarchy (hybrid or legacy), cpu, cpuacct, cpuset, memory controllers132* must have non-zero for the hierarchy ID field and relevant controllers mounted.133* Conversely, for cgroups v2 (unified hierarchy), cpu, cpuacct, cpuset, memory134* controllers must have hierarchy ID 0 and the unified controller mounted.135*/136cgroups = fopen(proc_cgroups, "r");137if (cgroups == NULL) {138log_debug(os, container)("Can't open %s, %s",139proc_cgroups, os::strerror(errno));140*flags = INVALID_CGROUPS_GENERIC;141return false;142}143144while ((p = fgets(buf, MAXPATHLEN, cgroups)) != NULL) {145char name[MAXPATHLEN+1];146int hierarchy_id;147int enabled;148149// Format of /proc/cgroups documented via man 7 cgroups150if (sscanf(p, "%s %d %*d %d", name, &hierarchy_id, &enabled) != 3) {151continue;152}153if (strcmp(name, "memory") == 0) {154cg_infos[MEMORY_IDX]._name = os::strdup(name);155cg_infos[MEMORY_IDX]._hierarchy_id = hierarchy_id;156cg_infos[MEMORY_IDX]._enabled = (enabled == 1);157} else if (strcmp(name, "cpuset") == 0) {158cg_infos[CPUSET_IDX]._name = os::strdup(name);159cg_infos[CPUSET_IDX]._hierarchy_id = hierarchy_id;160cg_infos[CPUSET_IDX]._enabled = (enabled == 1);161} else if (strcmp(name, "cpu") == 0) {162cg_infos[CPU_IDX]._name = os::strdup(name);163cg_infos[CPU_IDX]._hierarchy_id = hierarchy_id;164cg_infos[CPU_IDX]._enabled = (enabled == 1);165} else if (strcmp(name, "cpuacct") == 0) {166cg_infos[CPUACCT_IDX]._name = os::strdup(name);167cg_infos[CPUACCT_IDX]._hierarchy_id = hierarchy_id;168cg_infos[CPUACCT_IDX]._enabled = (enabled == 1);169}170}171fclose(cgroups);172173is_cgroupsV2 = true;174all_controllers_enabled = true;175for (int i = 0; i < CG_INFO_LENGTH; i++) {176is_cgroupsV2 = is_cgroupsV2 && cg_infos[i]._hierarchy_id == 0;177all_controllers_enabled = all_controllers_enabled && cg_infos[i]._enabled;178}179180if (!all_controllers_enabled) {181// one or more controllers disabled, disable container support182log_debug(os, container)("One or more required controllers disabled at kernel level.");183cleanup(cg_infos);184*flags = INVALID_CGROUPS_GENERIC;185return false;186}187188/*189* Read /proc/self/cgroup and determine:190* - the cgroup path for cgroups v2 or191* - on a cgroups v1 system, collect info for mapping192* the host mount point to the local one via /proc/self/mountinfo below.193*/194cgroup = fopen(proc_self_cgroup, "r");195if (cgroup == NULL) {196log_debug(os, container)("Can't open %s, %s",197proc_self_cgroup, os::strerror(errno));198cleanup(cg_infos);199*flags = INVALID_CGROUPS_GENERIC;200return false;201}202203while ((p = fgets(buf, MAXPATHLEN, cgroup)) != NULL) {204char *controllers;205char *token;206char *hierarchy_id_str;207int hierarchy_id;208char *cgroup_path;209210hierarchy_id_str = strsep(&p, ":");211hierarchy_id = atoi(hierarchy_id_str);212/* Get controllers and base */213controllers = strsep(&p, ":");214cgroup_path = strsep(&p, "\n");215216if (controllers == NULL) {217continue;218}219220while (!is_cgroupsV2 && (token = strsep(&controllers, ",")) != NULL) {221if (strcmp(token, "memory") == 0) {222assert(hierarchy_id == cg_infos[MEMORY_IDX]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch");223cg_infos[MEMORY_IDX]._cgroup_path = os::strdup(cgroup_path);224} else if (strcmp(token, "cpuset") == 0) {225assert(hierarchy_id == cg_infos[CPUSET_IDX]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch");226cg_infos[CPUSET_IDX]._cgroup_path = os::strdup(cgroup_path);227} else if (strcmp(token, "cpu") == 0) {228assert(hierarchy_id == cg_infos[CPU_IDX]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch");229cg_infos[CPU_IDX]._cgroup_path = os::strdup(cgroup_path);230} else if (strcmp(token, "cpuacct") == 0) {231assert(hierarchy_id == cg_infos[CPUACCT_IDX]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch");232cg_infos[CPUACCT_IDX]._cgroup_path = os::strdup(cgroup_path);233}234}235if (is_cgroupsV2) {236for (int i = 0; i < CG_INFO_LENGTH; i++) {237cg_infos[i]._cgroup_path = os::strdup(cgroup_path);238}239}240}241fclose(cgroup);242243// Find various mount points by reading /proc/self/mountinfo244// mountinfo format is documented at https://www.kernel.org/doc/Documentation/filesystems/proc.txt245mntinfo = fopen(proc_self_mountinfo, "r");246if (mntinfo == NULL) {247log_debug(os, container)("Can't open %s, %s",248proc_self_mountinfo, os::strerror(errno));249cleanup(cg_infos);250*flags = INVALID_CGROUPS_GENERIC;251return false;252}253254bool cgroupv2_mount_point_found = false;255bool any_cgroup_mounts_found = false;256while ((p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) {257char tmp_mount_point[MAXPATHLEN+1];258char tmp_fs_type[MAXPATHLEN+1];259char tmproot[MAXPATHLEN+1];260char tmpmount[MAXPATHLEN+1];261char tmpcgroups[MAXPATHLEN+1];262char *cptr = tmpcgroups;263char *token;264265// Cgroup v2 relevant info. We only look for the _mount_path iff is_cgroupsV2 so266// as to avoid memory stomping of the _mount_path pointer later on in the cgroup v1267// block in the hybrid case.268//269if (is_cgroupsV2 && sscanf(p, "%*d %*d %*d:%*d %*s %s %*[^-]- %s %*s %*s", tmp_mount_point, tmp_fs_type) == 2) {270// we likely have an early match return (e.g. cgroup fs match), be sure we have cgroup2 as fstype271if (!cgroupv2_mount_point_found && strcmp("cgroup2", tmp_fs_type) == 0) {272cgroupv2_mount_point_found = true;273any_cgroup_mounts_found = true;274for (int i = 0; i < CG_INFO_LENGTH; i++) {275assert(cg_infos[i]._mount_path == NULL, "_mount_path memory stomping");276cg_infos[i]._mount_path = os::strdup(tmp_mount_point);277}278}279}280281/* Cgroup v1 relevant info282*283* Find the cgroup mount point for memory, cpuset, cpu, cpuacct284*285* Example for docker:286* 219 214 0:29 /docker/7208cebd00fa5f2e342b1094f7bed87fa25661471a4637118e65f1c995be8a34 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory287*288* Example for host:289* 34 28 0:29 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,memory290*/291if (sscanf(p, "%*d %*d %*d:%*d %s %s %*[^-]- %s %*s %s", tmproot, tmpmount, tmp_fs_type, tmpcgroups) == 4) {292if (strcmp("cgroup", tmp_fs_type) != 0) {293// Skip cgroup2 fs lines on hybrid or unified hierarchy.294continue;295}296while ((token = strsep(&cptr, ",")) != NULL) {297if (strcmp(token, "memory") == 0) {298any_cgroup_mounts_found = true;299assert(cg_infos[MEMORY_IDX]._mount_path == NULL, "stomping of _mount_path");300cg_infos[MEMORY_IDX]._mount_path = os::strdup(tmpmount);301cg_infos[MEMORY_IDX]._root_mount_path = os::strdup(tmproot);302cg_infos[MEMORY_IDX]._data_complete = true;303} else if (strcmp(token, "cpuset") == 0) {304any_cgroup_mounts_found = true;305if (cg_infos[CPUSET_IDX]._mount_path != NULL) {306// On some systems duplicate cpuset controllers get mounted in addition to307// the main cgroup controllers most likely under /sys/fs/cgroup. In that308// case pick the one under /sys/fs/cgroup and discard others.309if (strstr(cg_infos[CPUSET_IDX]._mount_path, "/sys/fs/cgroup") != cg_infos[CPUSET_IDX]._mount_path) {310log_warning(os, container)("Duplicate cpuset controllers detected. Picking %s, skipping %s.",311tmpmount, cg_infos[CPUSET_IDX]._mount_path);312os::free(cg_infos[CPUSET_IDX]._mount_path);313cg_infos[CPUSET_IDX]._mount_path = os::strdup(tmpmount);314} else {315log_warning(os, container)("Duplicate cpuset controllers detected. Picking %s, skipping %s.",316cg_infos[CPUSET_IDX]._mount_path, tmpmount);317}318} else {319cg_infos[CPUSET_IDX]._mount_path = os::strdup(tmpmount);320}321cg_infos[CPUSET_IDX]._root_mount_path = os::strdup(tmproot);322cg_infos[CPUSET_IDX]._data_complete = true;323} else if (strcmp(token, "cpu") == 0) {324any_cgroup_mounts_found = true;325assert(cg_infos[CPU_IDX]._mount_path == NULL, "stomping of _mount_path");326cg_infos[CPU_IDX]._mount_path = os::strdup(tmpmount);327cg_infos[CPU_IDX]._root_mount_path = os::strdup(tmproot);328cg_infos[CPU_IDX]._data_complete = true;329} else if (strcmp(token, "cpuacct") == 0) {330any_cgroup_mounts_found = true;331assert(cg_infos[CPUACCT_IDX]._mount_path == NULL, "stomping of _mount_path");332cg_infos[CPUACCT_IDX]._mount_path = os::strdup(tmpmount);333cg_infos[CPUACCT_IDX]._root_mount_path = os::strdup(tmproot);334cg_infos[CPUACCT_IDX]._data_complete = true;335}336}337}338}339fclose(mntinfo);340341// Neither cgroup2 nor cgroup filesystems mounted via /proc/self/mountinfo342// No point in continuing.343if (!any_cgroup_mounts_found) {344log_trace(os, container)("No relevant cgroup controllers mounted.");345cleanup(cg_infos);346*flags = INVALID_CGROUPS_NO_MOUNT;347return false;348}349350if (is_cgroupsV2) {351if (!cgroupv2_mount_point_found) {352log_trace(os, container)("Mount point for cgroupv2 not found in /proc/self/mountinfo");353cleanup(cg_infos);354*flags = INVALID_CGROUPS_V2;355return false;356}357// Cgroups v2 case, we have all the info we need.358*flags = CGROUPS_V2;359return true;360}361362// What follows is cgroups v1363log_debug(os, container)("Detected cgroups hybrid or legacy hierarchy, using cgroups v1 controllers");364365if (!cg_infos[MEMORY_IDX]._data_complete) {366log_debug(os, container)("Required cgroup v1 memory subsystem not found");367cleanup(cg_infos);368*flags = INVALID_CGROUPS_V1;369return false;370}371if (!cg_infos[CPUSET_IDX]._data_complete) {372log_debug(os, container)("Required cgroup v1 cpuset subsystem not found");373cleanup(cg_infos);374*flags = INVALID_CGROUPS_V1;375return false;376}377if (!cg_infos[CPU_IDX]._data_complete) {378log_debug(os, container)("Required cgroup v1 cpu subsystem not found");379cleanup(cg_infos);380*flags = INVALID_CGROUPS_V1;381return false;382}383if (!cg_infos[CPUACCT_IDX]._data_complete) {384log_debug(os, container)("Required cgroup v1 cpuacct subsystem not found");385cleanup(cg_infos);386*flags = INVALID_CGROUPS_V1;387return false;388}389// Cgroups v1 case, we have all the info we need.390*flags = CGROUPS_V1;391return true;392393};394395void CgroupSubsystemFactory::cleanup(CgroupInfo* cg_infos) {396assert(cg_infos != NULL, "Invariant");397for (int i = 0; i < CG_INFO_LENGTH; i++) {398os::free(cg_infos[i]._name);399os::free(cg_infos[i]._cgroup_path);400os::free(cg_infos[i]._root_mount_path);401os::free(cg_infos[i]._mount_path);402}403}404405/* active_processor_count406*407* Calculate an appropriate number of active processors for the408* VM to use based on these three inputs.409*410* cpu affinity411* cgroup cpu quota & cpu period412* cgroup cpu shares413*414* Algorithm:415*416* Determine the number of available CPUs from sched_getaffinity417*418* If user specified a quota (quota != -1), calculate the number of419* required CPUs by dividing quota by period.420*421* If shares are in effect (shares != -1), calculate the number422* of CPUs required for the shares by dividing the share value423* by PER_CPU_SHARES.424*425* All results of division are rounded up to the next whole number.426*427* If neither shares or quotas have been specified, return the428* number of active processors in the system.429*430* If both shares and quotas have been specified, the results are431* based on the flag PreferContainerQuotaForCPUCount. If true,432* return the quota value. If false return the smallest value433* between shares or quotas.434*435* If shares and/or quotas have been specified, the resulting number436* returned will never exceed the number of active processors.437*438* return:439* number of CPUs440*/441int CgroupSubsystem::active_processor_count() {442int quota_count = 0, share_count = 0;443int cpu_count, limit_count;444int result;445446// We use a cache with a timeout to avoid performing expensive447// computations in the event this function is called frequently.448// [See 8227006].449CachingCgroupController* contrl = cpu_controller();450CachedMetric* cpu_limit = contrl->metrics_cache();451if (!cpu_limit->should_check_metric()) {452int val = (int)cpu_limit->value();453log_trace(os, container)("CgroupSubsystem::active_processor_count (cached): %d", val);454return val;455}456457cpu_count = limit_count = os::Linux::active_processor_count();458int quota = cpu_quota();459int period = cpu_period();460int share = cpu_shares();461462if (quota > -1 && period > 0) {463quota_count = ceilf((float)quota / (float)period);464log_trace(os, container)("CPU Quota count based on quota/period: %d", quota_count);465}466if (share > -1) {467share_count = ceilf((float)share / (float)PER_CPU_SHARES);468log_trace(os, container)("CPU Share count based on shares: %d", share_count);469}470471// If both shares and quotas are setup results depend472// on flag PreferContainerQuotaForCPUCount.473// If true, limit CPU count to quota474// If false, use minimum of shares and quotas475if (quota_count !=0 && share_count != 0) {476if (PreferContainerQuotaForCPUCount) {477limit_count = quota_count;478} else {479limit_count = MIN2(quota_count, share_count);480}481} else if (quota_count != 0) {482limit_count = quota_count;483} else if (share_count != 0) {484limit_count = share_count;485}486487result = MIN2(cpu_count, limit_count);488log_trace(os, container)("OSContainer::active_processor_count: %d", result);489490// Update cached metric to avoid re-reading container settings too often491cpu_limit->set_value(result, OSCONTAINER_CACHE_TIMEOUT);492493return result;494}495496/* memory_limit_in_bytes497*498* Return the limit of available memory for this process.499*500* return:501* memory limit in bytes or502* -1 for unlimited503* OSCONTAINER_ERROR for not supported504*/505jlong CgroupSubsystem::memory_limit_in_bytes() {506CachingCgroupController* contrl = memory_controller();507CachedMetric* memory_limit = contrl->metrics_cache();508if (!memory_limit->should_check_metric()) {509return memory_limit->value();510}511jlong mem_limit = read_memory_limit_in_bytes();512// Update cached metric to avoid re-reading container settings too often513memory_limit->set_value(mem_limit, OSCONTAINER_CACHE_TIMEOUT);514return mem_limit;515}516517518