Path: blob/master/tools/testing/selftests/cgroup/test_memcontrol.c
26285 views
/* SPDX-License-Identifier: GPL-2.0 */1#define _GNU_SOURCE23#include <linux/limits.h>4#include <linux/oom.h>5#include <fcntl.h>6#include <stdio.h>7#include <stdlib.h>8#include <string.h>9#include <sys/stat.h>10#include <sys/types.h>11#include <unistd.h>12#include <sys/socket.h>13#include <sys/wait.h>14#include <arpa/inet.h>15#include <netinet/in.h>16#include <netdb.h>17#include <errno.h>18#include <sys/mman.h>1920#include "../kselftest.h"21#include "cgroup_util.h"2223static bool has_localevents;24static bool has_recursiveprot;2526int get_temp_fd(void)27{28return open(".", O_TMPFILE | O_RDWR | O_EXCL);29}3031int alloc_pagecache(int fd, size_t size)32{33char buf[PAGE_SIZE];34struct stat st;35int i;3637if (fstat(fd, &st))38goto cleanup;3940size += st.st_size;4142if (ftruncate(fd, size))43goto cleanup;4445for (i = 0; i < size; i += sizeof(buf))46read(fd, buf, sizeof(buf));4748return 0;4950cleanup:51return -1;52}5354int alloc_anon(const char *cgroup, void *arg)55{56size_t size = (unsigned long)arg;57char *buf, *ptr;5859buf = malloc(size);60for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)61*ptr = 0;6263free(buf);64return 0;65}6667int is_swap_enabled(void)68{69char buf[PAGE_SIZE];70const char delim[] = "\n";71int cnt = 0;72char *line;7374if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)75return -1;7677for (line = strtok(buf, delim); line; line = strtok(NULL, delim))78cnt++;7980return cnt > 1;81}8283int set_oom_adj_score(int pid, int score)84{85char path[PATH_MAX];86int fd, len;8788sprintf(path, "/proc/%d/oom_score_adj", pid);8990fd = open(path, O_WRONLY | O_APPEND);91if (fd < 0)92return fd;9394len = dprintf(fd, "%d", score);95if (len < 0) {96close(fd);97return len;98}99100close(fd);101return 0;102}103104/*105* This test creates two nested cgroups with and without enabling106* the memory controller.107*/108static int test_memcg_subtree_control(const char *root)109{110char *parent, *child, *parent2 = NULL, *child2 = NULL;111int ret = KSFT_FAIL;112char buf[PAGE_SIZE];113114/* Create two nested cgroups with the memory controller enabled */115parent = cg_name(root, "memcg_test_0");116child = cg_name(root, "memcg_test_0/memcg_test_1");117if (!parent || !child)118goto cleanup_free;119120if (cg_create(parent))121goto cleanup_free;122123if (cg_write(parent, "cgroup.subtree_control", "+memory"))124goto cleanup_parent;125126if (cg_create(child))127goto cleanup_parent;128129if (cg_read_strstr(child, "cgroup.controllers", "memory"))130goto cleanup_child;131132/* Create two nested cgroups without enabling memory controller */133parent2 = cg_name(root, "memcg_test_1");134child2 = cg_name(root, "memcg_test_1/memcg_test_1");135if (!parent2 || !child2)136goto cleanup_free2;137138if (cg_create(parent2))139goto cleanup_free2;140141if (cg_create(child2))142goto cleanup_parent2;143144if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))145goto cleanup_all;146147if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))148goto cleanup_all;149150ret = KSFT_PASS;151152cleanup_all:153cg_destroy(child2);154cleanup_parent2:155cg_destroy(parent2);156cleanup_free2:157free(parent2);158free(child2);159cleanup_child:160cg_destroy(child);161cleanup_parent:162cg_destroy(parent);163cleanup_free:164free(parent);165free(child);166167return ret;168}169170static int alloc_anon_50M_check(const char *cgroup, void *arg)171{172size_t size = MB(50);173char *buf, *ptr;174long anon, current;175int ret = -1;176177buf = malloc(size);178if (buf == NULL) {179fprintf(stderr, "malloc() failed\n");180return -1;181}182183for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)184*ptr = 0;185186current = cg_read_long(cgroup, "memory.current");187if (current < size)188goto cleanup;189190if (!values_close(size, current, 3))191goto cleanup;192193anon = cg_read_key_long(cgroup, "memory.stat", "anon ");194if (anon < 0)195goto cleanup;196197if (!values_close(anon, current, 3))198goto cleanup;199200ret = 0;201cleanup:202free(buf);203return ret;204}205206static int alloc_pagecache_50M_check(const char *cgroup, void *arg)207{208size_t size = MB(50);209int ret = -1;210long current, file;211int fd;212213fd = get_temp_fd();214if (fd < 0)215return -1;216217if (alloc_pagecache(fd, size))218goto cleanup;219220current = cg_read_long(cgroup, "memory.current");221if (current < size)222goto cleanup;223224file = cg_read_key_long(cgroup, "memory.stat", "file ");225if (file < 0)226goto cleanup;227228if (!values_close(file, current, 10))229goto cleanup;230231ret = 0;232233cleanup:234close(fd);235return ret;236}237238/*239* This test create a memory cgroup, allocates240* some anonymous memory and some pagecache241* and checks memory.current, memory.peak, and some memory.stat values.242*/243static int test_memcg_current_peak(const char *root)244{245int ret = KSFT_FAIL;246long current, peak, peak_reset;247char *memcg;248bool fd2_closed = false, fd3_closed = false, fd4_closed = false;249int peak_fd = -1, peak_fd2 = -1, peak_fd3 = -1, peak_fd4 = -1;250struct stat ss;251252memcg = cg_name(root, "memcg_test");253if (!memcg)254goto cleanup;255256if (cg_create(memcg))257goto cleanup;258259current = cg_read_long(memcg, "memory.current");260if (current != 0)261goto cleanup;262263peak = cg_read_long(memcg, "memory.peak");264if (peak != 0)265goto cleanup;266267if (cg_run(memcg, alloc_anon_50M_check, NULL))268goto cleanup;269270peak = cg_read_long(memcg, "memory.peak");271if (peak < MB(50))272goto cleanup;273274/*275* We'll open a few FDs for the same memory.peak file to exercise the free-path276* We need at least three to be closed in a different order than writes occurred to test277* the linked-list handling.278*/279peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);280281if (peak_fd == -1) {282if (errno == ENOENT)283ret = KSFT_SKIP;284goto cleanup;285}286287/*288* Before we try to use memory.peak's fd, try to figure out whether289* this kernel supports writing to that file in the first place. (by290* checking the writable bit on the file's st_mode)291*/292if (fstat(peak_fd, &ss))293goto cleanup;294295if ((ss.st_mode & S_IWUSR) == 0) {296ret = KSFT_SKIP;297goto cleanup;298}299300peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);301302if (peak_fd2 == -1)303goto cleanup;304305peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);306307if (peak_fd3 == -1)308goto cleanup;309310/* any non-empty string resets, but make it clear */311static const char reset_string[] = "reset\n";312313peak_reset = write(peak_fd, reset_string, sizeof(reset_string));314if (peak_reset != sizeof(reset_string))315goto cleanup;316317peak_reset = write(peak_fd2, reset_string, sizeof(reset_string));318if (peak_reset != sizeof(reset_string))319goto cleanup;320321peak_reset = write(peak_fd3, reset_string, sizeof(reset_string));322if (peak_reset != sizeof(reset_string))323goto cleanup;324325/* Make sure a completely independent read isn't affected by our FD-local reset above*/326peak = cg_read_long(memcg, "memory.peak");327if (peak < MB(50))328goto cleanup;329330fd2_closed = true;331if (close(peak_fd2))332goto cleanup;333334peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);335336if (peak_fd4 == -1)337goto cleanup;338339peak_reset = write(peak_fd4, reset_string, sizeof(reset_string));340if (peak_reset != sizeof(reset_string))341goto cleanup;342343peak = cg_read_long_fd(peak_fd);344if (peak > MB(30) || peak < 0)345goto cleanup;346347if (cg_run(memcg, alloc_pagecache_50M_check, NULL))348goto cleanup;349350peak = cg_read_long(memcg, "memory.peak");351if (peak < MB(50))352goto cleanup;353354/* Make sure everything is back to normal */355peak = cg_read_long_fd(peak_fd);356if (peak < MB(50))357goto cleanup;358359peak = cg_read_long_fd(peak_fd4);360if (peak < MB(50))361goto cleanup;362363fd3_closed = true;364if (close(peak_fd3))365goto cleanup;366367fd4_closed = true;368if (close(peak_fd4))369goto cleanup;370371ret = KSFT_PASS;372373cleanup:374close(peak_fd);375if (!fd2_closed)376close(peak_fd2);377if (!fd3_closed)378close(peak_fd3);379if (!fd4_closed)380close(peak_fd4);381cg_destroy(memcg);382free(memcg);383384return ret;385}386387static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)388{389int fd = (long)arg;390int ppid = getppid();391392if (alloc_pagecache(fd, MB(50)))393return -1;394395while (getppid() == ppid)396sleep(1);397398return 0;399}400401static int alloc_anon_noexit(const char *cgroup, void *arg)402{403int ppid = getppid();404size_t size = (unsigned long)arg;405char *buf, *ptr;406407buf = malloc(size);408if (buf == NULL) {409fprintf(stderr, "malloc() failed\n");410return -1;411}412413for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)414*ptr = 0;415416while (getppid() == ppid)417sleep(1);418419free(buf);420return 0;421}422423/*424* Wait until processes are killed asynchronously by the OOM killer425* If we exceed a timeout, fail.426*/427static int cg_test_proc_killed(const char *cgroup)428{429int limit;430431for (limit = 10; limit > 0; limit--) {432if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)433return 0;434435usleep(100000);436}437return -1;438}439440static bool reclaim_until(const char *memcg, long goal);441442/*443* First, this test creates the following hierarchy:444* A memory.min = 0, memory.max = 200M445* A/B memory.min = 50M446* A/B/C memory.min = 75M, memory.current = 50M447* A/B/D memory.min = 25M, memory.current = 50M448* A/B/E memory.min = 0, memory.current = 50M449* A/B/F memory.min = 500M, memory.current = 0450*451* (or memory.low if we test soft protection)452*453* Usages are pagecache and the test keeps a running454* process in every leaf cgroup.455* Then it creates A/G and creates a significant456* memory pressure in A.457*458* Then it checks actual memory usages and expects that:459* A/B memory.current ~= 50M460* A/B/C memory.current ~= 29M [memory.events:low > 0]461* A/B/D memory.current ~= 21M [memory.events:low > 0]462* A/B/E memory.current ~= 0 [memory.events:low == 0 if !memory_recursiveprot,463* undefined otherwise]464* A/B/F memory.current = 0 [memory.events:low == 0]465* (for origin of the numbers, see model in memcg_protection.m.)466*467* After that it tries to allocate more than there is468* unprotected memory in A available, and checks that:469* a) memory.min protects pagecache even in this case,470* b) memory.low allows reclaiming page cache with low events.471*472* Then we try to reclaim from A/B/C using memory.reclaim until its473* usage reaches 10M.474* This makes sure that:475* (a) We ignore the protection of the reclaim target memcg.476* (b) The previously calculated emin value (~29M) should be dismissed.477*/478static int test_memcg_protection(const char *root, bool min)479{480int ret = KSFT_FAIL, rc;481char *parent[3] = {NULL};482char *children[4] = {NULL};483const char *attribute = min ? "memory.min" : "memory.low";484long c[4];485long current;486int i, attempts;487int fd;488489fd = get_temp_fd();490if (fd < 0)491goto cleanup;492493parent[0] = cg_name(root, "memcg_test_0");494if (!parent[0])495goto cleanup;496497parent[1] = cg_name(parent[0], "memcg_test_1");498if (!parent[1])499goto cleanup;500501parent[2] = cg_name(parent[0], "memcg_test_2");502if (!parent[2])503goto cleanup;504505if (cg_create(parent[0]))506goto cleanup;507508if (cg_read_long(parent[0], attribute)) {509/* No memory.min on older kernels is fine */510if (min)511ret = KSFT_SKIP;512goto cleanup;513}514515if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))516goto cleanup;517518if (cg_write(parent[0], "memory.max", "200M"))519goto cleanup;520521if (cg_write(parent[0], "memory.swap.max", "0"))522goto cleanup;523524if (cg_create(parent[1]))525goto cleanup;526527if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))528goto cleanup;529530if (cg_create(parent[2]))531goto cleanup;532533for (i = 0; i < ARRAY_SIZE(children); i++) {534children[i] = cg_name_indexed(parent[1], "child_memcg", i);535if (!children[i])536goto cleanup;537538if (cg_create(children[i]))539goto cleanup;540541if (i > 2)542continue;543544cg_run_nowait(children[i], alloc_pagecache_50M_noexit,545(void *)(long)fd);546}547548if (cg_write(parent[1], attribute, "50M"))549goto cleanup;550if (cg_write(children[0], attribute, "75M"))551goto cleanup;552if (cg_write(children[1], attribute, "25M"))553goto cleanup;554if (cg_write(children[2], attribute, "0"))555goto cleanup;556if (cg_write(children[3], attribute, "500M"))557goto cleanup;558559attempts = 0;560while (!values_close(cg_read_long(parent[1], "memory.current"),561MB(150), 3)) {562if (attempts++ > 5)563break;564sleep(1);565}566567if (cg_run(parent[2], alloc_anon, (void *)MB(148)))568goto cleanup;569570if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))571goto cleanup;572573for (i = 0; i < ARRAY_SIZE(children); i++)574c[i] = cg_read_long(children[i], "memory.current");575576if (!values_close(c[0], MB(29), 15))577goto cleanup;578579if (!values_close(c[1], MB(21), 20))580goto cleanup;581582if (c[3] != 0)583goto cleanup;584585rc = cg_run(parent[2], alloc_anon, (void *)MB(170));586if (min && !rc)587goto cleanup;588else if (!min && rc) {589fprintf(stderr,590"memory.low prevents from allocating anon memory\n");591goto cleanup;592}593594current = min ? MB(50) : MB(30);595if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3))596goto cleanup;597598if (!reclaim_until(children[0], MB(10)))599goto cleanup;600601if (min) {602ret = KSFT_PASS;603goto cleanup;604}605606/*607* Child 2 has memory.low=0, but some low protection may still be608* distributed down from its parent with memory.low=50M if cgroup2609* memory_recursiveprot mount option is enabled. Ignore the low610* event count in this case.611*/612for (i = 0; i < ARRAY_SIZE(children); i++) {613int ignore_low_events_index = has_recursiveprot ? 2 : -1;614int no_low_events_index = 1;615long low, oom;616617oom = cg_read_key_long(children[i], "memory.events", "oom ");618low = cg_read_key_long(children[i], "memory.events", "low ");619620if (oom)621goto cleanup;622if (i == ignore_low_events_index)623continue;624if (i <= no_low_events_index && low <= 0)625goto cleanup;626if (i > no_low_events_index && low)627goto cleanup;628629}630631ret = KSFT_PASS;632633cleanup:634for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {635if (!children[i])636continue;637638cg_destroy(children[i]);639free(children[i]);640}641642for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {643if (!parent[i])644continue;645646cg_destroy(parent[i]);647free(parent[i]);648}649close(fd);650return ret;651}652653static int test_memcg_min(const char *root)654{655return test_memcg_protection(root, true);656}657658static int test_memcg_low(const char *root)659{660return test_memcg_protection(root, false);661}662663static int alloc_pagecache_max_30M(const char *cgroup, void *arg)664{665size_t size = MB(50);666int ret = -1;667long current, high, max;668int fd;669670high = cg_read_long(cgroup, "memory.high");671max = cg_read_long(cgroup, "memory.max");672if (high != MB(30) && max != MB(30))673return -1;674675fd = get_temp_fd();676if (fd < 0)677return -1;678679if (alloc_pagecache(fd, size))680goto cleanup;681682current = cg_read_long(cgroup, "memory.current");683if (!values_close(current, MB(30), 5))684goto cleanup;685686ret = 0;687688cleanup:689close(fd);690return ret;691692}693694/*695* This test checks that memory.high limits the amount of696* memory which can be consumed by either anonymous memory697* or pagecache.698*/699static int test_memcg_high(const char *root)700{701int ret = KSFT_FAIL;702char *memcg;703long high;704705memcg = cg_name(root, "memcg_test");706if (!memcg)707goto cleanup;708709if (cg_create(memcg))710goto cleanup;711712if (cg_read_strcmp(memcg, "memory.high", "max\n"))713goto cleanup;714715if (cg_write(memcg, "memory.swap.max", "0"))716goto cleanup;717718if (cg_write(memcg, "memory.high", "30M"))719goto cleanup;720721if (cg_run(memcg, alloc_anon, (void *)MB(31)))722goto cleanup;723724if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))725goto cleanup;726727if (cg_run(memcg, alloc_pagecache_max_30M, NULL))728goto cleanup;729730high = cg_read_key_long(memcg, "memory.events", "high ");731if (high <= 0)732goto cleanup;733734ret = KSFT_PASS;735736cleanup:737cg_destroy(memcg);738free(memcg);739740return ret;741}742743static int alloc_anon_mlock(const char *cgroup, void *arg)744{745size_t size = (size_t)arg;746void *buf;747748buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,7490, 0);750if (buf == MAP_FAILED)751return -1;752753mlock(buf, size);754munmap(buf, size);755return 0;756}757758/*759* This test checks that memory.high is able to throttle big single shot760* allocation i.e. large allocation within one kernel entry.761*/762static int test_memcg_high_sync(const char *root)763{764int ret = KSFT_FAIL, pid, fd = -1;765char *memcg;766long pre_high, pre_max;767long post_high, post_max;768769memcg = cg_name(root, "memcg_test");770if (!memcg)771goto cleanup;772773if (cg_create(memcg))774goto cleanup;775776pre_high = cg_read_key_long(memcg, "memory.events", "high ");777pre_max = cg_read_key_long(memcg, "memory.events", "max ");778if (pre_high < 0 || pre_max < 0)779goto cleanup;780781if (cg_write(memcg, "memory.swap.max", "0"))782goto cleanup;783784if (cg_write(memcg, "memory.high", "30M"))785goto cleanup;786787if (cg_write(memcg, "memory.max", "140M"))788goto cleanup;789790fd = memcg_prepare_for_wait(memcg);791if (fd < 0)792goto cleanup;793794pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));795if (pid < 0)796goto cleanup;797798cg_wait_for(fd);799800post_high = cg_read_key_long(memcg, "memory.events", "high ");801post_max = cg_read_key_long(memcg, "memory.events", "max ");802if (post_high < 0 || post_max < 0)803goto cleanup;804805if (pre_high == post_high || pre_max != post_max)806goto cleanup;807808ret = KSFT_PASS;809810cleanup:811if (fd >= 0)812close(fd);813cg_destroy(memcg);814free(memcg);815816return ret;817}818819/*820* This test checks that memory.max limits the amount of821* memory which can be consumed by either anonymous memory822* or pagecache.823*/824static int test_memcg_max(const char *root)825{826int ret = KSFT_FAIL;827char *memcg;828long current, max;829830memcg = cg_name(root, "memcg_test");831if (!memcg)832goto cleanup;833834if (cg_create(memcg))835goto cleanup;836837if (cg_read_strcmp(memcg, "memory.max", "max\n"))838goto cleanup;839840if (cg_write(memcg, "memory.swap.max", "0"))841goto cleanup;842843if (cg_write(memcg, "memory.max", "30M"))844goto cleanup;845846/* Should be killed by OOM killer */847if (!cg_run(memcg, alloc_anon, (void *)MB(100)))848goto cleanup;849850if (cg_run(memcg, alloc_pagecache_max_30M, NULL))851goto cleanup;852853current = cg_read_long(memcg, "memory.current");854if (current > MB(30) || !current)855goto cleanup;856857max = cg_read_key_long(memcg, "memory.events", "max ");858if (max <= 0)859goto cleanup;860861ret = KSFT_PASS;862863cleanup:864cg_destroy(memcg);865free(memcg);866867return ret;868}869870/*871* Reclaim from @memcg until usage reaches @goal by writing to872* memory.reclaim.873*874* This function will return false if the usage is already below the875* goal.876*877* This function assumes that writing to memory.reclaim is the only878* source of change in memory.current (no concurrent allocations or879* reclaim).880*881* This function makes sure memory.reclaim is sane. It will return882* false if memory.reclaim's error codes do not make sense, even if883* the usage goal was satisfied.884*/885static bool reclaim_until(const char *memcg, long goal)886{887char buf[64];888int retries, err;889long current, to_reclaim;890bool reclaimed = false;891892for (retries = 5; retries > 0; retries--) {893current = cg_read_long(memcg, "memory.current");894895if (current < goal || values_close(current, goal, 3))896break;897/* Did memory.reclaim return 0 incorrectly? */898else if (reclaimed)899return false;900901to_reclaim = current - goal;902snprintf(buf, sizeof(buf), "%ld", to_reclaim);903err = cg_write(memcg, "memory.reclaim", buf);904if (!err)905reclaimed = true;906else if (err != -EAGAIN)907return false;908}909return reclaimed;910}911912/*913* This test checks that memory.reclaim reclaims the given914* amount of memory (from both anon and file, if possible).915*/916static int test_memcg_reclaim(const char *root)917{918int ret = KSFT_FAIL;919int fd = -1;920int retries;921char *memcg;922long current, expected_usage;923924memcg = cg_name(root, "memcg_test");925if (!memcg)926goto cleanup;927928if (cg_create(memcg))929goto cleanup;930931current = cg_read_long(memcg, "memory.current");932if (current != 0)933goto cleanup;934935fd = get_temp_fd();936if (fd < 0)937goto cleanup;938939cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);940941/*942* If swap is enabled, try to reclaim from both anon and file, else try943* to reclaim from file only.944*/945if (is_swap_enabled()) {946cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));947expected_usage = MB(100);948} else949expected_usage = MB(50);950951/*952* Wait until current usage reaches the expected usage (or we run out of953* retries).954*/955retries = 5;956while (!values_close(cg_read_long(memcg, "memory.current"),957expected_usage, 10)) {958if (retries--) {959sleep(1);960continue;961} else {962fprintf(stderr,963"failed to allocate %ld for memcg reclaim test\n",964expected_usage);965goto cleanup;966}967}968969/*970* Reclaim until current reaches 30M, this makes sure we hit both anon971* and file if swap is enabled.972*/973if (!reclaim_until(memcg, MB(30)))974goto cleanup;975976ret = KSFT_PASS;977cleanup:978cg_destroy(memcg);979free(memcg);980close(fd);981982return ret;983}984985static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)986{987long mem_max = (long)arg;988size_t size = MB(50);989char *buf, *ptr;990long mem_current, swap_current;991int ret = -1;992993buf = malloc(size);994if (buf == NULL) {995fprintf(stderr, "malloc() failed\n");996return -1;997}998999for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)1000*ptr = 0;10011002mem_current = cg_read_long(cgroup, "memory.current");1003if (!mem_current || !values_close(mem_current, mem_max, 3))1004goto cleanup;10051006swap_current = cg_read_long(cgroup, "memory.swap.current");1007if (!swap_current ||1008!values_close(mem_current + swap_current, size, 3))1009goto cleanup;10101011ret = 0;1012cleanup:1013free(buf);1014return ret;1015}10161017/*1018* This test checks that memory.swap.max limits the amount of1019* anonymous memory which can be swapped out. Additionally, it verifies that1020* memory.swap.peak reflects the high watermark and can be reset.1021*/1022static int test_memcg_swap_max_peak(const char *root)1023{1024int ret = KSFT_FAIL;1025char *memcg;1026long max, peak;1027struct stat ss;1028int swap_peak_fd = -1, mem_peak_fd = -1;10291030/* any non-empty string resets */1031static const char reset_string[] = "foobarbaz";10321033if (!is_swap_enabled())1034return KSFT_SKIP;10351036memcg = cg_name(root, "memcg_test");1037if (!memcg)1038goto cleanup;10391040if (cg_create(memcg))1041goto cleanup;10421043if (cg_read_long(memcg, "memory.swap.current")) {1044ret = KSFT_SKIP;1045goto cleanup;1046}10471048swap_peak_fd = cg_open(memcg, "memory.swap.peak",1049O_RDWR | O_APPEND | O_CLOEXEC);10501051if (swap_peak_fd == -1) {1052if (errno == ENOENT)1053ret = KSFT_SKIP;1054goto cleanup;1055}10561057/*1058* Before we try to use memory.swap.peak's fd, try to figure out1059* whether this kernel supports writing to that file in the first1060* place. (by checking the writable bit on the file's st_mode)1061*/1062if (fstat(swap_peak_fd, &ss))1063goto cleanup;10641065if ((ss.st_mode & S_IWUSR) == 0) {1066ret = KSFT_SKIP;1067goto cleanup;1068}10691070mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);10711072if (mem_peak_fd == -1)1073goto cleanup;10741075if (cg_read_long(memcg, "memory.swap.peak"))1076goto cleanup;10771078if (cg_read_long_fd(swap_peak_fd))1079goto cleanup;10801081/* switch the swap and mem fds into local-peak tracking mode*/1082int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));10831084if (peak_reset != sizeof(reset_string))1085goto cleanup;10861087if (cg_read_long_fd(swap_peak_fd))1088goto cleanup;10891090if (cg_read_long(memcg, "memory.peak"))1091goto cleanup;10921093if (cg_read_long_fd(mem_peak_fd))1094goto cleanup;10951096peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));1097if (peak_reset != sizeof(reset_string))1098goto cleanup;10991100if (cg_read_long_fd(mem_peak_fd))1101goto cleanup;11021103if (cg_read_strcmp(memcg, "memory.max", "max\n"))1104goto cleanup;11051106if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))1107goto cleanup;11081109if (cg_write(memcg, "memory.swap.max", "30M"))1110goto cleanup;11111112if (cg_write(memcg, "memory.max", "30M"))1113goto cleanup;11141115/* Should be killed by OOM killer */1116if (!cg_run(memcg, alloc_anon, (void *)MB(100)))1117goto cleanup;11181119if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)1120goto cleanup;11211122if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)1123goto cleanup;11241125peak = cg_read_long(memcg, "memory.peak");1126if (peak < MB(29))1127goto cleanup;11281129peak = cg_read_long(memcg, "memory.swap.peak");1130if (peak < MB(29))1131goto cleanup;11321133peak = cg_read_long_fd(mem_peak_fd);1134if (peak < MB(29))1135goto cleanup;11361137peak = cg_read_long_fd(swap_peak_fd);1138if (peak < MB(29))1139goto cleanup;11401141/*1142* open, reset and close the peak swap on another FD to make sure1143* multiple extant fds don't corrupt the linked-list1144*/1145peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string);1146if (peak_reset)1147goto cleanup;11481149peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string);1150if (peak_reset)1151goto cleanup;11521153/* actually reset on the fds */1154peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));1155if (peak_reset != sizeof(reset_string))1156goto cleanup;11571158peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));1159if (peak_reset != sizeof(reset_string))1160goto cleanup;11611162peak = cg_read_long_fd(swap_peak_fd);1163if (peak > MB(10))1164goto cleanup;11651166/*1167* The cgroup is now empty, but there may be a page or two associated1168* with the open FD accounted to it.1169*/1170peak = cg_read_long_fd(mem_peak_fd);1171if (peak > MB(1))1172goto cleanup;11731174if (cg_read_long(memcg, "memory.peak") < MB(29))1175goto cleanup;11761177if (cg_read_long(memcg, "memory.swap.peak") < MB(29))1178goto cleanup;11791180if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))1181goto cleanup;11821183max = cg_read_key_long(memcg, "memory.events", "max ");1184if (max <= 0)1185goto cleanup;11861187peak = cg_read_long(memcg, "memory.peak");1188if (peak < MB(29))1189goto cleanup;11901191peak = cg_read_long(memcg, "memory.swap.peak");1192if (peak < MB(29))1193goto cleanup;11941195peak = cg_read_long_fd(mem_peak_fd);1196if (peak < MB(29))1197goto cleanup;11981199peak = cg_read_long_fd(swap_peak_fd);1200if (peak < MB(19))1201goto cleanup;12021203ret = KSFT_PASS;12041205cleanup:1206if (mem_peak_fd != -1 && close(mem_peak_fd))1207ret = KSFT_FAIL;1208if (swap_peak_fd != -1 && close(swap_peak_fd))1209ret = KSFT_FAIL;1210cg_destroy(memcg);1211free(memcg);12121213return ret;1214}12151216/*1217* This test disables swapping and tries to allocate anonymous memory1218* up to OOM. Then it checks for oom and oom_kill events in1219* memory.events.1220*/1221static int test_memcg_oom_events(const char *root)1222{1223int ret = KSFT_FAIL;1224char *memcg;12251226memcg = cg_name(root, "memcg_test");1227if (!memcg)1228goto cleanup;12291230if (cg_create(memcg))1231goto cleanup;12321233if (cg_write(memcg, "memory.max", "30M"))1234goto cleanup;12351236if (cg_write(memcg, "memory.swap.max", "0"))1237goto cleanup;12381239if (!cg_run(memcg, alloc_anon, (void *)MB(100)))1240goto cleanup;12411242if (cg_read_strcmp(memcg, "cgroup.procs", ""))1243goto cleanup;12441245if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)1246goto cleanup;12471248if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)1249goto cleanup;12501251ret = KSFT_PASS;12521253cleanup:1254cg_destroy(memcg);1255free(memcg);12561257return ret;1258}12591260struct tcp_server_args {1261unsigned short port;1262int ctl[2];1263};12641265static int tcp_server(const char *cgroup, void *arg)1266{1267struct tcp_server_args *srv_args = arg;1268struct sockaddr_in6 saddr = { 0 };1269socklen_t slen = sizeof(saddr);1270int sk, client_sk, ctl_fd, yes = 1, ret = -1;12711272close(srv_args->ctl[0]);1273ctl_fd = srv_args->ctl[1];12741275saddr.sin6_family = AF_INET6;1276saddr.sin6_addr = in6addr_any;1277saddr.sin6_port = htons(srv_args->port);12781279sk = socket(AF_INET6, SOCK_STREAM, 0);1280if (sk < 0)1281return ret;12821283if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)1284goto cleanup;12851286if (bind(sk, (struct sockaddr *)&saddr, slen)) {1287write(ctl_fd, &errno, sizeof(errno));1288goto cleanup;1289}12901291if (listen(sk, 1))1292goto cleanup;12931294ret = 0;1295if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {1296ret = -1;1297goto cleanup;1298}12991300client_sk = accept(sk, NULL, NULL);1301if (client_sk < 0)1302goto cleanup;13031304ret = -1;1305for (;;) {1306uint8_t buf[0x100000];13071308if (write(client_sk, buf, sizeof(buf)) <= 0) {1309if (errno == ECONNRESET)1310ret = 0;1311break;1312}1313}13141315close(client_sk);13161317cleanup:1318close(sk);1319return ret;1320}13211322static int tcp_client(const char *cgroup, unsigned short port)1323{1324const char server[] = "localhost";1325struct addrinfo *ai;1326char servport[6];1327int retries = 0x10; /* nice round number */1328int sk, ret;1329long allocated;13301331allocated = cg_read_long(cgroup, "memory.current");1332snprintf(servport, sizeof(servport), "%hd", port);1333ret = getaddrinfo(server, servport, NULL, &ai);1334if (ret)1335return ret;13361337sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);1338if (sk < 0)1339goto free_ainfo;13401341ret = connect(sk, ai->ai_addr, ai->ai_addrlen);1342if (ret < 0)1343goto close_sk;13441345ret = KSFT_FAIL;1346while (retries--) {1347uint8_t buf[0x100000];1348long current, sock;13491350if (read(sk, buf, sizeof(buf)) <= 0)1351goto close_sk;13521353current = cg_read_long(cgroup, "memory.current");1354sock = cg_read_key_long(cgroup, "memory.stat", "sock ");13551356if (current < 0 || sock < 0)1357goto close_sk;13581359/* exclude the memory not related to socket connection */1360if (values_close(current - allocated, sock, 10)) {1361ret = KSFT_PASS;1362break;1363}1364}13651366close_sk:1367close(sk);1368free_ainfo:1369freeaddrinfo(ai);1370return ret;1371}13721373/*1374* This test checks socket memory accounting.1375* The test forks a TCP server listens on a random port between 10001376* and 61000. Once it gets a client connection, it starts writing to1377* its socket.1378* The TCP client interleaves reads from the socket with check whether1379* memory.current and memory.stat.sock are similar.1380*/1381static int test_memcg_sock(const char *root)1382{1383int bind_retries = 5, ret = KSFT_FAIL, pid, err;1384unsigned short port;1385char *memcg;13861387memcg = cg_name(root, "memcg_test");1388if (!memcg)1389goto cleanup;13901391if (cg_create(memcg))1392goto cleanup;13931394while (bind_retries--) {1395struct tcp_server_args args;13961397if (pipe(args.ctl))1398goto cleanup;13991400port = args.port = 1000 + rand() % 60000;14011402pid = cg_run_nowait(memcg, tcp_server, &args);1403if (pid < 0)1404goto cleanup;14051406close(args.ctl[1]);1407if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))1408goto cleanup;1409close(args.ctl[0]);14101411if (!err)1412break;1413if (err != EADDRINUSE)1414goto cleanup;14151416waitpid(pid, NULL, 0);1417}14181419if (err == EADDRINUSE) {1420ret = KSFT_SKIP;1421goto cleanup;1422}14231424if (tcp_client(memcg, port) != KSFT_PASS)1425goto cleanup;14261427waitpid(pid, &err, 0);1428if (WEXITSTATUS(err))1429goto cleanup;14301431if (cg_read_long(memcg, "memory.current") < 0)1432goto cleanup;14331434if (cg_read_key_long(memcg, "memory.stat", "sock "))1435goto cleanup;14361437ret = KSFT_PASS;14381439cleanup:1440cg_destroy(memcg);1441free(memcg);14421443return ret;1444}14451446/*1447* This test disables swapping and tries to allocate anonymous memory1448* up to OOM with memory.group.oom set. Then it checks that all1449* processes in the leaf were killed. It also checks that oom_events1450* were propagated to the parent level.1451*/1452static int test_memcg_oom_group_leaf_events(const char *root)1453{1454int ret = KSFT_FAIL;1455char *parent, *child;1456long parent_oom_events;14571458parent = cg_name(root, "memcg_test_0");1459child = cg_name(root, "memcg_test_0/memcg_test_1");14601461if (!parent || !child)1462goto cleanup;14631464if (cg_create(parent))1465goto cleanup;14661467if (cg_create(child))1468goto cleanup;14691470if (cg_write(parent, "cgroup.subtree_control", "+memory"))1471goto cleanup;14721473if (cg_write(child, "memory.max", "50M"))1474goto cleanup;14751476if (cg_write(child, "memory.swap.max", "0"))1477goto cleanup;14781479if (cg_write(child, "memory.oom.group", "1"))1480goto cleanup;14811482cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));1483cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));1484cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));1485if (!cg_run(child, alloc_anon, (void *)MB(100)))1486goto cleanup;14871488if (cg_test_proc_killed(child))1489goto cleanup;14901491if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)1492goto cleanup;14931494parent_oom_events = cg_read_key_long(1495parent, "memory.events", "oom_kill ");1496/*1497* If memory_localevents is not enabled (the default), the parent should1498* count OOM events in its children groups. Otherwise, it should not1499* have observed any events.1500*/1501if (has_localevents && parent_oom_events != 0)1502goto cleanup;1503else if (!has_localevents && parent_oom_events <= 0)1504goto cleanup;15051506ret = KSFT_PASS;15071508cleanup:1509if (child)1510cg_destroy(child);1511if (parent)1512cg_destroy(parent);1513free(child);1514free(parent);15151516return ret;1517}15181519/*1520* This test disables swapping and tries to allocate anonymous memory1521* up to OOM with memory.group.oom set. Then it checks that all1522* processes in the parent and leaf were killed.1523*/1524static int test_memcg_oom_group_parent_events(const char *root)1525{1526int ret = KSFT_FAIL;1527char *parent, *child;15281529parent = cg_name(root, "memcg_test_0");1530child = cg_name(root, "memcg_test_0/memcg_test_1");15311532if (!parent || !child)1533goto cleanup;15341535if (cg_create(parent))1536goto cleanup;15371538if (cg_create(child))1539goto cleanup;15401541if (cg_write(parent, "memory.max", "80M"))1542goto cleanup;15431544if (cg_write(parent, "memory.swap.max", "0"))1545goto cleanup;15461547if (cg_write(parent, "memory.oom.group", "1"))1548goto cleanup;15491550cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));1551cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));1552cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));15531554if (!cg_run(child, alloc_anon, (void *)MB(100)))1555goto cleanup;15561557if (cg_test_proc_killed(child))1558goto cleanup;1559if (cg_test_proc_killed(parent))1560goto cleanup;15611562ret = KSFT_PASS;15631564cleanup:1565if (child)1566cg_destroy(child);1567if (parent)1568cg_destroy(parent);1569free(child);1570free(parent);15711572return ret;1573}15741575/*1576* This test disables swapping and tries to allocate anonymous memory1577* up to OOM with memory.group.oom set. Then it checks that all1578* processes were killed except those set with OOM_SCORE_ADJ_MIN1579*/1580static int test_memcg_oom_group_score_events(const char *root)1581{1582int ret = KSFT_FAIL;1583char *memcg;1584int safe_pid;15851586memcg = cg_name(root, "memcg_test_0");15871588if (!memcg)1589goto cleanup;15901591if (cg_create(memcg))1592goto cleanup;15931594if (cg_write(memcg, "memory.max", "50M"))1595goto cleanup;15961597if (cg_write(memcg, "memory.swap.max", "0"))1598goto cleanup;15991600if (cg_write(memcg, "memory.oom.group", "1"))1601goto cleanup;16021603safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));1604if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))1605goto cleanup;16061607cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));1608if (!cg_run(memcg, alloc_anon, (void *)MB(100)))1609goto cleanup;16101611if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)1612goto cleanup;16131614if (kill(safe_pid, SIGKILL))1615goto cleanup;16161617ret = KSFT_PASS;16181619cleanup:1620if (memcg)1621cg_destroy(memcg);1622free(memcg);16231624return ret;1625}16261627#define T(x) { x, #x }1628struct memcg_test {1629int (*fn)(const char *root);1630const char *name;1631} tests[] = {1632T(test_memcg_subtree_control),1633T(test_memcg_current_peak),1634T(test_memcg_min),1635T(test_memcg_low),1636T(test_memcg_high),1637T(test_memcg_high_sync),1638T(test_memcg_max),1639T(test_memcg_reclaim),1640T(test_memcg_oom_events),1641T(test_memcg_swap_max_peak),1642T(test_memcg_sock),1643T(test_memcg_oom_group_leaf_events),1644T(test_memcg_oom_group_parent_events),1645T(test_memcg_oom_group_score_events),1646};1647#undef T16481649int main(int argc, char **argv)1650{1651char root[PATH_MAX];1652int i, proc_status, ret = EXIT_SUCCESS;16531654if (cg_find_unified_root(root, sizeof(root), NULL))1655ksft_exit_skip("cgroup v2 isn't mounted\n");16561657/*1658* Check that memory controller is available:1659* memory is listed in cgroup.controllers1660*/1661if (cg_read_strstr(root, "cgroup.controllers", "memory"))1662ksft_exit_skip("memory controller isn't available\n");16631664if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))1665if (cg_write(root, "cgroup.subtree_control", "+memory"))1666ksft_exit_skip("Failed to set memory controller\n");16671668proc_status = proc_mount_contains("memory_recursiveprot");1669if (proc_status < 0)1670ksft_exit_skip("Failed to query cgroup mount option\n");1671has_recursiveprot = proc_status;16721673proc_status = proc_mount_contains("memory_localevents");1674if (proc_status < 0)1675ksft_exit_skip("Failed to query cgroup mount option\n");1676has_localevents = proc_status;16771678for (i = 0; i < ARRAY_SIZE(tests); i++) {1679switch (tests[i].fn(root)) {1680case KSFT_PASS:1681ksft_test_result_pass("%s\n", tests[i].name);1682break;1683case KSFT_SKIP:1684ksft_test_result_skip("%s\n", tests[i].name);1685break;1686default:1687ret = EXIT_FAILURE;1688ksft_test_result_fail("%s\n", tests[i].name);1689break;1690}1691}16921693return ret;1694}169516961697