Path: blob/main/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
48775 views
// SPDX-License-Identifier: GPL-2.0-or-later1/*2* Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.3* Copyright (C) 2007 The Regents of the University of California.4* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).5* Written by Brian Behlendorf <[email protected]>.6* UCRL-CODE-2351977*8* This file is part of the SPL, Solaris Porting Layer.9*10* The SPL is free software; you can redistribute it and/or modify it11* under the terms of the GNU General Public License as published by the12* Free Software Foundation; either version 2 of the License, or (at your13* option) any later version.14*15* The SPL is distributed in the hope that it will be useful, but WITHOUT16* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or17* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License18* for more details.19*20* You should have received a copy of the GNU General Public License along21* with the SPL. If not, see <http://www.gnu.org/licenses/>.22*/2324#include <sys/debug.h>25#include <sys/sysmacros.h>26#include <sys/kmem.h>27#include <sys/vmem.h>2829/*30* As a general rule kmem_alloc() allocations should be small, preferably31* just a few pages since they must by physically contiguous. Therefore, a32* rate limited warning will be printed to the console for any kmem_alloc()33* which exceeds a reasonable threshold.34*35* The default warning threshold is set to sixteen pages but capped at 64K to36* accommodate systems using large pages. This value was selected to be small37* enough to ensure the largest allocations are quickly noticed and fixed.38* But large enough to avoid logging any warnings when a allocation size is39* larger than optimal but not a serious concern. Since this value is tunable,40* developers are encouraged to set it lower when testing so any new largish41* allocations are quickly caught. These warnings may be disabled by setting42* the threshold to zero.43*/44unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024);45module_param(spl_kmem_alloc_warn, uint, 0644);46MODULE_PARM_DESC(spl_kmem_alloc_warn,47"Warning threshold in bytes for a kmem_alloc()");48EXPORT_SYMBOL(spl_kmem_alloc_warn);4950/*51* Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.52* Allocations which are marginally smaller than this limit may succeed but53* should still be avoided due to the expense of locating a contiguous range54* of free pages. Therefore, a maximum kmem size with reasonable safely55* margin of 4x is set. Kmem_alloc() allocations larger than this maximum56* will quickly fail. Vmem_alloc() allocations less than or equal to this57* value will use kmalloc(), but shift to vmalloc() when exceeding this value.58*/59unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2);60module_param(spl_kmem_alloc_max, uint, 0644);61MODULE_PARM_DESC(spl_kmem_alloc_max,62"Maximum size in bytes for a kmem_alloc()");63EXPORT_SYMBOL(spl_kmem_alloc_max);6465int66kmem_debugging(void)67{68return (0);69}70EXPORT_SYMBOL(kmem_debugging);7172char *73kmem_vasprintf(const char *fmt, va_list ap)74{75va_list aq;76char *ptr;7778do {79va_copy(aq, ap);80ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq);81va_end(aq);82} while (ptr == NULL);8384return (ptr);85}86EXPORT_SYMBOL(kmem_vasprintf);8788char *89kmem_asprintf(const char *fmt, ...)90{91va_list ap;92char *ptr;9394do {95va_start(ap, fmt);96ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap);97va_end(ap);98} while (ptr == NULL);99100return (ptr);101}102EXPORT_SYMBOL(kmem_asprintf);103104static char *105__strdup(const char *str, int flags)106{107char *ptr;108int n;109110n = strlen(str);111ptr = kmalloc(n + 1, kmem_flags_convert(flags));112if (ptr)113memcpy(ptr, str, n + 1);114115return (ptr);116}117118char *119kmem_strdup(const char *str)120{121return (__strdup(str, KM_SLEEP));122}123EXPORT_SYMBOL(kmem_strdup);124125void126kmem_strfree(char *str)127{128kfree(str);129}130EXPORT_SYMBOL(kmem_strfree);131132void *133spl_kvmalloc(size_t size, gfp_t lflags)134{135/*136* GFP_KERNEL allocations can safely use kvmalloc which may137* improve performance by avoiding a) high latency caused by138* vmalloc's on-access allocation, b) performance loss due to139* MMU memory address mapping and c) vmalloc locking overhead.140* This has the side-effect that the slab statistics will141* incorrectly report this as a vmem allocation, but that is142* purely cosmetic.143*/144if ((lflags & GFP_KERNEL) == GFP_KERNEL)145return (kvmalloc(size, lflags));146147gfp_t kmalloc_lflags = lflags;148149if (size > PAGE_SIZE) {150/*151* We need to set __GFP_NOWARN here since spl_kvmalloc is not152* only called by spl_kmem_alloc_impl but can be called153* directly with custom lflags, too. In that case154* kmem_flags_convert does not get called, which would155* implicitly set __GFP_NOWARN.156*/157kmalloc_lflags |= __GFP_NOWARN;158159/*160* N.B. __GFP_RETRY_MAYFAIL is supported only for large161* e (>32kB) allocations.162*163* We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY164* for !costly requests because there is no other way to tell165* the allocator that we want to fail rather than retry166* endlessly.167*/168if (!(kmalloc_lflags & __GFP_RETRY_MAYFAIL) ||169(size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {170kmalloc_lflags |= __GFP_NORETRY;171}172}173174/*175* We first try kmalloc - even for big sizes - and fall back to176* spl_vmalloc if that fails.177*178* For non-__GFP-RECLAIM allocations we always stick to179* kmalloc_node, and fail when kmalloc is not successful (returns180* NULL).181* We cannot fall back to spl_vmalloc in this case because spl_vmalloc182* internally uses GPF_KERNEL allocations.183*/184void *ptr = kmalloc_node(size, kmalloc_lflags, NUMA_NO_NODE);185if (ptr || size <= PAGE_SIZE ||186(lflags & __GFP_RECLAIM) != __GFP_RECLAIM) {187return (ptr);188}189190return (spl_vmalloc(size, lflags | __GFP_HIGHMEM));191}192193/*194* General purpose unified implementation of kmem_alloc(). It is an195* amalgamation of Linux and Illumos allocator design. It should never be196* exported to ensure that code using kmem_alloc()/kmem_zalloc() remains197* relatively portable. Consumers may only access this function through198* wrappers that enforce the common flags to ensure portability.199*/200inline void *201spl_kmem_alloc_impl(size_t size, int flags, int node)202{203gfp_t lflags = kmem_flags_convert(flags);204void *ptr;205206/*207* Log abnormally large allocations and rate limit the console output.208* Allocations larger than spl_kmem_alloc_warn should be performed209* through the vmem_alloc()/vmem_zalloc() interfaces.210*/211if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) &&212!(flags & KM_VMEM)) {213printk(KERN_WARNING214"Large kmem_alloc(%lu, 0x%x), please file an issue at:\n"215"https://github.com/openzfs/zfs/issues/new\n",216(unsigned long)size, flags);217dump_stack();218}219220/*221* Use a loop because kmalloc_node() can fail when GFP_KERNEL is used222* unlike kmem_alloc() with KM_SLEEP on Illumos.223*/224do {225/*226* Calling kmalloc_node() when the size >= spl_kmem_alloc_max227* is unsafe. This must fail for all for kmem_alloc() and228* kmem_zalloc() callers.229*230* For vmem_alloc() and vmem_zalloc() callers it is permissible231* to use spl_vmalloc(). However, in general use of232* spl_vmalloc() is strongly discouraged because a global lock233* must be acquired. Contention on this lock can significantly234* impact performance so frequently manipulating the virtual235* address space is strongly discouraged.236*/237if (size > spl_kmem_alloc_max) {238if (flags & KM_VMEM) {239ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM);240} else {241return (NULL);242}243} else {244/*245* We use kmalloc when doing kmem_alloc(KM_NOSLEEP),246* because kvmalloc/vmalloc may sleep. We also use247* kmalloc on systems with limited kernel VA space (e.g.248* 32-bit), which have HIGHMEM. Otherwise we use249* kvmalloc, which tries to get contiguous physical250* memory (fast, like kmalloc) and falls back on using251* virtual memory to stitch together pages (slow, like252* vmalloc).253*/254#ifdef CONFIG_HIGHMEM255if (flags & KM_VMEM) {256#else257if ((flags & KM_VMEM) || !(flags & KM_NOSLEEP)) {258#endif259ptr = spl_kvmalloc(size, lflags);260} else {261ptr = kmalloc_node(size, lflags, node);262}263}264265if (likely(ptr) || (flags & KM_NOSLEEP))266return (ptr);267268/*269* Try hard to satisfy the allocation. However, when progress270* cannot be made, the allocation is allowed to fail.271*/272if ((lflags & GFP_KERNEL) == GFP_KERNEL)273lflags |= __GFP_RETRY_MAYFAIL;274275/*276* Use cond_resched() instead of congestion_wait() to avoid277* deadlocking systems where there are no block devices.278*/279cond_resched();280} while (1);281282return (NULL);283}284285inline void286spl_kmem_free_impl(const void *buf, size_t size)287{288if (is_vmalloc_addr(buf))289vfree(buf);290else291kfree(buf);292}293294/*295* Memory allocation and accounting for kmem_* * style allocations. When296* DEBUG_KMEM is enabled the total memory allocated will be tracked and297* any memory leaked will be reported during module unload.298*299* ./configure --enable-debug-kmem300*/301#ifdef DEBUG_KMEM302303/* Shim layer memory accounting */304atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);305uint64_t kmem_alloc_max = 0;306307EXPORT_SYMBOL(kmem_alloc_used);308EXPORT_SYMBOL(kmem_alloc_max);309310inline void *311spl_kmem_alloc_debug(size_t size, int flags, int node)312{313void *ptr;314315ptr = spl_kmem_alloc_impl(size, flags, node);316if (ptr) {317atomic64_add(size, &kmem_alloc_used);318if (unlikely(atomic64_read(&kmem_alloc_used) > kmem_alloc_max))319kmem_alloc_max = atomic64_read(&kmem_alloc_used);320}321322return (ptr);323}324325inline void326spl_kmem_free_debug(const void *ptr, size_t size)327{328atomic64_sub(size, &kmem_alloc_used);329spl_kmem_free_impl(ptr, size);330}331332/*333* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked334* but also the location of every alloc and free. When the SPL module is335* unloaded a list of all leaked addresses and where they were allocated336* will be dumped to the console. Enabling this feature has a significant337* impact on performance but it makes finding memory leaks straight forward.338*339* Not surprisingly with debugging enabled the xmem_locks are very highly340* contended particularly on xfree(). If we want to run with this detailed341* debugging enabled for anything other than debugging we need to minimize342* the contention by moving to a lock per xmem_table entry model.343*344* ./configure --enable-debug-kmem-tracking345*/346#ifdef DEBUG_KMEM_TRACKING347348#include <linux/hash.h>349#include <linux/ctype.h>350351#define KMEM_HASH_BITS 10352#define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS)353354typedef struct kmem_debug {355struct hlist_node kd_hlist; /* Hash node linkage */356struct list_head kd_list; /* List of all allocations */357void *kd_addr; /* Allocation pointer */358size_t kd_size; /* Allocation size */359const char *kd_func; /* Allocation function */360int kd_line; /* Allocation line */361} kmem_debug_t;362363static spinlock_t kmem_lock;364static struct hlist_head kmem_table[KMEM_TABLE_SIZE];365static struct list_head kmem_list;366367static kmem_debug_t *368kmem_del_init(spinlock_t *lock, struct hlist_head *table,369int bits, const void *addr)370{371struct hlist_head *head;372struct hlist_node *node = NULL;373struct kmem_debug *p;374unsigned long flags;375376spin_lock_irqsave(lock, flags);377378head = &table[hash_ptr((void *)addr, bits)];379hlist_for_each(node, head) {380p = list_entry(node, struct kmem_debug, kd_hlist);381if (p->kd_addr == addr) {382hlist_del_init(&p->kd_hlist);383list_del_init(&p->kd_list);384spin_unlock_irqrestore(lock, flags);385return (p);386}387}388389spin_unlock_irqrestore(lock, flags);390391return (NULL);392}393394inline void *395spl_kmem_alloc_track(size_t size, int flags,396const char *func, int line, int node)397{398void *ptr = NULL;399kmem_debug_t *dptr;400unsigned long irq_flags;401402dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags));403if (dptr == NULL)404return (NULL);405406dptr->kd_func = __strdup(func, flags);407if (dptr->kd_func == NULL) {408kfree(dptr);409return (NULL);410}411412ptr = spl_kmem_alloc_debug(size, flags, node);413if (ptr == NULL) {414kfree(dptr->kd_func);415kfree(dptr);416return (NULL);417}418419INIT_HLIST_NODE(&dptr->kd_hlist);420INIT_LIST_HEAD(&dptr->kd_list);421422dptr->kd_addr = ptr;423dptr->kd_size = size;424dptr->kd_line = line;425426spin_lock_irqsave(&kmem_lock, irq_flags);427hlist_add_head(&dptr->kd_hlist,428&kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);429list_add_tail(&dptr->kd_list, &kmem_list);430spin_unlock_irqrestore(&kmem_lock, irq_flags);431432return (ptr);433}434435inline void436spl_kmem_free_track(const void *ptr, size_t size)437{438kmem_debug_t *dptr;439440/* Ignore NULL pointer since we haven't tracked it at all */441if (ptr == NULL)442return;443444/* Must exist in hash due to kmem_alloc() */445dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);446ASSERT3P(dptr, !=, NULL);447ASSERT3S(dptr->kd_size, ==, size);448449kfree(dptr->kd_func);450kfree(dptr);451452spl_kmem_free_debug(ptr, size);453}454#endif /* DEBUG_KMEM_TRACKING */455#endif /* DEBUG_KMEM */456457/*458* Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces.459*/460void *461spl_kmem_alloc(size_t size, int flags, const char *func, int line)462{463ASSERT0(flags & ~KM_PUBLIC_MASK);464465#if !defined(DEBUG_KMEM)466return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));467#elif !defined(DEBUG_KMEM_TRACKING)468return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));469#else470return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));471#endif472}473EXPORT_SYMBOL(spl_kmem_alloc);474475void *476spl_kmem_zalloc(size_t size, int flags, const char *func, int line)477{478ASSERT0(flags & ~KM_PUBLIC_MASK);479480flags |= KM_ZERO;481482#if !defined(DEBUG_KMEM)483return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));484#elif !defined(DEBUG_KMEM_TRACKING)485return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));486#else487return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));488#endif489}490EXPORT_SYMBOL(spl_kmem_zalloc);491492void493spl_kmem_free(const void *buf, size_t size)494{495#if !defined(DEBUG_KMEM)496return (spl_kmem_free_impl(buf, size));497#elif !defined(DEBUG_KMEM_TRACKING)498return (spl_kmem_free_debug(buf, size));499#else500return (spl_kmem_free_track(buf, size));501#endif502}503EXPORT_SYMBOL(spl_kmem_free);504505#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)506static char *507spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)508{509int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;510int i, flag = 1;511512ASSERT(str != NULL && len >= 17);513memset(str, 0, len);514515/*516* Check for a fully printable string, and while we are at517* it place the printable characters in the passed buffer.518*/519for (i = 0; i < size; i++) {520str[i] = ((char *)(kd->kd_addr))[i];521if (isprint(str[i])) {522continue;523} else {524/*525* Minimum number of printable characters found526* to make it worthwhile to print this as ascii.527*/528if (i > min)529break;530531flag = 0;532break;533}534}535536if (!flag) {537sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",538*((uint8_t *)kd->kd_addr),539*((uint8_t *)kd->kd_addr + 2),540*((uint8_t *)kd->kd_addr + 4),541*((uint8_t *)kd->kd_addr + 6),542*((uint8_t *)kd->kd_addr + 8),543*((uint8_t *)kd->kd_addr + 10),544*((uint8_t *)kd->kd_addr + 12),545*((uint8_t *)kd->kd_addr + 14));546}547548return (str);549}550551static int552spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)553{554int i;555556spin_lock_init(lock);557INIT_LIST_HEAD(list);558559for (i = 0; i < size; i++)560INIT_HLIST_HEAD(&kmem_table[i]);561562return (0);563}564565static void566spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)567{568unsigned long flags;569kmem_debug_t *kd = NULL;570char str[17];571572spin_lock_irqsave(lock, flags);573if (!list_empty(list))574printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",575"size", "data", "func", "line");576577list_for_each_entry(kd, list, kd_list) {578printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,579(int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),580kd->kd_func, kd->kd_line);581}582583spin_unlock_irqrestore(lock, flags);584}585#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */586587int588spl_kmem_init(void)589{590591#ifdef DEBUG_KMEM592atomic64_set(&kmem_alloc_used, 0);593594595596#ifdef DEBUG_KMEM_TRACKING597spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);598#endif /* DEBUG_KMEM_TRACKING */599#endif /* DEBUG_KMEM */600601return (0);602}603604void605spl_kmem_fini(void)606{607#ifdef DEBUG_KMEM608/*609* Display all unreclaimed memory addresses, including the610* allocation size and the first few bytes of what's located611* at that address to aid in debugging. Performance is not612* a serious concern here since it is module unload time.613*/614if (atomic64_read(&kmem_alloc_used) != 0)615printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",616(unsigned long)atomic64_read(&kmem_alloc_used),617kmem_alloc_max);618619#ifdef DEBUG_KMEM_TRACKING620spl_kmem_fini_tracking(&kmem_list, &kmem_lock);621#endif /* DEBUG_KMEM_TRACKING */622#endif /* DEBUG_KMEM */623}624625626