Path: blob/main/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
48775 views
// SPDX-License-Identifier: GPL-2.0-or-later1/*2* Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.3* Copyright (C) 2007 The Regents of the University of California.4* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).5* Written by Brian Behlendorf <[email protected]>.6* UCRL-CODE-2351977*8* This file is part of the SPL, Solaris Porting Layer.9*10* The SPL is free software; you can redistribute it and/or modify it11* under the terms of the GNU General Public License as published by the12* Free Software Foundation; either version 2 of the License, or (at your13* option) any later version.14*15* The SPL is distributed in the hope that it will be useful, but WITHOUT16* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or17* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License18* for more details.19*20* You should have received a copy of the GNU General Public License along21* with the SPL. If not, see <http://www.gnu.org/licenses/>.22*/2324#define SPL_KMEM_CACHE_IMPLEMENTING2526#include <sys/kmem.h>27#include <sys/kmem_cache.h>28#include <sys/taskq.h>29#include <sys/timer.h>30#include <sys/vmem.h>31#include <sys/wait.h>32#include <sys/string.h>33#include <linux/slab.h>34#include <linux/swap.h>35#include <linux/prefetch.h>3637/*38* Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}()39* with smp_mb__{before,after}_atomic() because they were redundant. This is40* only used inside our SLAB allocator, so we implement an internal wrapper41* here to give us smp_mb__{before,after}_atomic() on older kernels.42*/43#ifndef smp_mb__before_atomic44#define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x)45#endif4647#ifndef smp_mb__after_atomic48#define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x)49#endif5051/*52* Cache magazines are an optimization designed to minimize the cost of53* allocating memory. They do this by keeping a per-cpu cache of recently54* freed objects, which can then be reallocated without taking a lock. This55* can improve performance on highly contended caches. However, because56* objects in magazines will prevent otherwise empty slabs from being57* immediately released this may not be ideal for low memory machines.58*59* For this reason spl_kmem_cache_magazine_size can be used to set a maximum60* magazine size. When this value is set to 0 the magazine size will be61* automatically determined based on the object size. Otherwise magazines62* will be limited to 2-256 objects per magazine (i.e per cpu). Magazines63* may never be entirely disabled in this implementation.64*/65static unsigned int spl_kmem_cache_magazine_size = 0;66module_param(spl_kmem_cache_magazine_size, uint, 0444);67MODULE_PARM_DESC(spl_kmem_cache_magazine_size,68"Default magazine size (2-256), set automatically (0)");6970static unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;71module_param(spl_kmem_cache_obj_per_slab, uint, 0644);72MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");7374static unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE;75module_param(spl_kmem_cache_max_size, uint, 0644);76MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");7778/*79* For small objects the Linux slab allocator should be used to make the most80* efficient use of the memory. However, large objects are not supported by81* the Linux slab and therefore the SPL implementation is preferred. A cutoff82* of 16K was determined to be optimal for architectures using 4K pages and83* to also work well on architecutres using larger 64K page sizes.84*/85static unsigned int spl_kmem_cache_slab_limit =86SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE;87module_param(spl_kmem_cache_slab_limit, uint, 0644);88MODULE_PARM_DESC(spl_kmem_cache_slab_limit,89"Objects less than N bytes use the Linux slab");9091/*92* The number of threads available to allocate new slabs for caches. This93* should not need to be tuned but it is available for performance analysis.94*/95static unsigned int spl_kmem_cache_kmem_threads = 4;96module_param(spl_kmem_cache_kmem_threads, uint, 0444);97MODULE_PARM_DESC(spl_kmem_cache_kmem_threads,98"Number of spl_kmem_cache threads");99100/*101* Slab allocation interfaces102*103* While the Linux slab implementation was inspired by the Solaris104* implementation I cannot use it to emulate the Solaris APIs. I105* require two features which are not provided by the Linux slab.106*107* 1) Constructors AND destructors. Recent versions of the Linux108* kernel have removed support for destructors. This is a deal109* breaker for the SPL which contains particularly expensive110* initializers for mutex's, condition variables, etc. We also111* require a minimal level of cleanup for these data types unlike112* many Linux data types which do need to be explicitly destroyed.113*114* 2) Virtual address space backed slab. Callers of the Solaris slab115* expect it to work well for both small are very large allocations.116* Because of memory fragmentation the Linux slab which is backed117* by kmalloc'ed memory performs very badly when confronted with118* large numbers of large allocations. Basing the slab on the119* virtual address space removes the need for contiguous pages120* and greatly improve performance for large allocations.121*122* For these reasons, the SPL has its own slab implementation with123* the needed features. It is not as highly optimized as either the124* Solaris or Linux slabs, but it should get me most of what is125* needed until it can be optimized or obsoleted by another approach.126*127* One serious concern I do have about this method is the relatively128* small virtual address space on 32bit arches. This will seriously129* constrain the size of the slab caches and their performance.130*/131132struct list_head spl_kmem_cache_list; /* List of caches */133struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */134static taskq_t *spl_kmem_cache_taskq; /* Task queue for aging / reclaim */135136static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);137138static void *139kv_alloc(spl_kmem_cache_t *skc, int size, int flags)140{141gfp_t lflags = kmem_flags_convert(flags);142void *ptr;143144if (skc->skc_flags & KMC_RECLAIMABLE)145lflags |= __GFP_RECLAIMABLE;146ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM);147148/* Resulting allocated memory will be page aligned */149ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));150151return (ptr);152}153154static void155kv_free(spl_kmem_cache_t *skc, void *ptr, int size)156{157ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));158159/*160* The Linux direct reclaim path uses this out of band value to161* determine if forward progress is being made. Normally this is162* incremented by kmem_freepages() which is part of the various163* Linux slab implementations. However, since we are using none164* of that infrastructure we are responsible for incrementing it.165*/166if (current->reclaim_state)167#ifdef HAVE_RECLAIM_STATE_RECLAIMED168current->reclaim_state->reclaimed += size >> PAGE_SHIFT;169#else170current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;171#endif172vfree(ptr);173}174175/*176* Required space for each aligned sks.177*/178static inline uint32_t179spl_sks_size(spl_kmem_cache_t *skc)180{181return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t),182skc->skc_obj_align, uint32_t));183}184185/*186* Required space for each aligned object.187*/188static inline uint32_t189spl_obj_size(spl_kmem_cache_t *skc)190{191uint32_t align = skc->skc_obj_align;192193return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +194P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t));195}196197uint64_t198spl_kmem_cache_inuse(kmem_cache_t *cache)199{200return (cache->skc_obj_total);201}202EXPORT_SYMBOL(spl_kmem_cache_inuse);203204uint64_t205spl_kmem_cache_entry_size(kmem_cache_t *cache)206{207return (cache->skc_obj_size);208}209EXPORT_SYMBOL(spl_kmem_cache_entry_size);210211/*212* Lookup the spl_kmem_object_t for an object given that object.213*/214static inline spl_kmem_obj_t *215spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)216{217return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size,218skc->skc_obj_align, uint32_t));219}220221/*222* It's important that we pack the spl_kmem_obj_t structure and the223* actual objects in to one large address space to minimize the number224* of calls to the allocator. It is far better to do a few large225* allocations and then subdivide it ourselves. Now which allocator226* we use requires balancing a few trade offs.227*228* For small objects we use kmem_alloc() because as long as you are229* only requesting a small number of pages (ideally just one) its cheap.230* However, when you start requesting multiple pages with kmem_alloc()231* it gets increasingly expensive since it requires contiguous pages.232* For this reason we shift to vmem_alloc() for slabs of large objects233* which removes the need for contiguous pages. We do not use234* vmem_alloc() in all cases because there is significant locking235* overhead in __get_vm_area_node(). This function takes a single236* global lock when acquiring an available virtual address range which237* serializes all vmem_alloc()'s for all slab caches. Using slightly238* different allocation functions for small and large objects should239* give us the best of both worlds.240*241* +------------------------+242* | spl_kmem_slab_t --+-+ |243* | skc_obj_size <-+ | |244* | spl_kmem_obj_t | |245* | skc_obj_size <---+ |246* | spl_kmem_obj_t | |247* | ... v |248* +------------------------+249*/250static spl_kmem_slab_t *251spl_slab_alloc(spl_kmem_cache_t *skc, int flags)252{253spl_kmem_slab_t *sks;254void *base;255uint32_t obj_size;256257base = kv_alloc(skc, skc->skc_slab_size, flags);258if (base == NULL)259return (NULL);260261sks = (spl_kmem_slab_t *)base;262sks->sks_magic = SKS_MAGIC;263sks->sks_objs = skc->skc_slab_objs;264sks->sks_age = jiffies;265sks->sks_cache = skc;266INIT_LIST_HEAD(&sks->sks_list);267INIT_LIST_HEAD(&sks->sks_free_list);268sks->sks_ref = 0;269obj_size = spl_obj_size(skc);270271for (int i = 0; i < sks->sks_objs; i++) {272void *obj = base + spl_sks_size(skc) + (i * obj_size);273274ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));275spl_kmem_obj_t *sko = spl_sko_from_obj(skc, obj);276sko->sko_addr = obj;277sko->sko_magic = SKO_MAGIC;278sko->sko_slab = sks;279INIT_LIST_HEAD(&sko->sko_list);280list_add_tail(&sko->sko_list, &sks->sks_free_list);281}282283return (sks);284}285286/*287* Remove a slab from complete or partial list, it must be called with288* the 'skc->skc_lock' held but the actual free must be performed289* outside the lock to prevent deadlocking on vmem addresses.290*/291static void292spl_slab_free(spl_kmem_slab_t *sks,293struct list_head *sks_list, struct list_head *sko_list)294{295spl_kmem_cache_t *skc;296297ASSERT(sks->sks_magic == SKS_MAGIC);298ASSERT0(sks->sks_ref);299300skc = sks->sks_cache;301ASSERT(skc->skc_magic == SKC_MAGIC);302303/*304* Update slab/objects counters in the cache, then remove the305* slab from the skc->skc_partial_list. Finally add the slab306* and all its objects in to the private work lists where the307* destructors will be called and the memory freed to the system.308*/309skc->skc_obj_total -= sks->sks_objs;310skc->skc_slab_total--;311list_del(&sks->sks_list);312list_add(&sks->sks_list, sks_list);313list_splice_init(&sks->sks_free_list, sko_list);314}315316/*317* Reclaim empty slabs at the end of the partial list.318*/319static void320spl_slab_reclaim(spl_kmem_cache_t *skc)321{322spl_kmem_slab_t *sks = NULL, *m = NULL;323spl_kmem_obj_t *sko = NULL, *n = NULL;324LIST_HEAD(sks_list);325LIST_HEAD(sko_list);326327/*328* Empty slabs and objects must be moved to a private list so they329* can be safely freed outside the spin lock. All empty slabs are330* at the end of skc->skc_partial_list, therefore once a non-empty331* slab is found we can stop scanning.332*/333spin_lock(&skc->skc_lock);334list_for_each_entry_safe_reverse(sks, m,335&skc->skc_partial_list, sks_list) {336337if (sks->sks_ref > 0)338break;339340spl_slab_free(sks, &sks_list, &sko_list);341}342spin_unlock(&skc->skc_lock);343344/*345* The following two loops ensure all the object destructors are run,346* and the slabs themselves are freed. This is all done outside the347* skc->skc_lock since this allows the destructor to sleep, and348* allows us to perform a conditional reschedule when a freeing a349* large number of objects and slabs back to the system.350*/351352list_for_each_entry_safe(sko, n, &sko_list, sko_list) {353ASSERT(sko->sko_magic == SKO_MAGIC);354}355356list_for_each_entry_safe(sks, m, &sks_list, sks_list) {357ASSERT(sks->sks_magic == SKS_MAGIC);358kv_free(skc, sks, skc->skc_slab_size);359}360}361362static spl_kmem_emergency_t *363spl_emergency_search(struct rb_root *root, void *obj)364{365struct rb_node *node = root->rb_node;366spl_kmem_emergency_t *ske;367unsigned long address = (unsigned long)obj;368369while (node) {370ske = container_of(node, spl_kmem_emergency_t, ske_node);371372if (address < ske->ske_obj)373node = node->rb_left;374else if (address > ske->ske_obj)375node = node->rb_right;376else377return (ske);378}379380return (NULL);381}382383static int384spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)385{386struct rb_node **new = &(root->rb_node), *parent = NULL;387spl_kmem_emergency_t *ske_tmp;388unsigned long address = ske->ske_obj;389390while (*new) {391ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);392393parent = *new;394if (address < ske_tmp->ske_obj)395new = &((*new)->rb_left);396else if (address > ske_tmp->ske_obj)397new = &((*new)->rb_right);398else399return (0);400}401402rb_link_node(&ske->ske_node, parent, new);403rb_insert_color(&ske->ske_node, root);404405return (1);406}407408/*409* Allocate a single emergency object and track it in a red black tree.410*/411static int412spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)413{414gfp_t lflags = kmem_flags_convert(flags);415spl_kmem_emergency_t *ske;416int order = get_order(skc->skc_obj_size);417int empty;418419/* Last chance use a partial slab if one now exists */420spin_lock(&skc->skc_lock);421empty = list_empty(&skc->skc_partial_list);422spin_unlock(&skc->skc_lock);423if (!empty)424return (-EEXIST);425426if (skc->skc_flags & KMC_RECLAIMABLE)427lflags |= __GFP_RECLAIMABLE;428ske = kmalloc(sizeof (*ske), lflags);429if (ske == NULL)430return (-ENOMEM);431432ske->ske_obj = __get_free_pages(lflags, order);433if (ske->ske_obj == 0) {434kfree(ske);435return (-ENOMEM);436}437438spin_lock(&skc->skc_lock);439empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);440if (likely(empty)) {441skc->skc_obj_total++;442skc->skc_obj_emergency++;443if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)444skc->skc_obj_emergency_max = skc->skc_obj_emergency;445}446spin_unlock(&skc->skc_lock);447448if (unlikely(!empty)) {449free_pages(ske->ske_obj, order);450kfree(ske);451return (-EINVAL);452}453454*obj = (void *)ske->ske_obj;455456return (0);457}458459/*460* Locate the passed object in the red black tree and free it.461*/462static int463spl_emergency_free(spl_kmem_cache_t *skc, void *obj)464{465spl_kmem_emergency_t *ske;466int order = get_order(skc->skc_obj_size);467468spin_lock(&skc->skc_lock);469ske = spl_emergency_search(&skc->skc_emergency_tree, obj);470if (ske) {471rb_erase(&ske->ske_node, &skc->skc_emergency_tree);472skc->skc_obj_emergency--;473skc->skc_obj_total--;474}475spin_unlock(&skc->skc_lock);476477if (ske == NULL)478return (-ENOENT);479480free_pages(ske->ske_obj, order);481kfree(ske);482483return (0);484}485486/*487* Release objects from the per-cpu magazine back to their slab. The flush488* argument contains the max number of entries to remove from the magazine.489*/490static void491spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)492{493spin_lock(&skc->skc_lock);494495ASSERT(skc->skc_magic == SKC_MAGIC);496ASSERT(skm->skm_magic == SKM_MAGIC);497498int count = MIN(flush, skm->skm_avail);499for (int i = 0; i < count; i++)500spl_cache_shrink(skc, skm->skm_objs[i]);501502skm->skm_avail -= count;503memmove(skm->skm_objs, &(skm->skm_objs[count]),504sizeof (void *) * skm->skm_avail);505506spin_unlock(&skc->skc_lock);507}508509/*510* Size a slab based on the size of each aligned object plus spl_kmem_obj_t.511* When on-slab we want to target spl_kmem_cache_obj_per_slab. However,512* for very small objects we may end up with more than this so as not513* to waste space in the minimal allocation of a single page.514*/515static int516spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)517{518uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs;519520sks_size = spl_sks_size(skc);521obj_size = spl_obj_size(skc);522max_size = (spl_kmem_cache_max_size * 1024 * 1024);523tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size);524525if (tgt_size <= max_size) {526tgt_objs = (tgt_size - sks_size) / obj_size;527} else {528tgt_objs = (max_size - sks_size) / obj_size;529tgt_size = (tgt_objs * obj_size) + sks_size;530}531532if (tgt_objs == 0)533return (-ENOSPC);534535*objs = tgt_objs;536*size = tgt_size;537538return (0);539}540541/*542* Make a guess at reasonable per-cpu magazine size based on the size of543* each object and the cost of caching N of them in each magazine. Long544* term this should really adapt based on an observed usage heuristic.545*/546static int547spl_magazine_size(spl_kmem_cache_t *skc)548{549uint32_t obj_size = spl_obj_size(skc);550int size;551552if (spl_kmem_cache_magazine_size > 0)553return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2));554555/* Per-magazine sizes below assume a 4Kib page size */556if (obj_size > (PAGE_SIZE * 256))557size = 4; /* Minimum 4Mib per-magazine */558else if (obj_size > (PAGE_SIZE * 32))559size = 16; /* Minimum 2Mib per-magazine */560else if (obj_size > (PAGE_SIZE))561size = 64; /* Minimum 256Kib per-magazine */562else if (obj_size > (PAGE_SIZE / 4))563size = 128; /* Minimum 128Kib per-magazine */564else565size = 256;566567return (size);568}569570/*571* Allocate a per-cpu magazine to associate with a specific core.572*/573static spl_kmem_magazine_t *574spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)575{576spl_kmem_magazine_t *skm;577int size = sizeof (spl_kmem_magazine_t) +578sizeof (void *) * skc->skc_mag_size;579580skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));581if (skm) {582skm->skm_magic = SKM_MAGIC;583skm->skm_avail = 0;584skm->skm_size = skc->skc_mag_size;585skm->skm_refill = skc->skc_mag_refill;586skm->skm_cache = skc;587skm->skm_cpu = cpu;588}589590return (skm);591}592593/*594* Free a per-cpu magazine associated with a specific core.595*/596static void597spl_magazine_free(spl_kmem_magazine_t *skm)598{599ASSERT(skm->skm_magic == SKM_MAGIC);600ASSERT0(skm->skm_avail);601kfree(skm);602}603604/*605* Create all pre-cpu magazines of reasonable sizes.606*/607static int608spl_magazine_create(spl_kmem_cache_t *skc)609{610int i = 0;611612ASSERT0((skc->skc_flags & KMC_SLAB));613614skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) *615num_possible_cpus(), kmem_flags_convert(KM_SLEEP));616skc->skc_mag_size = spl_magazine_size(skc);617skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;618619for_each_possible_cpu(i) {620skc->skc_mag[i] = spl_magazine_alloc(skc, i);621if (!skc->skc_mag[i]) {622for (i--; i >= 0; i--)623spl_magazine_free(skc->skc_mag[i]);624625kfree(skc->skc_mag);626return (-ENOMEM);627}628}629630return (0);631}632633/*634* Destroy all pre-cpu magazines.635*/636static void637spl_magazine_destroy(spl_kmem_cache_t *skc)638{639spl_kmem_magazine_t *skm;640int i = 0;641642ASSERT0((skc->skc_flags & KMC_SLAB));643644for_each_possible_cpu(i) {645skm = skc->skc_mag[i];646spl_cache_flush(skc, skm, skm->skm_avail);647spl_magazine_free(skm);648}649650kfree(skc->skc_mag);651}652653/*654* Create a object cache based on the following arguments:655* name cache name656* size cache object size657* align cache object alignment658* ctor cache object constructor659* dtor cache object destructor660* reclaim cache object reclaim661* priv cache private data for ctor/dtor/reclaim662* vmp unused must be NULL663* flags664* KMC_KVMEM Force kvmem backed SPL cache665* KMC_SLAB Force Linux slab backed cache666* KMC_NODEBUG Disable debugging (unsupported)667* KMC_RECLAIMABLE Memory can be freed under pressure668*/669spl_kmem_cache_t *670spl_kmem_cache_create(const char *name, size_t size, size_t align,671spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, void *reclaim,672void *priv, void *vmp, int flags)673{674gfp_t lflags = kmem_flags_convert(KM_SLEEP);675spl_kmem_cache_t *skc;676int rc;677678/*679* Unsupported flags680*/681ASSERT0P(vmp);682ASSERT0P(reclaim);683684might_sleep();685686skc = kzalloc(sizeof (*skc), lflags);687if (skc == NULL)688return (NULL);689690skc->skc_magic = SKC_MAGIC;691skc->skc_name_size = strlen(name) + 1;692skc->skc_name = kmalloc(skc->skc_name_size, lflags);693if (skc->skc_name == NULL) {694kfree(skc);695return (NULL);696}697strlcpy(skc->skc_name, name, skc->skc_name_size);698699skc->skc_ctor = ctor;700skc->skc_dtor = dtor;701skc->skc_private = priv;702skc->skc_vmp = vmp;703skc->skc_linux_cache = NULL;704skc->skc_flags = flags;705skc->skc_obj_size = size;706skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;707atomic_set(&skc->skc_ref, 0);708709INIT_LIST_HEAD(&skc->skc_list);710INIT_LIST_HEAD(&skc->skc_complete_list);711INIT_LIST_HEAD(&skc->skc_partial_list);712skc->skc_emergency_tree = RB_ROOT;713spin_lock_init(&skc->skc_lock);714init_waitqueue_head(&skc->skc_waitq);715skc->skc_slab_fail = 0;716skc->skc_slab_create = 0;717skc->skc_slab_destroy = 0;718skc->skc_slab_total = 0;719skc->skc_slab_alloc = 0;720skc->skc_slab_max = 0;721skc->skc_obj_total = 0;722skc->skc_obj_alloc = 0;723skc->skc_obj_max = 0;724skc->skc_obj_deadlock = 0;725skc->skc_obj_emergency = 0;726skc->skc_obj_emergency_max = 0;727728rc = percpu_counter_init(&skc->skc_linux_alloc, 0, GFP_KERNEL);729if (rc != 0) {730kfree(skc->skc_name);731kfree(skc);732return (NULL);733}734735/*736* Verify the requested alignment restriction is sane.737*/738if (align) {739VERIFY(ISP2(align));740VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);741VERIFY3U(align, <=, PAGE_SIZE);742skc->skc_obj_align = align;743}744745/*746* When no specific type of slab is requested (kmem, vmem, or747* linuxslab) then select a cache type based on the object size748* and default tunables.749*/750if (!(skc->skc_flags & (KMC_SLAB | KMC_KVMEM))) {751if (spl_kmem_cache_slab_limit &&752size <= (size_t)spl_kmem_cache_slab_limit) {753/*754* Objects smaller than spl_kmem_cache_slab_limit can755* use the Linux slab for better space-efficiency.756*/757skc->skc_flags |= KMC_SLAB;758} else {759/*760* All other objects are considered large and are761* placed on kvmem backed slabs.762*/763skc->skc_flags |= KMC_KVMEM;764}765}766767/*768* Given the type of slab allocate the required resources.769*/770if (skc->skc_flags & KMC_KVMEM) {771rc = spl_slab_size(skc,772&skc->skc_slab_objs, &skc->skc_slab_size);773if (rc)774goto out;775776rc = spl_magazine_create(skc);777if (rc)778goto out;779} else {780unsigned long slabflags = 0;781782if (size > spl_kmem_cache_slab_limit)783goto out;784785if (skc->skc_flags & KMC_RECLAIMABLE)786slabflags |= SLAB_RECLAIM_ACCOUNT;787788skc->skc_linux_cache = kmem_cache_create_usercopy(789skc->skc_name, size, align, slabflags, 0, size, NULL);790if (skc->skc_linux_cache == NULL)791goto out;792}793794down_write(&spl_kmem_cache_sem);795list_add_tail(&skc->skc_list, &spl_kmem_cache_list);796up_write(&spl_kmem_cache_sem);797798return (skc);799out:800kfree(skc->skc_name);801percpu_counter_destroy(&skc->skc_linux_alloc);802kfree(skc);803return (NULL);804}805EXPORT_SYMBOL(spl_kmem_cache_create);806807/*808* Register a move callback for cache defragmentation.809* XXX: Unimplemented but harmless to stub out for now.810*/811void812spl_kmem_cache_set_move(spl_kmem_cache_t *skc,813kmem_cbrc_t (move)(void *, void *, size_t, void *))814{815ASSERT(move != NULL);816}817EXPORT_SYMBOL(spl_kmem_cache_set_move);818819/*820* Destroy a cache and all objects associated with the cache.821*/822void823spl_kmem_cache_destroy(spl_kmem_cache_t *skc)824{825DECLARE_WAIT_QUEUE_HEAD(wq);826taskqid_t id;827828ASSERT(skc->skc_magic == SKC_MAGIC);829ASSERT(skc->skc_flags & (KMC_KVMEM | KMC_SLAB));830831down_write(&spl_kmem_cache_sem);832list_del_init(&skc->skc_list);833up_write(&spl_kmem_cache_sem);834835/* Cancel any and wait for any pending delayed tasks */836VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));837838spin_lock(&skc->skc_lock);839id = skc->skc_taskqid;840spin_unlock(&skc->skc_lock);841842taskq_cancel_id(spl_kmem_cache_taskq, id);843844/*845* Wait until all current callers complete, this is mainly846* to catch the case where a low memory situation triggers a847* cache reaping action which races with this destroy.848*/849wait_event(wq, atomic_read(&skc->skc_ref) == 0);850851if (skc->skc_flags & KMC_KVMEM) {852spl_magazine_destroy(skc);853spl_slab_reclaim(skc);854} else {855ASSERT(skc->skc_flags & KMC_SLAB);856kmem_cache_destroy(skc->skc_linux_cache);857}858859spin_lock(&skc->skc_lock);860861/*862* Validate there are no objects in use and free all the863* spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.864*/865ASSERT0(skc->skc_slab_alloc);866ASSERT0(skc->skc_obj_alloc);867ASSERT0(skc->skc_slab_total);868ASSERT0(skc->skc_obj_total);869ASSERT0(skc->skc_obj_emergency);870ASSERT(list_empty(&skc->skc_complete_list));871872ASSERT3U(percpu_counter_sum(&skc->skc_linux_alloc), ==, 0);873percpu_counter_destroy(&skc->skc_linux_alloc);874875spin_unlock(&skc->skc_lock);876877kfree(skc->skc_name);878kfree(skc);879}880EXPORT_SYMBOL(spl_kmem_cache_destroy);881882/*883* Allocate an object from a slab attached to the cache. This is used to884* repopulate the per-cpu magazine caches in batches when they run low.885*/886static void *887spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)888{889spl_kmem_obj_t *sko;890891ASSERT(skc->skc_magic == SKC_MAGIC);892ASSERT(sks->sks_magic == SKS_MAGIC);893894sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);895ASSERT(sko->sko_magic == SKO_MAGIC);896ASSERT(sko->sko_addr != NULL);897898/* Remove from sks_free_list */899list_del_init(&sko->sko_list);900901sks->sks_age = jiffies;902sks->sks_ref++;903skc->skc_obj_alloc++;904905/* Track max obj usage statistics */906if (skc->skc_obj_alloc > skc->skc_obj_max)907skc->skc_obj_max = skc->skc_obj_alloc;908909/* Track max slab usage statistics */910if (sks->sks_ref == 1) {911skc->skc_slab_alloc++;912913if (skc->skc_slab_alloc > skc->skc_slab_max)914skc->skc_slab_max = skc->skc_slab_alloc;915}916917return (sko->sko_addr);918}919920/*921* Generic slab allocation function to run by the global work queues.922* It is responsible for allocating a new slab, linking it in to the list923* of partial slabs, and then waking any waiters.924*/925static int926__spl_cache_grow(spl_kmem_cache_t *skc, int flags)927{928spl_kmem_slab_t *sks;929930fstrans_cookie_t cookie = spl_fstrans_mark();931sks = spl_slab_alloc(skc, flags);932spl_fstrans_unmark(cookie);933934spin_lock(&skc->skc_lock);935if (sks) {936skc->skc_slab_total++;937skc->skc_obj_total += sks->sks_objs;938list_add_tail(&sks->sks_list, &skc->skc_partial_list);939940smp_mb__before_atomic();941clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);942smp_mb__after_atomic();943}944spin_unlock(&skc->skc_lock);945946return (sks == NULL ? -ENOMEM : 0);947}948949static void950spl_cache_grow_work(void *data)951{952spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;953spl_kmem_cache_t *skc = ska->ska_cache;954955int error = __spl_cache_grow(skc, ska->ska_flags);956957atomic_dec(&skc->skc_ref);958smp_mb__before_atomic();959clear_bit(KMC_BIT_GROWING, &skc->skc_flags);960smp_mb__after_atomic();961if (error == 0)962wake_up_all(&skc->skc_waitq);963964kfree(ska);965}966967/*968* Returns non-zero when a new slab should be available.969*/970static int971spl_cache_grow_wait(spl_kmem_cache_t *skc)972{973return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags));974}975976/*977* No available objects on any slabs, create a new slab. Note that this978* functionality is disabled for KMC_SLAB caches which are backed by the979* Linux slab.980*/981static int982spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)983{984int remaining, rc = 0;985986ASSERT0(flags & ~KM_PUBLIC_MASK);987ASSERT(skc->skc_magic == SKC_MAGIC);988ASSERT0((skc->skc_flags & KMC_SLAB));989990*obj = NULL;991992/*993* Since we can't sleep attempt an emergency allocation to satisfy994* the request. The only alterative is to fail the allocation but995* it's preferable try. The use of KM_NOSLEEP is expected to be rare.996*/997if (flags & KM_NOSLEEP)998return (spl_emergency_alloc(skc, flags, obj));9991000might_sleep();10011002/*1003* Before allocating a new slab wait for any reaping to complete and1004* then return so the local magazine can be rechecked for new objects.1005*/1006if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {1007rc = wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,1008TASK_UNINTERRUPTIBLE);1009return (rc ? rc : -EAGAIN);1010}10111012/*1013* Note: It would be nice to reduce the overhead of context switch1014* and improve NUMA locality, by trying to allocate a new slab in the1015* current process context with KM_NOSLEEP flag.1016*1017* However, this can't be applied to vmem/kvmem due to a bug that1018* spl_vmalloc() doesn't honor gfp flags in page table allocation.1019*/10201021/*1022* This is handled by dispatching a work request to the global work1023* queue. This allows us to asynchronously allocate a new slab while1024* retaining the ability to safely fall back to a smaller synchronous1025* allocations to ensure forward progress is always maintained.1026*/1027if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {1028spl_kmem_alloc_t *ska;10291030ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags));1031if (ska == NULL) {1032clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags);1033smp_mb__after_atomic();1034wake_up_all(&skc->skc_waitq);1035return (-ENOMEM);1036}10371038atomic_inc(&skc->skc_ref);1039ska->ska_cache = skc;1040ska->ska_flags = flags;1041taskq_init_ent(&ska->ska_tqe);1042taskq_dispatch_ent(spl_kmem_cache_taskq,1043spl_cache_grow_work, ska, 0, &ska->ska_tqe);1044}10451046/*1047* The goal here is to only detect the rare case where a virtual slab1048* allocation has deadlocked. We must be careful to minimize the use1049* of emergency objects which are more expensive to track. Therefore,1050* we set a very long timeout for the asynchronous allocation and if1051* the timeout is reached the cache is flagged as deadlocked. From1052* this point only new emergency objects will be allocated until the1053* asynchronous allocation completes and clears the deadlocked flag.1054*/1055if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {1056rc = spl_emergency_alloc(skc, flags, obj);1057} else {1058remaining = wait_event_timeout(skc->skc_waitq,1059spl_cache_grow_wait(skc), HZ / 10);10601061if (!remaining) {1062spin_lock(&skc->skc_lock);1063if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {1064set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);1065skc->skc_obj_deadlock++;1066}1067spin_unlock(&skc->skc_lock);1068}10691070rc = -ENOMEM;1071}10721073return (rc);1074}10751076/*1077* Refill a per-cpu magazine with objects from the slabs for this cache.1078* Ideally the magazine can be repopulated using existing objects which have1079* been released, however if we are unable to locate enough free objects new1080* slabs of objects will be created. On success NULL is returned, otherwise1081* the address of a single emergency object is returned for use by the caller.1082*/1083static void *1084spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)1085{1086spl_kmem_slab_t *sks;1087int count = 0, rc, refill;1088void *obj = NULL;10891090ASSERT(skc->skc_magic == SKC_MAGIC);1091ASSERT(skm->skm_magic == SKM_MAGIC);10921093refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);1094spin_lock(&skc->skc_lock);10951096while (refill > 0) {1097/* No slabs available we may need to grow the cache */1098if (list_empty(&skc->skc_partial_list)) {1099spin_unlock(&skc->skc_lock);11001101local_irq_enable();1102rc = spl_cache_grow(skc, flags, &obj);1103local_irq_disable();11041105/* Emergency object for immediate use by caller */1106if (rc == 0 && obj != NULL)1107return (obj);11081109if (rc)1110goto out;11111112/* Rescheduled to different CPU skm is not local */1113if (skm != skc->skc_mag[smp_processor_id()])1114goto out;11151116/*1117* Potentially rescheduled to the same CPU but1118* allocations may have occurred from this CPU while1119* we were sleeping so recalculate max refill.1120*/1121refill = MIN(refill, skm->skm_size - skm->skm_avail);11221123spin_lock(&skc->skc_lock);1124continue;1125}11261127/* Grab the next available slab */1128sks = list_entry((&skc->skc_partial_list)->next,1129spl_kmem_slab_t, sks_list);1130ASSERT(sks->sks_magic == SKS_MAGIC);1131ASSERT(sks->sks_ref < sks->sks_objs);1132ASSERT(!list_empty(&sks->sks_free_list));11331134/*1135* Consume as many objects as needed to refill the requested1136* cache. We must also be careful not to overfill it.1137*/1138while (sks->sks_ref < sks->sks_objs && refill-- > 0 &&1139++count) {1140ASSERT(skm->skm_avail < skm->skm_size);1141ASSERT(count < skm->skm_size);1142skm->skm_objs[skm->skm_avail++] =1143spl_cache_obj(skc, sks);1144}11451146/* Move slab to skc_complete_list when full */1147if (sks->sks_ref == sks->sks_objs) {1148list_del(&sks->sks_list);1149list_add(&sks->sks_list, &skc->skc_complete_list);1150}1151}11521153spin_unlock(&skc->skc_lock);1154out:1155return (NULL);1156}11571158/*1159* Release an object back to the slab from which it came.1160*/1161static void1162spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)1163{1164spl_kmem_slab_t *sks = NULL;1165spl_kmem_obj_t *sko = NULL;11661167ASSERT(skc->skc_magic == SKC_MAGIC);11681169sko = spl_sko_from_obj(skc, obj);1170ASSERT(sko->sko_magic == SKO_MAGIC);1171sks = sko->sko_slab;1172ASSERT(sks->sks_magic == SKS_MAGIC);1173ASSERT(sks->sks_cache == skc);1174list_add(&sko->sko_list, &sks->sks_free_list);11751176sks->sks_age = jiffies;1177sks->sks_ref--;1178skc->skc_obj_alloc--;11791180/*1181* Move slab to skc_partial_list when no longer full. Slabs1182* are added to the head to keep the partial list is quasi-full1183* sorted order. Fuller at the head, emptier at the tail.1184*/1185if (sks->sks_ref == (sks->sks_objs - 1)) {1186list_del(&sks->sks_list);1187list_add(&sks->sks_list, &skc->skc_partial_list);1188}11891190/*1191* Move empty slabs to the end of the partial list so1192* they can be easily found and freed during reclamation.1193*/1194if (sks->sks_ref == 0) {1195list_del(&sks->sks_list);1196list_add_tail(&sks->sks_list, &skc->skc_partial_list);1197skc->skc_slab_alloc--;1198}1199}12001201/*1202* Allocate an object from the per-cpu magazine, or if the magazine1203* is empty directly allocate from a slab and repopulate the magazine.1204*/1205void *1206spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)1207{1208spl_kmem_magazine_t *skm;1209void *obj = NULL;12101211ASSERT0(flags & ~KM_PUBLIC_MASK);1212ASSERT(skc->skc_magic == SKC_MAGIC);1213ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));12141215/*1216* Allocate directly from a Linux slab. All optimizations are left1217* to the underlying cache we only need to guarantee that KM_SLEEP1218* callers will never fail.1219*/1220if (skc->skc_flags & KMC_SLAB) {1221struct kmem_cache *slc = skc->skc_linux_cache;1222do {1223obj = kmem_cache_alloc(slc, kmem_flags_convert(flags));1224} while ((obj == NULL) && !(flags & KM_NOSLEEP));12251226if (obj != NULL) {1227/*1228* Even though we leave everything up to the1229* underlying cache we still keep track of1230* how many objects we've allocated in it for1231* better debuggability.1232*/1233percpu_counter_inc(&skc->skc_linux_alloc);1234}1235goto ret;1236}12371238local_irq_disable();12391240restart:1241/*1242* Safe to update per-cpu structure without lock, but1243* in the restart case we must be careful to reacquire1244* the local magazine since this may have changed1245* when we need to grow the cache.1246*/1247skm = skc->skc_mag[smp_processor_id()];1248ASSERT(skm->skm_magic == SKM_MAGIC);12491250if (likely(skm->skm_avail)) {1251/* Object available in CPU cache, use it */1252obj = skm->skm_objs[--skm->skm_avail];1253} else {1254obj = spl_cache_refill(skc, skm, flags);1255if ((obj == NULL) && !(flags & KM_NOSLEEP))1256goto restart;12571258local_irq_enable();1259goto ret;1260}12611262local_irq_enable();1263ASSERT(obj);1264ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));12651266ret:1267/* Pre-emptively migrate object to CPU L1 cache */1268if (obj) {1269if (obj && skc->skc_ctor)1270skc->skc_ctor(obj, skc->skc_private, flags);1271else1272prefetchw(obj);1273}12741275return (obj);1276}1277EXPORT_SYMBOL(spl_kmem_cache_alloc);12781279/*1280* Free an object back to the local per-cpu magazine, there is no1281* guarantee that this is the same magazine the object was originally1282* allocated from. We may need to flush entire from the magazine1283* back to the slabs to make space.1284*/1285void1286spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)1287{1288spl_kmem_magazine_t *skm;1289unsigned long flags;1290int do_reclaim = 0;1291int do_emergency = 0;12921293ASSERT(skc->skc_magic == SKC_MAGIC);1294ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));12951296/*1297* Run the destructor1298*/1299if (skc->skc_dtor)1300skc->skc_dtor(obj, skc->skc_private);13011302/*1303* Free the object from the Linux underlying Linux slab.1304*/1305if (skc->skc_flags & KMC_SLAB) {1306kmem_cache_free(skc->skc_linux_cache, obj);1307percpu_counter_dec(&skc->skc_linux_alloc);1308return;1309}13101311/*1312* While a cache has outstanding emergency objects all freed objects1313* must be checked. However, since emergency objects will never use1314* a virtual address these objects can be safely excluded as an1315* optimization.1316*/1317if (!is_vmalloc_addr(obj)) {1318spin_lock(&skc->skc_lock);1319do_emergency = (skc->skc_obj_emergency > 0);1320spin_unlock(&skc->skc_lock);13211322if (do_emergency && (spl_emergency_free(skc, obj) == 0))1323return;1324}13251326local_irq_save(flags);13271328/*1329* Safe to update per-cpu structure without lock, but1330* no remote memory allocation tracking is being performed1331* it is entirely possible to allocate an object from one1332* CPU cache and return it to another.1333*/1334skm = skc->skc_mag[smp_processor_id()];1335ASSERT(skm->skm_magic == SKM_MAGIC);13361337/*1338* Per-CPU cache full, flush it to make space for this object,1339* this may result in an empty slab which can be reclaimed once1340* interrupts are re-enabled.1341*/1342if (unlikely(skm->skm_avail >= skm->skm_size)) {1343spl_cache_flush(skc, skm, skm->skm_refill);1344do_reclaim = 1;1345}13461347/* Available space in cache, use it */1348skm->skm_objs[skm->skm_avail++] = obj;13491350local_irq_restore(flags);13511352if (do_reclaim)1353spl_slab_reclaim(skc);1354}1355EXPORT_SYMBOL(spl_kmem_cache_free);13561357/*1358* Depending on how many and which objects are released it may simply1359* repopulate the local magazine which will then need to age-out. Objects1360* which cannot fit in the magazine will be released back to their slabs1361* which will also need to age out before being released. This is all just1362* best effort and we do not want to thrash creating and destroying slabs.1363*/1364void1365spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)1366{1367ASSERT(skc->skc_magic == SKC_MAGIC);1368ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));13691370if (skc->skc_flags & KMC_SLAB)1371return;13721373atomic_inc(&skc->skc_ref);13741375/*1376* Prevent concurrent cache reaping when contended.1377*/1378if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))1379goto out;13801381/* Reclaim from the magazine and free all now empty slabs. */1382unsigned long irq_flags;1383local_irq_save(irq_flags);1384spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];1385spl_cache_flush(skc, skm, skm->skm_avail);1386local_irq_restore(irq_flags);13871388spl_slab_reclaim(skc);1389clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags);1390smp_mb__after_atomic();1391wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);1392out:1393atomic_dec(&skc->skc_ref);1394}1395EXPORT_SYMBOL(spl_kmem_cache_reap_now);13961397/*1398* This is stubbed out for code consistency with other platforms. There1399* is existing logic to prevent concurrent reaping so while this is ugly1400* it should do no harm.1401*/1402int1403spl_kmem_cache_reap_active(void)1404{1405return (0);1406}1407EXPORT_SYMBOL(spl_kmem_cache_reap_active);14081409/*1410* Reap all free slabs from all registered caches.1411*/1412void1413spl_kmem_reap(void)1414{1415spl_kmem_cache_t *skc = NULL;14161417down_read(&spl_kmem_cache_sem);1418list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {1419spl_kmem_cache_reap_now(skc);1420}1421up_read(&spl_kmem_cache_sem);1422}1423EXPORT_SYMBOL(spl_kmem_reap);14241425int1426spl_kmem_cache_init(void)1427{1428init_rwsem(&spl_kmem_cache_sem);1429INIT_LIST_HEAD(&spl_kmem_cache_list);1430spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",1431spl_kmem_cache_kmem_threads, maxclsyspri,1432spl_kmem_cache_kmem_threads * 8, INT_MAX,1433TASKQ_PREPOPULATE | TASKQ_DYNAMIC);14341435if (spl_kmem_cache_taskq == NULL)1436return (-ENOMEM);14371438return (0);1439}14401441void1442spl_kmem_cache_fini(void)1443{1444taskq_destroy(spl_kmem_cache_taskq);1445}144614471448