Path: blob/main/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
48775 views
// SPDX-License-Identifier: BSD-2-Clause1/*2* Copyright (c) 2021 Klara Systems, Inc.3* All rights reserved.4*5* Redistribution and use in source and binary forms, with or without6* modification, are permitted provided that the following conditions7* are met:8* 1. Redistributions of source code must retain the above copyright9* notice, this list of conditions and the following disclaimer.10* 2. Redistributions in binary form must reproduce the above copyright11* notice, this list of conditions and the following disclaimer in the12* documentation and/or other materials provided with the distribution.13*14* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND15* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE16* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE17* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE18* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL19* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS20* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)21* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT22* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY23* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF24* SUCH DAMAGE.25*/2627/*28* Copyright (c) 2025, Rob Norris <[email protected]>29*/3031#include <sys/types.h>32#include <sys/sysmacros.h>33#include <sys/kmem.h>34#include <linux/file.h>35#include <linux/magic.h>36#include <sys/zone.h>37#include <sys/string.h>3839#if defined(CONFIG_USER_NS)40#include <linux/statfs.h>41#include <linux/proc_ns.h>42#endif4344#include <sys/mutex.h>4546static kmutex_t zone_datasets_lock;47static struct list_head zone_datasets;4849typedef struct zone_datasets {50struct list_head zds_list; /* zone_datasets linkage */51struct user_namespace *zds_userns; /* namespace reference */52struct list_head zds_datasets; /* datasets for the namespace */53} zone_datasets_t;5455typedef struct zone_dataset {56struct list_head zd_list; /* zone_dataset linkage */57size_t zd_dsnamelen; /* length of name */58char zd_dsname[]; /* name of the member dataset */59} zone_dataset_t;6061#ifdef CONFIG_USER_NS6263/*64* Linux 6.18 moved the generic namespace type away from ns->ops->type onto65* ns_common itself.66*/67#ifdef HAVE_NS_COMMON_TYPE68#define ns_is_newuser(ns) \69((ns)->ns_type == CLONE_NEWUSER)70#else71#define ns_is_newuser(ns) \72((ns)->ops != NULL && (ns)->ops->type == CLONE_NEWUSER)73#endif7475/*76* Returns:77* - 0 on success78* - EBADF if it cannot open the provided file descriptor79* - ENOTTY if the file itself is a not a user namespace file. We want to80* intercept this error in the ZFS layer. We cannot just return one of the81* ZFS_ERR_* errors here as we want to preserve the seperation of the ZFS82* and the SPL layers.83*/84static int85user_ns_get(int fd, struct user_namespace **userns)86{87struct kstatfs st;88struct file *nsfile;89struct ns_common *ns;90int error;9192if ((nsfile = fget(fd)) == NULL)93return (EBADF);94if (vfs_statfs(&nsfile->f_path, &st) != 0) {95error = ENOTTY;96goto done;97}98if (st.f_type != NSFS_MAGIC) {99error = ENOTTY;100goto done;101}102ns = get_proc_ns(file_inode(nsfile));103if (!ns_is_newuser(ns)) {104error = ENOTTY;105goto done;106}107*userns = container_of(ns, struct user_namespace, ns);108109error = 0;110done:111fput(nsfile);112113return (error);114}115#endif /* CONFIG_USER_NS */116117static unsigned int118user_ns_zoneid(struct user_namespace *user_ns)119{120unsigned int r;121122r = user_ns->ns.inum;123124return (r);125}126127static struct zone_datasets *128zone_datasets_lookup(unsigned int nsinum)129{130zone_datasets_t *zds;131132list_for_each_entry(zds, &zone_datasets, zds_list) {133if (user_ns_zoneid(zds->zds_userns) == nsinum)134return (zds);135}136return (NULL);137}138139#ifdef CONFIG_USER_NS140static struct zone_dataset *141zone_dataset_lookup(zone_datasets_t *zds, const char *dataset, size_t dsnamelen)142{143zone_dataset_t *zd;144145list_for_each_entry(zd, &zds->zds_datasets, zd_list) {146if (zd->zd_dsnamelen != dsnamelen)147continue;148if (strncmp(zd->zd_dsname, dataset, dsnamelen) == 0)149return (zd);150}151152return (NULL);153}154155static int156zone_dataset_cred_check(cred_t *cred)157{158159if (!uid_eq(cred->uid, GLOBAL_ROOT_UID))160return (EPERM);161162return (0);163}164#endif /* CONFIG_USER_NS */165166static int167zone_dataset_name_check(const char *dataset, size_t *dsnamelen)168{169170if (dataset[0] == '\0' || dataset[0] == '/')171return (ENOENT);172173*dsnamelen = strlen(dataset);174/* Ignore trailing slash, if supplied. */175if (dataset[*dsnamelen - 1] == '/')176(*dsnamelen)--;177178return (0);179}180181int182zone_dataset_attach(cred_t *cred, const char *dataset, int userns_fd)183{184#ifdef CONFIG_USER_NS185struct user_namespace *userns;186zone_datasets_t *zds;187zone_dataset_t *zd;188int error;189size_t dsnamelen;190191if ((error = zone_dataset_cred_check(cred)) != 0)192return (error);193if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)194return (error);195if ((error = user_ns_get(userns_fd, &userns)) != 0)196return (error);197198mutex_enter(&zone_datasets_lock);199zds = zone_datasets_lookup(user_ns_zoneid(userns));200if (zds == NULL) {201zds = kmem_alloc(sizeof (zone_datasets_t), KM_SLEEP);202INIT_LIST_HEAD(&zds->zds_list);203INIT_LIST_HEAD(&zds->zds_datasets);204zds->zds_userns = userns;205/*206* Lock the namespace by incresing its refcount to prevent207* the namespace ID from being reused.208*/209get_user_ns(userns);210list_add_tail(&zds->zds_list, &zone_datasets);211} else {212zd = zone_dataset_lookup(zds, dataset, dsnamelen);213if (zd != NULL) {214mutex_exit(&zone_datasets_lock);215return (EEXIST);216}217}218219zd = kmem_alloc(sizeof (zone_dataset_t) + dsnamelen + 1, KM_SLEEP);220zd->zd_dsnamelen = dsnamelen;221strlcpy(zd->zd_dsname, dataset, dsnamelen + 1);222INIT_LIST_HEAD(&zd->zd_list);223list_add_tail(&zd->zd_list, &zds->zds_datasets);224225mutex_exit(&zone_datasets_lock);226return (0);227#else228return (ENXIO);229#endif /* CONFIG_USER_NS */230}231EXPORT_SYMBOL(zone_dataset_attach);232233int234zone_dataset_detach(cred_t *cred, const char *dataset, int userns_fd)235{236#ifdef CONFIG_USER_NS237struct user_namespace *userns;238zone_datasets_t *zds;239zone_dataset_t *zd;240int error;241size_t dsnamelen;242243if ((error = zone_dataset_cred_check(cred)) != 0)244return (error);245if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)246return (error);247if ((error = user_ns_get(userns_fd, &userns)) != 0)248return (error);249250mutex_enter(&zone_datasets_lock);251zds = zone_datasets_lookup(user_ns_zoneid(userns));252if (zds != NULL)253zd = zone_dataset_lookup(zds, dataset, dsnamelen);254if (zds == NULL || zd == NULL) {255mutex_exit(&zone_datasets_lock);256return (ENOENT);257}258259list_del(&zd->zd_list);260kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);261262/* Prune the namespace entry if it has no more delegations. */263if (list_empty(&zds->zds_datasets)) {264/*265* Decrease the refcount now that the namespace is no longer266* used. It is no longer necessary to prevent the namespace ID267* from being reused.268*/269put_user_ns(userns);270list_del(&zds->zds_list);271kmem_free(zds, sizeof (*zds));272}273274mutex_exit(&zone_datasets_lock);275return (0);276#else277return (ENXIO);278#endif /* CONFIG_USER_NS */279}280EXPORT_SYMBOL(zone_dataset_detach);281282/*283* A dataset is visible if:284* - It is a parent of a namespace entry.285* - It is one of the namespace entries.286* - It is a child of a namespace entry.287*288* A dataset is writable if:289* - It is one of the namespace entries.290* - It is a child of a namespace entry.291*292* The parent datasets of namespace entries are visible and293* read-only to provide a path back to the root of the pool.294*/295int296zone_dataset_visible(const char *dataset, int *write)297{298zone_datasets_t *zds;299zone_dataset_t *zd;300size_t dsnamelen, zd_len;301int visible;302303/* Default to read-only, in case visible is returned. */304if (write != NULL)305*write = 0;306if (zone_dataset_name_check(dataset, &dsnamelen) != 0)307return (0);308if (INGLOBALZONE(curproc)) {309if (write != NULL)310*write = 1;311return (1);312}313314mutex_enter(&zone_datasets_lock);315zds = zone_datasets_lookup(crgetzoneid(curproc->cred));316if (zds == NULL) {317mutex_exit(&zone_datasets_lock);318return (0);319}320321visible = 0;322list_for_each_entry(zd, &zds->zds_datasets, zd_list) {323zd_len = strlen(zd->zd_dsname);324if (zd_len > dsnamelen) {325/*326* The name of the namespace entry is longer than that327* of the dataset, so it could be that the dataset is a328* parent of the namespace entry.329*/330visible = memcmp(zd->zd_dsname, dataset,331dsnamelen) == 0 &&332zd->zd_dsname[dsnamelen] == '/';333if (visible)334break;335} else if (zd_len == dsnamelen) {336/*337* The name of the namespace entry is as long as that338* of the dataset, so perhaps the dataset itself is the339* namespace entry.340*/341visible = memcmp(zd->zd_dsname, dataset, zd_len) == 0;342if (visible) {343if (write != NULL)344*write = 1;345break;346}347} else {348/*349* The name of the namespace entry is shorter than that350* of the dataset, so perhaps the dataset is a child of351* the namespace entry.352*/353visible = memcmp(zd->zd_dsname, dataset,354zd_len) == 0 && dataset[zd_len] == '/';355if (visible) {356if (write != NULL)357*write = 1;358break;359}360}361}362363mutex_exit(&zone_datasets_lock);364return (visible);365}366EXPORT_SYMBOL(zone_dataset_visible);367368unsigned int369global_zoneid(void)370{371unsigned int z = 0;372373#if defined(CONFIG_USER_NS)374z = user_ns_zoneid(&init_user_ns);375#endif376377return (z);378}379EXPORT_SYMBOL(global_zoneid);380381unsigned int382crgetzoneid(const cred_t *cr)383{384unsigned int r = 0;385386#if defined(CONFIG_USER_NS)387r = user_ns_zoneid(cr->user_ns);388#endif389390return (r);391}392EXPORT_SYMBOL(crgetzoneid);393394boolean_t395inglobalzone(proc_t *proc)396{397#if defined(CONFIG_USER_NS)398return (proc->cred->user_ns == &init_user_ns);399#else400return (B_TRUE);401#endif402}403EXPORT_SYMBOL(inglobalzone);404405int406spl_zone_init(void)407{408mutex_init(&zone_datasets_lock, NULL, MUTEX_DEFAULT, NULL);409INIT_LIST_HEAD(&zone_datasets);410return (0);411}412413void414spl_zone_fini(void)415{416zone_datasets_t *zds;417zone_dataset_t *zd;418419/*420* It would be better to assert an empty zone_datasets, but since421* there's no automatic mechanism for cleaning them up if the user422* namespace is destroyed, just do it here, since spl is about to go423* out of context.424*/425while (!list_empty(&zone_datasets)) {426zds = list_entry(zone_datasets.next, zone_datasets_t, zds_list);427while (!list_empty(&zds->zds_datasets)) {428zd = list_entry(zds->zds_datasets.next,429zone_dataset_t, zd_list);430list_del(&zd->zd_list);431kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);432}433put_user_ns(zds->zds_userns);434list_del(&zds->zds_list);435kmem_free(zds, sizeof (*zds));436}437mutex_destroy(&zone_datasets_lock);438}439440441