Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
48775 views
1
// SPDX-License-Identifier: BSD-2-Clause
2
/*
3
* Copyright (c) 2021 Klara Systems, Inc.
4
* All rights reserved.
5
*
6
* Redistribution and use in source and binary forms, with or without
7
* modification, are permitted provided that the following conditions
8
* are met:
9
* 1. Redistributions of source code must retain the above copyright
10
* notice, this list of conditions and the following disclaimer.
11
* 2. Redistributions in binary form must reproduce the above copyright
12
* notice, this list of conditions and the following disclaimer in the
13
* documentation and/or other materials provided with the distribution.
14
*
15
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25
* SUCH DAMAGE.
26
*/
27
28
/*
29
* Copyright (c) 2025, Rob Norris <[email protected]>
30
*/
31
32
#include <sys/types.h>
33
#include <sys/sysmacros.h>
34
#include <sys/kmem.h>
35
#include <linux/file.h>
36
#include <linux/magic.h>
37
#include <sys/zone.h>
38
#include <sys/string.h>
39
40
#if defined(CONFIG_USER_NS)
41
#include <linux/statfs.h>
42
#include <linux/proc_ns.h>
43
#endif
44
45
#include <sys/mutex.h>
46
47
static kmutex_t zone_datasets_lock;
48
static struct list_head zone_datasets;
49
50
typedef struct zone_datasets {
51
struct list_head zds_list; /* zone_datasets linkage */
52
struct user_namespace *zds_userns; /* namespace reference */
53
struct list_head zds_datasets; /* datasets for the namespace */
54
} zone_datasets_t;
55
56
typedef struct zone_dataset {
57
struct list_head zd_list; /* zone_dataset linkage */
58
size_t zd_dsnamelen; /* length of name */
59
char zd_dsname[]; /* name of the member dataset */
60
} zone_dataset_t;
61
62
#ifdef CONFIG_USER_NS
63
64
/*
65
* Linux 6.18 moved the generic namespace type away from ns->ops->type onto
66
* ns_common itself.
67
*/
68
#ifdef HAVE_NS_COMMON_TYPE
69
#define ns_is_newuser(ns) \
70
((ns)->ns_type == CLONE_NEWUSER)
71
#else
72
#define ns_is_newuser(ns) \
73
((ns)->ops != NULL && (ns)->ops->type == CLONE_NEWUSER)
74
#endif
75
76
/*
77
* Returns:
78
* - 0 on success
79
* - EBADF if it cannot open the provided file descriptor
80
* - ENOTTY if the file itself is a not a user namespace file. We want to
81
* intercept this error in the ZFS layer. We cannot just return one of the
82
* ZFS_ERR_* errors here as we want to preserve the seperation of the ZFS
83
* and the SPL layers.
84
*/
85
static int
86
user_ns_get(int fd, struct user_namespace **userns)
87
{
88
struct kstatfs st;
89
struct file *nsfile;
90
struct ns_common *ns;
91
int error;
92
93
if ((nsfile = fget(fd)) == NULL)
94
return (EBADF);
95
if (vfs_statfs(&nsfile->f_path, &st) != 0) {
96
error = ENOTTY;
97
goto done;
98
}
99
if (st.f_type != NSFS_MAGIC) {
100
error = ENOTTY;
101
goto done;
102
}
103
ns = get_proc_ns(file_inode(nsfile));
104
if (!ns_is_newuser(ns)) {
105
error = ENOTTY;
106
goto done;
107
}
108
*userns = container_of(ns, struct user_namespace, ns);
109
110
error = 0;
111
done:
112
fput(nsfile);
113
114
return (error);
115
}
116
#endif /* CONFIG_USER_NS */
117
118
static unsigned int
119
user_ns_zoneid(struct user_namespace *user_ns)
120
{
121
unsigned int r;
122
123
r = user_ns->ns.inum;
124
125
return (r);
126
}
127
128
static struct zone_datasets *
129
zone_datasets_lookup(unsigned int nsinum)
130
{
131
zone_datasets_t *zds;
132
133
list_for_each_entry(zds, &zone_datasets, zds_list) {
134
if (user_ns_zoneid(zds->zds_userns) == nsinum)
135
return (zds);
136
}
137
return (NULL);
138
}
139
140
#ifdef CONFIG_USER_NS
141
static struct zone_dataset *
142
zone_dataset_lookup(zone_datasets_t *zds, const char *dataset, size_t dsnamelen)
143
{
144
zone_dataset_t *zd;
145
146
list_for_each_entry(zd, &zds->zds_datasets, zd_list) {
147
if (zd->zd_dsnamelen != dsnamelen)
148
continue;
149
if (strncmp(zd->zd_dsname, dataset, dsnamelen) == 0)
150
return (zd);
151
}
152
153
return (NULL);
154
}
155
156
static int
157
zone_dataset_cred_check(cred_t *cred)
158
{
159
160
if (!uid_eq(cred->uid, GLOBAL_ROOT_UID))
161
return (EPERM);
162
163
return (0);
164
}
165
#endif /* CONFIG_USER_NS */
166
167
static int
168
zone_dataset_name_check(const char *dataset, size_t *dsnamelen)
169
{
170
171
if (dataset[0] == '\0' || dataset[0] == '/')
172
return (ENOENT);
173
174
*dsnamelen = strlen(dataset);
175
/* Ignore trailing slash, if supplied. */
176
if (dataset[*dsnamelen - 1] == '/')
177
(*dsnamelen)--;
178
179
return (0);
180
}
181
182
int
183
zone_dataset_attach(cred_t *cred, const char *dataset, int userns_fd)
184
{
185
#ifdef CONFIG_USER_NS
186
struct user_namespace *userns;
187
zone_datasets_t *zds;
188
zone_dataset_t *zd;
189
int error;
190
size_t dsnamelen;
191
192
if ((error = zone_dataset_cred_check(cred)) != 0)
193
return (error);
194
if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
195
return (error);
196
if ((error = user_ns_get(userns_fd, &userns)) != 0)
197
return (error);
198
199
mutex_enter(&zone_datasets_lock);
200
zds = zone_datasets_lookup(user_ns_zoneid(userns));
201
if (zds == NULL) {
202
zds = kmem_alloc(sizeof (zone_datasets_t), KM_SLEEP);
203
INIT_LIST_HEAD(&zds->zds_list);
204
INIT_LIST_HEAD(&zds->zds_datasets);
205
zds->zds_userns = userns;
206
/*
207
* Lock the namespace by incresing its refcount to prevent
208
* the namespace ID from being reused.
209
*/
210
get_user_ns(userns);
211
list_add_tail(&zds->zds_list, &zone_datasets);
212
} else {
213
zd = zone_dataset_lookup(zds, dataset, dsnamelen);
214
if (zd != NULL) {
215
mutex_exit(&zone_datasets_lock);
216
return (EEXIST);
217
}
218
}
219
220
zd = kmem_alloc(sizeof (zone_dataset_t) + dsnamelen + 1, KM_SLEEP);
221
zd->zd_dsnamelen = dsnamelen;
222
strlcpy(zd->zd_dsname, dataset, dsnamelen + 1);
223
INIT_LIST_HEAD(&zd->zd_list);
224
list_add_tail(&zd->zd_list, &zds->zds_datasets);
225
226
mutex_exit(&zone_datasets_lock);
227
return (0);
228
#else
229
return (ENXIO);
230
#endif /* CONFIG_USER_NS */
231
}
232
EXPORT_SYMBOL(zone_dataset_attach);
233
234
int
235
zone_dataset_detach(cred_t *cred, const char *dataset, int userns_fd)
236
{
237
#ifdef CONFIG_USER_NS
238
struct user_namespace *userns;
239
zone_datasets_t *zds;
240
zone_dataset_t *zd;
241
int error;
242
size_t dsnamelen;
243
244
if ((error = zone_dataset_cred_check(cred)) != 0)
245
return (error);
246
if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
247
return (error);
248
if ((error = user_ns_get(userns_fd, &userns)) != 0)
249
return (error);
250
251
mutex_enter(&zone_datasets_lock);
252
zds = zone_datasets_lookup(user_ns_zoneid(userns));
253
if (zds != NULL)
254
zd = zone_dataset_lookup(zds, dataset, dsnamelen);
255
if (zds == NULL || zd == NULL) {
256
mutex_exit(&zone_datasets_lock);
257
return (ENOENT);
258
}
259
260
list_del(&zd->zd_list);
261
kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
262
263
/* Prune the namespace entry if it has no more delegations. */
264
if (list_empty(&zds->zds_datasets)) {
265
/*
266
* Decrease the refcount now that the namespace is no longer
267
* used. It is no longer necessary to prevent the namespace ID
268
* from being reused.
269
*/
270
put_user_ns(userns);
271
list_del(&zds->zds_list);
272
kmem_free(zds, sizeof (*zds));
273
}
274
275
mutex_exit(&zone_datasets_lock);
276
return (0);
277
#else
278
return (ENXIO);
279
#endif /* CONFIG_USER_NS */
280
}
281
EXPORT_SYMBOL(zone_dataset_detach);
282
283
/*
284
* A dataset is visible if:
285
* - It is a parent of a namespace entry.
286
* - It is one of the namespace entries.
287
* - It is a child of a namespace entry.
288
*
289
* A dataset is writable if:
290
* - It is one of the namespace entries.
291
* - It is a child of a namespace entry.
292
*
293
* The parent datasets of namespace entries are visible and
294
* read-only to provide a path back to the root of the pool.
295
*/
296
int
297
zone_dataset_visible(const char *dataset, int *write)
298
{
299
zone_datasets_t *zds;
300
zone_dataset_t *zd;
301
size_t dsnamelen, zd_len;
302
int visible;
303
304
/* Default to read-only, in case visible is returned. */
305
if (write != NULL)
306
*write = 0;
307
if (zone_dataset_name_check(dataset, &dsnamelen) != 0)
308
return (0);
309
if (INGLOBALZONE(curproc)) {
310
if (write != NULL)
311
*write = 1;
312
return (1);
313
}
314
315
mutex_enter(&zone_datasets_lock);
316
zds = zone_datasets_lookup(crgetzoneid(curproc->cred));
317
if (zds == NULL) {
318
mutex_exit(&zone_datasets_lock);
319
return (0);
320
}
321
322
visible = 0;
323
list_for_each_entry(zd, &zds->zds_datasets, zd_list) {
324
zd_len = strlen(zd->zd_dsname);
325
if (zd_len > dsnamelen) {
326
/*
327
* The name of the namespace entry is longer than that
328
* of the dataset, so it could be that the dataset is a
329
* parent of the namespace entry.
330
*/
331
visible = memcmp(zd->zd_dsname, dataset,
332
dsnamelen) == 0 &&
333
zd->zd_dsname[dsnamelen] == '/';
334
if (visible)
335
break;
336
} else if (zd_len == dsnamelen) {
337
/*
338
* The name of the namespace entry is as long as that
339
* of the dataset, so perhaps the dataset itself is the
340
* namespace entry.
341
*/
342
visible = memcmp(zd->zd_dsname, dataset, zd_len) == 0;
343
if (visible) {
344
if (write != NULL)
345
*write = 1;
346
break;
347
}
348
} else {
349
/*
350
* The name of the namespace entry is shorter than that
351
* of the dataset, so perhaps the dataset is a child of
352
* the namespace entry.
353
*/
354
visible = memcmp(zd->zd_dsname, dataset,
355
zd_len) == 0 && dataset[zd_len] == '/';
356
if (visible) {
357
if (write != NULL)
358
*write = 1;
359
break;
360
}
361
}
362
}
363
364
mutex_exit(&zone_datasets_lock);
365
return (visible);
366
}
367
EXPORT_SYMBOL(zone_dataset_visible);
368
369
unsigned int
370
global_zoneid(void)
371
{
372
unsigned int z = 0;
373
374
#if defined(CONFIG_USER_NS)
375
z = user_ns_zoneid(&init_user_ns);
376
#endif
377
378
return (z);
379
}
380
EXPORT_SYMBOL(global_zoneid);
381
382
unsigned int
383
crgetzoneid(const cred_t *cr)
384
{
385
unsigned int r = 0;
386
387
#if defined(CONFIG_USER_NS)
388
r = user_ns_zoneid(cr->user_ns);
389
#endif
390
391
return (r);
392
}
393
EXPORT_SYMBOL(crgetzoneid);
394
395
boolean_t
396
inglobalzone(proc_t *proc)
397
{
398
#if defined(CONFIG_USER_NS)
399
return (proc->cred->user_ns == &init_user_ns);
400
#else
401
return (B_TRUE);
402
#endif
403
}
404
EXPORT_SYMBOL(inglobalzone);
405
406
int
407
spl_zone_init(void)
408
{
409
mutex_init(&zone_datasets_lock, NULL, MUTEX_DEFAULT, NULL);
410
INIT_LIST_HEAD(&zone_datasets);
411
return (0);
412
}
413
414
void
415
spl_zone_fini(void)
416
{
417
zone_datasets_t *zds;
418
zone_dataset_t *zd;
419
420
/*
421
* It would be better to assert an empty zone_datasets, but since
422
* there's no automatic mechanism for cleaning them up if the user
423
* namespace is destroyed, just do it here, since spl is about to go
424
* out of context.
425
*/
426
while (!list_empty(&zone_datasets)) {
427
zds = list_entry(zone_datasets.next, zone_datasets_t, zds_list);
428
while (!list_empty(&zds->zds_datasets)) {
429
zd = list_entry(zds->zds_datasets.next,
430
zone_dataset_t, zd_list);
431
list_del(&zd->zd_list);
432
kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
433
}
434
put_user_ns(zds->zds_userns);
435
list_del(&zds->zds_list);
436
kmem_free(zds, sizeof (*zds));
437
}
438
mutex_destroy(&zone_datasets_lock);
439
}
440
441