Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/lib/libzpool/kernel.c
48375 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
26
* Copyright (c) 2025, Klara, Inc.
27
*/
28
29
#include <assert.h>
30
#include <fcntl.h>
31
#include <libgen.h>
32
#include <poll.h>
33
#include <stdio.h>
34
#include <stdlib.h>
35
#include <string.h>
36
#include <limits.h>
37
#include <libzutil.h>
38
#include <sys/crypto/icp.h>
39
#include <sys/processor.h>
40
#include <sys/rrwlock.h>
41
#include <sys/spa.h>
42
#include <sys/spa_impl.h>
43
#include <sys/stat.h>
44
#include <sys/systeminfo.h>
45
#include <sys/time.h>
46
#include <sys/utsname.h>
47
#include <sys/zfs_context.h>
48
#include <sys/zfs_onexit.h>
49
#include <sys/zfs_vfsops.h>
50
#include <sys/zstd/zstd.h>
51
#include <sys/zvol.h>
52
#include <zfs_fletcher.h>
53
#include <zlib.h>
54
55
/*
56
* Emulation of kernel services in userland.
57
*/
58
59
uint64_t physmem;
60
uint32_t hostid;
61
struct utsname hw_utsname;
62
63
/* If set, all blocks read will be copied to the specified directory. */
64
char *vn_dumpdir = NULL;
65
66
/* this only exists to have its address taken */
67
struct proc p0;
68
69
/*
70
* =========================================================================
71
* threads
72
* =========================================================================
73
*
74
* TS_STACK_MIN is dictated by the minimum allowed pthread stack size. While
75
* TS_STACK_MAX is somewhat arbitrary, it was selected to be large enough for
76
* the expected stack depth while small enough to avoid exhausting address
77
* space with high thread counts.
78
*/
79
#define TS_STACK_MIN MAX(PTHREAD_STACK_MIN, 32768)
80
#define TS_STACK_MAX (256 * 1024)
81
82
struct zk_thread_wrapper {
83
void (*func)(void *);
84
void *arg;
85
};
86
87
static void *
88
zk_thread_wrapper(void *arg)
89
{
90
struct zk_thread_wrapper ztw;
91
memcpy(&ztw, arg, sizeof (ztw));
92
free(arg);
93
ztw.func(ztw.arg);
94
return (NULL);
95
}
96
97
kthread_t *
98
zk_thread_create(const char *name, void (*func)(void *), void *arg,
99
size_t stksize, int state)
100
{
101
pthread_attr_t attr;
102
pthread_t tid;
103
char *stkstr;
104
struct zk_thread_wrapper *ztw;
105
int detachstate = PTHREAD_CREATE_DETACHED;
106
107
VERIFY0(pthread_attr_init(&attr));
108
109
if (state & TS_JOINABLE)
110
detachstate = PTHREAD_CREATE_JOINABLE;
111
112
VERIFY0(pthread_attr_setdetachstate(&attr, detachstate));
113
114
/*
115
* We allow the default stack size in user space to be specified by
116
* setting the ZFS_STACK_SIZE environment variable. This allows us
117
* the convenience of observing and debugging stack overruns in
118
* user space. Explicitly specified stack sizes will be honored.
119
* The usage of ZFS_STACK_SIZE is discussed further in the
120
* ENVIRONMENT VARIABLES sections of the ztest(1) man page.
121
*/
122
if (stksize == 0) {
123
stkstr = getenv("ZFS_STACK_SIZE");
124
125
if (stkstr == NULL)
126
stksize = TS_STACK_MAX;
127
else
128
stksize = MAX(atoi(stkstr), TS_STACK_MIN);
129
}
130
131
VERIFY3S(stksize, >, 0);
132
stksize = P2ROUNDUP(MAX(stksize, TS_STACK_MIN), PAGESIZE);
133
134
/*
135
* If this ever fails, it may be because the stack size is not a
136
* multiple of system page size.
137
*/
138
VERIFY0(pthread_attr_setstacksize(&attr, stksize));
139
VERIFY0(pthread_attr_setguardsize(&attr, PAGESIZE));
140
141
VERIFY(ztw = malloc(sizeof (*ztw)));
142
ztw->func = func;
143
ztw->arg = arg;
144
VERIFY0(pthread_create(&tid, &attr, zk_thread_wrapper, ztw));
145
VERIFY0(pthread_attr_destroy(&attr));
146
147
pthread_setname_np(tid, name);
148
149
return ((void *)(uintptr_t)tid);
150
}
151
152
/*
153
* =========================================================================
154
* kstats
155
* =========================================================================
156
*/
157
kstat_t *
158
kstat_create(const char *module, int instance, const char *name,
159
const char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag)
160
{
161
(void) module, (void) instance, (void) name, (void) class, (void) type,
162
(void) ndata, (void) ks_flag;
163
return (NULL);
164
}
165
166
void
167
kstat_install(kstat_t *ksp)
168
{
169
(void) ksp;
170
}
171
172
void
173
kstat_delete(kstat_t *ksp)
174
{
175
(void) ksp;
176
}
177
178
void
179
kstat_set_raw_ops(kstat_t *ksp,
180
int (*headers)(char *buf, size_t size),
181
int (*data)(char *buf, size_t size, void *data),
182
void *(*addr)(kstat_t *ksp, loff_t index))
183
{
184
(void) ksp, (void) headers, (void) data, (void) addr;
185
}
186
187
/*
188
* =========================================================================
189
* mutexes
190
* =========================================================================
191
*/
192
193
void
194
mutex_init(kmutex_t *mp, char *name, int type, void *cookie)
195
{
196
(void) name, (void) type, (void) cookie;
197
VERIFY0(pthread_mutex_init(&mp->m_lock, NULL));
198
memset(&mp->m_owner, 0, sizeof (pthread_t));
199
}
200
201
void
202
mutex_destroy(kmutex_t *mp)
203
{
204
VERIFY0(pthread_mutex_destroy(&mp->m_lock));
205
}
206
207
void
208
mutex_enter(kmutex_t *mp)
209
{
210
VERIFY0(pthread_mutex_lock(&mp->m_lock));
211
mp->m_owner = pthread_self();
212
}
213
214
int
215
mutex_enter_check_return(kmutex_t *mp)
216
{
217
int error = pthread_mutex_lock(&mp->m_lock);
218
if (error == 0)
219
mp->m_owner = pthread_self();
220
return (error);
221
}
222
223
int
224
mutex_tryenter(kmutex_t *mp)
225
{
226
int error = pthread_mutex_trylock(&mp->m_lock);
227
if (error == 0) {
228
mp->m_owner = pthread_self();
229
return (1);
230
} else {
231
VERIFY3S(error, ==, EBUSY);
232
return (0);
233
}
234
}
235
236
void
237
mutex_exit(kmutex_t *mp)
238
{
239
memset(&mp->m_owner, 0, sizeof (pthread_t));
240
VERIFY0(pthread_mutex_unlock(&mp->m_lock));
241
}
242
243
/*
244
* =========================================================================
245
* rwlocks
246
* =========================================================================
247
*/
248
249
void
250
rw_init(krwlock_t *rwlp, char *name, int type, void *arg)
251
{
252
(void) name, (void) type, (void) arg;
253
VERIFY0(pthread_rwlock_init(&rwlp->rw_lock, NULL));
254
rwlp->rw_readers = 0;
255
rwlp->rw_owner = 0;
256
}
257
258
void
259
rw_destroy(krwlock_t *rwlp)
260
{
261
VERIFY0(pthread_rwlock_destroy(&rwlp->rw_lock));
262
}
263
264
void
265
rw_enter(krwlock_t *rwlp, krw_t rw)
266
{
267
if (rw == RW_READER) {
268
VERIFY0(pthread_rwlock_rdlock(&rwlp->rw_lock));
269
atomic_inc_uint(&rwlp->rw_readers);
270
} else {
271
VERIFY0(pthread_rwlock_wrlock(&rwlp->rw_lock));
272
rwlp->rw_owner = pthread_self();
273
}
274
}
275
276
void
277
rw_exit(krwlock_t *rwlp)
278
{
279
if (RW_READ_HELD(rwlp))
280
atomic_dec_uint(&rwlp->rw_readers);
281
else
282
rwlp->rw_owner = 0;
283
284
VERIFY0(pthread_rwlock_unlock(&rwlp->rw_lock));
285
}
286
287
int
288
rw_tryenter(krwlock_t *rwlp, krw_t rw)
289
{
290
int error;
291
292
if (rw == RW_READER)
293
error = pthread_rwlock_tryrdlock(&rwlp->rw_lock);
294
else
295
error = pthread_rwlock_trywrlock(&rwlp->rw_lock);
296
297
if (error == 0) {
298
if (rw == RW_READER)
299
atomic_inc_uint(&rwlp->rw_readers);
300
else
301
rwlp->rw_owner = pthread_self();
302
303
return (1);
304
}
305
306
VERIFY3S(error, ==, EBUSY);
307
308
return (0);
309
}
310
311
uint32_t
312
zone_get_hostid(void *zonep)
313
{
314
/*
315
* We're emulating the system's hostid in userland.
316
*/
317
(void) zonep;
318
return (hostid);
319
}
320
321
int
322
rw_tryupgrade(krwlock_t *rwlp)
323
{
324
(void) rwlp;
325
return (0);
326
}
327
328
/*
329
* =========================================================================
330
* condition variables
331
* =========================================================================
332
*/
333
334
void
335
cv_init(kcondvar_t *cv, char *name, int type, void *arg)
336
{
337
(void) name, (void) type, (void) arg;
338
VERIFY0(pthread_cond_init(cv, NULL));
339
}
340
341
void
342
cv_destroy(kcondvar_t *cv)
343
{
344
VERIFY0(pthread_cond_destroy(cv));
345
}
346
347
void
348
cv_wait(kcondvar_t *cv, kmutex_t *mp)
349
{
350
memset(&mp->m_owner, 0, sizeof (pthread_t));
351
VERIFY0(pthread_cond_wait(cv, &mp->m_lock));
352
mp->m_owner = pthread_self();
353
}
354
355
int
356
cv_wait_sig(kcondvar_t *cv, kmutex_t *mp)
357
{
358
cv_wait(cv, mp);
359
return (1);
360
}
361
362
int
363
cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
364
{
365
int error;
366
struct timeval tv;
367
struct timespec ts;
368
clock_t delta;
369
370
delta = abstime - ddi_get_lbolt();
371
if (delta <= 0)
372
return (-1);
373
374
VERIFY0(gettimeofday(&tv, NULL));
375
376
ts.tv_sec = tv.tv_sec + delta / hz;
377
ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % hz) * (NANOSEC / hz);
378
if (ts.tv_nsec >= NANOSEC) {
379
ts.tv_sec++;
380
ts.tv_nsec -= NANOSEC;
381
}
382
383
memset(&mp->m_owner, 0, sizeof (pthread_t));
384
error = pthread_cond_timedwait(cv, &mp->m_lock, &ts);
385
mp->m_owner = pthread_self();
386
387
if (error == ETIMEDOUT)
388
return (-1);
389
390
VERIFY0(error);
391
392
return (1);
393
}
394
395
int
396
cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res,
397
int flag)
398
{
399
(void) res;
400
int error;
401
struct timeval tv;
402
struct timespec ts;
403
hrtime_t delta;
404
405
ASSERT(flag == 0 || flag == CALLOUT_FLAG_ABSOLUTE);
406
407
delta = tim;
408
if (flag & CALLOUT_FLAG_ABSOLUTE)
409
delta -= gethrtime();
410
411
if (delta <= 0)
412
return (-1);
413
414
VERIFY0(gettimeofday(&tv, NULL));
415
416
ts.tv_sec = tv.tv_sec + delta / NANOSEC;
417
ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % NANOSEC);
418
if (ts.tv_nsec >= NANOSEC) {
419
ts.tv_sec++;
420
ts.tv_nsec -= NANOSEC;
421
}
422
423
memset(&mp->m_owner, 0, sizeof (pthread_t));
424
error = pthread_cond_timedwait(cv, &mp->m_lock, &ts);
425
mp->m_owner = pthread_self();
426
427
if (error == ETIMEDOUT)
428
return (-1);
429
430
VERIFY0(error);
431
432
return (1);
433
}
434
435
void
436
cv_signal(kcondvar_t *cv)
437
{
438
VERIFY0(pthread_cond_signal(cv));
439
}
440
441
void
442
cv_broadcast(kcondvar_t *cv)
443
{
444
VERIFY0(pthread_cond_broadcast(cv));
445
}
446
447
/*
448
* =========================================================================
449
* procfs list
450
* =========================================================================
451
*/
452
453
void
454
seq_printf(struct seq_file *m, const char *fmt, ...)
455
{
456
(void) m, (void) fmt;
457
}
458
459
void
460
procfs_list_install(const char *module,
461
const char *submodule,
462
const char *name,
463
mode_t mode,
464
procfs_list_t *procfs_list,
465
int (*show)(struct seq_file *f, void *p),
466
int (*show_header)(struct seq_file *f),
467
int (*clear)(procfs_list_t *procfs_list),
468
size_t procfs_list_node_off)
469
{
470
(void) module, (void) submodule, (void) name, (void) mode, (void) show,
471
(void) show_header, (void) clear;
472
mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL);
473
list_create(&procfs_list->pl_list,
474
procfs_list_node_off + sizeof (procfs_list_node_t),
475
procfs_list_node_off + offsetof(procfs_list_node_t, pln_link));
476
procfs_list->pl_next_id = 1;
477
procfs_list->pl_node_offset = procfs_list_node_off;
478
}
479
480
void
481
procfs_list_uninstall(procfs_list_t *procfs_list)
482
{
483
(void) procfs_list;
484
}
485
486
void
487
procfs_list_destroy(procfs_list_t *procfs_list)
488
{
489
ASSERT(list_is_empty(&procfs_list->pl_list));
490
list_destroy(&procfs_list->pl_list);
491
mutex_destroy(&procfs_list->pl_lock);
492
}
493
494
#define NODE_ID(procfs_list, obj) \
495
(((procfs_list_node_t *)(((char *)obj) + \
496
(procfs_list)->pl_node_offset))->pln_id)
497
498
void
499
procfs_list_add(procfs_list_t *procfs_list, void *p)
500
{
501
ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
502
NODE_ID(procfs_list, p) = procfs_list->pl_next_id++;
503
list_insert_tail(&procfs_list->pl_list, p);
504
}
505
506
/*
507
* =========================================================================
508
* vnode operations
509
* =========================================================================
510
*/
511
512
/*
513
* =========================================================================
514
* Figure out which debugging statements to print
515
* =========================================================================
516
*/
517
518
static char *dprintf_string;
519
static int dprintf_print_all;
520
521
int
522
dprintf_find_string(const char *string)
523
{
524
char *tmp_str = dprintf_string;
525
int len = strlen(string);
526
527
/*
528
* Find out if this is a string we want to print.
529
* String format: file1.c,function_name1,file2.c,file3.c
530
*/
531
532
while (tmp_str != NULL) {
533
if (strncmp(tmp_str, string, len) == 0 &&
534
(tmp_str[len] == ',' || tmp_str[len] == '\0'))
535
return (1);
536
tmp_str = strchr(tmp_str, ',');
537
if (tmp_str != NULL)
538
tmp_str++; /* Get rid of , */
539
}
540
return (0);
541
}
542
543
void
544
dprintf_setup(int *argc, char **argv)
545
{
546
int i, j;
547
548
/*
549
* Debugging can be specified two ways: by setting the
550
* environment variable ZFS_DEBUG, or by including a
551
* "debug=..." argument on the command line. The command
552
* line setting overrides the environment variable.
553
*/
554
555
for (i = 1; i < *argc; i++) {
556
int len = strlen("debug=");
557
/* First look for a command line argument */
558
if (strncmp("debug=", argv[i], len) == 0) {
559
dprintf_string = argv[i] + len;
560
/* Remove from args */
561
for (j = i; j < *argc; j++)
562
argv[j] = argv[j+1];
563
argv[j] = NULL;
564
(*argc)--;
565
}
566
}
567
568
if (dprintf_string == NULL) {
569
/* Look for ZFS_DEBUG environment variable */
570
dprintf_string = getenv("ZFS_DEBUG");
571
}
572
573
/*
574
* Are we just turning on all debugging?
575
*/
576
if (dprintf_find_string("on"))
577
dprintf_print_all = 1;
578
579
if (dprintf_string != NULL)
580
zfs_flags |= ZFS_DEBUG_DPRINTF;
581
}
582
583
/*
584
* =========================================================================
585
* debug printfs
586
* =========================================================================
587
*/
588
void
589
__dprintf(boolean_t dprint, const char *file, const char *func,
590
int line, const char *fmt, ...)
591
{
592
/* Get rid of annoying "../common/" prefix to filename. */
593
const char *newfile = zfs_basename(file);
594
595
va_list adx;
596
if (dprint) {
597
/* dprintf messages are printed immediately */
598
599
if (!dprintf_print_all &&
600
!dprintf_find_string(newfile) &&
601
!dprintf_find_string(func))
602
return;
603
604
/* Print out just the function name if requested */
605
flockfile(stdout);
606
if (dprintf_find_string("pid"))
607
(void) printf("%d ", getpid());
608
if (dprintf_find_string("tid"))
609
(void) printf("%ju ",
610
(uintmax_t)(uintptr_t)pthread_self());
611
if (dprintf_find_string("cpu"))
612
(void) printf("%u ", getcpuid());
613
if (dprintf_find_string("time"))
614
(void) printf("%llu ", gethrtime());
615
if (dprintf_find_string("long"))
616
(void) printf("%s, line %d: ", newfile, line);
617
(void) printf("dprintf: %s: ", func);
618
va_start(adx, fmt);
619
(void) vprintf(fmt, adx);
620
va_end(adx);
621
funlockfile(stdout);
622
} else {
623
/* zfs_dbgmsg is logged for dumping later */
624
size_t size;
625
char *buf;
626
int i;
627
628
size = 1024;
629
buf = umem_alloc(size, UMEM_NOFAIL);
630
i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func);
631
632
if (i < size) {
633
va_start(adx, fmt);
634
(void) vsnprintf(buf + i, size - i, fmt, adx);
635
va_end(adx);
636
}
637
638
__zfs_dbgmsg(buf);
639
640
umem_free(buf, size);
641
}
642
}
643
644
/*
645
* =========================================================================
646
* cmn_err() and panic()
647
* =========================================================================
648
*/
649
650
static __attribute__((noreturn)) void
651
panic_stop_or_abort(void)
652
{
653
const char *stopenv = getenv("LIBZPOOL_PANIC_STOP");
654
if (stopenv != NULL && atoi(stopenv)) {
655
fputs("libzpool: LIBZPOOL_PANIC_STOP is set, sending "
656
"SIGSTOP to process group\n", stderr);
657
fflush(stderr);
658
659
kill(0, SIGSTOP);
660
661
fputs("libzpool: continued after panic stop, "
662
"aborting\n", stderr);
663
}
664
665
abort(); /* think of it as a "user-level crash dump" */
666
}
667
668
static void
669
vcmn_msg(int ce, const char *fmt, va_list adx)
670
{
671
switch (ce) {
672
case CE_IGNORE:
673
return;
674
case CE_CONT:
675
break;
676
case CE_NOTE:
677
fputs("libzpool: NOTICE: ", stderr);
678
break;
679
case CE_WARN:
680
fputs("libzpool: WARNING: ", stderr);
681
break;
682
case CE_PANIC:
683
fputs("libzpool: PANIC: ", stderr);
684
break;
685
default:
686
fputs("libzpool: [unknown severity %d]: ", stderr);
687
break;
688
}
689
690
vfprintf(stderr, fmt, adx);
691
if (ce != CE_CONT)
692
fputc('\n', stderr);
693
fflush(stderr);
694
}
695
696
void
697
vcmn_err(int ce, const char *fmt, va_list adx)
698
{
699
vcmn_msg(ce, fmt, adx);
700
701
if (ce == CE_PANIC)
702
panic_stop_or_abort();
703
}
704
705
void
706
cmn_err(int ce, const char *fmt, ...)
707
{
708
va_list adx;
709
710
va_start(adx, fmt);
711
vcmn_err(ce, fmt, adx);
712
va_end(adx);
713
}
714
715
__attribute__((noreturn)) void
716
panic(const char *fmt, ...)
717
{
718
va_list adx;
719
720
va_start(adx, fmt);
721
vcmn_msg(CE_PANIC, fmt, adx);
722
va_end(adx);
723
724
panic_stop_or_abort();
725
}
726
727
__attribute__((noreturn)) void
728
vpanic(const char *fmt, va_list adx)
729
{
730
vcmn_msg(CE_PANIC, fmt, adx);
731
panic_stop_or_abort();
732
}
733
734
/*
735
* =========================================================================
736
* misc routines
737
* =========================================================================
738
*/
739
740
void
741
delay(clock_t ticks)
742
{
743
(void) poll(0, 0, ticks * (1000 / hz));
744
}
745
746
/*
747
* Find highest one bit set.
748
* Returns bit number + 1 of highest bit that is set, otherwise returns 0.
749
* The __builtin_clzll() function is supported by both GCC and Clang.
750
*/
751
int
752
highbit64(uint64_t i)
753
{
754
if (i == 0)
755
return (0);
756
757
return (NBBY * sizeof (uint64_t) - __builtin_clzll(i));
758
}
759
760
/*
761
* Find lowest one bit set.
762
* Returns bit number + 1 of lowest bit that is set, otherwise returns 0.
763
* The __builtin_ffsll() function is supported by both GCC and Clang.
764
*/
765
int
766
lowbit64(uint64_t i)
767
{
768
if (i == 0)
769
return (0);
770
771
return (__builtin_ffsll(i));
772
}
773
774
const char *random_path = "/dev/random";
775
const char *urandom_path = "/dev/urandom";
776
static int random_fd = -1, urandom_fd = -1;
777
778
void
779
random_init(void)
780
{
781
VERIFY((random_fd = open(random_path, O_RDONLY | O_CLOEXEC)) != -1);
782
VERIFY((urandom_fd = open(urandom_path, O_RDONLY | O_CLOEXEC)) != -1);
783
}
784
785
void
786
random_fini(void)
787
{
788
close(random_fd);
789
close(urandom_fd);
790
791
random_fd = -1;
792
urandom_fd = -1;
793
}
794
795
static int
796
random_get_bytes_common(uint8_t *ptr, size_t len, int fd)
797
{
798
size_t resid = len;
799
ssize_t bytes;
800
801
ASSERT(fd != -1);
802
803
while (resid != 0) {
804
bytes = read(fd, ptr, resid);
805
ASSERT3S(bytes, >=, 0);
806
ptr += bytes;
807
resid -= bytes;
808
}
809
810
return (0);
811
}
812
813
int
814
random_get_bytes(uint8_t *ptr, size_t len)
815
{
816
return (random_get_bytes_common(ptr, len, random_fd));
817
}
818
819
int
820
random_get_pseudo_bytes(uint8_t *ptr, size_t len)
821
{
822
return (random_get_bytes_common(ptr, len, urandom_fd));
823
}
824
825
int
826
ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result)
827
{
828
errno = 0;
829
*result = strtoull(str, nptr, base);
830
if (*result == 0)
831
return (errno);
832
return (0);
833
}
834
835
utsname_t *
836
utsname(void)
837
{
838
return (&hw_utsname);
839
}
840
841
/*
842
* =========================================================================
843
* kernel emulation setup & teardown
844
* =========================================================================
845
*/
846
static int
847
umem_out_of_memory(void)
848
{
849
char errmsg[] = "out of memory -- generating core dump\n";
850
851
(void) fprintf(stderr, "%s", errmsg);
852
abort();
853
return (0);
854
}
855
856
static void
857
spa_config_load(void)
858
{
859
void *buf = NULL;
860
nvlist_t *nvlist, *child;
861
nvpair_t *nvpair;
862
char *pathname;
863
zfs_file_t *fp;
864
zfs_file_attr_t zfa;
865
uint64_t fsize;
866
int err;
867
868
/*
869
* Open the configuration file.
870
*/
871
pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
872
873
(void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);
874
875
err = zfs_file_open(pathname, O_RDONLY, 0, &fp);
876
if (err)
877
err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp);
878
879
kmem_free(pathname, MAXPATHLEN);
880
881
if (err)
882
return;
883
884
if (zfs_file_getattr(fp, &zfa))
885
goto out;
886
887
fsize = zfa.zfa_size;
888
buf = kmem_alloc(fsize, KM_SLEEP);
889
890
/*
891
* Read the nvlist from the file.
892
*/
893
if (zfs_file_read(fp, buf, fsize, NULL) < 0)
894
goto out;
895
896
/*
897
* Unpack the nvlist.
898
*/
899
if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
900
goto out;
901
902
/*
903
* Iterate over all elements in the nvlist, creating a new spa_t for
904
* each one with the specified configuration.
905
*/
906
mutex_enter(&spa_namespace_lock);
907
nvpair = NULL;
908
while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
909
if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
910
continue;
911
912
child = fnvpair_value_nvlist(nvpair);
913
914
if (spa_lookup(nvpair_name(nvpair)) != NULL)
915
continue;
916
(void) spa_add(nvpair_name(nvpair), child, NULL);
917
}
918
mutex_exit(&spa_namespace_lock);
919
920
nvlist_free(nvlist);
921
922
out:
923
if (buf != NULL)
924
kmem_free(buf, fsize);
925
926
zfs_file_close(fp);
927
}
928
929
void
930
kernel_init(int mode)
931
{
932
extern uint_t rrw_tsd_key;
933
934
umem_nofail_callback(umem_out_of_memory);
935
936
physmem = sysconf(_SC_PHYS_PAGES);
937
938
dprintf("physmem = %llu pages (%.2f GB)\n", (u_longlong_t)physmem,
939
(double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
940
941
hostid = (mode & SPA_MODE_WRITE) ? get_system_hostid() : 0;
942
943
random_init();
944
945
VERIFY0(uname(&hw_utsname));
946
947
system_taskq_init();
948
icp_init();
949
950
zstd_init();
951
952
spa_init((spa_mode_t)mode);
953
spa_config_load();
954
955
fletcher_4_init();
956
957
tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
958
}
959
960
void
961
kernel_fini(void)
962
{
963
fletcher_4_fini();
964
spa_fini();
965
966
zstd_fini();
967
968
icp_fini();
969
system_taskq_fini();
970
971
random_fini();
972
}
973
974
uid_t
975
crgetuid(cred_t *cr)
976
{
977
(void) cr;
978
return (0);
979
}
980
981
uid_t
982
crgetruid(cred_t *cr)
983
{
984
(void) cr;
985
return (0);
986
}
987
988
gid_t
989
crgetgid(cred_t *cr)
990
{
991
(void) cr;
992
return (0);
993
}
994
995
int
996
crgetngroups(cred_t *cr)
997
{
998
(void) cr;
999
return (0);
1000
}
1001
1002
gid_t *
1003
crgetgroups(cred_t *cr)
1004
{
1005
(void) cr;
1006
return (NULL);
1007
}
1008
1009
int
1010
zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
1011
{
1012
(void) name, (void) cr;
1013
return (0);
1014
}
1015
1016
int
1017
zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
1018
{
1019
(void) from, (void) to, (void) cr;
1020
return (0);
1021
}
1022
1023
int
1024
zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
1025
{
1026
(void) name, (void) cr;
1027
return (0);
1028
}
1029
1030
int
1031
secpolicy_zfs(const cred_t *cr)
1032
{
1033
(void) cr;
1034
return (0);
1035
}
1036
1037
ksiddomain_t *
1038
ksid_lookupdomain(const char *dom)
1039
{
1040
ksiddomain_t *kd;
1041
1042
kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL);
1043
kd->kd_name = spa_strdup(dom);
1044
return (kd);
1045
}
1046
1047
void
1048
ksiddomain_rele(ksiddomain_t *ksid)
1049
{
1050
spa_strfree(ksid->kd_name);
1051
umem_free(ksid, sizeof (ksiddomain_t));
1052
}
1053
1054
char *
1055
kmem_vasprintf(const char *fmt, va_list adx)
1056
{
1057
char *buf = NULL;
1058
va_list adx_copy;
1059
1060
va_copy(adx_copy, adx);
1061
VERIFY(vasprintf(&buf, fmt, adx_copy) != -1);
1062
va_end(adx_copy);
1063
1064
return (buf);
1065
}
1066
1067
char *
1068
kmem_asprintf(const char *fmt, ...)
1069
{
1070
char *buf = NULL;
1071
va_list adx;
1072
1073
va_start(adx, fmt);
1074
VERIFY(vasprintf(&buf, fmt, adx) != -1);
1075
va_end(adx);
1076
1077
return (buf);
1078
}
1079
1080
/*
1081
* kmem_scnprintf() will return the number of characters that it would have
1082
* printed whenever it is limited by value of the size variable, rather than
1083
* the number of characters that it did print. This can cause misbehavior on
1084
* subsequent uses of the return value, so we define a safe version that will
1085
* return the number of characters actually printed, minus the NULL format
1086
* character. Subsequent use of this by the safe string functions is safe
1087
* whether it is snprintf(), strlcat() or strlcpy().
1088
*/
1089
int
1090
kmem_scnprintf(char *restrict str, size_t size, const char *restrict fmt, ...)
1091
{
1092
int n;
1093
va_list ap;
1094
1095
/* Make the 0 case a no-op so that we do not return -1 */
1096
if (size == 0)
1097
return (0);
1098
1099
va_start(ap, fmt);
1100
n = vsnprintf(str, size, fmt, ap);
1101
va_end(ap);
1102
1103
if (n >= size)
1104
n = size - 1;
1105
1106
return (n);
1107
}
1108
1109
zfs_file_t *
1110
zfs_onexit_fd_hold(int fd, minor_t *minorp)
1111
{
1112
(void) fd;
1113
*minorp = 0;
1114
return (NULL);
1115
}
1116
1117
void
1118
zfs_onexit_fd_rele(zfs_file_t *fp)
1119
{
1120
(void) fp;
1121
}
1122
1123
int
1124
zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
1125
uintptr_t *action_handle)
1126
{
1127
(void) minor, (void) func, (void) data, (void) action_handle;
1128
return (0);
1129
}
1130
1131
fstrans_cookie_t
1132
spl_fstrans_mark(void)
1133
{
1134
return ((fstrans_cookie_t)0);
1135
}
1136
1137
void
1138
spl_fstrans_unmark(fstrans_cookie_t cookie)
1139
{
1140
(void) cookie;
1141
}
1142
1143
int
1144
kmem_cache_reap_active(void)
1145
{
1146
return (0);
1147
}
1148
1149
void
1150
zvol_create_minors(const char *name)
1151
{
1152
(void) name;
1153
}
1154
1155
void
1156
zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
1157
{
1158
(void) spa, (void) name, (void) async;
1159
}
1160
1161
void
1162
zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname,
1163
boolean_t async)
1164
{
1165
(void) spa, (void) oldname, (void) newname, (void) async;
1166
}
1167
1168
/*
1169
* Open file
1170
*
1171
* path - fully qualified path to file
1172
* flags - file attributes O_READ / O_WRITE / O_EXCL
1173
* fpp - pointer to return file pointer
1174
*
1175
* Returns 0 on success underlying error on failure.
1176
*/
1177
int
1178
zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp)
1179
{
1180
int fd;
1181
int dump_fd;
1182
int err;
1183
int old_umask = 0;
1184
zfs_file_t *fp;
1185
struct stat64 st;
1186
1187
if (!(flags & O_CREAT) && stat64(path, &st) == -1)
1188
return (errno);
1189
1190
if (!(flags & O_CREAT) && S_ISBLK(st.st_mode))
1191
flags |= O_DIRECT;
1192
1193
if (flags & O_CREAT)
1194
old_umask = umask(0);
1195
1196
fd = open64(path, flags, mode);
1197
if (fd == -1)
1198
return (errno);
1199
1200
if (flags & O_CREAT)
1201
(void) umask(old_umask);
1202
1203
if (vn_dumpdir != NULL) {
1204
char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL);
1205
const char *inpath = zfs_basename(path);
1206
1207
(void) snprintf(dumppath, MAXPATHLEN,
1208
"%s/%s", vn_dumpdir, inpath);
1209
dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666);
1210
umem_free(dumppath, MAXPATHLEN);
1211
if (dump_fd == -1) {
1212
err = errno;
1213
close(fd);
1214
return (err);
1215
}
1216
} else {
1217
dump_fd = -1;
1218
}
1219
1220
(void) fcntl(fd, F_SETFD, FD_CLOEXEC);
1221
1222
fp = umem_zalloc(sizeof (zfs_file_t), UMEM_NOFAIL);
1223
fp->f_fd = fd;
1224
fp->f_dump_fd = dump_fd;
1225
*fpp = fp;
1226
1227
return (0);
1228
}
1229
1230
void
1231
zfs_file_close(zfs_file_t *fp)
1232
{
1233
close(fp->f_fd);
1234
if (fp->f_dump_fd != -1)
1235
close(fp->f_dump_fd);
1236
1237
umem_free(fp, sizeof (zfs_file_t));
1238
}
1239
1240
/*
1241
* Stateful write - use os internal file pointer to determine where to
1242
* write and update on successful completion.
1243
*
1244
* fp - pointer to file (pipe, socket, etc) to write to
1245
* buf - buffer to write
1246
* count - # of bytes to write
1247
* resid - pointer to count of unwritten bytes (if short write)
1248
*
1249
* Returns 0 on success errno on failure.
1250
*/
1251
int
1252
zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
1253
{
1254
ssize_t rc;
1255
1256
rc = write(fp->f_fd, buf, count);
1257
if (rc < 0)
1258
return (errno);
1259
1260
if (resid) {
1261
*resid = count - rc;
1262
} else if (rc != count) {
1263
return (EIO);
1264
}
1265
1266
return (0);
1267
}
1268
1269
/*
1270
* Stateless write - os internal file pointer is not updated.
1271
*
1272
* fp - pointer to file (pipe, socket, etc) to write to
1273
* buf - buffer to write
1274
* count - # of bytes to write
1275
* off - file offset to write to (only valid for seekable types)
1276
* resid - pointer to count of unwritten bytes
1277
*
1278
* Returns 0 on success errno on failure.
1279
*/
1280
int
1281
zfs_file_pwrite(zfs_file_t *fp, const void *buf,
1282
size_t count, loff_t pos, uint8_t ashift, ssize_t *resid)
1283
{
1284
ssize_t rc, split, done;
1285
int sectors;
1286
1287
/*
1288
* To simulate partial disk writes, we split writes into two
1289
* system calls so that the process can be killed in between.
1290
* This is used by ztest to simulate realistic failure modes.
1291
*/
1292
sectors = count >> ashift;
1293
split = (sectors > 0 ? rand() % sectors : 0) << ashift;
1294
rc = pwrite64(fp->f_fd, buf, split, pos);
1295
if (rc != -1) {
1296
done = rc;
1297
rc = pwrite64(fp->f_fd, (char *)buf + split,
1298
count - split, pos + split);
1299
}
1300
#ifdef __linux__
1301
if (rc == -1 && errno == EINVAL) {
1302
/*
1303
* Under Linux, this most likely means an alignment issue
1304
* (memory or disk) due to O_DIRECT, so we abort() in order
1305
* to catch the offender.
1306
*/
1307
abort();
1308
}
1309
#endif
1310
1311
if (rc < 0)
1312
return (errno);
1313
1314
done += rc;
1315
1316
if (resid) {
1317
*resid = count - done;
1318
} else if (done != count) {
1319
return (EIO);
1320
}
1321
1322
return (0);
1323
}
1324
1325
/*
1326
* Stateful read - use os internal file pointer to determine where to
1327
* read and update on successful completion.
1328
*
1329
* fp - pointer to file (pipe, socket, etc) to read from
1330
* buf - buffer to write
1331
* count - # of bytes to read
1332
* resid - pointer to count of unread bytes (if short read)
1333
*
1334
* Returns 0 on success errno on failure.
1335
*/
1336
int
1337
zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid)
1338
{
1339
int rc;
1340
1341
rc = read(fp->f_fd, buf, count);
1342
if (rc < 0)
1343
return (errno);
1344
1345
if (resid) {
1346
*resid = count - rc;
1347
} else if (rc != count) {
1348
return (EIO);
1349
}
1350
1351
return (0);
1352
}
1353
1354
/*
1355
* Stateless read - os internal file pointer is not updated.
1356
*
1357
* fp - pointer to file (pipe, socket, etc) to read from
1358
* buf - buffer to write
1359
* count - # of bytes to write
1360
* off - file offset to read from (only valid for seekable types)
1361
* resid - pointer to count of unwritten bytes (if short write)
1362
*
1363
* Returns 0 on success errno on failure.
1364
*/
1365
int
1366
zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off,
1367
ssize_t *resid)
1368
{
1369
ssize_t rc;
1370
1371
rc = pread64(fp->f_fd, buf, count, off);
1372
if (rc < 0) {
1373
#ifdef __linux__
1374
/*
1375
* Under Linux, this most likely means an alignment issue
1376
* (memory or disk) due to O_DIRECT, so we abort() in order to
1377
* catch the offender.
1378
*/
1379
if (errno == EINVAL)
1380
abort();
1381
#endif
1382
return (errno);
1383
}
1384
1385
if (fp->f_dump_fd != -1) {
1386
int status;
1387
1388
status = pwrite64(fp->f_dump_fd, buf, rc, off);
1389
ASSERT(status != -1);
1390
}
1391
1392
if (resid) {
1393
*resid = count - rc;
1394
} else if (rc != count) {
1395
return (EIO);
1396
}
1397
1398
return (0);
1399
}
1400
1401
/*
1402
* lseek - set / get file pointer
1403
*
1404
* fp - pointer to file (pipe, socket, etc) to read from
1405
* offp - value to seek to, returns current value plus passed offset
1406
* whence - see man pages for standard lseek whence values
1407
*
1408
* Returns 0 on success errno on failure (ESPIPE for non seekable types)
1409
*/
1410
int
1411
zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence)
1412
{
1413
loff_t rc;
1414
1415
rc = lseek(fp->f_fd, *offp, whence);
1416
if (rc < 0)
1417
return (errno);
1418
1419
*offp = rc;
1420
1421
return (0);
1422
}
1423
1424
/*
1425
* Get file attributes
1426
*
1427
* filp - file pointer
1428
* zfattr - pointer to file attr structure
1429
*
1430
* Currently only used for fetching size and file mode
1431
*
1432
* Returns 0 on success or error code of underlying getattr call on failure.
1433
*/
1434
int
1435
zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr)
1436
{
1437
struct stat64 st;
1438
1439
if (fstat64_blk(fp->f_fd, &st) == -1)
1440
return (errno);
1441
1442
zfattr->zfa_size = st.st_size;
1443
zfattr->zfa_mode = st.st_mode;
1444
1445
return (0);
1446
}
1447
1448
/*
1449
* Sync file to disk
1450
*
1451
* filp - file pointer
1452
* flags - O_SYNC and or O_DSYNC
1453
*
1454
* Returns 0 on success or error code of underlying sync call on failure.
1455
*/
1456
int
1457
zfs_file_fsync(zfs_file_t *fp, int flags)
1458
{
1459
(void) flags;
1460
1461
if (fsync(fp->f_fd) < 0)
1462
return (errno);
1463
1464
return (0);
1465
}
1466
1467
/*
1468
* deallocate - zero and/or deallocate file storage
1469
*
1470
* fp - file pointer
1471
* offset - offset to start zeroing or deallocating
1472
* len - length to zero or deallocate
1473
*/
1474
int
1475
zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len)
1476
{
1477
int rc;
1478
#if defined(__linux__)
1479
rc = fallocate(fp->f_fd,
1480
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, len);
1481
#elif defined(__FreeBSD__) && (__FreeBSD_version >= 1400029)
1482
struct spacectl_range rqsr = {
1483
.r_offset = offset,
1484
.r_len = len,
1485
};
1486
rc = fspacectl(fp->f_fd, SPACECTL_DEALLOC, &rqsr, 0, &rqsr);
1487
#else
1488
(void) fp, (void) offset, (void) len;
1489
rc = EOPNOTSUPP;
1490
#endif
1491
if (rc)
1492
return (SET_ERROR(rc));
1493
return (0);
1494
}
1495
1496
/*
1497
* Request current file pointer offset
1498
*
1499
* fp - pointer to file
1500
*
1501
* Returns current file offset.
1502
*/
1503
loff_t
1504
zfs_file_off(zfs_file_t *fp)
1505
{
1506
return (lseek(fp->f_fd, SEEK_CUR, 0));
1507
}
1508
1509
/*
1510
* unlink file
1511
*
1512
* path - fully qualified file path
1513
*
1514
* Returns 0 on success.
1515
*
1516
* OPTIONAL
1517
*/
1518
int
1519
zfs_file_unlink(const char *path)
1520
{
1521
return (remove(path));
1522
}
1523
1524
/*
1525
* Get reference to file pointer
1526
*
1527
* fd - input file descriptor
1528
*
1529
* Returns pointer to file struct or NULL.
1530
* Unsupported in user space.
1531
*/
1532
zfs_file_t *
1533
zfs_file_get(int fd)
1534
{
1535
(void) fd;
1536
abort();
1537
return (NULL);
1538
}
1539
/*
1540
* Drop reference to file pointer
1541
*
1542
* fp - pointer to file struct
1543
*
1544
* Unsupported in user space.
1545
*/
1546
void
1547
zfs_file_put(zfs_file_t *fp)
1548
{
1549
abort();
1550
(void) fp;
1551
}
1552
1553
void
1554
zfsvfs_update_fromname(const char *oldname, const char *newname)
1555
{
1556
(void) oldname, (void) newname;
1557
}
1558
1559
void
1560
spa_import_os(spa_t *spa)
1561
{
1562
(void) spa;
1563
}
1564
1565
void
1566
spa_export_os(spa_t *spa)
1567
{
1568
(void) spa;
1569
}
1570
1571
void
1572
spa_activate_os(spa_t *spa)
1573
{
1574
(void) spa;
1575
}
1576
1577
void
1578
spa_deactivate_os(spa_t *spa)
1579
{
1580
(void) spa;
1581
}
1582
1583